diff --git a/src/CUDA/cell_cuda.h b/src/CUDA/cell_cuda.h index e08ec133031bc4b87d124b2607f758826d450a3a..5307a2d6ea126213ce634772d9557dadc3712a2b 100644 --- a/src/CUDA/cell_cuda.h +++ b/src/CUDA/cell_cuda.h @@ -45,6 +45,9 @@ struct cell_cuda { /* IS split? */ int split; + + float h_max_old; + float dx_max_old; }; diff --git a/src/CUDA/runner_cuda_main.cu b/src/CUDA/runner_cuda_main.cu index 8d333322ee63eda2ec2a6f49afc6e8533da5fe8a..9c0095ae79c09b0643a4fe0bc7606ddd259110b2 100644 --- a/src/CUDA/runner_cuda_main.cu +++ b/src/CUDA/runner_cuda_main.cu @@ -115,6 +115,36 @@ __device__ __constant__ float cuda_h_tolerance; __device__ __constant__ float cuda_eta_neighbours; __device__ __constant__ int engine_step_cuda; +__device__ __constant__ int cuda_sortlistID[27] = { + /* ( -1 , -1 , -1 ) */ 0, + /* ( -1 , -1 , 0 ) */ 1, + /* ( -1 , -1 , 1 ) */ 2, + /* ( -1 , 0 , -1 ) */ 3, + /* ( -1 , 0 , 0 ) */ 4, + /* ( -1 , 0 , 1 ) */ 5, + /* ( -1 , 1 , -1 ) */ 6, + /* ( -1 , 1 , 0 ) */ 7, + /* ( -1 , 1 , 1 ) */ 8, + /* ( 0 , -1 , -1 ) */ 9, + /* ( 0 , -1 , 0 ) */ 10, + /* ( 0 , -1 , 1 ) */ 11, + /* ( 0 , 0 , -1 ) */ 12, + /* ( 0 , 0 , 0 ) */ 0, + /* ( 0 , 0 , 1 ) */ 12, + /* ( 0 , 1 , -1 ) */ 11, + /* ( 0 , 1 , 0 ) */ 10, + /* ( 0 , 1 , 1 ) */ 9, + /* ( 1 , -1 , -1 ) */ 8, + /* ( 1 , -1 , 0 ) */ 7, + /* ( 1 , -1 , 1 ) */ 6, + /* ( 1 , 0 , -1 ) */ 5, + /* ( 1 , 0 , 0 ) */ 4, + /* ( 1 , 0 , 1 ) */ 3, + /* ( 1 , 1 , -1 ) */ 2, + /* ( 1 , 1 , 0 ) */ 1, + /* ( 1 , 1 , 1 ) */ 0}; + + int firstrun = 0; __device__ double atomicAdd(double* address, double val) { @@ -229,6 +259,27 @@ __device__ float cuda_pow_dimension_minus_one(float x) { #endif*/ } +__device__ __inline__ int cuda_space_getsid(struct cell_cuda *ci, struct cell_cuda *cj){ + const int periodic = 1; //TODO Assuming periodicity for now + double dx[3], shift[3]; + for(int k = 0; k < 3; k++){ + dx[k] = cj->loc[k] - ci->loc[k]; + if(periodic && dx[k] < dim[k] / 2) + shift[k] = dim[k]; + else if (periodic && dx[k] > dim[k] / 2) + shift[k] = -dim[k]; + else + shift[k] = 0.0; + dx[k] += shift[k]; + } + int sid = 0; + for(int k = 0; k < 3; k++){ + sid = 3 * sid + ((dx[k] < 0.0) ? 0 : ((dx[k] > 0.0) ? 2 : 1)); + } + sid = cuda_sortlistID[sid]; + return sid; +} + __device__ __inline__ void cuda_kernel_deval(float u, float *restrict W, float *restrict dW_dx) { /* Go to the range [0,1] from [0,H] */ @@ -269,6 +320,8 @@ __device__ void load_cell(int cell_index) { if(threadIdx.x == 0){ cell->ti_end_min = cpu_cell->ti_end_min; cell->ti_end_max = cpu_cell->ti_end_max; + cell->h_max_old = cpu_cell->h_max_old; + cell->dx_max_old = cpu_cell->dx_max_old; } for (i = threadIdx.x; i < cell->part_count; i += blockDim.x) { struct part *current = &parts[i]; @@ -658,6 +711,220 @@ __device__ void dopair_density(struct cell_cuda *ci, struct cell_cuda *cj) { } // Loop over ci. } +__device__ int cuda_cell_can_recurse_in_pair_task(struct cell_cuda *c){ +return c->split && ((kernel_gamma * c->h_max_old + c->dx_max_old) < 0.5f * c->dmin); +} + +/*Subpair task, is asymmetric , subcells of ci interact with subcells of cj ONLY*/ +__device__ void dosubpair_density(struct cell_cuda *ci, struct cell_cuda *cj) { + + int sid; + sid = cuda_space_getsid(ci, cj); + + if(cuda_cell_can_recurse_in_pair_task(ci) && cuda_cell_can_recurse_in_pair_task(cj)){ + + /* Different types of flags. */ + switch (sid) { + + /* Regular sub-cell interactions of a single cell. */ + case 0: /* ( 1 , 1 , 1 ) */ + if (ci->progeny[7] >= 0 && cj->progeny[0] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[7]], &cells_cuda[cj->progeny[0]]); + break; + + case 1: /* (1, 1, 0) */ + if (ci->progeny[6] >= 0 && cj->progeny[0] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[6]], &cells_cuda[cj->progeny[0]]); + if (ci->progeny[6] >= 0 && cj->progeny[1] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[6]], &cells_cuda[cj->progeny[1]]); + if (ci->progeny[7] >= 0 && cj->progeny[0] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[7]], &cells_cuda[cj->progeny[0]]); + if (ci->progeny[7] >= 0 && cj->progeny[1] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[7]], &cells_cuda[cj->progeny[1]]); + break; + + case 2: /* 1, 1, -1*/ + if (ci->progeny[6] >= 0 && cj->progeny[1] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[6]], &cells_cuda[cj->progeny[1]]); + break; + + case 3: /* 1 0 1*/ + if (ci->progeny[5] >= 0 && cj->progeny[0] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[5]], &cells_cuda[cj->progeny[0]]); + if (ci->progeny[5] >= 0 && cj->progeny[2] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[6]], &cells_cuda[cj->progeny[2]]); + if (ci->progeny[7] >= 0 && cj->progeny[1] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[7]], &cells_cuda[cj->progeny[0]]); + if (ci->progeny[7] >= 0 && cj->progeny[2] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[7]], &cells_cuda[cj->progeny[2]]); + break; + + case 4: /* 1 0 0 */ + if (ci->progeny[4] >= 0 && cj->progeny[0] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[4]], &cells_cuda[cj->progeny[0]]); + if (ci->progeny[4] >= 0 && cj->progeny[1] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[4]], &cells_cuda[cj->progeny[1]]); + if (ci->progeny[4] >= 0 && cj->progeny[2] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[4]], &cells_cuda[cj->progeny[2]]); + if (ci->progeny[4] >= 0 && cj->progeny[3] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[4]], &cells_cuda[cj->progeny[3]]); + if (ci->progeny[5] >= 0 && cj->progeny[0] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[5]], &cells_cuda[cj->progeny[0]]); + if (ci->progeny[5] >= 0 && cj->progeny[1] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[5]], &cells_cuda[cj->progeny[1]]); + if (ci->progeny[5] >= 0 && cj->progeny[2] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[5]], &cells_cuda[cj->progeny[2]]); + if (ci->progeny[5] >= 0 && cj->progeny[3] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[6]], &cells_cuda[cj->progeny[3]]); + if (ci->progeny[6] >= 0 && cj->progeny[0] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[6]], &cells_cuda[cj->progeny[0]]); + if (ci->progeny[6] >= 0 && cj->progeny[1] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[6]], &cells_cuda[cj->progeny[1]]); + if (ci->progeny[6] >= 0 && cj->progeny[2] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[6]], &cells_cuda[cj->progeny[2]]); + if (ci->progeny[6] >= 0 && cj->progeny[3] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[6]], &cells_cuda[cj->progeny[3]]); + if (ci->progeny[7] >= 0 && cj->progeny[0] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[7]], &cells_cuda[cj->progeny[0]]); + if (ci->progeny[7] >= 0 && cj->progeny[1] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[7]], &cells_cuda[cj->progeny[1]]); + if (ci->progeny[7] >= 0 && cj->progeny[2] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[7]], &cells_cuda[cj->progeny[2]]); + if (ci->progeny[7] >= 0 && cj->progeny[3] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[7]], &cells_cuda[cj->progeny[3]]); + break; + + case 5: /* 1 0 -1 */ + if (ci->progeny[4] >= 0 && cj->progeny[1] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[4]], &cells_cuda[cj->progeny[1]]); + if (ci->progeny[4] >= 0 && cj->progeny[3] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[4]], &cells_cuda[cj->progeny[3]]); + if (ci->progeny[6] >= 0 && cj->progeny[1] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[6]], &cells_cuda[cj->progeny[1]]); + if (ci->progeny[6] >= 0 && cj->progeny[3] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[6]], &cells_cuda[cj->progeny[3]]); + break; + + case 6: /* 1 -1 1*/ + if (ci->progeny[5] >= 0 && cj->progeny[2] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[5]], &cells_cuda[cj->progeny[2]]); + break; + + case 7: /* 1 -1 0*/ + if (ci->progeny[4] >= 0 && cj->progeny[2] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[4]], &cells_cuda[cj->progeny[2]]); + if (ci->progeny[4] >= 0 && cj->progeny[3] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[4]], &cells_cuda[cj->progeny[3]]); + if (ci->progeny[5] >= 0 && cj->progeny[2] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[5]], &cells_cuda[cj->progeny[2]]); + if (ci->progeny[5] >= 0 && cj->progeny[3] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[5]], &cells_cuda[cj->progeny[3]]); + break; + + case 8: /* 1 -1 -1*/ + if (ci->progeny[4] >= 0 && cj->progeny[3] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[4]], &cells_cuda[cj->progeny[3]]); + break; + + case 9: /* 0 1 1*/ + if (ci->progeny[3] >= 0 && cj->progeny[0] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[3]], &cells_cuda[cj->progeny[0]]); + if (ci->progeny[3] >= 0 && cj->progeny[4] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[3]], &cells_cuda[cj->progeny[4]]); + if (ci->progeny[7] >= 0 && cj->progeny[0] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[7]], &cells_cuda[cj->progeny[0]]); + if (ci->progeny[7] >= 0 && cj->progeny[4] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[7]], &cells_cuda[cj->progeny[4]]); + break; + + case 10: /* 0 1 0 */ + if (ci->progeny[2] >= 0 && cj->progeny[0] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[2]], &cells_cuda[cj->progeny[0]]); + if (ci->progeny[2] >= 0 && cj->progeny[1] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[2]], &cells_cuda[cj->progeny[1]]); + if (ci->progeny[2] >= 0 && cj->progeny[4] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[2]], &cells_cuda[cj->progeny[4]]); + if (ci->progeny[2] >= 0 && cj->progeny[5] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[2]], &cells_cuda[cj->progeny[5]]); + if (ci->progeny[3] >= 0 && cj->progeny[0] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[3]], &cells_cuda[cj->progeny[0]]); + if (ci->progeny[3] >= 0 && cj->progeny[1] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[3]], &cells_cuda[cj->progeny[1]]); + if (ci->progeny[3] >= 0 && cj->progeny[4] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[3]], &cells_cuda[cj->progeny[4]]); + if (ci->progeny[3] >= 0 && cj->progeny[5] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[3]], &cells_cuda[cj->progeny[5]]); + if (ci->progeny[6] >= 0 && cj->progeny[0] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[6]], &cells_cuda[cj->progeny[0]]); + if (ci->progeny[6] >= 0 && cj->progeny[1] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[6]], &cells_cuda[cj->progeny[1]]); + if (ci->progeny[6] >= 0 && cj->progeny[4] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[6]], &cells_cuda[cj->progeny[4]]); + if (ci->progeny[6] >= 0 && cj->progeny[5] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[6]], &cells_cuda[cj->progeny[5]]); + if (ci->progeny[7] >= 0 && cj->progeny[0] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[7]], &cells_cuda[cj->progeny[0]]); + if (ci->progeny[7] >= 0 && cj->progeny[1] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[7]], &cells_cuda[cj->progeny[1]]); + if (ci->progeny[7] >= 0 && cj->progeny[4] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[7]], &cells_cuda[cj->progeny[4]]); + if (ci->progeny[7] >= 0 && cj->progeny[5] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[7]], &cells_cuda[cj->progeny[5]]); + break; + + case 11: /* 0 1 -1*/ + if (ci->progeny[2] >= 0 && cj->progeny[1] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[2]], &cells_cuda[cj->progeny[1]]); + if (ci->progeny[2] >= 0 && cj->progeny[5] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[2]], &cells_cuda[cj->progeny[5]]); + if (ci->progeny[6] >= 0 && cj->progeny[1] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[6]], &cells_cuda[cj->progeny[1]]); + if (ci->progeny[6] >= 0 && cj->progeny[5] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[6]], &cells_cuda[cj->progeny[5]]); + break; + + case 12: /* 0 0 1 */ + if (ci->progeny[1] >= 0 && cj->progeny[0] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[1]], &cells_cuda[cj->progeny[0]]); + if (ci->progeny[1] >= 0 && cj->progeny[2] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[1]], &cells_cuda[cj->progeny[2]]); + if (ci->progeny[1] >= 0 && cj->progeny[4] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[1]], &cells_cuda[cj->progeny[4]]); + if (ci->progeny[1] >= 0 && cj->progeny[6] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[1]], &cells_cuda[cj->progeny[6]]); + if (ci->progeny[3] >= 0 && cj->progeny[0] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[3]], &cells_cuda[cj->progeny[0]]); + if (ci->progeny[3] >= 0 && cj->progeny[2] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[3]], &cells_cuda[cj->progeny[2]]); + if (ci->progeny[3] >= 0 && cj->progeny[4] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[3]], &cells_cuda[cj->progeny[4]]); + if (ci->progeny[3] >= 0 && cj->progeny[6] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[3]], &cells_cuda[cj->progeny[6]]); + if (ci->progeny[5] >= 0 && cj->progeny[0] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[5]], &cells_cuda[cj->progeny[0]]); + if (ci->progeny[5] >= 0 && cj->progeny[2] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[5]], &cells_cuda[cj->progeny[2]]); + if (ci->progeny[5] >= 0 && cj->progeny[4] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[5]], &cells_cuda[cj->progeny[4]]); + if (ci->progeny[5] >= 0 && cj->progeny[6] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[5]], &cells_cuda[cj->progeny[6]]); + if (ci->progeny[7] >= 0 && cj->progeny[0] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[7]], &cells_cuda[cj->progeny[0]]); + if (ci->progeny[7] >= 0 && cj->progeny[2] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[7]], &cells_cuda[cj->progeny[2]]); + if (ci->progeny[7] >= 0 && cj->progeny[4] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[7]], &cells_cuda[cj->progeny[4]]); + if (ci->progeny[7] >= 0 && cj->progeny[6] >= 0) + dosubpair_density(&cells_cuda[ci->progeny[7]], &cells_cuda[cj->progeny[6]]); + break; + } + + }else{ + dopair_density(ci, cj); + } +} + + /* Task function to perform a self force task. No symmetry */ __device__ void doself_force(struct cell_cuda *ci) { /* Is the cell active? */ @@ -976,6 +1243,216 @@ __device__ void dopair_force(struct cell_cuda *ci, struct cell_cuda *cj) { } // Loop over cell ci. } +__device__ void dosubpair_force(struct cell_cuda *ci, struct cell_cuda *cj){ + + int sid; + sid = cuda_space_getsid(ci, cj); + + if(cuda_cell_can_recurse_in_pair_task(ci) && cuda_cell_can_recurse_in_pair_task(cj)){ + + /* Different types of flags. */ + switch (sid) { + + /* Regular sub-cell interactions of a single cell. */ + case 0: /* ( 1 , 1 , 1 ) */ + if (ci->progeny[7] >= 0 && cj->progeny[0] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[7]], &cells_cuda[cj->progeny[0]]); + break; + + case 1: /* (1, 1, 0) */ + if (ci->progeny[6] >= 0 && cj->progeny[0] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[6]], &cells_cuda[cj->progeny[0]]); + if (ci->progeny[6] >= 0 && cj->progeny[1] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[6]], &cells_cuda[cj->progeny[1]]); + if (ci->progeny[7] >= 0 && cj->progeny[0] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[7]], &cells_cuda[cj->progeny[0]]); + if (ci->progeny[7] >= 0 && cj->progeny[1] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[7]], &cells_cuda[cj->progeny[1]]); + break; + + case 2: /* 1, 1, -1*/ + if (ci->progeny[6] >= 0 && cj->progeny[1] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[6]], &cells_cuda[cj->progeny[1]]); + break; + + case 3: /* 1 0 1*/ + if (ci->progeny[5] >= 0 && cj->progeny[0] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[5]], &cells_cuda[cj->progeny[0]]); + if (ci->progeny[5] >= 0 && cj->progeny[2] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[6]], &cells_cuda[cj->progeny[2]]); + if (ci->progeny[7] >= 0 && cj->progeny[1] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[7]], &cells_cuda[cj->progeny[0]]); + if (ci->progeny[7] >= 0 && cj->progeny[2] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[7]], &cells_cuda[cj->progeny[2]]); + break; + + case 4: /* 1 0 0 */ + if (ci->progeny[4] >= 0 && cj->progeny[0] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[4]], &cells_cuda[cj->progeny[0]]); + if (ci->progeny[4] >= 0 && cj->progeny[1] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[4]], &cells_cuda[cj->progeny[1]]); + if (ci->progeny[4] >= 0 && cj->progeny[2] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[4]], &cells_cuda[cj->progeny[2]]); + if (ci->progeny[4] >= 0 && cj->progeny[3] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[4]], &cells_cuda[cj->progeny[3]]); + if (ci->progeny[5] >= 0 && cj->progeny[0] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[5]], &cells_cuda[cj->progeny[0]]); + if (ci->progeny[5] >= 0 && cj->progeny[1] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[5]], &cells_cuda[cj->progeny[1]]); + if (ci->progeny[5] >= 0 && cj->progeny[2] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[5]], &cells_cuda[cj->progeny[2]]); + if (ci->progeny[5] >= 0 && cj->progeny[3] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[6]], &cells_cuda[cj->progeny[3]]); + if (ci->progeny[6] >= 0 && cj->progeny[0] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[6]], &cells_cuda[cj->progeny[0]]); + if (ci->progeny[6] >= 0 && cj->progeny[1] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[6]], &cells_cuda[cj->progeny[1]]); + if (ci->progeny[6] >= 0 && cj->progeny[2] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[6]], &cells_cuda[cj->progeny[2]]); + if (ci->progeny[6] >= 0 && cj->progeny[3] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[6]], &cells_cuda[cj->progeny[3]]); + if (ci->progeny[7] >= 0 && cj->progeny[0] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[7]], &cells_cuda[cj->progeny[0]]); + if (ci->progeny[7] >= 0 && cj->progeny[1] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[7]], &cells_cuda[cj->progeny[1]]); + if (ci->progeny[7] >= 0 && cj->progeny[2] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[7]], &cells_cuda[cj->progeny[2]]); + if (ci->progeny[7] >= 0 && cj->progeny[3] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[7]], &cells_cuda[cj->progeny[3]]); + break; + + case 5: /* 1 0 -1 */ + if (ci->progeny[4] >= 0 && cj->progeny[1] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[4]], &cells_cuda[cj->progeny[1]]); + if (ci->progeny[4] >= 0 && cj->progeny[3] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[4]], &cells_cuda[cj->progeny[3]]); + if (ci->progeny[6] >= 0 && cj->progeny[1] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[6]], &cells_cuda[cj->progeny[1]]); + if (ci->progeny[6] >= 0 && cj->progeny[3] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[6]], &cells_cuda[cj->progeny[3]]); + break; + + case 6: /* 1 -1 1*/ + if (ci->progeny[5] >= 0 && cj->progeny[2] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[5]], &cells_cuda[cj->progeny[2]]); + break; + + case 7: /* 1 -1 0*/ + if (ci->progeny[4] >= 0 && cj->progeny[2] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[4]], &cells_cuda[cj->progeny[2]]); + if (ci->progeny[4] >= 0 && cj->progeny[3] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[4]], &cells_cuda[cj->progeny[3]]); + if (ci->progeny[5] >= 0 && cj->progeny[2] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[5]], &cells_cuda[cj->progeny[2]]); + if (ci->progeny[5] >= 0 && cj->progeny[3] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[5]], &cells_cuda[cj->progeny[3]]); + break; + + case 8: /* 1 -1 -1*/ + if (ci->progeny[4] >= 0 && cj->progeny[3] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[4]], &cells_cuda[cj->progeny[3]]); + break; + + case 9: /* 0 1 1*/ + if (ci->progeny[3] >= 0 && cj->progeny[0] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[3]], &cells_cuda[cj->progeny[0]]); + if (ci->progeny[3] >= 0 && cj->progeny[4] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[3]], &cells_cuda[cj->progeny[4]]); + if (ci->progeny[7] >= 0 && cj->progeny[0] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[7]], &cells_cuda[cj->progeny[0]]); + if (ci->progeny[7] >= 0 && cj->progeny[4] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[7]], &cells_cuda[cj->progeny[4]]); + break; + + case 10: /* 0 1 0 */ + if (ci->progeny[2] >= 0 && cj->progeny[0] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[2]], &cells_cuda[cj->progeny[0]]); + if (ci->progeny[2] >= 0 && cj->progeny[1] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[2]], &cells_cuda[cj->progeny[1]]); + if (ci->progeny[2] >= 0 && cj->progeny[4] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[2]], &cells_cuda[cj->progeny[4]]); + if (ci->progeny[2] >= 0 && cj->progeny[5] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[2]], &cells_cuda[cj->progeny[5]]); + if (ci->progeny[3] >= 0 && cj->progeny[0] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[3]], &cells_cuda[cj->progeny[0]]); + if (ci->progeny[3] >= 0 && cj->progeny[1] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[3]], &cells_cuda[cj->progeny[1]]); + if (ci->progeny[3] >= 0 && cj->progeny[4] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[3]], &cells_cuda[cj->progeny[4]]); + if (ci->progeny[3] >= 0 && cj->progeny[5] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[3]], &cells_cuda[cj->progeny[5]]); + if (ci->progeny[6] >= 0 && cj->progeny[0] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[6]], &cells_cuda[cj->progeny[0]]); + if (ci->progeny[6] >= 0 && cj->progeny[1] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[6]], &cells_cuda[cj->progeny[1]]); + if (ci->progeny[6] >= 0 && cj->progeny[4] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[6]], &cells_cuda[cj->progeny[4]]); + if (ci->progeny[6] >= 0 && cj->progeny[5] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[6]], &cells_cuda[cj->progeny[5]]); + if (ci->progeny[7] >= 0 && cj->progeny[0] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[7]], &cells_cuda[cj->progeny[0]]); + if (ci->progeny[7] >= 0 && cj->progeny[1] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[7]], &cells_cuda[cj->progeny[1]]); + if (ci->progeny[7] >= 0 && cj->progeny[4] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[7]], &cells_cuda[cj->progeny[4]]); + if (ci->progeny[7] >= 0 && cj->progeny[5] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[7]], &cells_cuda[cj->progeny[5]]); + break; + + case 11: /* 0 1 -1*/ + if (ci->progeny[2] >= 0 && cj->progeny[1] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[2]], &cells_cuda[cj->progeny[1]]); + if (ci->progeny[2] >= 0 && cj->progeny[5] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[2]], &cells_cuda[cj->progeny[5]]); + if (ci->progeny[6] >= 0 && cj->progeny[1] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[6]], &cells_cuda[cj->progeny[1]]); + if (ci->progeny[6] >= 0 && cj->progeny[5] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[6]], &cells_cuda[cj->progeny[5]]); + break; + + case 12: /* 0 0 1 */ + if (ci->progeny[1] >= 0 && cj->progeny[0] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[1]], &cells_cuda[cj->progeny[0]]); + if (ci->progeny[1] >= 0 && cj->progeny[2] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[1]], &cells_cuda[cj->progeny[2]]); + if (ci->progeny[1] >= 0 && cj->progeny[4] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[1]], &cells_cuda[cj->progeny[4]]); + if (ci->progeny[1] >= 0 && cj->progeny[6] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[1]], &cells_cuda[cj->progeny[6]]); + if (ci->progeny[3] >= 0 && cj->progeny[0] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[3]], &cells_cuda[cj->progeny[0]]); + if (ci->progeny[3] >= 0 && cj->progeny[2] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[3]], &cells_cuda[cj->progeny[2]]); + if (ci->progeny[3] >= 0 && cj->progeny[4] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[3]], &cells_cuda[cj->progeny[4]]); + if (ci->progeny[3] >= 0 && cj->progeny[6] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[3]], &cells_cuda[cj->progeny[6]]); + if (ci->progeny[5] >= 0 && cj->progeny[0] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[5]], &cells_cuda[cj->progeny[0]]); + if (ci->progeny[5] >= 0 && cj->progeny[2] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[5]], &cells_cuda[cj->progeny[2]]); + if (ci->progeny[5] >= 0 && cj->progeny[4] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[5]], &cells_cuda[cj->progeny[4]]); + if (ci->progeny[5] >= 0 && cj->progeny[6] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[5]], &cells_cuda[cj->progeny[6]]); + if (ci->progeny[7] >= 0 && cj->progeny[0] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[7]], &cells_cuda[cj->progeny[0]]); + if (ci->progeny[7] >= 0 && cj->progeny[2] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[7]], &cells_cuda[cj->progeny[2]]); + if (ci->progeny[7] >= 0 && cj->progeny[4] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[7]], &cells_cuda[cj->progeny[4]]); + if (ci->progeny[7] >= 0 && cj->progeny[6] >= 0) + dosubpair_force(&cells_cuda[ci->progeny[7]], &cells_cuda[cj->progeny[6]]); + break; + } + + }else{ + dopair_force(ci, cj); + } +} + + + /* Device function to finish the density calculation. */ __device__ void hydro_end_density(int pid) { @@ -1435,6 +1912,50 @@ __device__ int runner_cuda_gettask(struct queue_cuda *q) { return tid; } +/* Task function to execute a sub-self-density task.*/ +__device__ void dosubself_density( struct cell_cuda *ci ){ + + if(ci->split){ + /* Loop over all progeny */ + for(int k = 0; k < 8; k++){ + if(ci->progeny[k] >= 0){ + dosubself_density(&cells_cuda[ci->progeny[k]]); + for(int j = k+1; j < 8; j++){ + if(ci->progeny[j] >= 0){ + //Pair tasks are asymmetric + dosubpair_density(&cells_cuda[ci->progeny[k]],&cells_cuda[ci->progeny[j]]); + dosubpair_density(&cells_cuda[ci->progeny[j]],&cells_cuda[ci->progeny[k]]); + } + } + } + } + }else{ + doself_density(ci); + } +} + +__device__ void dosubself_force(struct cell_cuda *ci){ + if(ci->split){ + /* Loop over all progeny */ + for(int k = 0; k < 8; k++){ + if(ci->progeny[k] >= 0){ + dosubself_force(&cells_cuda[ci->progeny[k]]); + for(int j = k+1; j < 8; j++){ + if(ci->progeny[j] >= 0){ + //Pair tasks are asymmetric + dosubpair_force(&cells_cuda[ci->progeny[k]],&cells_cuda[ci->progeny[j]]); + dosubpair_force(&cells_cuda[ci->progeny[j]],&cells_cuda[ci->progeny[k]]); + } + } + } + } + }else{ + doself_force(ci); + } + +} + + /* The main kernel. */ __global__ void swift_device_kernel() { __shared__ volatile int tid; @@ -1492,7 +2013,7 @@ __global__ void swift_device_kernel() { } else if (type == type_unload) { unload_cell(tasks[tid].ci); - }else if (type == task_type_pair || type == task_type_sub_pair) { + }else if (type == task_type_pair) { if (subtype == task_subtype_density) { struct cell_cuda *ci = &cells_cuda[tasks[tid].ci]; struct cell_cuda *cj = &cells_cuda[tasks[tid].cj]; @@ -1504,7 +2025,19 @@ __global__ void swift_device_kernel() { dopair_force(ci, cj); dopair_force(cj, ci); } - } else if (type == task_type_self || type == task_type_sub_self) { + } else if(type == task_type_sub_pair) { + if(subtype == task_subtype_density) { + struct cell_cuda *ci = &cells_cuda[tasks[tid].ci]; + struct cell_cuda *cj = &cells_cuda[tasks[tid].cj]; + dosubpair_density(ci,cj); + dosubpair_density(cj,ci); + }else if (subtype == task_subtype_force){ + struct cell_cuda *ci = &cells_cuda[tasks[tid].ci]; + struct cell_cuda *cj = &cells_cuda[tasks[tid].cj]; + dosubpair_force(ci,cj); + dosubpair_force(cj,ci); + } + }else if (type == task_type_self ) { if (subtype == task_subtype_density) { struct cell_cuda *ci = &cells_cuda[tasks[tid].ci]; doself_density(ci); @@ -1512,7 +2045,15 @@ __global__ void swift_device_kernel() { struct cell_cuda *ci = &cells_cuda[tasks[tid].ci]; doself_force(ci); } - } else if (type == task_type_ghost) { + } else if (type == task_type_sub_self){ + if(subtype == task_subtype_density) { + struct cell_cuda *ci = &cells_cuda[tasks[tid].ci]; + dosubself_density(ci); + }else if (subtype == task_subtype_force){ + struct cell_cuda *ci = &cells_cuda[tasks[tid].ci]; + dosubself_force(ci); + } + }else if (type == task_type_ghost) { if(! tasks[tid].implicit){ struct cell_cuda *ci = &cells_cuda[tasks[tid].ci]; @@ -1944,6 +2485,26 @@ __host__ void update_tasks(struct engine *e) { } } } + +/* Relies on assumption implicit unloads are always before unloads in host_tasks, which i believe to be true by conscruction. + for(int i = 0; i < nr_gpu_tasks; i++){ + if(host_tasks[i].type == type_unload && host_tasks[i].type == type_implicit_unload){ + if(host_tasks[i].wait==1){ + host_tasks[i].skip = 1; + task_count--; + struct task_cuda *temp_t = &host_tasks[i]; + int *unlocks = host_unlock_copy + (temp_t->unlocks-host_unlock_pointer); + for(int ii = 0; ii < temp_t->nr_unlock_tasks; ii++){ + if(!host_tasks[unlocks[ii].skip) + host_tasks[unlock[ii]].wait--; + } + *Find the corresponding load task* + Have to search the cells for this at the moment. + + } + + } + }*/ cudaErrCheck(cudaMemcpyToSymbol(tot_num_tasks, &task_count, sizeof(int))); /* Reset the queue data.*/