diff --git a/examples/test_bh_2.cu b/examples/test_bh_2.cu index be5dcd4ba73a655bd6d43cb27befa0cdc06aaaf8..259a31f7d1a5fd8aa732f1d274f86aca523bec45 100644 --- a/examples/test_bh_2.cu +++ b/examples/test_bh_2.cu @@ -34,8 +34,20 @@ #include <fenv.h> /* Local includes. */ +extern "C"{ #include "quicksched.h" +#include "res.h" +} +#include "cuda_queue.h" +/** Task types. */ +enum task_type { + task_type_self = 0, + task_type_pair, + task_type_self_pc, + task_type_com, + task_type_count +}; struct cell{ @@ -45,16 +57,19 @@ double h; int count; unsigned short int split, sorted; int parts, firstchild, sibling; -int res, com_tid; +int res, resz, resm, com_tid; }__attribute__((aligned(64))); + +#define const_G 1 /* Requred variables to obtain cells. */ +#define cell_maxparts 128 #define CELL_STRETCH 2 #define INITIAL_CELLS 256 -struct *cell cell_pool=NULL; +struct cell *cell_pool = NULL; int used_cells=0; -int num_cells = INITIAL_CELLS; +int num_cells = 0; int cell_size = INITIAL_CELLS*sizeof(struct cell); /* Device locations for the particle values. */ @@ -63,7 +78,9 @@ __device__ double *parts_pos_z; __device__ float4 *parts_a_m; __device__ double2 *com_xy; __device__ double *com_z; -__device__ float com_mass; +__device__ float *com_mass; + +__device__ struct cell *cells; /* Host locations for the particle values. */ @@ -72,14 +89,339 @@ double *parts_pos_z_host; float4 *parts_a_m_host; double2 *com_xy_host; double *com_z_host; -float com_mass_host; +float *com_mass_host; + + + + +__device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *cj) { + int i, j, k; + int count_i = ci->count, count_j = cj->count; + int parts_i = ci->parts, parts_j = cj->parts; + double xi[3]; + float dx[3], ai[3], mi, mj, r2, w, ir; + __shared__ double2 parts_xy[cell_maxparts]; + __shared__ double parts_z[cell_maxparts]; + __shared__ float4 parts_am[cell_maxparts]; + + /* Load particles of cell j into shared memory */ + for(k = parts_j + threadIdx.x, j = threadIdx.x; k < parts_j + count_j; k+= blockDim.x, j += blockDim.x ) { + parts_xy[j] = parts_pos_xy[k]; + parts_z[j] = parts_pos_z[k]; + parts_am[j] = parts_a_m[k]; + } + + /* Loop over cell i.*/ + for(i = parts_i + threadIdx.x; i < parts_i + count_i; i+= blockDim.x) { + xi[0] = parts_xy[i].x; + xi[1] = parts_xy[i].y; + xi[2] = parts_z[i]; + for(k = 0; k < 3; k++) { + ai[k] = 0.0f; + } + mi = parts_a_m[i].w; + + for(j = 0; j < count_j; j++) { + r2 = 0.0f; + dx[0] = xi[0] - parts_xy[j].x; + dx[1] = xi[1] - parts_xy[j].y; + dx[2] = xi[2] - parts_z[j]; + r2 += dx[0] * dx[0]; + r2 += dx[1] * dx[1]; + r2 += dx[2] * dx[2]; + + +// ir = 1.0f / sqrtf(r2); + ir = rsqrtf(r2); + w = const_G * ir * ir * ir; + mj = parts_am[j].w; + for(k = 0; k < 3; k++) { + ai[k] -= dx[k] * mj * w; + } + } + + atomicAdd(&parts_a_m[i].x, ai[0]); + atomicAdd(&parts_a_m[i].y, ai[1]); + atomicAdd(&parts_a_m[i].z, ai[2]); + + } + + /* Load particles of cell i into shared memory */ + for(k = parts_i + threadIdx.x, j = threadIdx.x; k < parts_i + count_i; k+= blockDim.x, j += blockDim.x ) { + parts_xy[j] = parts_pos_xy[k]; + parts_z[j] = parts_pos_z[k]; + parts_am[j] = parts_a_m[k]; + } + /*Loop over cell j. */ + for(i = parts_j + threadIdx.x; i < parts_j + count_j; i+= blockDim.x) { + xi[0] = parts_xy[i].x; + xi[1] = parts_xy[i].y; + xi[2] = parts_z[i]; + for(k = 0; k < 3; k++) { + ai[k] = 0.0f; + } + mi = parts_a_m[i].w; + + for(j = 0; j < count_j; j++) { + r2 = 0.0f; + dx[0] = xi[0] - parts_xy[j].x; + dx[1] = xi[1] - parts_xy[j].y; + dx[2] = xi[2] - parts_z[j]; + r2 += dx[0] * dx[0]; + r2 += dx[1] * dx[1]; + r2 += dx[2] * dx[2]; + + + ir = rsqrtf(r2); + w = const_G * ir * ir * ir; + mj = parts_am[j].w; + for(k = 0; k < 3; k++) { + ai[k] -= dx[k] * mj * w; + } + } + + atomicAdd(&parts_a_m[i].x, ai[0]); + atomicAdd(&parts_a_m[i].y, ai[1]); + atomicAdd(&parts_a_m[i].z, ai[2]); + + } + +} + +/*__device__ void iact_pair(int celli, int cellj) { + + struct cell *ci, *cj; + ci = &cells[celli]; + cj = &cells[cellj]; + + if(Check if neighbours0) + { + if(ci->split && cj->split) { + //Split both cells and do all possible pairs. + + }else { + iact_pair_direct(ci, cj); + } + + } + +}*/ + +__device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell *cj) { + + int i, k; + double2 j_com_xy; + double j_com_z; + float j_com_mass; + int count = leaf->count; + int parts = leaf->parts; + int cell_j = cj - cells; + float r2, dx[3], ir, w; + + + /* Init the com's data.*/ + j_com_xy = com_xy[cell_j]; + j_com_z = com_z[cell_j]; + j_com_mass = com_mass[cell_j]; + + for(i = parts; i < parts+count; i++) { + + r2 = 0.0; + dx[0] = j_com_xy.x - parts_pos_xy[i].x; + r2 += dx[0] * dx[0]; + dx[1] = j_com_xy.y - parts_pos_xy[i].y; + r2 += dx[1] * dx[1]; + dx[2] = j_com_z - parts_pos_z[i]; + r2 += dx[2] * dx[2]; + + ir = rsqrtf(r2); + w = j_com_mass * const_G * ir * ir * ir; + + parts_a_m[i].x += w * dx[0]; + parts_a_m[i].y += w * dx[1]; + parts_a_m[i].z += w * dx[2]; + } +} + +/** + * @brief Checks whether the cells are direct neighbours ot not + */ +__device__ __forceinline__ int are_neighbours_different_size(struct cell *ci, struct cell *cj) { + + int k; + float dx[3]; + double cih = ci->h, cjh = cj->h; + + float min_dist = 0.5*(cih+cjh); + + float center_i = ci->loc_xy.x + 0.5*cih; + float center_j = cj->loc_xy.x + 0.5*cjh; + dx[0] = fabsf(center_i - center_j); + center_i = ci->loc_xy.y + 0.5*cih; + center_j = cj->loc_xy.y + 0.5*cjh; + dx[1] = fabsf(center_i - center_j); + center_i = ci->loc_z + 0.5*cih; + center_j = cj->loc_z + 0.5*cjh; + dx[2] = fabsf(center_i - center_j); + return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist); +} + +__device__ __forceinline__ int are_neighbours(struct cell *ci, struct cell *cj) { + int k; + float dx[3]; + float min_dist = ci->h; + float center_i = ci->loc_xy.x; + float center_j = cj->loc_xy.x; + dx[0] = fabsf(center_i - center_j); + center_i = ci->loc_xy.y; + center_j = cj->loc_xy.y; + dx[1] = fabsf(center_i - center_j); + center_i = ci->loc_z; + center_j = cj->loc_z; + dx[2] = fabsf(center_i - center_j); + return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist); +} + +__device__ __forceinline__ int is_inside(struct cell *leaf, struct cell *c) { + return (leaf->parts >= c->parts) && (leaf->parts < c->parts + c->count); +} + +__device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf) { + + struct cell *cp ,*cps; + + for(cp = &cells[ci->firstchild]; cp != &cells[ci->sibling]; cp = &cells[cp->sibling]) { + if(is_inside(leaf, cp)) break; + } + + if(are_neighbours_different_size(cp, cj)) { + for(cps = &cells[cj->firstchild]; cps != &cells[ci->sibling]; cps = &cells[cps->sibling]) { + if(are_neighbours(cp, cps)) { + if(cp->split && cps->split) { + iact_pair_pc(cp, cps, leaf); + } + } else { + make_interact_pc(leaf, cps); + } + } + } + + +} + +/** + * @brief Compute the interactions between all particles in a leaf and + * and all the monopoles in the cell c + * + * @param c The #cell containing the monopoles + * @param leaf The #cell containing the particles + */ +__device__ void iact_self_pc(struct cell *c, struct cell *leaf) { + + struct cell *cp, *cps; + + /* Find the subcell of c the leaf is in.*/ + for( cp = &cells[c->firstchild]; cp != &cells[c->sibling]; cp = &cells[cp->sibling]) { + if(is_inside(leaf, cp)) break; + } + + if(cp->split) { + + iact_self_pc(cp, leaf); + + for(cps = &cells[c->firstchild]; cps != &cells[c->sibling]; cps = &cells[cps->sibling]) { + + if(cp != cps && cps->split) iact_pair_pc(cp,cps,leaf); + } + + } +} + + + +/** + * @brief Compute the interactions between all particles in a cell. + * + * @param cellID The cell ID to compute interactions on. + */ +__device__ void iact_self_direct(int cellID) { + struct cell *c = &cells[cellID]; + double xi[3] = {0.0, 0.0, 0.0}; + float ai[3] = {0.0, 0.0, 0.0 }, mi, mj, dx[3] = {0.0,0.0,0.0}, r2, ir, w; + __shared__ double2 parts_xy[cell_maxparts]; + __shared__ double parts_z[cell_maxparts]; + __shared__ float4 parts_am[cell_maxparts]; + int parts; + int count; + int i,j,k; + + + //If cell is split, interact each child with itself, and with each of its siblings. + /*if(c->split) { + //TODO + + } else {*/ + parts = c->parts; + count = c->count; + int z = threadIdx.x; + /* Load particle data into shared memory*/ + for(k = threadIdx.x + parts; k < parts + count; k += blockDim.x , z += blockDim.x) { + parts_xy[z] = parts_pos_xy[k]; + parts_z[z] = parts_pos_z[k]; + parts_am[z] = parts_a_m[k]; + } + __syncthreads(); + for(i = threadIdx.x; i < count; i += blockDim.x) + { + xi[0] = parts_xy[i].x; + xi[1] = parts_xy[i].y; + xi[2] = parts_z[i]; + for(k = 0; k < 3; k++) { + ai[k] = 0.0; + } + mi = parts_a_m[i].w; + + //for(j = i+1; j!= i; j = (j+1)%count) + for(j = 0; j < count; j++) + { + if(i != j){ + + /* Compute the pairwise distance. */ + r2 = 0.0; + dx[0] = xi[0] - parts_pos_xy[j].x; + r2 += dx[0]*dx[0]; + dx[1] = xi[1] - parts_pos_xy[j].y; + r2 += dx[1]*dx[1]; + dx[2] = xi[2] - parts_pos_z[j]; + r2 += dx[2]*dx[2]; + + /* Apply the gravitational acceleration. */ + + //ir = 1.0f / sqrtf(r2); + ir = rsqrtf(r2); + w = const_G * ir * ir * ir; + mj = parts_am[j].w; + for(k = 0; k < 3; k++) { + ai[k] -= w * dx[k] * mj; + } + + } + } + //Update. + atomicAdd(&parts_a_m[i].x,ai[0] ); + atomicAdd(&parts_a_m[i].y,ai[1] ); + atomicAdd(&parts_a_m[i].z,ai[2] ); + } + + +} /** * @brief Checks whether the cells are direct neighbours ot not. Both cells have * to be of the same size */ -static inline int are_neighbours(struct cell *ci, struct cell *cj) { +static inline int are_neighbours_host(struct cell *ci, struct cell *cj) { int k; float dx[3]; @@ -101,49 +443,78 @@ static inline int are_neighbours(struct cell *ci, struct cell *cj) { center_j = loc2.y; dx[1] = fabs(center_i - center_j); center_i = ci->loc_z; - center_J = cj->loc_z; + center_j = cj->loc_z; dx[2] = fabs(center_i - center_j); return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist); } -struct *cell cell_get() +struct cell *cell_get() { - struct *cell res; + struct cell *res; + if(num_cells == 0) + { + cell_pool = (struct cell*) calloc(INITIAL_CELLS, sizeof(struct cell)); + if(cell_pool == NULL) + error("Failed to allocate cell_pool"); + com_xy_host = (double2*) calloc(INITIAL_CELLS, sizeof(double2)); + if(com_xy_host == NULL) + error("Failed to allocate cell_pool"); + com_z_host = (double*) calloc(INITIAL_CELLS, sizeof(double)); + if(com_z_host == NULL) + error("Failed to allocate cell_pool"); + com_mass_host = (float*) calloc(INITIAL_CELLS, sizeof(float)); + if(com_mass_host == NULL) + error("Failed to allocate cell_pool"); + num_cells = INITIAL_CELLS; + } + if(used_cells >= num_cells) { /* Stretch */ - struct *cell new_pool; + struct cell *new_pool; cell_size *= CELL_STRETCH; - new_pool = (struct *cell) calloc(cell_size); + new_pool = (struct cell*) calloc(num_cells*CELL_STRETCH, sizeof(struct cell)); + if(new_pool == NULL) + error("Failed to allocate new_pool"); if(cell_pool != NULL) memcpy(new_pool, cell_pool, num_cells*sizeof(struct cell)); - double2 *temp = (*double2) calloc(num_cells*sizeof(double2)); - memcpy(temp, com_xy_host, sizeof(double2)*num_cells); + double2 *tempxy = (double2*) calloc(num_cells*CELL_STRETCH, sizeof(double2)); + if(tempxy == NULL) + error("Failed to allocate tempxy"); + memcpy(tempxy, com_xy_host, sizeof(double2)*num_cells); free(com_xy_host); - com_xy_host = temp; - double temp2 = (*double) calloc(num_cells*sizeof(double)); - memcpy(temp2, com_z_host, num_cells*sizeof(double)); + com_xy_host = tempxy; + double *tempz = (double*) calloc(num_cells*CELL_STRETCH, sizeof(double)); + if(tempz == NULL) + error("Failed to allocate tempz"); + memcpy(tempz, com_z_host, num_cells*sizeof(double)); free(com_z_host); - com_z_host = temp2; - float temp3 = (*float) calloc(num_cells*sizeof(float)); - memcpy(temp3, com_mass_host, num_cells*sizeof(float)); + com_z_host = tempz; + float *tempm = (float*) calloc(num_cells*CELL_STRETCH, sizeof(float)); + if(tempm == NULL) + error("Failed to allocate tempm"); + memcpy(tempm, com_mass_host, num_cells*sizeof(float)); free(com_mass_host); - com_mass_host = temp3; + com_mass_host = tempm; num_cells *= CELL_STRETCH; free(cell_pool); cell_pool = new_pool; + + message("Increased size of arrays"); } used_cells++; cell_pool[used_cells-1].sibling = -1; cell_pool[used_cells-1].firstchild = -1; - cell_poo[used_cells-1].res = qsched_res_none; + cell_pool[used_cells-1].res = qsched_res_none; + cell_pool[used_cells-1].resz = qsched_res_none; + cell_pool[used_cells-1].resm = qsched_res_none; return &cell_pool[used_cells-1]; } @@ -156,7 +527,7 @@ void comp_com(struct cell *c){ double com[3] = {0.0, 0.0, 0.0}, mass = 0.0; if(c->split) { - for(cp = &cell_pool[(cpi = c->firstchild)]; cp != &cell_pool[c->sibling]; &cell_pool[(cpi = cp->sibling)]) { + for(cp = &cell_pool[(cpi = c->firstchild)]; cpi != c->sibling; cp = &cell_pool[(cpi = cp->sibling)]) { float cp_mass = com_mass_host[cpi]; com[0] += com_xy_host[cpi].x * cp_mass; com[1] += com_xy_host[cpi].y * cp_mass; @@ -208,46 +579,73 @@ void comp_com(struct cell *c){ * @param N The total number of parts. * @param s The #sched to store the resources. */ -void cell_split(struct cell *c, struct qsched *s) { - int i, j, k, kk, count = c->count; - int parts = c->parts; - double2 temp; - double temp1; - float4 temp2; - struct cell *cp; +void cell_split(int c, struct qsched *s) { + int i, j, k, kk, count = cell_pool[c].count; + int parts = cell_pool[c].parts; + double2 tempxy; + double tempxy1; + float4 tempxy2; + struct cell *cp, *cell; int left[8], right[8]; double pivot[3]; - static struct cell *root = NULL; - struct cell *progenitors[8]; + static int root = -1; +// struct cell *progenitors[8]; + int progenitors[8]; + int c1 = c; + double2 *temp_xy; + double *temp_z; + float4 *temp_a_m; /* Set the root cell. */ - if (root == NULL) { + if (root < 0) { root = c; - c->sibling = 0; + cell_pool[c].sibling = -1; } - if(c->res == qsched_res_none) - error("Cell has no resource"); + if(cell_pool[c].res == qsched_res_none) + { + if( cudaHostGetDevicePointer(&temp_xy, &parts_pos_xy_host[cell_pool[c].parts], 0) != cudaSuccess ) + error("Failed to get host device pointer."); + printf("tempxy = %p\n", temp_xy); + cell_pool[c].res = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_xy, + sizeof(double2) * cell_pool[c].count, parts_pos_xy + cell_pool[c].parts); + } + if(cell_pool[c].resz == qsched_res_none) + { + if( cudaHostGetDevicePointer(&temp_z, &parts_pos_z_host[cell_pool[c].parts], 0) != cudaSuccess ) + error("Failed to get host device pointer."); + cell_pool[c].resz = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_z, + sizeof(double) * cell_pool[c].count, parts_pos_z + cell_pool[c].parts); + } + if(cell_pool[c].resm == qsched_res_none) + { + if( cudaHostGetDevicePointer(&temp_a_m, &parts_a_m_host[cell_pool[c].parts], 0) != cudaSuccess ) + error("Failed to get host device pointer."); + cell_pool[c].resm = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_a_m, + sizeof(float4) * cell_pool[c].count, parts_a_m + cell_pool[c].parts); + } + // error("Cell has no resource");*///TODO - if(c->count > cell_maxparts ) + if(count > cell_maxparts ) { - c->split = 1; + cell_pool[c].split = 1; for(k = 0; k < 8; k++) { - progenitors[k] = cp = cell_get(); - cp->loc_xy = c->loc_xy; - cp->loc_z = c->loc_z; - cp->h = c->h*0.5; + progenitors[k] = (cp = cell_get()) - cell_pool; + cell = &cell_pool[c]; + cp->loc_xy = cell->loc_xy; + cp->loc_z = cell->loc_z; + cp->h = cell->h*0.5; if(k & 4) cp->loc_xy.x += cp->h; if(k & 2) cp->loc_xy.y += cp->h; if(k & 1) cp->loc_z += cp->h; } /* Init the pivots.*/ - pivot[0] = c->loc_xy.x + c->h * 0.5; - pivot[1] = c->loc_xy.y + c->h * 0.5; - pivot[2] = c->loc_z + c->h * 0.5; + pivot[0] = cell->loc_xy.x + cell->h * 0.5; + pivot[1] = cell->loc_xy.y + cell->h * 0.5; + pivot[2] = cell->loc_z + cell->h * 0.5; /* Split along the x axis. */ i = parts; @@ -257,22 +655,21 @@ void cell_split(struct cell *c, struct qsched *s) { while(i <= parts+count-1 && parts_pos_xy_host[i].x < pivot[0]) i += 1; while(j >= parts && parts_pos_xy_host[j].x >= pivot[0]) j -= 1; if(i < j){ - temp = parts_pos_xy_host[i]; - temp1 = parts_pos_z_host[i]; - temp2 = parts_a_m_host[i]; + tempxy = parts_pos_xy_host[i]; + tempxy1 = parts_pos_z_host[i]; + tempxy2 = parts_a_m_host[i]; parts_pos_xy_host[i] = parts_pos_xy_host[j]; parts_pos_z_host[i] = parts_pos_z_host[j]; parts_a_m_host[i] = parts_a_m_host[j]; - parts_pos_xy_host[j] = temp; - parts_pos_z_host[j] = temp1; - parts_a_m_host[j] = temp2; + parts_pos_xy_host[j] = tempxy; + parts_pos_z_host[j] = tempxy1; + parts_a_m_host[j] = tempxy2; } } left[1] = i; - right[1] parts+count-1; + right[1] = parts+count-1; left[0] = parts; right[0] = j; - /* Split along the y axis twice. */ for (k = 1; k >= 0; k--) { @@ -283,15 +680,15 @@ void cell_split(struct cell *c, struct qsched *s) { while(j >= left[k] && parts_pos_xy_host[j].y >= pivot[1]) j -= 1; if(i < j) { - temp = parts_pos_xy_host[i]; - temp1 = parts_pos_z_host[i]; - temp2 = parts_a_m_host[i]; + tempxy = parts_pos_xy_host[i]; + tempxy1 = parts_pos_z_host[i]; + tempxy2 = parts_a_m_host[i]; parts_pos_xy_host[i] = parts_pos_xy_host[j]; parts_pos_z_host[i] = parts_pos_z_host[j]; parts_a_m_host[i] = parts_a_m_host[j]; - parts_pos_xy_host[j] = temp; - parts_pos_z_host[j] = temp1; - parts_a_m_host[j] = temp2; + parts_pos_xy_host[j] = tempxy; + parts_pos_z_host[j] = tempxy1; + parts_a_m_host[j] = tempxy2; } } left[2*k+1] = i; @@ -299,7 +696,6 @@ void cell_split(struct cell *c, struct qsched *s) { left[2*k] = left[k]; right[2*k] = j; } - /* Split along the z axis four times.*/ for(k = 3; k >=0; k--) { @@ -307,18 +703,18 @@ void cell_split(struct cell *c, struct qsched *s) { j = right[k]; while(i <= j){ while(i <= right[k] && parts_pos_z_host[i] < pivot[2]) i += 1; - while(i >= left[k] && parts_post_z_host[i] >= pivot[2]) j -= 1; + while(j >= left[k] && parts_pos_z_host[j] >= pivot[2]) j -= 1; if(i < j) { - temp = parts_pos_xy_host[i]; - temp1 = parts_pos_z_host[i]; - temp2 = parts_a_m_host[i]; + tempxy = parts_pos_xy_host[i]; + tempxy1 = parts_pos_z_host[i]; + tempxy2 = parts_a_m_host[i]; parts_pos_xy_host[i] = parts_pos_xy_host[j]; parts_pos_z_host[i] = parts_pos_z_host[j]; parts_a_m_host[i] = parts_a_m_host[j]; - parts_pos_xy_host[j] = temp; - parts_pos_z_host[j] = temp1; - parts_a_m_host[j] = temp2; + parts_pos_xy_host[j] = tempxy; + parts_pos_z_host[j] = tempxy1; + parts_a_m_host[j] = tempxy2; } } left[2 * k + 1] = i; @@ -326,27 +722,38 @@ void cell_split(struct cell *c, struct qsched *s) { left[2 * k] = left[k]; right[2 * k] = j; } - + /* Store the counts and offsets. */ for(k = 0; k < 8; k++) { - progenitors[k]->count = right[k]-left[k]+1; - progenitors[k]->parts = left[k]; - //TODO ADD RESOURCES HERE. + cell_pool[progenitors[k]].count = right[k]-left[k]+1; + cell_pool[progenitors[k]].parts = left[k]; + if( cudaHostGetDevicePointer(&temp_xy, &parts_pos_xy_host[cell_pool[progenitors[k]].parts], 0) != cudaSuccess ) + error("Failed to get host device pointer."); + cell_pool[progenitors[k]].res = qsched_addres(s, qsched_owner_none, cell->res, temp_xy, + sizeof(double2) * cell_pool[progenitors[k]].count, parts_pos_xy + cell_pool[progenitors[k]].parts); + if( cudaHostGetDevicePointer(&temp_z, &parts_pos_z_host[cell_pool[progenitors[k]].parts], 0) != cudaSuccess ) + error("Failed to get host device pointer."); + cell_pool[progenitors[k]].resz = qsched_addres(s, qsched_owner_none, cell->resz, temp_z, + sizeof(double) * cell_pool[progenitors[k]].count, parts_pos_z + cell_pool[progenitors[k]].parts); + if( cudaHostGetDevicePointer(&temp_a_m, &parts_a_m_host[cell_pool[progenitors[k]].parts], 0) != cudaSuccess ) + error("Failed to get host device pointer."); + cell_pool[progenitors[k]].resm = qsched_addres(s, qsched_owner_none, cell->resm, temp_a_m, + sizeof(float4) * cell_pool[progenitors[k]].count, parts_a_m + cell_pool[progenitors[k]].parts); } /* Find the first non-empty progenitor */ for(k = 0; k < 8; k++) { - if(progenitors[k]->count > 0) + if(cell_pool[progenitors[k]].count > 0) { - c->firstchild = &progenitors[k]-cell_pool; + cell->firstchild = progenitors[k]; break; } } #ifdef SANITY_CHECKS - if(c->firstchild == -1) + if(cell->firstchild == -1) error("Cell has been split but all children have 0 parts"); #endif @@ -355,34 +762,35 @@ void cell_split(struct cell *c, struct qsched *s) { { /* Find the next non-empty sibling */ for(kk = k+1; kk < 8; ++kk){ - if(progenitors[kk]->count > 0){ - progenitors[k]->sibling = &progenitors[kk]-cell_pool; + if(cell_pool[progenitors[kk]].count > 0){ + cell_pool[progenitors[k]].sibling = progenitors[kk]; break; } } /* No non-empty sibling, go back a level.*/ - if(kk == 8) progenitors[k]->sibling = c->sibling; + if(kk == 8) cell_pool[progenitors[k]].sibling = cell->sibling; } /* Recurse */ for(k = 0; k < 8; k++) - if(progenitors[k]->count > 0) cell_split(progenitors[k], s); + if(cell_pool[progenitors[k]].count > 0) cell_split(progenitors[k], s); /* Otherwise we're at a leaf so we need to make the cell's particle-cell task. */ } else { - struct cell *data[2] = {root, c}; +// struct cell *data[2] = {root, c}; + int data[2] = {root, c}; int tid = qsched_addtask(s, task_type_self_pc, task_flag_none, data, - 2 * sizeof(struct cell *), 1); - qsched_addlock(s, tid, c->res); - //TODO Create task. - //TODO Deal with multiple resources. + 2 * sizeof(int), 1); + qsched_addlock(s, tid, cell_pool[c].res); + qsched_addlock(s, tid, cell_pool[c].resz); + qsched_addlock(s, tid, cell_pool[c].resm); } #ifndef COM_AS_TASK - comp_com(c); + comp_com(&cell_pool[c]); #endif } @@ -396,7 +804,7 @@ void cell_split(struct cell *c, struct qsched *s) { void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){ qsched_task_t tid; - int *data[2]; + int data[2]; struct cell /**data[2],*/ *cp, *cps; int cpi; @@ -421,11 +829,13 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){ tid = qsched_addtask(s, task_type_self, task_flag_none, data, sizeof(int)*2, ci->count*ci->count/2); qsched_addlock(s, tid, ci->res); + qsched_addlock(s, tid, ci->resz); + qsched_addlock(s, tid, ci->resm); } } /* Else its a pair!*/ else{ - if(are_neighbours(ci,cj){/* Cells are neighbours */ + if(are_neighbours_host(ci,cj)){/* Cells are neighbours */ /*Are both split? */ if(ci->split && cj->split) { @@ -446,7 +856,11 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){ /* Add the resources. */ qsched_addlock(s, tid, ci->res); + qsched_addlock(s, tid, ci->resz); + qsched_addlock(s, tid, ci->resm); qsched_addlock(s, tid, cj->res); + qsched_addlock(s, tid, cj->resz); + qsched_addlock(s, tid, cj->resm); } } @@ -456,6 +870,33 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){ } + +__device__ void runner( int type , void *data ) { + + int *idata = (int *)data; + int i = idata[0]; + int j = idata[1]; + + switch ( type ) { + case task_type_self: + iact_self_direct(i); + break; + case task_type_pair: + iact_pair_direct(&cells[i], &cells[j]); + break; + case task_type_self_pc: + iact_self_pc( &cells[i], &cells[j] ); + break; + default: + asm("trap;"); + } + __syncthreads(); +} + +__device__ qsched_funtype function = runner; + + +qsched_funtype func; /** * @brief Set up and run a task-based Barnes-Hutt N-body solver. * @@ -472,7 +913,13 @@ void test_bh(int N, int runs, char *fileName) { struct qsched s; ticks tic, toc_run, tot_setup = 0, tot_run = 0; int countMultipoles = 0, countPairs = 0, countCoMs = 0; + double2 *parts_pos_xy_temp; + double *parts_pos_z_temp; + float4 *parts_a_m_temp; + cudaFree(0); + if( cudaMemcpyFromSymbol( &func , function , sizeof(qsched_funtype) ) != cudaSuccess) + error("Failed to copy function pointer from device"); /* Initialize the scheduler. */ qsched_init(&s, 1, qsched_flag_none); @@ -485,21 +932,40 @@ void test_bh(int N, int runs, char *fileName) { if( cudaMallocHost( &parts_a_m_host, sizeof(float4) * N) != cudaSuccess ) error("Failed to allocate parts array"); + if( cudaMalloc(&parts_pos_xy_temp, sizeof(double2) * N) != cudaSuccess) + error("Failed to allocate device parts array"); + printf("parts_pos_xy_temp = %p\n", parts_pos_xy_temp); + if( cudaMemcpyToSymbol(parts_pos_xy, &parts_pos_xy_temp, sizeof(double2*), 0, cudaMemcpyHostToDevice) != cudaSuccess) + error("Failed to set device symbol for parts array"); + printf("parts_pos_xy = %p\n", parts_pos_xy); + if( cudaMalloc(&parts_pos_z_temp, sizeof(double) * N) != cudaSuccess) + error("Failed to allocate device parts array"); + if( cudaMemcpyToSymbol(parts_pos_z, &parts_pos_z_temp, sizeof(double*), 0, cudaMemcpyHostToDevice) != cudaSuccess) + error("Failed to set device symbol for parts array"); + if( cudaMalloc(&parts_a_m_temp, sizeof(float4) * N) != cudaSuccess) + error("Failed to allocate device parts array"); + if( cudaMemcpyToSymbol(parts_a_m, &parts_a_m_temp, sizeof(float4*), 0, cudaMemcpyHostToDevice) != cudaSuccess) + error("Failed to set device symbol for parts array"); + + if(fileName == NULL || fileName[0] == 0) { for(k = 0; k < N; k++) { - parts_pos_xy_host.x = ((double)rand())/ RAND_MAX; - parts_pos_xy_host.y = ((double)rand())/ RAND_MAX; - parts_pos_z_host = ((double)rand())/ RAND_MAX; - parts_a_m_host.w = ((double)rand()) / RAND_MAX; + parts_pos_xy_host[k].x = ((double)rand())/ RAND_MAX; + parts_pos_xy_host[k].y = ((double)rand())/ RAND_MAX; + parts_pos_z_host[k] = ((double)rand())/ RAND_MAX; + parts_a_m_host[k].w = ((double)rand()) / RAND_MAX; } } else { file = fopen(fileName, "r"); + int tempxy; if(file) { for(k = 0; k < N; k++) { - if(fscanf(file, "%lf%lf%lf", &parts_pos_xy_host.x, &parts_pos_xy_host.y, &parts_pos_z_host) != 3) + if(fscanf(file, "%d", &tempxy) != 1) + error("Failed to read ID"); + if(fscanf(file, "%lf%lf%lf", &parts_pos_xy_host[k].x, &parts_pos_xy_host[k].y, &parts_pos_z_host[k]) != 3) error("Failed to read position of part %i.", k); - if(fscanf(file, "%f", &parts[k].mass != 1) + if(fscanf(file, "%f", &parts_a_m_host[k].w) != 1) error("Failed to read mass of part %i.", k); } fclose(file); @@ -514,21 +980,63 @@ void test_bh(int N, int runs, char *fileName) { root->h = 1.0; root->count = N; root->parts = 0; - cell_split(root, &s); - - int c = root - cell_pool; + root->sibling = -1; + int c = root-cell_pool; + cell_split(root - cell_pool, &s); + root = &cell_pool[c]; int nr_leaves = 0; + int maxparts=0, minparts=1000000; + int number = 0; while(c >= 0) { + if(cell_pool[c].count > 0) + { + number++; + if(cell_pool[c].res == qsched_res_none) + message("cell %i has no res", c); + if(cell_pool[c].resz == qsched_res_none) + message("cell %i has no resz", c); + if(cell_pool[c].resm == qsched_res_none) + message("cell %i has no resm", c); + } if(!cell_pool[c].split) { nr_leaves++; - c = c->sibling; + if(cell_pool[c].count > maxparts) + { + maxparts = cell_pool[c].count; + } + if(cell_pool[c].count < minparts) + { + minparts = cell_pool[c].count; + } + c = cell_pool[c].sibling; } else { - c = c->firstchild; + c = cell_pool[c].firstchild; } } - + message("root.sibling = %i, root.split = %i", root->sibling, root->split); + printf("nr_leaves = %i\n", nr_leaves); message("Average number of parts per leaf is %lf.", ((double)N) / ((double)nr_leaves)); + message("Max number of parts in a leaf is %i, min number is %i", maxparts, minparts); + create_tasks(&s, root, NULL); + + message("total number of tasks: %i.", s.count); + message("total number of cells: %i.", number); + message("total number of deps: %i.", s.count_deps); + message("total number of res: %i.", s.count_res); + message("total number of locks: %i.", s.count_locks); + + for(k = 0; k < runs; k++) { + for(i = 0; i < N; ++i) { + parts_a_m_host[i].x = 0.0; + parts_a_m_host[i].y = 0.0; + parts_a_m_host[i].z = 0.0; + } + + //Run code. + printf("gpu_data = %p\n", (int*)s.res[0].gpu_data); + qsched_run_CUDA( &s , func ); + } } @@ -573,7 +1081,7 @@ int main(int argc, char *argv[]) { printf("Size of cell: %zu bytes.\n", sizeof(struct cell)); /* Part information */ - printf("Size of part: %zu bytes.\n", sizeof(struct part)); +// printf("Size of part: %zu bytes.\n", sizeof(struct part)); /* Dump arguments. */ if (fileName[0] == 0) { @@ -587,5 +1095,5 @@ int main(int argc, char *argv[]) { } /* Run the BH test. */ - test_bh(N, nr_threads, runs, fileName); + test_bh(N, runs, fileName); } diff --git a/src/CUDACompile.sh b/src/CUDACompile.sh index df36d95939ff236fe45ef30ec5a0afa26bb8a7c2..0a3a0bf0b49b085d80024b34763e3a3369a6d48a 100755 --- a/src/CUDACompile.sh +++ b/src/CUDACompile.sh @@ -1,7 +1,7 @@ #!/bin/bash FLAGS2="-Xcompiler=-fsanitize=address -Xcompiler=-fno-omit-frame-pointer" DEBUG_FLAGS="-G -DDEBUG_GPU" -FLAGS="-O3 -g -DCPU_TPS=3.1e9 -lineinfo -src-in-ptx -Xptxas -dlcm=cg --maxrregcount=32 -gencode arch=compute_30,code=sm_30 -ftz=true -fmad=true -DFPTYPE_SINGLE -lgomp -DWITH_CUDA -DTIMERS -ccbin=/usr/bin/gcc-4.8" +FLAGS="-O3 -g -G -DCPU_TPS=3.1e9 -lineinfo -src-in-ptx -Xptxas -dlcm=cg --maxrregcount=32 -gencode arch=compute_30,code=sm_30 -ftz=true -fmad=true -DFPTYPE_SINGLE -lgomp -DWITH_CUDA -DTIMERS -ccbin=/usr/bin/gcc-4.8" # -DGPU_locks -Xptxas -dlcm=cg -Xptxas="-v"" # -DNO_LOADS @@ -33,6 +33,6 @@ cd ../examples /home/aidan/cuda_6.0/bin/nvcc $FLAGS -m64 -I../src -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -Xnvlink -v test_hierarchy.o ../src/.libs/libquicksched_cuda.a -o test_heirarchy -lprofiler -/home/aidan/cuda_6.0/bin/nvcc $FLAGS -dc -m64 -I../src -dc -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -lcudart -lcuda test_bh.cu -lprofiler +/home/aidan/cuda_6.0/bin/nvcc $FLAGS -dc -m64 -I../src -dc -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -lcudart -lcuda test_bh_2.cu -lprofiler -/home/aidan/cuda_6.0/bin/nvcc $FLAGS -m64 -I../src -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -Xnvlink -v test_bh.o ../src/.libs/libquicksched_cuda.a -o test_heirarchy -lprofiler +/home/aidan/cuda_6.0/bin/nvcc $FLAGS -m64 -I../src -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -Xnvlink -v test_bh_2.o ../src/.libs/libquicksched_cuda.a -o test_bh_2 -lprofiler diff --git a/src/cuda_queue.cu b/src/cuda_queue.cu index 971255fb4f2e72f9475d9ba5fc107a4f2fd43067..9beba72c9f2fe2189f12a7ccece603762d278888 100644 --- a/src/cuda_queue.cu +++ b/src/cuda_queue.cu @@ -739,8 +739,8 @@ void qsched_create_loads(struct qsched *s, int ID, int size, int numChildren, in if(parent >= 0) { /* Create dependecy to parent. */ - /*qsched_addunlock(s, task, s->res[parent].task ); - qsched_addunlock(s, s->res[parent].utask, utask );*/ + // qsched_addunlock(s, task, s->res[parent].task ); + /// qsched_addunlock(s, s->res[parent].utask, utask ); } for(i = sorted[ID]; i < sorted[ID+1]; i++) { @@ -759,8 +759,8 @@ void qsched_create_loads(struct qsched *s, int ID, int size, int numChildren, in /* If it has a parent then set the parents ghost task to be dependent on this.*/ if(parent >= 0) { - /*qsched_addunlock(s, task, s->res[parent].task ); - qsched_addunlock(s, s->res[parent].utask, utask);*/ + // qsched_addunlock(s, task, s->res[parent].task ); + // qsched_addunlock(s, s->res[parent].utask, utask); } } } @@ -1306,6 +1306,10 @@ for(i = 0; i < s->count_res; i++ ) int parent = s->res[ID].parent; struct res *resource = &s->res[ res[i] ]; + printf("ID = %i, size = %i, numChild = %i, parent = %i\n", ID, size, numChildren, parent); + printf("task = %i, utask = %i\n", s->res[ID].task, s->res[ID].utask); + if(s->res[ID].task == s->res[parent].task) + continue; /* Loop through children if there are any. */ if(numChildren > 0) { @@ -1843,7 +1847,7 @@ toc_run = getticks(); } /* Print all dependencies */ - /*for(i = 0; i < count; i++ ) + for(i = 0; i < count; i++ ) { printf("Task ID: %i, type=%i, ", i, tasks[i].type); for(j = 0; j < tasks[i].nr_unlocks; j++) @@ -1851,46 +1855,19 @@ toc_run = getticks(); printf("%i ", tasks[i].unlocks[j]); } printf("\n"); - }*/ + } if ( k < count ) { printf("k = %i, wait = %i\n", tid[k-1], tasks[tid[k-1]].wait); - printf("tasks[0].nr_unlocks=%i\n", tasks[0].nr_unlocks); - for(i = 0; i < k-1; i++) + for(i = 0; i < count; i++) { - t = &tasks[tid[i]]; + t = &tasks[i]; for(j = 0; j < t->nr_unlocks; j++) { if(t->unlocks[j] == tid[k-1]) - printf("%i ", tid[i]); - - } + printf("Task %i is unlocking task %i\n",j, tid[k-1]); + } } - printf("\n"); - for(i = 0; i < tasks[tid[k-1]].nr_unlocks; i++) - { - printf("%i ", tasks[tid[k-1]].unlocks[i]); - } - printf("\n"); - for(i = 0; i < tasks[tid[k-1]].nr_unlocks; i++) - { - printf("unlocks[%i] = %i wait = %i\n", i, tasks[tid[k-1]].unlocks[i], tasks[tasks[tid[k-1]].unlocks[i]].wait); - for(j = 0; j < tasks[tasks[tid[k-1]].unlocks[i]].nr_unlocks; j++) - { - printf("%i ", tasks[tasks[tid[k-1]].unlocks[i]].unlocks[j]); - } - printf("\n"); - } - printf("\n\n"); - for(i = 0; i < k; i++) - { - if(tasks[tid[i]].type == 0) - { - int* idata = (int*)&(s->data[tasks[tid[i]].data]); - printf("%i %i,", tid[i], idata[0]); - } - } - printf("\n"); error( "Circular dependencies detected." ); } /* Run through the topologically sorted tasks backwards and