From f0ab955c4be92d874ea9e13ccb33fa3645ff32bb Mon Sep 17 00:00:00 2001 From: aidan <aidan@gtx690.dur.ac.uk> Date: Mon, 2 Mar 2015 14:02:05 +0000 Subject: [PATCH] Code cleanup --- examples/test_bh_2.cu | 145 ++---------------- examples/test_bh_3.cu | 284 ++++++++++-------------------------- examples/test_gpu_simple.cu | 21 +-- examples/test_qr.cu | 158 ++------------------ src/CUDACompile.sh | 6 +- src/cuda_queue.cu | 279 +++-------------------------------- src/qsched.h | 3 +- 7 files changed, 126 insertions(+), 770 deletions(-) diff --git a/examples/test_bh_2.cu b/examples/test_bh_2.cu index 0cdae97..66ecbf5 100644 --- a/examples/test_bh_2.cu +++ b/examples/test_bh_2.cu @@ -40,6 +40,7 @@ extern "C"{ } #include "cuda_queue.h" + /** Task types. */ enum task_type { task_type_self = 0, @@ -59,12 +60,12 @@ unsigned short int split, sorted; int parts, firstchild, sibling; int res, resz, resm, com_tid; -};//__attribute__((aligned(64))); +}; #define const_G 1 /* Requred variables to obtain cells. */ -#define cell_maxparts 128 +#define cell_maxparts 256 #define CELL_STRETCH 2 #define INITIAL_CELLS 256 struct cell *cell_pool = NULL; @@ -101,20 +102,7 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c int count_i = ci->count, count_j = cj->count; int parts_i = ci->parts, parts_j = cj->parts; double xi[3]; - float dx[3], ai[3], mi, mj, r2, w, ir; - __shared__ double2 parts_xy[cell_maxparts]; - __shared__ double parts_z[cell_maxparts]; - __shared__ float4 parts_am[cell_maxparts]; - /*if(threadIdx.x == 0) - printf("%f, %f, %f, %f, %i, %f, %f, %f, %f, %i\n", ci->h, ci->loc_xy.x, ci->loc_xy.y, ci->loc_z, ci->split, - cj->h, cj->loc_xy.x, cj->loc_xy.y, cj->loc_z, cj->split);*/ - - /* Load particles of cell j into shared memory */ - /*for(k = parts_j + threadIdx.x, j = threadIdx.x; k < parts_j + count_j; k+= blockDim.x, j += blockDim.x ) { - parts_xy[j] = parts_pos_xy[k]; - parts_z[j] = parts_pos_z[k]; - parts_am[j] = parts_a_m[k]; - }*/ + float dx[3], ai[3], mj, r2, w, ir; /* Loop over cell i.*/ for(i = parts_i + threadIdx.x; i < parts_i + count_i; i+= blockDim.x) { @@ -124,7 +112,6 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c for(k = 0; k < 3; k++) { ai[k] = 0.0f; } - mi = parts_a_m[i].w; for(j = parts_j; j < parts_j + count_j; j++) { r2 = 0.0f; @@ -136,16 +123,12 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c r2 += dx[2] * dx[2]; - // ir = 1.0f / sqrtf(r2); ir = rsqrtf(r2); w = const_G * ir * ir * ir; mj = parts_a_m[j].w; for(k = 0; k < 3; k++) { ai[k] -= dx[k] * mj * w; } - // atomicAdd(&parts_a_m[j].x, w*dx[0]*mi); - // atomicAdd(&parts_a_m[j].y, w*dx[1]*mi); - // atomicAdd(&parts_a_m[j].z, w*dx[2]*mi); } atomicAdd(&parts_a_m[i].x, ai[0]); atomicAdd(&parts_a_m[i].y, ai[1]); @@ -154,11 +137,6 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c } /* Load particles of cell i into shared memory */ - /*for(k = parts_i + threadIdx.x, j = threadIdx.x; k < parts_i + count_i; k+= blockDim.x, j += blockDim.x ) { - parts_xy[j] = parts_pos_xy[k]; - parts_z[j] = parts_pos_z[k]; - parts_am[j] = parts_a_m[k]; - }*/ /*Loop over cell j. */ for(i = parts_j + threadIdx.x; i < parts_j + count_j; i+= blockDim.x) { xi[0] = parts_pos_xy[i].x; @@ -167,7 +145,6 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c for(k = 0; k < 3; k++) { ai[k] = 0.0f; } - mi = parts_a_m[i].w; for(j = parts_i; j < parts_i + count_i; j++) { r2 = 0.0f; @@ -196,29 +173,15 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c __device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell *cj) { - int i, k; + int i; double2 j_com_xy; double j_com_z; float j_com_mass; int count = leaf->count; int parts = leaf->parts; int cell_j = cj - cells; - int temp; float r2, dx[3], ir, w; - // if(cell_j < 0) -// { -// if(threadIdx.x == 0) - // printf("cell_j = %i, leaf = %i, threadIdx.x == %i\n", cell_j, leaf-cells, threadIdx.x); - // __syncthreads(); -// asm("trap;"); -// } - - // if(threadIdx.x == 0) - // printf("%f, %f, %f\n", cj->loc_xy.x, cj->loc_xy.y, cj->loc_z); - - - temp = cell_j; /* Init the com's data.*/ j_com_xy = com_xy[cell_j]; @@ -236,18 +199,10 @@ __device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell ir = rsqrtf(r2); w = j_com_mass * const_G * ir * ir * ir; - /* __threadfence(); - if(!isfinite(w * dx[0])){ - printf("Error in make_interact_pc, j_com_mass = %f, cell_j = %i, temp = %i, i = %i, threadIdx.x=%i\n", j_com_mass, cell_j, temp, i, threadIdx.x); asm("trap;");} - if(!isfinite(w * dx[1])){ - printf("Error in make_interact_pc\n"); asm("trap;");} - if(!isfinite(w * dx[2])){ - printf("Error in make_interact_pc\n"); asm("trap;");}*/ atomicAdd( &parts_a_m[i].x , w * dx[0]); atomicAdd( &parts_a_m[i].y , w * dx[1]); atomicAdd( &parts_a_m[i].z , w * dx[2]); } -//__syncthreads(); } /** @@ -255,7 +210,6 @@ __device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell */ __device__ __forceinline__ int are_neighbours_different_size(struct cell *ci, struct cell *cj) { - int k; float dx[3]; double cih = ci->h, cjh = cj->h; @@ -274,7 +228,7 @@ __device__ __forceinline__ int are_neighbours_different_size(struct cell *ci, st } __device__ __forceinline__ int are_neighbours(struct cell *ci, struct cell *cj) { - int k; + float dx[3]; float min_dist = ci->h; float center_i = ci->loc_xy.x; @@ -296,13 +250,6 @@ __device__ __forceinline__ int is_inside(struct cell *leaf, struct cell *c) { __device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf) { struct cell *cp ,*cps; - int leafnum = leaf - cells; -//if(threadIdx.x == 0 && leafnum == 23) - // printf("cj = %i\n", cj - cells); -// printf("%i\n", leafnum); - - // if(threadIdx.x == 0) - /// printf("ci = %i, cj = %i, leaf = %i\n", ci - cells, cj - cells, leaf - cells); for(cp = &cells[ci->firstchild]; cp != &cells[ci->sibling]; cp = &cells[cp->sibling]) { if(is_inside(leaf, cp)) break; @@ -318,16 +265,12 @@ __device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf } } else { make_interact_pc(leaf, cps); - // if(threadIdx.x == 0 && leafnum == 23) - // printf("leafnum = %i with cps = %i here\n", leafnum, cps - cells); __syncthreads(); } } }else{ for(cps = &cells[cj->firstchild]; cps!= &cells[cj->sibling]; cps = &cells[cps->sibling]) { - // if(threadIdx.x == 0 && leafnum == 23) - // printf("leafnum = %i with cps = %i\n", leafnum, cps - cells); make_interact_pc(leaf, cps); } @@ -347,17 +290,6 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) { struct cell *cp, *cps; - /*if(leaf->split) - { - printf("Leaf split = 1, oh dear."); - asm("trap;"); - } - if(c->split > 1) - { - printf("Cell had split > 1\n"); - asm("trap;"); - }*/ - /* Find the subcell of c the leaf is in.*/ /*cp = c; @@ -402,31 +334,13 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) { __device__ void iact_self_direct(int cellID) { struct cell *c = &cells[cellID]; double xi[3] = {0.0, 0.0, 0.0}; - float ai[3] = {0.0, 0.0, 0.0 }, mi, mj, dx[3] = {0.0,0.0,0.0}, r2, ir, w; - __shared__ double2 parts_xy[cell_maxparts]; - __shared__ double parts_z[cell_maxparts]; - __shared__ float4 parts_am[cell_maxparts]; + float ai[3] = {0.0, 0.0, 0.0 }, mj, dx[3] = {0.0,0.0,0.0}, r2, ir, w; int parts; int count; int i,j,k; - //if(threadIdx.x == 0) - // printf("%f, %f, %f, %f, %i\n", c->h, c->loc_xy.x, c->loc_xy.y, c->loc_z, c->split); - //If cell is split, interact each child with itself, and with each of its siblings. - /*if(c->split) { - //TODO - - } else {*/ parts = c->parts; count = c->count; - int z = threadIdx.x; - /* Load particle data into shared memory*/ - /*for(k = threadIdx.x + parts; k < parts + count; k += blockDim.x , z += blockDim.x) { - parts_xy[z] = parts_pos_xy[k]; - parts_z[z] = parts_pos_z[k]; - parts_am[z] = parts_a_m[k]; - } - __syncthreads();*/ for(i = parts+threadIdx.x; i < parts+count; i += blockDim.x) { xi[0] = parts_pos_xy[i].x; @@ -435,9 +349,7 @@ __device__ void iact_self_direct(int cellID) { for(k = 0; k < 3; k++) { ai[k] = 0.0; } - mi = parts_a_m[i].w; - //for(j = i+1; j!= i; j = (j+1)%count) for(j = parts; j < parts+count; j++) { if(i != j){ @@ -453,7 +365,6 @@ __device__ void iact_self_direct(int cellID) { /* Apply the gravitational acceleration. */ - //ir = 1.0f / sqrtf(r2); ir = rsqrtf(r2); w = const_G * ir * ir * ir; mj = parts_a_m[j].w; @@ -479,7 +390,7 @@ __device__ void iact_self_direct(int cellID) { */ static inline int are_neighbours_host(struct cell *ci, struct cell *cj) { - int k; +// int k; float dx[3]; #ifdef SANITY_CHECKS @@ -508,7 +419,7 @@ static inline int are_neighbours_host(struct cell *ci, struct cell *cj) { struct cell *cell_get() { - struct cell *res; +// struct cell *res; if(num_cells == 0) { @@ -645,9 +556,7 @@ void cell_split(int c, struct qsched *s) { int left[8], right[8]; double pivot[3]; static int root = -1; -// struct cell *progenitors[8]; int progenitors[8]; - int c1 = c; double2 *temp_xy; double *temp_z; float4 *temp_a_m; @@ -662,7 +571,6 @@ void cell_split(int c, struct qsched *s) { { if( cudaHostGetDevicePointer(&temp_xy, &parts_pos_xy_host[cell_pool[c].parts], 0) != cudaSuccess ) error("Failed to get host device pointer."); -// printf("tempxy = %p\n", temp_xy); cell_pool[c].res = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_xy, sizeof(double2) * cell_pool[c].count, parts_pos_xy_temp + cell_pool[c].parts); } @@ -680,7 +588,6 @@ void cell_split(int c, struct qsched *s) { cell_pool[c].resm = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_a_m, sizeof(float4) * cell_pool[c].count, parts_a_m_temp + cell_pool[c].parts); } - // error("Cell has no resource");*///TODO if(count > cell_maxparts ) { @@ -836,13 +743,9 @@ void cell_split(int c, struct qsched *s) { /* Otherwise we're at a leaf so we need to make the cell's particle-cell task. */ } else { -// struct cell *data[2] = {root, c}; int data[2] = {root, c}; int tid = qsched_addtask(s, task_type_self_pc, task_flag_none, data, 2 * sizeof(int), 3000); - /*qsched_adduse(s, tid, cell_pool[root].res); - qsched_adduse(s, tid, cell_pool[root].resz); - qsched_adduse(s, tid, cell_pool[root].resm);*/ qsched_addlock(s, tid, cell_pool[c].res); qsched_addlock(s, tid, cell_pool[c].resz); qsched_addlock(s, tid, cell_pool[c].resm); @@ -864,8 +767,7 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){ qsched_task_t tid; int data[2]; - struct cell /**data[2],*/ *cp, *cps; - int cpi; + struct cell *cp, *cps; if(cj == NULL) @@ -967,11 +869,8 @@ qsched_funtype func; void test_bh(int N, int runs, char *fileName) { int i, k; struct cell *root; - struct part *parts; FILE *file; struct qsched s; - ticks tic, toc_run, tot_setup = 0, tot_run = 0; - int countMultipoles = 0, countPairs = 0, countCoMs = 0; struct cell *gpu_ptr_cells; cudaFree(0); @@ -1142,23 +1041,6 @@ float *comm_temp; if( cudaMemcpyToSymbol(com_mass, &comm_temp, sizeof(float *), 0, cudaMemcpyHostToDevice) != cudaSuccess) error("Failed to copy com pointer to the GPU"); - /* for(i = 0; i < s.count; i++){ - int *idata = (int*)&s.data[s.tasks[i].data]; - if(s.tasks[i].type == task_type_self) - printf("Self task with data[0] = %i\n", idata[0]); - if(s.tasks[i].type == task_type_pair) { - printf("Pair task with data[0] = %i and data[1] = %i\n", idata[0], idata[1]); - } - if(s.tasks[i].type == task_type_self_pc) { - printf("PC task with data[0] = %i and data[1] = %i\n", idata[0], idata[1]); - } - }*/ - - // printf("com_mass_host[152] = %f\n", com_mass_host[152]); - - - //Run code. -// printf("gpu_data = %p\n", (int*)s.res[0].gpu_data); qsched_run_CUDA( &s , func ); qsched_print_cuda_timers(&s); @@ -1169,16 +1051,12 @@ struct task* tasks = qsched_get_timers( &s , s.count ); for(i = 0; i < s.count; i++) { printf("%i %lli %lli %i\n", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID); - // printf("\n"); } } /* Dump the particles to a file */ file = fopen("particle_dump.dat", "w"); -/* fprintf(file, - "# ID m x y z a_exact.x a_exact.y a_exact.z a_legacy.x " - "a_legacy.y a_legacy.z a_new.x a_new.y a_new.z\n");*/ for (k = 0; k < N; ++k) fprintf(file, "%e, %e, %e, %e, %e, %e, %e\n", parts_a_m_host[k].w, parts_pos_xy_host[k].x, parts_pos_xy_host[k].y, parts_pos_z_host[k], @@ -1228,9 +1106,6 @@ int main(int argc, char *argv[]) { /* Tree node information */ printf("Size of cell: %zu bytes.\n", sizeof(struct cell)); - /* Part information */ -// printf("Size of part: %zu bytes.\n", sizeof(struct part)); - /* Dump arguments. */ if (fileName[0] == 0) { message("Computing the N-body problem over %i random particles using %i " diff --git a/examples/test_bh_3.cu b/examples/test_bh_3.cu index a1adfbd..7b3df04 100644 --- a/examples/test_bh_3.cu +++ b/examples/test_bh_3.cu @@ -39,6 +39,9 @@ extern "C"{ #include "res.h" } #include "cuda_queue.h" +//#define double float +//#define double2 float2 + /** Task types. */ enum task_type { @@ -57,20 +60,19 @@ double h; int count; unsigned short int split, sorted; int parts, firstchild, sibling; -int res, /*resz, resm,*/ com_tid; - -};//__attribute__((aligned(64))); +int res, com_tid; +}; struct part{ double loc[3]; float a[3]; float m; -};//__attribute__((aligned(64))); +}; #define const_G 1 /* Requred variables to obtain cells. */ -#define cell_maxparts 256 +#define cell_maxparts 64 #define CELL_STRETCH 2 #define INITIAL_CELLS 256 struct cell *cell_pool = NULL; @@ -79,9 +81,6 @@ int num_cells = 0; int cell_size = INITIAL_CELLS*sizeof(struct cell); /* Device locations for the particle values. */ -//__device__ double2 *parts_pos_xy; -//__device__ double *parts_pos_z; -//__device__ float4 *parts_a_m; __device__ double2 *com_xy; __device__ double *com_z; __device__ float *com_mass; @@ -90,9 +89,6 @@ __device__ struct cell *cells; /* Host locations for the particle values. */ -//double2 *parts_pos_xy_host; -//double *parts_pos_z_host; -//float4 *parts_a_m_host; double2 *com_xy_host; double *com_z_host; float *com_mass_host; @@ -109,20 +105,7 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c int count_i = ci->count, count_j = cj->count; int parts_i = ci->parts, parts_j = cj->parts; double xi[3]; - float dx[3], ai[3], mi, mj, r2, w, ir; - __shared__ double2 parts_xy[cell_maxparts]; - __shared__ double parts_z[cell_maxparts]; - __shared__ float4 parts_am[cell_maxparts]; - /*if(threadIdx.x == 0) - printf("%f, %f, %f, %f, %i, %f, %f, %f, %f, %i\n", ci->h, ci->loc_xy.x, ci->loc_xy.y, ci->loc_z, ci->split, - cj->h, cj->loc_xy.x, cj->loc_xy.y, cj->loc_z, cj->split);*/ - - /* Load particles of cell j into shared memory */ - /*for(k = parts_j + threadIdx.x, j = threadIdx.x; k < parts_j + count_j; k+= blockDim.x, j += blockDim.x ) { - parts_xy[j] = parts_pos_xy[k]; - parts_z[j] = parts_pos_z[k]; - parts_am[j] = parts_a_m[k]; - }*/ + float dx[3], ai[3], mj, r2, w, ir; /* Loop over cell i.*/ for(i = parts_i + threadIdx.x; i < parts_i + count_i; i+= blockDim.x) { @@ -132,7 +115,6 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c for(k = 0; k < 3; k++) { ai[k] = 0.0f; } - mi = parts_cuda[i].m; for(j = parts_j; j < parts_j + count_j; j++) { r2 = 0.0f; @@ -144,16 +126,12 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c r2 += dx[2] * dx[2]; - // ir = 1.0f / sqrtf(r2); ir = rsqrtf(r2); w = const_G * ir * ir * ir; mj = parts_cuda[j].m; for(k = 0; k < 3; k++) { ai[k] -= dx[k] * mj * w; } - // atomicAdd(&parts_a_m[j].x, w*dx[0]*mi); - // atomicAdd(&parts_a_m[j].y, w*dx[1]*mi); - // atomicAdd(&parts_a_m[j].z, w*dx[2]*mi); } atomicAdd(&parts_cuda[i].a[0], ai[0]); atomicAdd(&parts_cuda[i].a[1], ai[1]); @@ -162,11 +140,6 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c } /* Load particles of cell i into shared memory */ - /*for(k = parts_i + threadIdx.x, j = threadIdx.x; k < parts_i + count_i; k+= blockDim.x, j += blockDim.x ) { - parts_xy[j] = parts_pos_xy[k]; - parts_z[j] = parts_pos_z[k]; - parts_am[j] = parts_a_m[k]; - }*/ /*Loop over cell j. */ for(i = parts_j + threadIdx.x; i < parts_j + count_j; i+= blockDim.x) { xi[0] = parts_cuda[i].loc[0]; @@ -175,7 +148,6 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c for(k = 0; k < 3; k++) { ai[k] = 0.0f; } - mi = parts_cuda[i].m; for(j = parts_i; j < parts_i + count_i; j++) { r2 = 0.0f; @@ -202,53 +174,30 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c } -__device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell *cj) { +__device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell *cj, float3 *accels) { - int i, k; - double2 j_com_xy; - double j_com_z; - float j_com_mass; + int i; int count = leaf->count; int parts = leaf->parts; int cell_j = cj - cells; - int temp; float r2, dx[3], ir, w; - // if(cell_j < 0) -// { -// if(threadIdx.x == 0) - // printf("cell_j = %i, leaf = %i, threadIdx.x == %i\n", cell_j, leaf-cells, threadIdx.x); - // __syncthreads(); -// asm("trap;"); -// } - - // if(threadIdx.x == 0) - // printf("%f, %f, %f\n", cj->loc_xy.x, cj->loc_xy.y, cj->loc_z); - - - temp = cell_j; - - /* Init the com's data.*/ - j_com_xy = com_xy[cell_j]; - j_com_z = com_z[cell_j]; - j_com_mass = com_mass[cell_j]; for(i = parts+threadIdx.x; i < parts+count; i+=blockDim.x) { r2 = 0.0; - dx[0] = j_com_xy.x - parts_cuda[i].loc[0]; + dx[0] = com_xy[cell_j].x - parts_cuda[i].loc[0]; r2 += dx[0] * dx[0]; - dx[1] = j_com_xy.y - parts_cuda[i].loc[1]; + dx[1] = com_xy[cell_j].y - parts_cuda[i].loc[1]; r2 += dx[1] * dx[1]; - dx[2] = j_com_z - parts_cuda[i].loc[2]; + dx[2] = com_z[cell_j] - parts_cuda[i].loc[2]; r2 += dx[2] * dx[2]; ir = rsqrtf(r2); - w = j_com_mass * const_G * ir * ir * ir; - atomicAdd( &parts_cuda[i].a[0] , w * dx[0]); - atomicAdd( &parts_cuda[i].a[1] , w * dx[1]); - atomicAdd( &parts_cuda[i].a[2] , w * dx[2]); + w = com_mass[cell_j] * const_G * ir * ir * ir; + (*accels).x+= w*dx[0]; + (*accels).y+= w*dx[1]; + (*accels).z+= w*dx[2]; } -//__syncthreads(); } /** @@ -256,7 +205,6 @@ __device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell */ __device__ __forceinline__ int are_neighbours_different_size(struct cell *ci, struct cell *cj) { - int k; float dx[3]; double cih = ci->h, cjh = cj->h; @@ -275,7 +223,6 @@ __device__ __forceinline__ int are_neighbours_different_size(struct cell *ci, st } __device__ __forceinline__ int are_neighbours(struct cell *ci, struct cell *cj) { - int k; float dx[3]; float min_dist = ci->h; float center_i = ci->loc_xy.x; @@ -294,7 +241,7 @@ __device__ __forceinline__ int is_inside(struct cell *leaf, struct cell *c) { return (leaf->parts >= c->parts) && (leaf->parts < c->parts + c->count); } -__device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf) { +__device__ __forceinline__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf, float3 *accels) { struct cell *cp ,*cps; @@ -311,8 +258,9 @@ __device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf make interact pc */ - struct cell *loopend; + struct cell *loopend; cp = ci; + #pragma unroll while(cp->split) { for(cp = &cells[ci->firstchild]; cp != &cells[ci->sibling]; cp = &cells[cp->sibling]) { @@ -322,26 +270,32 @@ __device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf if(are_neighbours_different_size(cp, cj)) { loopend = &cells[cj->sibling]; cps = &cells[cj->firstchild]; + #pragma unroll while(cps != loopend) { - if(!are_neighbours(cp, cps)){ - make_interact_pc(leaf, cps); - //__syncthreads(); - cps = &cells[cps->sibling]; - }else{ - if(cps->split) - cps = &cells[cps->firstchild]; - else + // if(are_neighbours_different_size(cps, cj)){ + if(!are_neighbours(cp, cps)){ + make_interact_pc(leaf, cps, accels); cps = &cells[cps->sibling]; - } + }else{ + if(cps->split) + cps = &cells[cps->firstchild]; + else + cps = &cells[cps->sibling]; + } + +// }else{ + // struct cell *temp; + // for(temp = &cells[cps->firstchild]; temp != &cells[cps->sibling]; temp = &cells[temp->sibling]) + // make_interact_pc(leaf, temp, accels); + // cps = &cells[cps->sibling]; + // } } }else{ for(cps = &cells[cj->firstchild]; cps != &cells[cj->sibling]; cps = &cells[cps->sibling]) { - make_interact_pc(leaf, cps); - // __syncthreads(); - + make_interact_pc(leaf, cps, accels); } break; } @@ -354,7 +308,7 @@ __device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf - /* for(cp = &cells[ci->firstchild]; cp != &cells[ci->sibling]; cp = &cells[cp->sibling]) { + /* for(cp = &cells[ci->firstchild]; cp != &cells[ci->sibling]; cp = &cells[cp->sibling]) { if(is_inside(leaf, cp)) break; } @@ -363,11 +317,11 @@ __device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf for(cps = &cells[cj->firstchild]; cps != &cells[cj->sibling]; cps = &cells[cps->sibling]) { if(!are_neighbours(cp, cps)) { - make_interact_pc(leaf, cps); + make_interact_pc(leaf, cps, accels); __syncthreads(); } else { if(cp->split && cps->split) { - iact_pair_pc(cp, cps, leaf); + iact_pair_pc(cp, cps, leaf, accels); } } @@ -375,12 +329,11 @@ __device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf }else{ for(cps = &cells[cj->firstchild]; cps!= &cells[cj->sibling]; cps = &cells[cps->sibling]) { - make_interact_pc(leaf, cps); + make_interact_pc(leaf, cps, accels); } - }*/ - -// __syncthreads(); + } +*/ } /** @@ -390,25 +343,21 @@ __device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf * @param c The #cell containing the monopoles * @param leaf The #cell containing the particles */ -__device__ void iact_self_pc(struct cell *c, struct cell *leaf) { +__device__ __forceinline__ void iact_self_pc(struct cell *c, struct cell *leaf) { struct cell *cp, *cps; + float3 accelerations; - /*if(leaf->split) - { - printf("Leaf split = 1, oh dear."); - asm("trap;"); - } - if(c->split > 1) - { - printf("Cell had split > 1\n"); - asm("trap;"); - }*/ + int i; + accelerations.x = 0.0f; + accelerations.y = 0.0f; + accelerations.z = 0.0f; /* Find the subcell of c the leaf is in.*/ cp = c; cps = c; + #pragma unroll while(c->split) { for(cp = &cells[cp->firstchild]; cp != &cells[c->sibling]; cp = &cells[cp->sibling]){ @@ -417,12 +366,21 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) { if(cp->split){ for(cps = &cells[c->firstchild]; cps != &cells[c->sibling]; cps = &cells[cps->sibling]) { - if(cp != cps && cps->split) iact_pair_pc(cp, cps, leaf); + if(cp != cps && cps->split) iact_pair_pc(cp, cps, leaf, &accelerations); } } c = cp; } + int parts = leaf->parts; + int count = leaf->count; + + for(i = parts+threadIdx.x; i < parts+count; i+=blockDim.x) { + atomicAdd( &parts_cuda[i].a[0] , accelerations.x); + atomicAdd( &parts_cuda[i].a[1] , accelerations.y); + atomicAdd( &parts_cuda[i].a[2] , accelerations.z); + } + /*for( cp = &cells[c->firstchild]; cp != &cells[c->sibling]; cp = &cells[cp->sibling]) { if(is_inside(leaf, cp)) break; } @@ -446,34 +404,16 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) { * * @param cellID The cell ID to compute interactions on. */ -__device__ void iact_self_direct(int cellID) { +__device__ __forceinline__ void iact_self_direct(int cellID) { struct cell *c = &cells[cellID]; double xi[3] = {0.0, 0.0, 0.0}; - float ai[3] = {0.0, 0.0, 0.0 }, mi, mj, dx[3] = {0.0,0.0,0.0}, r2, ir, w; - __shared__ double2 parts_xy[cell_maxparts]; - __shared__ double parts_z[cell_maxparts]; - __shared__ float4 parts_am[cell_maxparts]; + float ai[3] = {0.0, 0.0, 0.0 }, mj, dx[3] = {0.0,0.0,0.0}, r2, ir, w; int parts; int count; int i,j,k; - //if(threadIdx.x == 0) - // printf("%f, %f, %f, %f, %i\n", c->h, c->loc_xy.x, c->loc_xy.y, c->loc_z, c->split); - //If cell is split, interact each child with itself, and with each of its siblings. - /*if(c->split) { - //TODO - - } else {*/ parts = c->parts; count = c->count; - int z = threadIdx.x; - /* Load particle data into shared memory*/ - /*for(k = threadIdx.x + parts; k < parts + count; k += blockDim.x , z += blockDim.x) { - parts_xy[z] = parts_pos_xy[k]; - parts_z[z] = parts_pos_z[k]; - parts_am[z] = parts_a_m[k]; - } - __syncthreads();*/ for(i = parts+threadIdx.x; i < parts+count; i += blockDim.x) { xi[0] = parts_cuda[i].loc[0]; @@ -482,9 +422,7 @@ __device__ void iact_self_direct(int cellID) { for(k = 0; k < 3; k++) { ai[k] = 0.0; } - mi = parts_cuda[i].m; - //for(j = i+1; j!= i; j = (j+1)%count) for(j = parts; j < parts+count; j++) { if(i != j){ @@ -500,7 +438,6 @@ __device__ void iact_self_direct(int cellID) { /* Apply the gravitational acceleration. */ - //ir = 1.0f / sqrtf(r2); ir = rsqrtf(r2); w = const_G * ir * ir * ir; mj = parts_cuda[j].m; @@ -526,7 +463,7 @@ __device__ void iact_self_direct(int cellID) { */ static inline int are_neighbours_host(struct cell *ci, struct cell *cj) { - int k; +// int k; float dx[3]; #ifdef SANITY_CHECKS @@ -555,7 +492,6 @@ static inline int are_neighbours_host(struct cell *ci, struct cell *cj) { struct cell *cell_get() { - struct cell *res; if(num_cells == 0) { @@ -616,8 +552,6 @@ struct cell *cell_get() cell_pool[used_cells-1].sibling = -1; cell_pool[used_cells-1].firstchild = -1; cell_pool[used_cells-1].res = qsched_res_none; - //cell_pool[used_cells-1].resz = qsched_res_none; - //cell_pool[used_cells-1].resm = qsched_res_none; return &cell_pool[used_cells-1]; } @@ -686,19 +620,12 @@ void cell_split(int c, struct qsched *s) { int i, j, k, kk, count = cell_pool[c].count; int parts = cell_pool[c].parts; struct part temp_part; - double2 tempxy; - double tempxy1; - float4 tempxy2; struct cell *cp, *cell; int left[8], right[8]; double pivot[3]; static int root = -1; -// struct cell *progenitors[8]; int progenitors[8]; - int c1 = c; struct part *temp_xy; - double *temp_z; - float4 *temp_a_m; /* Set the root cell. */ if (root < 0) { @@ -710,25 +637,9 @@ void cell_split(int c, struct qsched *s) { { if( cudaHostGetDevicePointer(&temp_xy, &parts_host[cell_pool[c].parts], 0) != cudaSuccess ) error("Failed to get host device pointer."); -// printf("tempxy = %p\n", temp_xy); cell_pool[c].res = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_xy, sizeof(struct part) * cell_pool[c].count, parts_pos_xy_temp + cell_pool[c].parts); } - /*if(cell_pool[c].resz == qsched_res_none) - { - if( cudaHostGetDevicePointer(&temp_z, &parts_pos_z_host[cell_pool[c].parts], 0) != cudaSuccess ) - error("Failed to get host device pointer."); - cell_pool[c].resz = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_z, - sizeof(double) * cell_pool[c].count, parts_pos_z_temp + cell_pool[c].parts); - } - if(cell_pool[c].resm == qsched_res_none) - { - if( cudaHostGetDevicePointer(&temp_a_m, &parts_a_m_host[cell_pool[c].parts], 0) != cudaSuccess ) - error("Failed to get host device pointer."); - cell_pool[c].resm = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_a_m, - sizeof(float4) * cell_pool[c].count, parts_a_m_temp + cell_pool[c].parts); - }*/ - // error("Cell has no resource");*///TODO if(count > cell_maxparts ) { @@ -818,14 +729,6 @@ void cell_split(int c, struct qsched *s) { error("Failed to get host device pointer."); cell_pool[progenitors[k]].res = qsched_addres(s, qsched_owner_none, cell->res, temp_xy, sizeof(struct part) * cell_pool[progenitors[k]].count, parts_temp + cell_pool[progenitors[k]].parts); - /*if( cudaHostGetDevicePointer(&temp_z, &parts_pos_z_host[cell_pool[progenitors[k]].parts], 0) != cudaSuccess ) - error("Failed to get host device pointer."); - cell_pool[progenitors[k]].resz = qsched_addres(s, qsched_owner_none, cell->resz, temp_z, - sizeof(double) * cell_pool[progenitors[k]].count, parts_pos_z_temp + cell_pool[progenitors[k]].parts); - if( cudaHostGetDevicePointer(&temp_a_m, &parts_a_m_host[cell_pool[progenitors[k]].parts], 0) != cudaSuccess ) - error("Failed to get host device pointer."); - cell_pool[progenitors[k]].resm = qsched_addres(s, qsched_owner_none, cell->resm, temp_a_m, - sizeof(float4) * cell_pool[progenitors[k]].count, parts_a_m_temp + cell_pool[progenitors[k]].parts);*/ } /* Find the first non-empty progenitor */ @@ -870,12 +773,7 @@ void cell_split(int c, struct qsched *s) { int data[2] = {root, c}; int tid = qsched_addtask(s, task_type_self_pc, task_flag_none, data, 2 * sizeof(int), 3000); - /*qsched_adduse(s, tid, cell_pool[root].res); - qsched_adduse(s, tid, cell_pool[root].resz); - qsched_adduse(s, tid, cell_pool[root].resm);*/ qsched_addlock(s, tid, cell_pool[c].res); - // qsched_addlock(s, tid, cell_pool[c].resz); - // qsched_addlock(s, tid, cell_pool[c].resm); } #ifndef COM_AS_TASK @@ -894,8 +792,7 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){ qsched_task_t tid; int data[2]; - struct cell /**data[2],*/ *cp, *cps; - int cpi; + struct cell *cp, *cps; if(cj == NULL) @@ -917,8 +814,6 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){ data[1] = -1; tid = qsched_addtask(s, task_type_self, task_flag_none, data, sizeof(int)*2, 2); qsched_addlock(s, tid, ci->res); - //qsched_addlock(s, tid, ci->resz); - //qsched_addlock(s, tid, ci->resm); } } /* Else its a pair!*/ @@ -946,11 +841,7 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){ /* Add the resources. */ qsched_addlock(s, tid, ci->res); - // qsched_addlock(s, tid, ci->resz); - // qsched_addlock(s, tid, ci->resm); qsched_addlock(s, tid, cj->res); - // qsched_addlock(s, tid, cj->resz); - // qsched_addlock(s, tid, cj->resm); } } @@ -961,7 +852,7 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){ } -__device__ void runner( int type , void *data ) { +__device__ __forceinline__ void runner( int type , void *data ) { int *idata = (int *)data; int i = idata[0]; @@ -977,6 +868,7 @@ __device__ void runner( int type , void *data ) { iact_self_pc( &cells[i], &cells[j] ); break; default: + printf("Got to default?\n"); asm("trap;"); } __syncthreads(); @@ -997,14 +889,12 @@ qsched_funtype func; void test_bh(int N, int runs, char *fileName) { int i, k; struct cell *root; - struct part *parts; FILE *file; struct qsched s; - ticks tic, toc_run, tot_setup = 0, tot_run = 0; - int countMultipoles = 0, countPairs = 0, countCoMs = 0; struct cell *gpu_ptr_cells; cudaFree(0); + cudaThreadSetCacheConfig(cudaFuncCachePreferL1); if( cudaMemcpyFromSymbol( &func , function , sizeof(qsched_funtype) ) != cudaSuccess) error("Failed to copy function pointer from device"); @@ -1068,10 +958,6 @@ void test_bh(int N, int runs, char *fileName) { number++; if(cell_pool[c].res == qsched_res_none) message("cell %i has no res", c); - //if(cell_pool[c].resz == qsched_res_none) - // message("cell %i has no resz", c); - //if(cell_pool[c].resm == qsched_res_none) - // message("cell %i has no resm", c); } if(!cell_pool[c].split) { nr_leaves++; @@ -1091,10 +977,6 @@ void test_bh(int N, int runs, char *fileName) { message("Average number of parts per leaf is %lf.", ((double)N) / ((double)nr_leaves)); message("Max number of parts in a leaf is %i, min number is %i", maxparts, minparts); - /* for(k = 0; k < num_cells; k++) - if(cell_pool[k].split > 1 ) - printf("Split > 1\n");*/ - create_tasks(&s, root, NULL); int self = 0, pair = 0, pc = 0; @@ -1160,36 +1042,19 @@ float *comm_temp; if( cudaMemcpyToSymbol(com_mass, &comm_temp, sizeof(float *), 0, cudaMemcpyHostToDevice) != cudaSuccess) error("Failed to copy com pointer to the GPU"); - /* for(i = 0; i < s.count; i++){ - int *idata = (int*)&s.data[s.tasks[i].data]; - if(s.tasks[i].type == task_type_self) - printf("Self task with data[0] = %i\n", idata[0]); - if(s.tasks[i].type == task_type_pair) { - printf("Pair task with data[0] = %i and data[1] = %i\n", idata[0], idata[1]); - } - if(s.tasks[i].type == task_type_self_pc) { - printf("PC task with data[0] = %i and data[1] = %i\n", idata[0], idata[1]); - } - }*/ - - // printf("com_mass_host[152] = %f\n", com_mass_host[152]); - - - //Run code. -// printf("gpu_data = %p\n", (int*)s.res[0].gpu_data); qsched_run_CUDA( &s , func ); qsched_print_cuda_timers(&s); k = 0; printf("%e, %e, %e, %e, %e, %e, %e\n", parts_host[k].m, parts_host[k].loc[0], parts_host[k].loc[1], parts_host[k].loc[2], parts_host[k].a[0], parts_host[k].a[1], parts_host[k].a[2]); -struct task* tasks = qsched_get_timers( &s , s.count ); - for(i = 0; i < s.count; i++) - { - printf("%i %lli %lli %i\n", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID); +//struct task* tasks = qsched_get_timers( &s , s.count ); +// for(i = 0; i < s.count; i++) +// { +// printf("%i %lli %lli %i\n", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID); // printf("\n"); - } +// } } /* Dump the particles to a file */ @@ -1224,7 +1089,6 @@ int main(int argc, char *argv[]) { case 't': if (sscanf(optarg, "%d", &nr_threads) != 1) error("Error parsing number of threads."); -// omp_set_num_threads(nr_threads); break; case 'f': if (sscanf(optarg, "%s", &fileName[0]) != 1) @@ -1247,7 +1111,7 @@ int main(int argc, char *argv[]) { printf("Size of cell: %zu bytes.\n", sizeof(struct cell)); /* Part information */ -// printf("Size of part: %zu bytes.\n", sizeof(struct part)); + printf("Size of part: %zu bytes.\n", sizeof(struct part)); /* Dump arguments. */ if (fileName[0] == 0) { diff --git a/examples/test_gpu_simple.cu b/examples/test_gpu_simple.cu index 066b53b..0924717 100644 --- a/examples/test_gpu_simple.cu +++ b/examples/test_gpu_simple.cu @@ -67,7 +67,6 @@ __device__ void quarter(float *data) __global__ void Manual(float *src) { - int i; int datas = blockIdx.x; cuda_data[datas*1000+threadIdx.x] = src[datas*1000+threadIdx.x]; @@ -109,13 +108,13 @@ __global__ void Setup() int main ( int argc , char *argv[] ) { - float *array, *cuda_array, *cuda_array2, *device_array; + float *array, *cuda_array, *device_array; int i, k=0; qsched_funtype func; struct qsched s; qsched_task_t *tid; qsched_res_t *rid; - ticks tic, toc_run, tot_setup, tot_run = 0; + ticks tic, toc_run, tot_run = 0; qsched_init( &s , 1 , qsched_flag_none ); cudaDeviceReset(); cudaSetDevice(0); @@ -134,11 +133,6 @@ int main ( int argc , char *argv[] ) { array[i] = (float)i; } - // if(cudaHostGetDevicePointer(&cuda_array , &array[k*1000] , 0) != cudaSuccess) - //error("Failed to get device pointer for data: %s", cudaGetErrorString(cudaPeekAtLastError())); - -// if ( cudaMallocHost(&array, M*sizeof(float)) != cudaSuccess ) - // error("Failed to allocate array"); /* Initialize the scheduler. */ @@ -193,16 +187,9 @@ printf("Starting second run\n"); printf("Second run complete\n"); struct task *completed_tasks = qsched_get_timers( &s, s.count ); -/* tic = getticks(); - qsched_run_CUDA( &s , func ); - toc_run = getticks(); - message( "qsched_run_CUDA took %lli ticks..." , toc_run - tic ); - tot_run += toc_run - tic;*/ -// if( cudaFreeHost( array) != cudaSuccess) - // error("Failed to free array"); cudaDeviceReset(); @@ -215,8 +202,6 @@ cudaDeviceReset(); error("Failed to get device pointer for data: %s",cudaGetErrorString(cudaPeekAtLastError())); if( cudaMalloc(&device_array , M*sizeof(float) ) != cudaSuccess ) error("Failed to allocate device array: %s", cudaGetErrorString(cudaPeekAtLastError())); - /*if( cudaMemcpy( &device_array , array , M*sizeof(float), cudaMemcpyHostToDevice ) != cudaSuccess ) - error("Failed to copy device array: %s", cudaGetErrorString(cudaPeekAtLastError()));*/ if( cudaMemcpyToSymbol( cuda_data , &device_array,sizeof(float*), 0 , cudaMemcpyHostToDevice) != cudaSuccess ) error("Failed to copy array pointer to device: %s", cudaGetErrorString(cudaPeekAtLastError())); tic = getticks(); @@ -229,6 +214,4 @@ cudaDeviceReset(); if(array[i] != ((float)i)*((float)i)*0.5f) printf("%i wrong, %.3f != %.3f\n", i, array[i], ((float)i)*((float)i)*0.5f ); - //printf("%.3f\n", array[2]); - } diff --git a/examples/test_qr.cu b/examples/test_qr.cu index b93b68d..f195f3a 100644 --- a/examples/test_qr.cu +++ b/examples/test_qr.cu @@ -22,11 +22,8 @@ extern "C"{ int g_size; enum task_types { task_SGEQRF , task_SLARFT , task_STSQRF , task_SSSRFT} ; -//#define TID threadIdx.x #define numthreads 128 -//#define tilesize 32 #define cuda_maxtasks 1000000 -//#define CO(x,y,ldm) (((y)*ldm) + (x)) #define WARPS 4 __device__ float *GPU_matrix; @@ -145,7 +142,7 @@ float* generateColumnMatrix(int size, unsigned long int m_z) { float* matrix; - cudaError_t code = cudaMallocHost(&matrix, sizeof(float)*size/**size*/); + cudaError_t code = cudaMallocHost(&matrix, sizeof(float)*size); if(code != cudaSuccess) printf("%s size = %i g_size = %i\n", cudaGetErrorString(code),size, g_size); else @@ -211,8 +208,6 @@ __device__ inline void reduceSumMultiWarp( float* value ) } __threadfence(); *value = stuff[group * 32]; - //printf("Not supported.\n"); - //asm("trap;"); #endif } @@ -236,7 +231,6 @@ __device__ void SGEQRF(volatile float* cornerTile, int tilesize,volatile float* int i, j; float norm=0.0, sign, u1, tau, z; int TID = threadIdx.x % 32; - //int set = threadIdx.x / 32; float w; /*Find the householder vector for each row. */ for(i = 0; i < tilesize; i++) @@ -253,7 +247,6 @@ __device__ void SGEQRF(volatile float* cornerTile, int tilesize,volatile float* else sign = 1; - // sign = __shfl(sign, i, 32); norm = sqrt(norm); if(TID >= i ) w = cornerTile[ i*tilesize + TID ]; @@ -262,13 +255,8 @@ __device__ void SGEQRF(volatile float* cornerTile, int tilesize,volatile float* -// if(TID==i) u1 = cornerTile[i*tilesize + i] - sign*norm; - // __syncthreads(); -// u1 = __shfl(u1, i, 32); - // if(i==1) -// printf("%.3f\n", u1); if(u1 != 0.0) { if(TID > i) @@ -285,8 +273,6 @@ __device__ void SGEQRF(volatile float* cornerTile, int tilesize,volatile float* else tau = 0.0; - //if(TID == 0) - // printf("%.3f\n", sign); /*Store the below diagonal vector */ if(TID > i) @@ -358,17 +344,9 @@ void __device__ SLARFT(volatile float* cornerTile, volatile float* rowTile, int /* Below Diagonal!*/ /*Compute w'*A_j*/ z = w * rowTile[j*tilesize+TID]; - //for(n = i; n < tileSize; n++) - //{ - // z = z + w[n] * rowTile[j*tileSize+n]; - //} reduceSumMultiWarp(&z); if(TID >= i) rowTile[j*tilesize + TID] = rowTile[j*tilesize + TID] - tauMatrix[(kk*tilesize+i)*tauNum + kk] * w * z; - //for(n = i; n < tileSize; n++) - //{ - // rowTile[j*tileSize+n] = rowTile[j*tileSize+n] - tauMatrix[(kk*tileSize+i)*tauNum+kk]*w[n]*z; - //} } @@ -393,16 +371,14 @@ void __device__ SLARFT(volatile float* cornerTile, volatile float* rowTile, int */ __device__ void STSQRF(volatile float* cornerTile,volatile float* columnTile, int tilesize, int ii, int kk, volatile float* tauMatrix, int tauNum ) { - int i, j, n; + int i, j; float norm=0.0, sign, u1, tau, z; float w, wupper; int TID = threadIdx.x % 32; - //int set = threadIdx.x / 32; /* For each column compute the householder vector. */ for(i = 0; i < tilesize; i++) { -// norm = 0.0; wupper = cornerTile[i*tilesize + i]; @@ -446,33 +422,6 @@ __device__ void STSQRF(volatile float* cornerTile,volatile float* columnTile, in columnTile[i*tilesize+TID] = w; if(threadIdx.x == 0) tauMatrix[(kk*tilesize+i)*tauNum+ii] = tau; - /* Apply to each row to the right.*/ -// for(j = i; j < tilesize; j++) - // { - /* Find w'*A_j, w is 0s except for first value with upper tile.*/ - // z = 1.0 * cornerTile[ j*tilesize+i ]; - // for(n = 0; n < tilesize; n++) - // { - // z = z + w[ tilesize+n ]*columnTile[ j*tilesize+n ]; -// } - /* Apply to upper tile.*/ - // cornerTile[j*tilesize+i] = cornerTile[j*tilesize+i ] - tau*1.0*z; - // for(n = i+1; n < tilesize; n++) - // { - // cornerTile[j*tilesize+n] = cornerTile[j*tilesize+n ] - tau*w[n]*z; - // } - /* Apply to lower tile.*/ -// for(n = 0; n < tilesize; n++) - // { - // columnTile[ j*tilesize+n] = columnTile[ j*tilesize+n ] - tau*w[tilesize+n]*z; - // } - - // } - /* Store w*/ -// for(j = 0; j < tilesize; j++){ - // columnTile[ i*tilesize+j ] = w[tilesize+j]; - // } - // tauMatrix[(kk*tilesize+i)*tauNum+ ii] = tau; } } @@ -495,7 +444,7 @@ __device__ void STSQRF(volatile float* cornerTile,volatile float* columnTile, in */ __device__ void SSSRFT( volatile float* cornerTile,volatile float* columnTile,volatile float* rowTile, int tilesize, int ii, int jj, int kk,volatile float* tauMatrix, int tauNum ) { - int i, j, n; + int i, j; float z; float w; int TID = threadIdx.x % 32; @@ -520,30 +469,6 @@ __device__ void SSSRFT( volatile float* cornerTile,volatile float* columnTile,vo cornerTile[j*tilesize+TID] = cornerTile[j*tilesize+TID] - tau*w*z; } __syncthreads(); - /*for(j = 0; j < i; j++) - w[j] = 0.0; - w[i] = 1.0; - for(j = i+1; j < tilesize; j++) - w[j] = 0.0; - for(j = 0; j < tilesize; j++) - w[j+tilesize] = columnTile[i*tilesize +j];*/ - - /* Apply householder vector (w) to the tiles.*/ -/* for(j = 0; j < tilesize; j++) - { - z = 0.0;*/ - /* Compute w' * A_j */ -/* for(n = 0; n < tilesize; n++) - { - z += w[n] * rowTile[j*tilesize+n]; - z += w[n + tilesize] * cornerTile[j*tilesize+n]; - } - for(n = 0; n < tilesize; n++) - { - rowTile[j*tilesize + n] = rowTile[j*tilesize + n] - tauMatrix[(kk*tilesize+i)*tauNum+ii]*w[n]*z; - cornerTile[j*tilesize+n] = cornerTile[j*tilesize+n]- tauMatrix[(kk*tilesize+i)*tauNum+ii]*w[tilesize+n]*z; - } - }*/ } } @@ -551,37 +476,25 @@ __device__ void SSSRFT( volatile float* cornerTile,volatile float* columnTile,vo __device__ void runner ( int type , void *data ) { __shared__ volatile float blockCache[(32*32) + 128]; - volatile float *workVector; - workVector = blockCache + (32*32); /* Decode the task data. */ int *idata = (int *)data; int i = idata[0]; int j = idata[1]; int k = idata[2]; - // int z; -// double buff[ 2*K*K ]; /* Decode and execute the task. */ switch ( type ) { case task_SGEQRF: - //if(threadIdx.x == 0) - // printf("SGEQRF %i %i\n", k, (k*cuda_m+k)*32*32); if(threadIdx.x < 32) SGEQRF( &GPU_matrix[(k*cuda_m+k)*32*32], 32, GPU_tau, k, cuda_m); break; case task_SLARFT: - // if(threadIdx.x == 0) - // printf("SLARFT %i %i %i\n", k,j, (j*cuda_m+k)*32*32); SLARFT( &GPU_matrix[(k*cuda_m +k)*32*32], &GPU_matrix[(j*cuda_m+k)*32*32], 32, j, k, GPU_tau, cuda_m); break; case task_STSQRF: - // if(threadIdx.x == 0) - // printf("STSQRF %i %i %i\n", k,i, (k*cuda_m+i)*32*32); if(threadIdx.x < 32) STSQRF( &GPU_matrix[(k*cuda_m+k)*32*32], &GPU_matrix[(k*cuda_m + i)*32*32], 32, i, k, GPU_tau, cuda_m); break; case task_SSSRFT: - // if(threadIdx.x == 0) - // printf("SSSRFT %i %i %i %i\n", k,j, i, (j*cuda_m+i)*32*32); SSSRFT( &GPU_matrix[(j*cuda_m+i)*32*32],&GPU_matrix[(k*cuda_m + i)*32*32],&GPU_matrix[(j*cuda_m+k)*32*32], 32, i, j, k, GPU_tau, cuda_m ); break; default: @@ -630,8 +543,6 @@ float* columnToTile( float* columnMatrix, int size , int m , int n ) cudaMallocHost(&TileMatrix, sizeof(float) * size ); if(TileMatrix == NULL) error("failed to allocate TileMatrix"); - int rows = m*32; - int columns = n*32; int i,j,k,l; for( i = 0; i < n ; i++ ) @@ -691,8 +602,6 @@ float* createIdentity(int m, int n) cudaMallocHost(&Matrix, sizeof(float) * m*n*32*32); if(Matrix == NULL) error("Failed to allocate Matrix"); - int rows = m*32; - int columns = n*32; int i, j; memset ( Matrix, 0, sizeof(float)*m*n*32*32 ); @@ -729,24 +638,6 @@ void printMatrix(float* Matrix, int m, int n) } -/*void printTileMatrix(float* Matrix, int m, int n) -{ - int i,j; - float *tempMatrix, *tempMatrix2; - for(i = 0; i < m*32; i++) - { - int tiled = i/32; - tempMatrix = &Matrix[tiled*32*32]; - for(j = 0; j < n*32; j++) - { - int tile = j/32; - tempMatrix2 = &tempMatrix[tile*32*32*m + i%32]; - printf(" %.1f ", tempMatrix2[j*32]); - //printf(" %.1f,%i ", tempMatrix2[j*32], &tempMatrix2[j*32]-Matrix); - } - printf("\n"); - } -}*/ qsched_funtype func; @@ -759,7 +650,7 @@ void test_qr(int m , int n , int K , int nr_threads , int runs) qsched_task_t *tid, tid_new; qsched_res_t *rid; int data[3]; - ticks tic, toc_run, tot_setup, tot_run = 0; +// ticks tic, tot_run = 0; #ifdef NO_LOADS ticks tic_loads; #endif @@ -776,22 +667,15 @@ void test_qr(int m , int n , int K , int nr_threads , int runs) /* Allocate and fill the original matrix. */ if(cudaMallocHost(&A, sizeof(float) * m * n * K * K ) != cudaSuccess || - cudaMallocHost(&tau, sizeof(float) * m * n * K ) != cudaSuccess /*|| - cudaMallocHost(&A_orig, sizeof(float) * m * n * K * K ) != cudaSuccess*/ ) + cudaMallocHost(&tau, sizeof(float) * m * n * K ) != cudaSuccess ) error("Failed to allocate matrices."); g_size = g_size + sizeof(float) * m * n * K * K + sizeof(float) * m * n * K; - //cudaFreeHost(A_orig); -// for ( k = 0 ; k < m * n * K * K ; k++ ) - // A_orig[k] = 2.0f*((float)rand()) / RAND_MAX - 1.0f; A_orig = generateColumnMatrix(m*n*K*K, 35532); - // printMatrix(A_orig, m, n); float *temp = columnToTile(A_orig, m * n * K *K, m , n); cudaFreeHost(A_orig); A_orig = temp; temp = tileToColumn(A_orig, m*n*K*K, m , n, K); -// printMatrix(temp, m, n); -// printTileMatrix(A_orig, m, n); memcpy( A , A_orig , sizeof(float) * m * n * K * K ); bzero( tau , sizeof(float) * m * n * K ); @@ -814,7 +698,6 @@ void test_qr(int m , int n , int K , int nr_threads , int runs) error("Failed to copy matrix pointer to the device"); /* Allocate and init the task ID and resource ID matrix. */ - tic = getticks(); /* Allocate and init the task ID and resource ID matrix. */ if( cudaMallocHost(&tid , sizeof(qsched_task_t) * m*n ) != cudaSuccess ) error("Failed to allocate tid"); @@ -860,7 +743,7 @@ void test_qr(int m , int n , int K , int nr_threads , int runs) data[0] = i; data[1] = k; data[2] = k; tid_new = qsched_addtask( &s , task_STSQRF , task_flag_none , data , sizeof(int)*3 , 3 ); qsched_addlock(&s, tid_new, rid[k * m + i]); - qsched_adduse(&s, tid_new, rid[k * m + k]); + qsched_addlock(&s, tid_new, rid[k * m + k]); qsched_addunlock(&s, tid[k * m + (i - 1)], tid_new); if (tid[k * m + i] != -1) qsched_addunlock(&s, tid[k * m + i], tid_new); tid[k * m + i] = tid_new; @@ -871,7 +754,7 @@ void test_qr(int m , int n , int K , int nr_threads , int runs) tid_new = qsched_addtask( &s , task_SSSRFT , task_flag_none , data , sizeof(int)*3 , 5 ); qsched_addlock(&s, tid_new, rid[j * m + i]); qsched_adduse(&s, tid_new, rid[k * m + i]); - qsched_adduse(&s, tid_new, rid[j * m + k]); + qsched_addlock(&s, tid_new, rid[j * m + k]); qsched_addunlock(&s, tid[k * m + i], tid_new); qsched_addunlock(&s, tid[j * m + i - 1], tid_new); if (tid[j * m + i] != -1) qsched_addunlock(&s, tid[j * m + i], tid_new); @@ -905,15 +788,12 @@ void test_qr(int m , int n , int K , int nr_threads , int runs) - // printMatrix(tileToColumn(A, m*n*K*K, m, n, K), m, n); -/* float *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K); + float *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K); float *Q = computeQ(tempMatrix, m*K, K, tau, m); float *R = getR(tempMatrix, m*K); cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m*K, m*K, m*K, 1.0, Q, m*K, R, m*K, 0.0, tempMatrix, m*K); free(Q); - // printMatrix(tempMatrix, m, n); - // printf("\n\n\n\n"); Q = tileToColumn(A_orig, m*n*K*K, m, n, K); for(i = 0; i < m * n * K * K; i++) { @@ -922,15 +802,16 @@ void test_qr(int m , int n , int K , int nr_threads , int runs) printf("Not correct at value %i %.3f %.3e %.3e\n", i, A[i], Q[i], tempMatrix[i]); } + printf("Checked for correctness\n"); free(tempMatrix); free(Q); - free(R);*/ + free(R); // cudaMemcpy( A , device_array , sizeof(float) * m * n * K * K, cudaMemcpyHostToDevice); // A = tileToColumn(A,m * n * K * K, m, n, K); // printMatrix(A, m, n); // printTileMatrix(A, m , n); - struct task* tasks = qsched_get_timers( &s , s.count ); + /* struct task* tasks = qsched_get_timers( &s , s.count ); for(i = 0; i < s.count; i++) { printf("%i %lli %lli %i ", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID); @@ -947,7 +828,7 @@ void test_qr(int m , int n , int K , int nr_threads , int runs) printf("0 0"); printf("\n"); } - free(tasks); + free(tasks);*/ cudaDeviceReset(); } @@ -956,7 +837,6 @@ void test() { int m = 2, n = 2; float* Matrix = createIdentity(m,n); - //printMatrix(Matrix, m, n); float* MatrixTile = columnToTile(Matrix, m*n*32*32, m, n); printMatrix(&MatrixTile[0], 1, 1); printf("\n \n \n"); @@ -966,7 +846,6 @@ void test() printf("\n \n \n"); printMatrix(&MatrixTile[3*32*32], 1, 1); printf("\n \n \n"); - //printTileMatrix(MatrixTile, m, n); free(Matrix); Matrix = tileToColumn(MatrixTile, m*n*32*32, m , n, 32); printMatrix(Matrix, m, n); @@ -988,11 +867,9 @@ __global__ void SGEQRF_test(float* cornerTile) for(i = threadIdx.x; i < 32*32; i+=blockDim.x) { - // printf("Copying value %i\n", i); tile[i] = cornerTile[i]; } - // printf("blockDim.x = %i\n", blockDim.x); __syncthreads(); if(threadIdx.x < 32) SGEQRF(tile, 32, tau, 0, 1); @@ -1029,7 +906,6 @@ __global__ void SLARFT_test(float* cornerTile, float* rowTile) SGEQRF(tile, 32, tau, 0, 2); __syncthreads(); -// if(threadIdx.x < 32) SLARFT(tile, tile2, 32, 1, 0, tau, 2); __syncthreads(); for(i = threadIdx.x; i < 32*32; i+=blockDim.x) @@ -1165,12 +1041,6 @@ int main ( int argc , char *argv[] ) { int c, nr_threads=128; int M = 4, N = 4, runs = 1, K = 32; - /* Get the number of threads. */ - //#pragma omp parallel shared(nr_threads) -// { -// if ( omp_get_thread_num() == 0 ) -// nr_threads = omp_get_num_threads(); -// } /* Parse the options */ while ( ( c = getopt( argc , argv , "m:n:k:r:t:D" ) ) != -1 ) @@ -1194,7 +1064,6 @@ int main ( int argc , char *argv[] ) { case 't': if ( sscanf( optarg , "%d" , &nr_threads ) != 1 ) error( "Error parsing number of threads." ); - //omp_set_num_threads( nr_threads ); break; case 'D': runTests(); @@ -1206,9 +1075,6 @@ int main ( int argc , char *argv[] ) { exit( EXIT_FAILURE ); } - /* Dump arguments. */ - // message( "Computing the tiled QR decomposition of a %ix%i matrix using %i threads (%i runs)." , - // 32*M , 32*N , nr_threads , runs ); test_qr( M , N , K , nr_threads , runs ); diff --git a/src/CUDACompile.sh b/src/CUDACompile.sh index 0a3a0bf..085086c 100755 --- a/src/CUDACompile.sh +++ b/src/CUDACompile.sh @@ -1,7 +1,7 @@ #!/bin/bash FLAGS2="-Xcompiler=-fsanitize=address -Xcompiler=-fno-omit-frame-pointer" DEBUG_FLAGS="-G -DDEBUG_GPU" -FLAGS="-O3 -g -G -DCPU_TPS=3.1e9 -lineinfo -src-in-ptx -Xptxas -dlcm=cg --maxrregcount=32 -gencode arch=compute_30,code=sm_30 -ftz=true -fmad=true -DFPTYPE_SINGLE -lgomp -DWITH_CUDA -DTIMERS -ccbin=/usr/bin/gcc-4.8" +FLAGS="-O3 -g -DCPU_TPS=3.1e9 -lineinfo -src-in-ptx -Xptxas -dlcm=cg --maxrregcount=32 -gencode arch=compute_30,code=sm_30 -ftz=true -fmad=true -DFPTYPE_SINGLE -lgomp -DWITH_CUDA -DTIMERS -ccbin=/usr/bin/gcc-4.8" # -DGPU_locks -Xptxas -dlcm=cg -Xptxas="-v"" # -DNO_LOADS @@ -36,3 +36,7 @@ cd ../examples /home/aidan/cuda_6.0/bin/nvcc $FLAGS -dc -m64 -I../src -dc -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -lcudart -lcuda test_bh_2.cu -lprofiler /home/aidan/cuda_6.0/bin/nvcc $FLAGS -m64 -I../src -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -Xnvlink -v test_bh_2.o ../src/.libs/libquicksched_cuda.a -o test_bh_2 -lprofiler + +/home/aidan/cuda_6.0/bin/nvcc $FLAGS -dc -m64 -I../src -dc -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -lcudart -lcuda test_bh_3.cu -lprofiler + +/home/aidan/cuda_6.0/bin/nvcc $FLAGS -m64 -I../src -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -Xnvlink -v test_bh_3.o ../src/.libs/libquicksched_cuda.a -o test_bh_3 -lprofiler diff --git a/src/cuda_queue.cu b/src/cuda_queue.cu index 853ed2b..d42e785 100644 --- a/src/cuda_queue.cu +++ b/src/cuda_queue.cu @@ -333,7 +333,7 @@ __device__ int cuda_locktask ( int tid ) { __device__ void cuda_done ( struct task *t ) { int k; - struct task *t2; +// struct task *t2; /* Release this task's locks. */ for ( k = 0 ; k < t->nr_locks ; k++ ) @@ -437,8 +437,6 @@ __device__ int get_best_task(struct queue_cuda *q ) int ind1, ind2, tid1 = -1, tid2 = -1; /* Don't even try... */ - // if ( q->rec_count == q->count ) - // return -1; /* Get the index of the next task. */ ind1 = atomicAdd( &q->first , 1 ); @@ -483,7 +481,6 @@ __device__ int get_best_task(struct queue_cuda *q ) q->data[ind2] = -1; atomicAdd((int*) &tot_num_tasks, -1); cuda_queue_puttask( q, tid2); -// atomicAdd((int*)&q->nr_avail_tasks, 1); return tid1; }else @@ -492,7 +489,6 @@ __device__ int get_best_task(struct queue_cuda *q ) q->data[ind2] = -1; atomicAdd((int*) &tot_num_tasks, -1); cuda_queue_puttask( q, tid1); - // atomicAdd((int*)&q->nr_avail_tasks, 1); return tid2; } @@ -507,7 +503,6 @@ __device__ int runner_cuda_gettask_priority ( struct queue_cuda *q ) { atomicAdd((int*)&q->nr_avail_tasks, 1); return -1; } -// atomicAdd((int*)&q->nr_avail_tasks, -1); tid = get_best_task(q); /* Put this task into the recycling queue, if needed. */ @@ -534,13 +529,6 @@ __global__ void qsched_device_kernel ( ) /* Pull a task from the queues*/ while ( 1 ) { __syncthreads(); - /*if(threadIdx.x == 0 && blockIdx.x == 0) - { - printf("%i %i %i \n", cuda_queues[0].nr_avail_tasks, cuda_queues[0].data[0], cuda_queues[0].data[1]); - break; - } - else - break;*/ if ( threadIdx.x == 0 ) { tid = -1; /* Highest priority queue, holds the unload tasks. */ @@ -553,6 +541,16 @@ __global__ void qsched_device_kernel ( ) } #endif +#ifndef NO_LOADS + /* Low priority queue, contains the load tasks. */ + if( tid < 0 && cuda_queues[1].nr_avail_tasks > 0 && blockIdx.x < 12 ) + { + TIMER_TIC + tid = runner_cuda_gettask ( &cuda_queues[1]); + TIMER_TOC(timers_load_q) + } +#endif + /* Middle priority queue, contains user-specifed tasks. */ #ifdef PRIQ if( tid < 0 && cuda_queues[0].nr_avail_tasks > 0 ) @@ -568,15 +566,6 @@ __global__ void qsched_device_kernel ( ) tid = runner_cuda_gettask ( &cuda_queues[0]); TIMER_TOC(timers_queue) } -#endif -#ifndef NO_LOADS - /* Low priority queue, contains the load tasks. */ - if( tid < 0 && cuda_queues[1].nr_avail_tasks > 0 /*&& blockIdx.x < 12*/ ) - { - TIMER_TIC - tid = runner_cuda_gettask ( &cuda_queues[1]); - TIMER_TOC(timers_load_q) - } #endif } @@ -667,21 +656,9 @@ __global__ void qsched_device_kernel ( ) /* Reset values.*/ - /*printf("cuda_queues[0].first = %i, nr_avail_tasks = %i, tot_num_tasks = %i, data[first] = %i\n", cuda_queues[0].first, cuda_queues[0].nr_avail_tasks ,tot_num_tasks, cuda_queues[0].data[cuda_queues[0].first] ); - printf("cuda_queues[0].last = %i, data[last-1] = %i\n", cuda_queues[0].last ,cuda_queues[0].data[cuda_queues[0].last] ); - for(tid = cuda_queues[0].first; tid < cuda_queues[0].first + cuda_queues[0].nr_avail_tasks; tid++) - { - printf("task =%i, type = %i\n", cuda_queues[0].data[tid%cuda_queue_size], tasks_cuda[cuda_queues[0].data[tid%cuda_queue_size]].type ); - } */ cuda_queues[0].first = 0; cuda_queues[0].last = 0; cuda_queues[0].rec_count = 0; - /*cuda_queues[1].first = 0; - cuda_queues[1].last = cuda_queues[1].count; - cuda_queues[1].rec_count = 0; - cuda_queues[2].first = 0; - cuda_queues[2].last = 0; - cuda_queues[2].rec_count = 0;*/ } //TODO @@ -736,12 +713,6 @@ void qsched_create_loads(struct qsched *s, int ID, int size, int numChildren, in utask = qsched_addtask( s , type_ghost, task_flag_none, NULL, 0 , 0 ); qsched_adduse(s, task, ID); s->res[ID].utask = utask; - // if(parent >= 0) -// { - /* Create dependecy to parent. */ - // qsched_addunlock(s, task, s->res[parent].task ); - /// qsched_addunlock(s, s->res[parent].utask, utask ); - // } for(i = sorted[ID]; i < sorted[ID+1]; i++) { qsched_create_loads(s, res[i], s->res[res[i]].size, sorted[res[i]+1]-sorted[res[i]], ID, res, sorted); @@ -756,12 +727,6 @@ void qsched_create_loads(struct qsched *s, int ID, int size, int numChildren, in { s->res[res[j]].task = task; } - /* If it has a parent then set the parents ghost task to be dependent on this.*/ - // if(parent >= 0) - // { - // qsched_addunlock(s, task, s->res[parent].task ); - // qsched_addunlock(s, s->res[parent].utask, utask); - // } } } @@ -781,7 +746,6 @@ int transitive_use_unlocks(struct qsched *s, struct task *t, int res, int depth) } if(depth >= MAX_DEPTH) { -// printf("Max Depth reached\n"); return 0; } for(i = 0; i < t->nr_unlocks; i++) @@ -809,7 +773,6 @@ int transitive_use_locks(struct qsched *s, int tid, int res, int depth) } if(depth >= MAX_DEPTH) { -// printf("Max Depth reached\n"); return 0; } for(i = tid-1; i >= 0; i--) @@ -830,20 +793,18 @@ int transitive_use_locks(struct qsched *s, int tid, int res, int depth) void qsched_prepare_loads ( struct qsched *s ) { -int i, task, unload, j , k , unlocked = 0; +int i, j , k; struct task *t; int *sorted, lastindex; int *res, *res_data; -//printf("tasks[63].wait_init = %i\n", s->tasks[63].wait_init); if(s->res[0].task != -1) { printf("Tasks already initialised, not redoing load/unload tasks"); return; } -double itpms = 1000.0 / CPU_TPS; -ticks tic, toc_run, toc2 ; +ticks tic, toc2 ; tic = getticks(); /* Expand the deps array so we can add new dependencies in place. */ @@ -947,15 +908,12 @@ for(i = 0; i < s->count_res; i++) res_data[i] = ((char *)s->res[res[i]].data) - (char*)s->res[0].data; if(res_data[i] < mini) mini = res_data[i]; -// printf("%i ", res_data[i]); } for(i = 0; i < s->count_res; i++) { res_data[i] -= mini; -//printf("%i ", res_data[i]); } -//printf("\n"); /* Sort the children of each parent by memory address. */ qsched_sort(res, res_data, sorted[0], minVal(res_data, sorted[0]), maxVal(res_data, sorted[0])); @@ -970,7 +928,6 @@ for(i = 1; i < s->count_res; i++) qsched_sort(&res[sorted[s->count_res-1]], &res_data[sorted[s->count_res-1]], s->count_res - sorted[s->count_res-1], minVal(&res_data[sorted[s->count_res-1]], s->count_res - sorted[s->count_res-1]), maxVal(&res_data[sorted[s->count_res-1]], s->count_res - sorted[s->count_res-1])); /*res now contains an array of indices, first sorted by parent, then memory address of data. */ -int size=0; if(sorted[0] != 0) { /* Check no overlapping resources.*/ @@ -1008,9 +965,10 @@ for(i = sorted[s->count_res]; i >= 0; i-- ) sorted[i] = sorted[i-1]; } -toc_run = getticks(); tic = getticks(); + + /* If nothing overlaps create tasks.*/ for( i = sorted[s->count_res]; i < s->count_res; i++ ) { @@ -1024,9 +982,7 @@ for( i = sorted[s->count_res]; i < s->count_res; i++ ) } -toc_run = getticks(); tic = getticks(); -// message( "Creating load tasks took %.3f ms" , ((double)(toc_run - tic)) * itpms ); /* Check all resources have load tasks - if not give parents (recursively)*/ @@ -1078,10 +1034,6 @@ for(i = 0; i < s->count; i++) for(j = 0; j < t->nr_uses; j++) { - t->unlocks[t->nr_unlocks] = s->res[t->uses[j]].utask; - s->tasks[s->res[t->uses[j]].utask].wait_init +=1 ; - deps_new_key[(t->unlocks - deps_new) + t->nr_unlocks] = i; - t->nr_unlocks++; if(num_uses[t->uses[j]] == size_uses[t->uses[j]]) { /* Stretch. */ @@ -1131,7 +1083,7 @@ int *tasks_assigned = (int*)calloc(sizeof(int), s->count); for(i = 0; i < s->count_res; i++ ) { int ID = res[i]; - int size = s->res[ID].size; + // int size = s->res[ID].size; int numChildren = sorted[ID+1] - sorted[ID]; int parent = s->res[ID].parent; struct res *resource = &s->res[ res[i] ]; @@ -1226,8 +1178,6 @@ for(i = 0; i < s->count_res; i++ ) { s->tasks[resource->task].unlocks[ s->tasks[resource->task].nr_unlocks ] = usage_list[ res[ sorted[ ID ] +j ] ][k]; s->tasks[ usage_list[ res[ sorted[ ID ] +j ] ][k] ].wait_init += 1; -// if(usage_list[res[sorted[ID]+j]][k] == 63) -// printf("Increased wait_init of task 63, wait_init now %i\n", s->tasks[63].wait_init); deps_new_key[s->count_deps] = resource->task; s->tasks[resource->task].nr_unlocks += 1; s->count_deps += 1; @@ -1263,11 +1213,7 @@ for(i = 0; i < s->count_res; i++ ) s->tasks[resource->task].unlocks[ s->tasks[resource->task].nr_unlocks ] = s->res[parent].task; deps_new_key[s->count_deps] = resource->task; s->tasks[s->res[parent].task].wait_init += 1; - //if(s->res[parent].task == 63) -// printf("task %i should unlock task 63, %i position.\n", resource->task, s->tasks[resource->task].nr_unlocks); s->tasks[resource->task].nr_unlocks += 1; -// if(s->res[parent].task == 63) -// printf("s->tasks[resource->task].nr_unlocks = %i\n", s->tasks[resource->task].nr_unlocks); s->count_deps += 1; } @@ -1276,62 +1222,19 @@ for(i = 0; i < s->count_res; i++ ) s->tasks[resource->task].unlocks[ s->tasks[resource->task].nr_unlocks ] = usage_list[ res[ i ] ][k]; deps_new_key[s->count_deps] = resource->task; s->tasks[usage_list[ res[ i ] ][k]].wait_init += 1; -// if(usage_list[res[i]][k] == 63) -// printf("task %i should unlock task 63\n", resource->task); s->tasks[resource->task].nr_unlocks += 1; -// s->count_deps += 1; } if(!tasks_assigned[resource->task]) { s->count_deps += num_deps[resource->task]; tasks_assigned[resource->task] = 1; } - //r(k = 0; k < s->tasks[66].nr_unlocks; k++) -// printf("%i ", s->tasks[66].unlocks[k]); -// printf("\n"); -// printf("s->tasks[66].unlocks - deps_new = %i\n", s->tasks[66].unlocks - deps_new); } -//printf("s->deps = %i, deps_new = %i\n", s->deps, deps_new); free(s->deps); free(s->deps_key); s->deps = deps_new; s->deps_key = deps_new_key; s->flags &= ~qsched_flag_dirty; toc2 += getticks() - tic; - // printf("s->tasks[50].unlocks - s->deps = %i\n", s->tasks[50].unlocks - s->deps); - -/*printf("Dependencies leading from task 50's\n"); -for(i = 0; i < s->tasks[50].nr_unlocks + 5; i++) - printf("%i ", s->tasks[50].unlocks[i]); -printf("\n"); - -printf("s->tasks[50].nr_unlocks = %i, num_deps[50] = %i\n", s->tasks[50].nr_unlocks, num_deps[50]); - -free(num_deps); -printf("Printing dependencies here:\n"); -for(i = 0; i < s->count; i++) -{*/ - // printf("Task ID: %i, type = %i: ", i, s->tasks[i].type); - /*if(s->tasks[i].type < 100) - { - for(k = 0; k < s->count_res; k++) - { - if(s->res[k].task == i || s->res[k].utask == i ) - { - printf(" resource ID = %i", k); - if(s->res[k].parent != -1) - printf(" parent resource ID = %i", s->res[k].parent); - printf(": "); - } - } - }*/ - /* for(k = 0; k < s->tasks[i].nr_unlocks; k++) - printf("%i ", s->tasks[i].unlocks[k]); - printf("\n"); -} - printf("----------------\n");*/ -toc_run = getticks(); - //message( "Setting up dependencies took %.3f ms" , toc2 * itpms ); -//error("Got to here"); } @@ -1349,7 +1252,6 @@ for(i = 0; i < s->count_res; i++) continue; if(s->res[i].task >= 0) continue; -// cudaMalloc( &s->res[ i ].gpu_data, s->res[i].size ); task = qsched_addtask( s , type_load , task_flag_none , &i , sizeof(int) , 0 ); s->res[i].task = task; unload = qsched_addtask( s , type_unload, task_flag_none , &i, sizeof(int), 0 ); @@ -1383,8 +1285,6 @@ for(i = 0; i < s->count_res; i++) } } - // qsched_adduse(s, task , i); - // qsched_adduse(s, unload , i); } @@ -1410,7 +1310,6 @@ int *data; char *sdata; /* Lock the sched. */ - //lock_lock( &s->lock ); /* Get a pointer to the tasks, set the count. */ tasks = s->tasks; @@ -1448,15 +1347,10 @@ char *sdata; } /* All cleaned-up now! */ - //s->flags &= ~qsched_flag_dirty; } - double itpms = 1000.0 / CPU_TPS; -ticks tic, toc_run ; - tic = getticks(); -//qsched_ensure(s, s->count + 2*s->count_res, s->count_res, s->count_deps + 2*s->count_res, s->count_locks, s->count_uses + 2*s->count_res, s->size_data ); for ( k = 0 ; k < count ; k++ ) { t = &tasks[k]; @@ -1470,8 +1364,6 @@ ticks tic, toc_run ; #ifndef NO_LOADS qsched_prepare_loads(s); #endif -toc_run = getticks(); -// message( "prepare_loads took %.3f ms" , ((double)(toc_run - tic)) * itpms ); @@ -1480,41 +1372,18 @@ toc_run = getticks(); tasks = s->tasks; count = s->count; - /* If the sched is dirty... */ - if ( 1 /*s->flags & qsched_flag_dirty*/ ) { - - /* Do the sorts in parallel, if possible. */ - // #pragma omp parallel - //{ - - /* Sort the unlocks. */ - //#pragma omp single nowait - // qsched_sort( s->deps , s->deps_key , s->count_deps , 0 , count - 1 ); + qsched_sort( s->locks , s->locks_key , s->count_locks , 0 , count - 1 ); - /* Sort the locks. */ - // #pragma omp single nowait - qsched_sort( s->locks , s->locks_key , s->count_locks , 0 , count - 1 ); - - /* Sort the uses. */ - // #pragma omp single nowait - // qsched_sort( s->uses , s->uses_key , s->count_uses , 0 , count - 1 ); - - // } /* Run throught the tasks and link the locks and unlocks. */ -// tasks[0].unlocks = s->deps; tasks[0].locks = s->locks; - // tasks[0].uses = s->uses; for ( k = 1 ; k < count ; k++ ) { - // tasks[k].unlocks = &tasks[k-1].unlocks[ tasks[k-1].nr_unlocks ]; tasks[k].locks = &tasks[k-1].locks[ tasks[k-1].nr_locks ]; - // tasks[k].uses = &tasks[k-1].uses[ tasks[k-1].nr_uses ]; } /* All cleaned-up now! */ s->flags &= ~qsched_flag_dirty; - } /* Init the queues. */ for ( k = 0 ; k < s->nr_queues ; k++ ) queue_init( &s->queues[k] , count ); @@ -1525,24 +1394,11 @@ toc_run = getticks(); tasks[k].wait = 0; } - /* Run through the tasks and set the waits... */ -/* for ( k = 0 ; k < count ; k++ ) { - t = &tasks[k]; - if ( !( t->flags & task_flag_skip ) ) - for ( j = 0 ; j < t->nr_unlocks ; j++ ) - { - tasks[ t->unlocks[j] ].wait += 1; - } - }*/ for( k = 0; k < count; k++ ) { t = &tasks[k]; t->wait = t->wait_init; -// if(t->wait != t->wait_init) - // { - // printf("Task ID %i has wait %i and wait_init %i and type %i\n", k, t->wait, t->wait_init, t->type); - // } } /* Sort the tasks topologically. */ @@ -1566,23 +1422,11 @@ toc_run = getticks(); } - /* Print all dependencies */ - // for(i = 0; i < count; i++ ) -// { -// printf("Task ID: %i, type=%i, wait= %i, ", tid[i], tasks[tid[i]].type, tasks[tid[i]].wait); - // printf("Task ID: %i, type=%i, wait= %i, ", i, tasks[i].type, tasks[i].wait); -// for(j = 0; j < tasks[tid[i]].nr_unlocks; j++) -// { -// printf("%i ", tasks[tid[i]].unlocks[j]); -// } -// printf("\n"); -// } if ( k < count ) { for(i = 0; i < count; i++) if(tasks[tid[i]].wait != 0) printf("Task %i has nonzero wait, wait is %i, wait_init is %i\n", tid[i], tasks[tid[i]].wait, tasks[tid[i]].wait_init); - //printf("tasks[63].wait = %i, tasks[63].wait_init = %i\n", tasks[63].wait, tasks[63].wait_init); printf("k = %i, count = %i\n", k, count); printf("tid[k-1] = %i, tid[k-1].wait = %i, initial wait = %i\n", tid[k], tasks[tid[k]].wait, tasks[tid[k]].wait_init); @@ -1613,8 +1457,6 @@ toc_run = getticks(); else t->weight = t->cost + maxweight - 1000; -// if(t->weight < -100) - // printf("%i\n", t->weight); } #endif #ifndef PRIQ @@ -1622,10 +1464,6 @@ toc_run = getticks(); { t = &tasks[k]; t->wait = t->wait_init; -// if(t->wait != t->wait_init) - // { - // printf("Task ID %i has wait %i and wait_init %i and type %i\n", k, t->wait, t->wait_init, t->type); - // } } #endif @@ -1712,82 +1550,7 @@ int *data2; struct queue_cuda queues[ cuda_numqueues ]; -/*#ifdef PRIQ -qsize = max(2*s->count, 512); - if ( cudaMemcpyToSymbol( cuda_queue_size , &qsize , sizeof(int) , 0 , cudaMemcpyHostToDevice ) != cudaSuccess ) - error("Failed to copy queue size to the device."); - - if ( ( data = (int *)malloc( sizeof(int) * qsize ) ) == NULL ) - error("Failed to allocate data buffer."); - if( ( data2 = (int *) malloc( sizeof(int) * qsize ) ) == NULL ) - error("Failed to allocate data2 buffer."); - queues[0].count = 0; - - for(i = 0; i < s->count; i++) - { - if(s->tasks[i].wait == 0) - { - if(s->tasks[i].type != type_load) - { - printf("i = %i\n", i); - for(k = 0; k < s->count; k++) - { - for(j = 0; j < s->tasks[k].nr_unlocks; j++) - { - if(s->tasks[k].unlocks[j] == i) - printf("Should be unlocked by %i\n", k); - } - for(j = 0; j < s->tasks[k].nr_uses; j++) - { - if(s->tasks[k].uses[j] == 256) - printf("Task %i uses resource 256\n", k); - } - for(j = 0; j < s->tasks[k].nr_locks; j++) - { - if(s->tasks[k].locks[j] == 256) - printf("Task %i locks resource 256\n", k); - } - } - for(k = 0; k < s->count_res; k++) - { - if(s->res[k].utask == i) - printf("resource = %i\n", k); - } - printf("%i\n", i); - } - data[queues[0].count++] = i; - data2[queues[0].count-1] = -temp[i].weight; - } - } - qsched_sort(data, data2, queues[0].count, minVal(data2,queues[0].count), maxVal(data2, queues[0].count)); - free(data2); - for ( i = queues[0].count ; i < qsize ; i++ ) - data[i] = -1; - - if ( cudaMalloc( &queues[0].data , sizeof(int) * qsize ) != cudaSuccess ) - error("Failed to allocate queue data on the device."); - if ( cudaMemcpy( (void *)queues[0].data , data , sizeof(int) * qsize , cudaMemcpyHostToDevice ) != cudaSuccess ) - error("Failed to copy queue data pointer to the device"); - - for ( k = 0; k < qsize; k++ ) - data[k] = -1; - - if ( cudaMalloc( &queues[0].rec_data , sizeof(int) * qsize ) != cudaSuccess ) - error("Failed to allocate queue data on the device."); - if ( cudaMemcpy( (void *)queues[0].rec_data , data , sizeof(int) * qsize , cudaMemcpyHostToDevice ) != cudaSuccess ) - error("Failed to copy queue data pointer to the device"); - - - queues[0].first = 0; - queues[0].last = queues[0].count; - queues[0].nr_avail_tasks = queues[0].last; - queues[0].rec_count = 0; - queues[0].count = s->count; - - if ( cudaMemcpyToSymbol( cuda_queues , &queues , sizeof(struct queue_cuda) * nr_queues , 0 , cudaMemcpyHostToDevice ) != cudaSuccess ) - error("Failed to copy the queues to the device"); -#else*/ - qsize = max(2*s->count / nr_queues, 256); + qsize = max(s->count, 256); if ( cudaMemcpyToSymbol( cuda_queue_size , &qsize , sizeof(int) , 0 , cudaMemcpyHostToDevice ) != cudaSuccess ) error("Failed to copy queue size to the device."); @@ -1826,17 +1589,17 @@ qsize = max(2*s->count, 512); data[k] = -1; /* Allocate and copy the recyling data. */ - if ( cudaMalloc( &queues[1].rec_data , sizeof(int) * qsize ) != cudaSuccess ) + if ( cudaMalloc( &queues[1].rec_data , sizeof(int) * s->count*1.2 ) != cudaSuccess ) error("Failed to allocate queue data on the device."); if ( cudaMemcpy( (void *)queues[1].rec_data , data , sizeof(int) * qsize , cudaMemcpyHostToDevice ) != cudaSuccess ) error("Failed to copy queue data pointer to the device"); /* Allocate and copy the recyling data. */ - if ( cudaMalloc( &queues[0].rec_data , sizeof(int) * qsize ) != cudaSuccess ) + if ( cudaMalloc( &queues[0].rec_data , sizeof(int) * s->count*1.2 ) != cudaSuccess ) error("Failed to allocate queue data on the device."); if ( cudaMemcpy( (void *)queues[0].rec_data , data , sizeof(int) * qsize , cudaMemcpyHostToDevice ) != cudaSuccess ) error("Failed to copy queue data pointer to the device"); /* Allocate and copy the recyling data. */ - if ( cudaMalloc( &queues[2].rec_data , sizeof(int) * qsize ) != cudaSuccess ) + if ( cudaMalloc( &queues[2].rec_data , sizeof(int) * s->count*1.2 ) != cudaSuccess ) error("Failed to allocate queue data on the device."); if ( cudaMemcpy( (void *)queues[2].rec_data , data , sizeof(int) * qsize , cudaMemcpyHostToDevice ) != cudaSuccess ) error("Failed to copy queue data pointer to the device"); diff --git a/src/qsched.h b/src/qsched.h index dfa9cde..97f58fe 100644 --- a/src/qsched.h +++ b/src/qsched.h @@ -163,8 +163,9 @@ struct qsched { /* Function prototypes. */ /* Internal functions. */ +void qsched_quicksort ( int *data , int *ind , int N , int min , int max ); void qsched_sort ( int *data , int *ind , int N , int min , int max ); -void qsched_sort_rec ( int *data , int *ind , int N , int min , int max ); +//void qsched_sort_rec ( int *data , int *ind , int N , int min , int max ); struct task *qsched_gettask ( struct qsched *s , int qid ); void qsched_done ( struct qsched *s , struct task *t ); void *qsched_getdata( struct qsched *s , struct task *t ); -- GitLab