diff --git a/examples/test_bh_2.cu b/examples/test_bh_2.cu index 259a31f7d1a5fd8aa732f1d274f86aca523bec45..7981f6b9877e8445dd4f7c192672d00dd4b55a96 100644 --- a/examples/test_bh_2.cu +++ b/examples/test_bh_2.cu @@ -90,7 +90,9 @@ float4 *parts_a_m_host; double2 *com_xy_host; double *com_z_host; float *com_mass_host; - +double2 *parts_pos_xy_temp; +double *parts_pos_z_temp; +float4 *parts_a_m_temp; @@ -113,9 +115,9 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c /* Loop over cell i.*/ for(i = parts_i + threadIdx.x; i < parts_i + count_i; i+= blockDim.x) { - xi[0] = parts_xy[i].x; - xi[1] = parts_xy[i].y; - xi[2] = parts_z[i]; + xi[0] = parts_pos_xy[i].x; + xi[1] = parts_pos_xy[i].y; + xi[2] = parts_pos_z[i]; for(k = 0; k < 3; k++) { ai[k] = 0.0f; } @@ -154,9 +156,9 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c } /*Loop over cell j. */ for(i = parts_j + threadIdx.x; i < parts_j + count_j; i+= blockDim.x) { - xi[0] = parts_xy[i].x; - xi[1] = parts_xy[i].y; - xi[2] = parts_z[i]; + xi[0] = parts_pos_xy[i].x; + xi[1] = parts_pos_xy[i].y; + xi[2] = parts_pos_z[i]; for(k = 0; k < 3; k++) { ai[k] = 0.0f; } @@ -290,23 +292,40 @@ __device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf struct cell *cp ,*cps; + if(leaf->split) + { + printf("Leaf split = 1, oh dear."); + asm("trap;"); + } +if(ci->split > 1) + { + printf("Cell %i had split > 1\n", ci - cells); + asm("trap;"); + } + if(cj->split > 1) + { + printf("cell %i had split > 1\n", cj - cells); + asm("trap;"); + } + for(cp = &cells[ci->firstchild]; cp != &cells[ci->sibling]; cp = &cells[cp->sibling]) { if(is_inside(leaf, cp)) break; } if(are_neighbours_different_size(cp, cj)) { - for(cps = &cells[cj->firstchild]; cps != &cells[ci->sibling]; cps = &cells[cps->sibling]) { + for(cps = &cells[cj->firstchild]; cps != &cells[cj->sibling]; cps = &cells[cps->sibling]) { if(are_neighbours(cp, cps)) { if(cp->split && cps->split) { iact_pair_pc(cp, cps, leaf); } } else { make_interact_pc(leaf, cps); + __syncthreads(); } } } - + __syncthreads(); } /** @@ -320,6 +339,17 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) { struct cell *cp, *cps; + if(leaf->split) + { + printf("Leaf split = 1, oh dear."); + asm("trap;"); + } + if(c->split > 1) + { + printf("Cell had split > 1\n"); + asm("trap;"); + } + /* Find the subcell of c the leaf is in.*/ for( cp = &cells[c->firstchild]; cp != &cells[c->sibling]; cp = &cells[cp->sibling]) { if(is_inside(leaf, cp)) break; @@ -606,23 +636,23 @@ void cell_split(int c, struct qsched *s) { { if( cudaHostGetDevicePointer(&temp_xy, &parts_pos_xy_host[cell_pool[c].parts], 0) != cudaSuccess ) error("Failed to get host device pointer."); - printf("tempxy = %p\n", temp_xy); +// printf("tempxy = %p\n", temp_xy); cell_pool[c].res = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_xy, - sizeof(double2) * cell_pool[c].count, parts_pos_xy + cell_pool[c].parts); + sizeof(double2) * cell_pool[c].count, parts_pos_xy_temp + cell_pool[c].parts); } if(cell_pool[c].resz == qsched_res_none) { if( cudaHostGetDevicePointer(&temp_z, &parts_pos_z_host[cell_pool[c].parts], 0) != cudaSuccess ) error("Failed to get host device pointer."); cell_pool[c].resz = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_z, - sizeof(double) * cell_pool[c].count, parts_pos_z + cell_pool[c].parts); + sizeof(double) * cell_pool[c].count, parts_pos_z_temp + cell_pool[c].parts); } if(cell_pool[c].resm == qsched_res_none) { if( cudaHostGetDevicePointer(&temp_a_m, &parts_a_m_host[cell_pool[c].parts], 0) != cudaSuccess ) error("Failed to get host device pointer."); cell_pool[c].resm = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_a_m, - sizeof(float4) * cell_pool[c].count, parts_a_m + cell_pool[c].parts); + sizeof(float4) * cell_pool[c].count, parts_a_m_temp + cell_pool[c].parts); } // error("Cell has no resource");*///TODO @@ -731,15 +761,15 @@ void cell_split(int c, struct qsched *s) { if( cudaHostGetDevicePointer(&temp_xy, &parts_pos_xy_host[cell_pool[progenitors[k]].parts], 0) != cudaSuccess ) error("Failed to get host device pointer."); cell_pool[progenitors[k]].res = qsched_addres(s, qsched_owner_none, cell->res, temp_xy, - sizeof(double2) * cell_pool[progenitors[k]].count, parts_pos_xy + cell_pool[progenitors[k]].parts); + sizeof(double2) * cell_pool[progenitors[k]].count, parts_pos_xy_temp + cell_pool[progenitors[k]].parts); if( cudaHostGetDevicePointer(&temp_z, &parts_pos_z_host[cell_pool[progenitors[k]].parts], 0) != cudaSuccess ) error("Failed to get host device pointer."); cell_pool[progenitors[k]].resz = qsched_addres(s, qsched_owner_none, cell->resz, temp_z, - sizeof(double) * cell_pool[progenitors[k]].count, parts_pos_z + cell_pool[progenitors[k]].parts); + sizeof(double) * cell_pool[progenitors[k]].count, parts_pos_z_temp + cell_pool[progenitors[k]].parts); if( cudaHostGetDevicePointer(&temp_a_m, &parts_a_m_host[cell_pool[progenitors[k]].parts], 0) != cudaSuccess ) error("Failed to get host device pointer."); cell_pool[progenitors[k]].resm = qsched_addres(s, qsched_owner_none, cell->resm, temp_a_m, - sizeof(float4) * cell_pool[progenitors[k]].count, parts_a_m + cell_pool[progenitors[k]].parts); + sizeof(float4) * cell_pool[progenitors[k]].count, parts_a_m_temp + cell_pool[progenitors[k]].parts); } /* Find the first non-empty progenitor */ @@ -784,6 +814,9 @@ void cell_split(int c, struct qsched *s) { int data[2] = {root, c}; int tid = qsched_addtask(s, task_type_self_pc, task_flag_none, data, 2 * sizeof(int), 1); + qsched_addlock(s, tid, cell_pool[root].res); + qsched_addlock(s, tid, cell_pool[root].resz); + qsched_addlock(s, tid, cell_pool[root].resm); qsched_addlock(s, tid, cell_pool[c].res); qsched_addlock(s, tid, cell_pool[c].resz); qsched_addlock(s, tid, cell_pool[c].resm); @@ -913,9 +946,7 @@ void test_bh(int N, int runs, char *fileName) { struct qsched s; ticks tic, toc_run, tot_setup = 0, tot_run = 0; int countMultipoles = 0, countPairs = 0, countCoMs = 0; - double2 *parts_pos_xy_temp; - double *parts_pos_z_temp; - float4 *parts_a_m_temp; + struct cell *gpu_ptr_cells; cudaFree(0); if( cudaMemcpyFromSymbol( &func , function , sizeof(qsched_funtype) ) != cudaSuccess) @@ -934,10 +965,10 @@ void test_bh(int N, int runs, char *fileName) { if( cudaMalloc(&parts_pos_xy_temp, sizeof(double2) * N) != cudaSuccess) error("Failed to allocate device parts array"); - printf("parts_pos_xy_temp = %p\n", parts_pos_xy_temp); + // printf("parts_pos_xy_temp = %p\n", parts_pos_xy_temp); if( cudaMemcpyToSymbol(parts_pos_xy, &parts_pos_xy_temp, sizeof(double2*), 0, cudaMemcpyHostToDevice) != cudaSuccess) error("Failed to set device symbol for parts array"); - printf("parts_pos_xy = %p\n", parts_pos_xy); +// printf("parts_pos_xy = %p\n", parts_pos_xy); if( cudaMalloc(&parts_pos_z_temp, sizeof(double) * N) != cudaSuccess) error("Failed to allocate device parts array"); if( cudaMemcpyToSymbol(parts_pos_z, &parts_pos_z_temp, sizeof(double*), 0, cudaMemcpyHostToDevice) != cudaSuccess) @@ -1018,6 +1049,10 @@ void test_bh(int N, int runs, char *fileName) { message("Average number of parts per leaf is %lf.", ((double)N) / ((double)nr_leaves)); message("Max number of parts in a leaf is %i, min number is %i", maxparts, minparts); + for(k = 0; k < num_cells; k++) + if(cell_pool[k].split > 1 ) + printf("Split > 1\n"); + create_tasks(&s, root, NULL); message("total number of tasks: %i.", s.count); @@ -1033,6 +1068,56 @@ void test_bh(int N, int runs, char *fileName) { parts_a_m_host[i].z = 0.0; } + + /* Copy the cells to the device. */ + if( cudaMalloc( &gpu_ptr_cells , sizeof(struct cell) * used_cells) != cudaSuccess) + error("Failed to allocate cells on the GPU"); + if( cudaMemcpy( gpu_ptr_cells, cell_pool, sizeof(struct cell) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess) + error("Failed to copy cells to the GPU"); + if( cudaMemcpyToSymbol(cells, &gpu_ptr_cells, sizeof(struct cell*), 0, cudaMemcpyHostToDevice) != cudaSuccess ) + error("Failed to copy cell pointer to the GPU"); + +double2 *com_temp; +double *comz_temp; +float *comm_temp; + + if(cudaMalloc( &com_temp, sizeof(double2) * used_cells) != cudaSuccess) + error("Failed to allocate com on the GPU"); + if( cudaMemcpy( com_temp, com_xy_host, sizeof(double2) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess ) + error("failed to copy com to the GPU"); + if( cudaMemcpyToSymbol(com_xy, &com_temp, sizeof(double2 *), 0, cudaMemcpyHostToDevice) != cudaSuccess) + error("Failed to copy com pointer to the GPU"); + + + if(cudaMalloc( &comz_temp, sizeof(double) * used_cells) != cudaSuccess) + error("Failed to allocate com on the GPU"); + if( cudaMemcpy( comz_temp, com_z_host, sizeof(double) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess ) + error("failed to copy com to the GPU"); + if( cudaMemcpyToSymbol(com_z, &comz_temp, sizeof(double *), 0, cudaMemcpyHostToDevice) != cudaSuccess) + error("Failed to copy com pointer to the GPU"); + + if(cudaMalloc( &comm_temp, sizeof(float) * used_cells) != cudaSuccess) + error("Failed to allocate com on the GPU"); + if( cudaMemcpy( comm_temp, com_z_host, sizeof(float) * used_cells, cudaMemcpyHostToDevice) != cudaSuccess ) + error("failed to copy com to the GPU"); + if( cudaMemcpyToSymbol(com_mass, &comm_temp, sizeof(float *), 0, cudaMemcpyHostToDevice) != cudaSuccess) + error("Failed to copy com pointer to the GPU"); + + /* for(i = 0; i < s.count; i++){ + int *idata = (int*)&s.data[s.tasks[i].data]; + if(s.tasks[i].type == task_type_self) + printf("Self task with data[0] = %i\n", idata[0]); + if(s.tasks[i].type == task_type_pair) { + printf("Pair task with data[0] = %i and data[1] = %i\n", idata[0], idata[1]); + } + if(s.tasks[i].type == task_type_self_pc) { + printf("PC task with data[0] = %i and data[1] = %i\n", idata[0], idata[1]); + } + }*/ + + + + //Run code. printf("gpu_data = %p\n", (int*)s.res[0].gpu_data); qsched_run_CUDA( &s , func ); diff --git a/src/cuda_queue.cu b/src/cuda_queue.cu index 9beba72c9f2fe2189f12a7ccece603762d278888..853ed2b8afa1f24f2c7909bd27c916be6e380965 100644 --- a/src/cuda_queue.cu +++ b/src/cuda_queue.cu @@ -735,16 +735,16 @@ void qsched_create_loads(struct qsched *s, int ID, int size, int numChildren, in s->res[ID].task = task; utask = qsched_addtask( s , type_ghost, task_flag_none, NULL, 0 , 0 ); qsched_adduse(s, task, ID); - s->res[ID].task = utask; - if(parent >= 0) - { + s->res[ID].utask = utask; + // if(parent >= 0) +// { /* Create dependecy to parent. */ // qsched_addunlock(s, task, s->res[parent].task ); /// qsched_addunlock(s, s->res[parent].utask, utask ); - } + // } for(i = sorted[ID]; i < sorted[ID+1]; i++) { - qsched_create_loads(s, i, s->res[res[i]].size, sorted[i+1]-sorted[i], ID, res, sorted); + qsched_create_loads(s, res[i], s->res[res[i]].size, sorted[res[i]+1]-sorted[res[i]], ID, res, sorted); } }else{ task = qsched_addtask( s , type_load , task_flag_none, &ID, sizeof(int), 0 ); @@ -757,11 +757,11 @@ void qsched_create_loads(struct qsched *s, int ID, int size, int numChildren, in s->res[res[j]].task = task; } /* If it has a parent then set the parents ghost task to be dependent on this.*/ - if(parent >= 0) - { + // if(parent >= 0) + // { // qsched_addunlock(s, task, s->res[parent].task ); // qsched_addunlock(s, s->res[parent].utask, utask); - } + // } } } @@ -828,156 +828,15 @@ int transitive_use_locks(struct qsched *s, int tid, int res, int depth) return 0; } -/* -* Not used. -*/ -void qsched_prepare_deps( struct qsched *s ) -{ - int **is_loaded; - int **parents; - int i, k, j, use, usek, usem; - int *num_parents; - is_loaded = (int**)malloc(sizeof(int*) * s->count); - parents = (int**)malloc(sizeof(int*) * s->count); - num_parents = (int*)malloc(sizeof(int) * s->count); - bzero(num_parents, sizeof(int)*s->count); - k = (sizeof(int)*s->count_res)/32 +1; - for(i = 0; i < s->count; i++) - { - is_loaded[i] = (int*)malloc(k); - bzero(is_loaded[i], k); - } - /* Is loaded[i][k] gives the set of k*32 resources for task i*/ - - /* Reset the waits to 0... */ - for( k = 0; k < s->count; k++ ) - { - s->tasks[k].wait = 0; - } - - /* Run through the tasks and set the waits... */ - for ( k = 0 ; k < s->count ; k++ ) { - struct task *t = &s->tasks[k]; - if ( !( t->flags & task_flag_skip ) ) - for ( j = 0 ; j < t->nr_unlocks ; j++ ) - s->tasks[ t->unlocks[j] ].wait += 1; - } - - /* Sort the tasks topologically. */ - int *tid = (int *)malloc( sizeof(int) * s->count ); - for ( j = 0 , k = 0 ; k < s->count ; k++ ) - if ( s->tasks[k].wait == 0 ) { - tid[j] = k; - j += 1; - } - for ( k = 0 ; k < j ; k++ ) { - struct task *t = &s->tasks[ tid[k] ]; - for ( int kk = 0 ; kk < t->nr_unlocks ; kk++ ) - if ( ( s->tasks[ t->unlocks[kk] ].wait -= 1 ) == 0 ) { - tid[j] = t->unlocks[kk]; - j += 1; - } - } - if ( k < s->count ) - { - error( "Circular dependencies detected." ); - } - int max_parents = 0; - for(i = s->count-1; i >= 0; i--) - { - for(j = 0; j < s->tasks[i].nr_unlocks; j++) - { - num_parents[s->tasks[i].unlocks[j]]++; - } - } - for(i = 0; i < s->count; i++) - { - if(num_parents[i] > 0) - parents[i] = (int*)calloc(num_parents[i],sizeof(int)); - else - parents[i] = NULL; - - if(num_parents[i] > max_parents) - { - max_parents = num_parents[i]; - } - - num_parents[i] = 0; - } - - for(i = 0; i < s->count; i++) - { - if(s->tasks[i].type == type_load || s->tasks[i].type == type_unload) - continue; - for(k = 0; k < s->tasks[i].nr_uses; k++) - { - use = s->tasks[i].uses[k]; - usek = use >> 5; // use / 32; - usem = use & 31; // use % 32. - - if((is_loaded[i][usek] & (1 << (31-usem))) == 0 ) - { - qsched_addunlock(s, s->res[use].task , i ) ; - is_loaded[i][usek] |= (1 <<(31-usem)); - } - } - for(k = 0; k < s->tasks[i].nr_unlocks; k++) - { - if(s->tasks[s->tasks[i].unlocks[k]].type == type_load || - s->tasks[s->tasks[i].unlocks[k]].type == type_unload ) - continue; - for(j = 0; j < s->count_res/32 +1; j++) - { - is_loaded[s->tasks[i].unlocks[k]][j] |= is_loaded[i][j]; - } - parents[s->tasks[i].unlocks[k]][num_parents[s->tasks[i].unlocks[k]]] = i; - num_parents[s->tasks[i].unlocks[k]] = num_parents[s->tasks[i].unlocks[k]] + 1; - } - } - max_parents = 0; - for(i = 0; i < s->count; i++) - { - if(s->tasks[i].type == type_load || s->tasks[i].type == type_unload) - continue; - if(num_parents[i] > max_parents) - { - max_parents = num_parents[i]; - } - bzero(is_loaded[i], k); - } - for(i = s->count-1; i >= 0; i--) - { - if(s->tasks[i].type == type_load || s->tasks[i].type == type_unload) - continue; - for(k = 0; k < s->tasks[i].nr_uses; k++) - { - use = s->tasks[i].uses[k]; - usek = use >> 5; // use / 32; - usem = use & 31; // use % 32. - if((is_loaded[i][usek] & (1 << (31-usem))) == 0 ) - { - qsched_addunlock(s, i, s->res[use].utask ); - is_loaded[i][usek] |= (1 << (31-usem)); - } - } - for(k = 0; k < num_parents[i]; k++) - { - for(j = 0; j < s->count_res/32 +1; j++) - { - is_loaded[parents[i][k]][j] |= is_loaded[i][j]; - } - } - } - -} - void qsched_prepare_loads ( struct qsched *s ) { -int i, task, unload, j , k , x, unlocked = 0; +int i, task, unload, j , k , unlocked = 0; struct task *t; int *sorted, lastindex; int *res, *res_data; +//printf("tasks[63].wait_init = %i\n", s->tasks[63].wait_init); + if(s->res[0].task != -1) { printf("Tasks already initialised, not redoing load/unload tasks"); @@ -991,19 +850,14 @@ ticks tic, toc_run, toc2 ; qsched_task_t *deps_new, *deps_new_key; s->size_deps *= qsched_stretch; - - j = 0; -//printf("%i\n", s->tasks[94].unlocks); -// for(k = 0; k < s->tasks[94].nr_unlocks; k++) -// printf("%i ", s->tasks[94].unlocks[k]); -// printf("\n"); /* Allocate a new dependency list. */ if ( ( deps_new = (int*) malloc(s->size_deps * sizeof(int) ) ) == NULL || ( deps_new_key = (int*) malloc( s->size_deps * sizeof(int) ) ) == NULL ) error( "Failed to allocate new deps lists." ); tic = getticks(); + /* Copy the dependency list to the new list, leaving a space between each task to fit unload dependencies in.*/ for(i = 0; i < s->count; i++) { @@ -1011,7 +865,6 @@ for(i = 0; i < s->count; i++) /* Its possible we might not fit in weird cases so we need to make sure to expand if needed.*/ if(j + t->nr_unlocks + t->nr_uses + t->nr_locks > s->size_deps) { - // printf("\n"); qsched_task_t *temp1, *temp2; s->size_deps *= qsched_stretch; /* Allocate a new task list. */ @@ -1033,34 +886,18 @@ for(i = 0; i < s->count; i++) free(deps_new_key); deps_new = temp1; deps_new_key = temp2; - // printf("Stretch at line 828. m = %i\n", m); - t = &s->tasks[i]; + t = &s->tasks[i]; } int start_j = j; - /*if(i <= 94) -{ - printf("%i\n", s->tasks[94].unlocks); - for(k = 0; k < s->tasks[94].nr_unlocks; k++) - printf("%i ", s->tasks[94].unlocks[k]); - printf(" i = %i\n", i); - printf("%i\n", t->unlocks); -}*/ + /* Sort the unlocks into position, and add enough room for any unlocks to load and unload tasks. */ for(k = 0; k < t->nr_unlocks; k++) { deps_new[j] = t->unlocks[k]; - //if(i == 48) - // printf("%i %i %i %i\n", t->unlocks +k,s->deps_key, s->deps, s->count_deps); deps_new_key[j] = s->deps_key[&t->unlocks[k] - s->deps]; j++; } t->unlocks = &deps_new[start_j]; - /*if(i == 94) -{ - for(k = 0; k < t->nr_unlocks; k++) - printf("%i ", t->unlocks[k]); - printf("\n"); -}*/ j+=t->nr_uses + t->nr_locks; } @@ -1132,25 +969,6 @@ for(i = 1; i < s->count_res; i++) /* Sort super resources by memory address.*/ qsched_sort(&res[sorted[s->count_res-1]], &res_data[sorted[s->count_res-1]], s->count_res - sorted[s->count_res-1], minVal(&res_data[sorted[s->count_res-1]], s->count_res - sorted[s->count_res-1]), maxVal(&res_data[sorted[s->count_res-1]], s->count_res - sorted[s->count_res-1])); - -/* -for(i = 0; i < s->count_res; i++) -{ - printf("%i ", res_data[i]); -} -printf("\n"); - -for(i = 0; i < s->count_res; i++) -{ -printf("%i ", res[i]); -} -printf("\n"); - -for(i = 0; i < s->count_res; i++) -{ - printf("%i ", s->res[res[i]].parent); -} -printf("\n");*/ /*res now contains an array of indices, first sorted by parent, then memory address of data. */ int size=0; if(sorted[0] != 0) @@ -1180,7 +998,6 @@ for( i = sorted[s->count_res-1]; i < s->count_res-1; i++ ) { if(res_data[i] + (s->res[res[i]].size/sizeof(int)) > res_data[i+1]) { - /*printf("i=%i, s->count_res=%i, res_data[i] = %i, size = %i, res_data[i]+size = %i, res_data[i+1] = %i\n",i, s->count_res, res_data[i], s->res[res[i]].size, res_data[i] + (s->res[res[i]].size/sizeof(int)), res_data[i+1]);*/ error("Overlapping resources are not allowed."); } } @@ -1192,7 +1009,6 @@ for(i = sorted[s->count_res]; i >= 0; i-- ) } toc_run = getticks(); -// message( "Sorting took %.3f ms" , ((double)(toc_run - tic)) * itpms ); tic = getticks(); /* If nothing overlaps create tasks.*/ @@ -1252,6 +1068,7 @@ for(i = 0; i < s->count_res; i++ ) size_uses[i] = s->count_uses / s->count_res + 1; } + /* Add deps from tasks to unload tasks. */ for(i = 0; i < s->count; i++) { @@ -1295,9 +1112,22 @@ for(i = 0; i < s->count; i++) } } -/* Loop through resources. */ +/* Loop through resources, count number dependencies each task needs to have added. */ +int *num_deps = (int*)calloc(sizeof(int) , s->count); +for(i = 0; i < s->count_res; i++ ) +{ + int ID = res[i]; + struct res *resource = &s->res[res[i]]; + num_deps[s->res[ID].utask] += sorted[ID+1] - sorted[ID]; + if(resource->parent >= 0) + num_deps[resource->task]++; + num_deps[resource->task] += num_uses[ res[ i ] ]; + +} + +int *tasks_assigned = (int*)calloc(sizeof(int), s->count); -// TODO Make sure to stretch if needed. +/* Loop through resources. */ for(i = 0; i < s->count_res; i++ ) { int ID = res[i]; @@ -1306,16 +1136,16 @@ for(i = 0; i < s->count_res; i++ ) int parent = s->res[ID].parent; struct res *resource = &s->res[ res[i] ]; - printf("ID = %i, size = %i, numChild = %i, parent = %i\n", ID, size, numChildren, parent); - printf("task = %i, utask = %i\n", s->res[ID].task, s->res[ID].utask); if(s->res[ID].task == s->res[parent].task) continue; /* Loop through children if there are any. */ if(numChildren > 0) { /* Do unload task stuff first. */ - s->tasks[resource->utask].unlocks = &deps_new[s->count_deps]; - s->tasks[resource->utask].nr_unlocks = 0; + if(!tasks_assigned[resource->utask]) { + s->tasks[resource->utask].unlocks = &deps_new[s->count_deps]; + s->tasks[resource->utask].nr_unlocks = 0; + } if(s->count_deps + numChildren > s->size_deps) { qsched_task_t *temp1, *temp2; @@ -1338,7 +1168,6 @@ for(i = 0; i < s->count_res; i++ ) free(deps_new_key); deps_new = temp1; deps_new_key = temp2; - // printf("Stretch at line 1102, m = %i.\n", m); } for(j = 0; j < numChildren; j++) { @@ -1350,15 +1179,19 @@ for(i = 0; i < s->count_res; i++ ) s->tasks[child->utask].wait_init += 1; deps_new_key[s->count_deps] = resource->utask; s->tasks[resource->utask].nr_unlocks += 1; - s->count_deps += 1; } } + if(!tasks_assigned[resource->utask]) { + s->count_deps += num_deps[resource->utask]; + tasks_assigned[resource->utask] = 1; + } } /* Do load task stuff. */ - - s->tasks[resource->task].unlocks = &deps_new[s->count_deps]; - s->tasks[resource->task].nr_unlocks = 0; + if(!tasks_assigned[resource->task]) { + s->tasks[resource->task].unlocks = &deps_new[s->count_deps]; + s->tasks[resource->task].nr_unlocks = 0; + } if(numChildren > 0) { for(j = 0; j < numChildren; j++) @@ -1388,12 +1221,13 @@ for(i = 0; i < s->count_res; i++ ) free(deps_new_key); deps_new = temp1; deps_new_key = temp2; - // printf("Stretch at line 1151.\n"); } for(k = 0; k < num_uses[ res[ sorted[ ID ] + j ] ]; k++) { s->tasks[resource->task].unlocks[ s->tasks[resource->task].nr_unlocks ] = usage_list[ res[ sorted[ ID ] +j ] ][k]; s->tasks[ usage_list[ res[ sorted[ ID ] +j ] ][k] ].wait_init += 1; +// if(usage_list[res[sorted[ID]+j]][k] == 63) +// printf("Increased wait_init of task 63, wait_init now %i\n", s->tasks[63].wait_init); deps_new_key[s->count_deps] = resource->task; s->tasks[resource->task].nr_unlocks += 1; s->count_deps += 1; @@ -1423,14 +1257,17 @@ for(i = 0; i < s->count_res; i++ ) free(deps_new_key); deps_new = temp1; deps_new_key = temp2; - // printf("Stretch at line 1185.\n"); } if( parent > 0 ) { s->tasks[resource->task].unlocks[ s->tasks[resource->task].nr_unlocks ] = s->res[parent].task; deps_new_key[s->count_deps] = resource->task; s->tasks[s->res[parent].task].wait_init += 1; + //if(s->res[parent].task == 63) +// printf("task %i should unlock task 63, %i position.\n", resource->task, s->tasks[resource->task].nr_unlocks); s->tasks[resource->task].nr_unlocks += 1; +// if(s->res[parent].task == 63) +// printf("s->tasks[resource->task].nr_unlocks = %i\n", s->tasks[resource->task].nr_unlocks); s->count_deps += 1; } @@ -1439,10 +1276,19 @@ for(i = 0; i < s->count_res; i++ ) s->tasks[resource->task].unlocks[ s->tasks[resource->task].nr_unlocks ] = usage_list[ res[ i ] ][k]; deps_new_key[s->count_deps] = resource->task; s->tasks[usage_list[ res[ i ] ][k]].wait_init += 1; +// if(usage_list[res[i]][k] == 63) +// printf("task %i should unlock task 63\n", resource->task); s->tasks[resource->task].nr_unlocks += 1; - s->count_deps += 1; +// s->count_deps += 1; } - + if(!tasks_assigned[resource->task]) { + s->count_deps += num_deps[resource->task]; + tasks_assigned[resource->task] = 1; + } + //r(k = 0; k < s->tasks[66].nr_unlocks; k++) +// printf("%i ", s->tasks[66].unlocks[k]); +// printf("\n"); +// printf("s->tasks[66].unlocks - deps_new = %i\n", s->tasks[66].unlocks - deps_new); } //printf("s->deps = %i, deps_new = %i\n", s->deps, deps_new); free(s->deps); @@ -1451,164 +1297,38 @@ s->deps = deps_new; s->deps_key = deps_new_key; s->flags &= ~qsched_flag_dirty; toc2 += getticks() - tic; + // printf("s->tasks[50].unlocks - s->deps = %i\n", s->tasks[50].unlocks - s->deps); -//printf("Number tasks = %i\n", s->count); -//printf("Number dependencies = %i\n", s->count_deps); -/* Set up dependencies with the rest of the system.*/ - - - -/*New version*/ - /* Reset the waits to 0... */ -/* for( k = 0; k < s->count; k++ ) - { - s->tasks[k].wait = 0; - } */ - - /* Run through the tasks and set the waits... */ -/* for ( k = 0 ; k < s->count ; k++ ) { - struct task *t = &s->tasks[k]; - if ( !( t->flags & task_flag_skip ) ) - for ( j = 0 ; j < t->nr_unlocks ; j++ ) - s->tasks[ t->unlocks[j] ].wait += 1; - } - */ - /* Sort the tasks topologically. */ -/* int *tid = (int *)malloc( sizeof(int) * s->count ); - for ( j = 0 , k = 0 ; k < s->count ; k++ ) - if ( s->tasks[k].wait == 0 ) { - tid[j] = k; - j += 1; - } - for ( k = 0 ; k < j ; k++ ) { - struct task *t = &s->tasks[ tid[k] ]; - for ( int kk = 0 ; kk < t->nr_unlocks ; kk++ ) - if ( ( s->tasks[ t->unlocks[kk] ].wait -= 1 ) == 0 ) { - tid[j] = t->unlocks[kk]; - j += 1; - } - } - if ( k < s->count ) - { - //printf("k = %i, count = %i\n", k, count); - error( "Circular dependencies detected." ); - } - -*/ -/*Do unlocks */ -/*for(i = 0; i < s->count; i++) -{ - struct task *t = &s->tasks[i]; - int result = 0; - if(t->type == type_ghost || t->type == type_unload || t->type == type_load) - continue; - for(j = 0; j < t->nr_uses; j++) - { - result = 0; - for(k = 0; k < t->nr_unlocks && result == 0; k++) - { - result = transitive_use_unlocks(s, &s->tasks[t->unlocks[k]], t->uses[j],0); - - - } - if( result == 0) - qsched_addunlock(s, i, s->res[t->uses[j]].utask); - } - for(j = 0; j < t->nr_locks; j++) - { - result = 0; - for(k = 0; k < t->nr_unlocks && result == 0; k++) - { - result =transitive_use_unlocks(s, &s->tasks[t->unlocks[k]], t->locks[j],0); +/*printf("Dependencies leading from task 50's\n"); +for(i = 0; i < s->tasks[50].nr_unlocks + 5; i++) + printf("%i ", s->tasks[50].unlocks[i]); +printf("\n"); - } - if(result == 0) - qsched_addunlock(s, i, s->res[t->locks[j]].utask); - } -}*/ +printf("s->tasks[50].nr_unlocks = %i, num_deps[50] = %i\n", s->tasks[50].nr_unlocks, num_deps[50]); -/*Do locks */ -/*for(i = s->count-1; i >= 0; i--) -{ - struct task *t = &s->tasks[i]; - struct task *new_t; - int result = 0; - if(t->type == type_ghost || t->type == type_unload || t->type == type_load) - continue; - for(j = 0; j < t->nr_uses; j++) - { - result = 0; - for(k = i-1; k >= 0; k--) - { - new_t = &s->tasks[k]; - for(x = 0; x < new_t->nr_unlocks && result == 0; x++) - { - if(new_t->unlocks[x] == i) - { - result = transitive_use_locks(s, k, t->uses[j],0); - } - } - } - if(result == 0) - { - qsched_addunlock(s, s->res[t->uses[j]].task, i); - } - } - for(j = 0; j < t->nr_locks; j++) +free(num_deps); +printf("Printing dependencies here:\n"); +for(i = 0; i < s->count; i++) +{*/ + // printf("Task ID: %i, type = %i: ", i, s->tasks[i].type); + /*if(s->tasks[i].type < 100) { - result = 0; - for(k = i-1; k >= 0; k--) + for(k = 0; k < s->count_res; k++) { - new_t = &s->tasks[k]; - for(x = 0; x < new_t->nr_unlocks && result == 0; x++) + if(s->res[k].task == i || s->res[k].utask == i ) { - if(new_t->unlocks[x] == i) - { - result = transitive_use_locks(s, k, t->locks[j],0); - } + printf(" resource ID = %i", k); + if(s->res[k].parent != -1) + printf(" parent resource ID = %i", s->res[k].parent); + printf(": "); } } - if(result == 0) - { - qsched_addunlock(s, s->res[t->locks[j]].task, i); - } - } - -}*/ - -/* Old version*/ -/*for(i = 0; i < s->count; i++) -{ - struct task *t = &s->tasks[i]; - if(t->type == type_load || t->type == type_unload || t-> type == type_ghost) - continue; - - for(k = 0; k < t->nr_uses; k++) - { - qsched_addunlock(s, s->res[t->uses[k]].task, i); - qsched_addunlock(s, i, s->res[t->uses[k]].utask); - } - - for(k = 0; k < t->nr_locks; k++) - { - qsched_addunlock(s, s->res[t->locks[k]].task, i); - qsched_addunlock(s, i, s->res[t->locks[k]].utask); - } -}*/ -//qsched_prepare_deps( s ); -//printf("Number dependencies = %i\n", s->count_deps); -/*#ifdef PRIQ -int PCI_res; -PCI_res = qsched_addres(s , qsched_owner_none , qsched_res_none , NULL, 0 , NULL); -s->res[PCI_res].lock = PCIEX; -for(i = 0; i < s->count; i++) -{ - if(s->tasks[i].type == type_load) - { - qsched_addlock(s, i, PCI_res); - } + }*/ + /* for(k = 0; k < s->tasks[i].nr_unlocks; k++) + printf("%i ", s->tasks[i].unlocks[k]); + printf("\n"); } -#endif*/ + printf("----------------\n");*/ toc_run = getticks(); //message( "Setting up dependencies took %.3f ms" , toc2 * itpms ); //error("Got to here"); @@ -1756,7 +1476,6 @@ toc_run = getticks(); - /* Get a pointer to the tasks, set the count. */ tasks = s->tasks; count = s->count; @@ -1783,13 +1502,13 @@ toc_run = getticks(); // } /* Run throught the tasks and link the locks and unlocks. */ - tasks[0].unlocks = s->deps; +// tasks[0].unlocks = s->deps; tasks[0].locks = s->locks; - tasks[0].uses = s->uses; + // tasks[0].uses = s->uses; for ( k = 1 ; k < count ; k++ ) { - tasks[k].unlocks = &tasks[k-1].unlocks[ tasks[k-1].nr_unlocks ]; + // tasks[k].unlocks = &tasks[k-1].unlocks[ tasks[k-1].nr_unlocks ]; tasks[k].locks = &tasks[k-1].locks[ tasks[k-1].nr_locks ]; - tasks[k].uses = &tasks[k-1].uses[ tasks[k-1].nr_uses ]; + // tasks[k].uses = &tasks[k-1].uses[ tasks[k-1].nr_uses ]; } /* All cleaned-up now! */ @@ -1825,7 +1544,7 @@ toc_run = getticks(); // printf("Task ID %i has wait %i and wait_init %i and type %i\n", k, t->wait, t->wait_init, t->type); // } } - + /* Sort the tasks topologically. */ int *tid = (int *)malloc( sizeof(int) * count ); for ( j = 0 , k = 0 ; k < count ; k++ ) @@ -1846,26 +1565,34 @@ toc_run = getticks(); } } + /* Print all dependencies */ - for(i = 0; i < count; i++ ) - { - printf("Task ID: %i, type=%i, ", i, tasks[i].type); - for(j = 0; j < tasks[i].nr_unlocks; j++) - { - printf("%i ", tasks[i].unlocks[j]); - } - printf("\n"); - } + // for(i = 0; i < count; i++ ) +// { +// printf("Task ID: %i, type=%i, wait= %i, ", tid[i], tasks[tid[i]].type, tasks[tid[i]].wait); + // printf("Task ID: %i, type=%i, wait= %i, ", i, tasks[i].type, tasks[i].wait); +// for(j = 0; j < tasks[tid[i]].nr_unlocks; j++) +// { +// printf("%i ", tasks[tid[i]].unlocks[j]); +// } +// printf("\n"); +// } if ( k < count ) { - printf("k = %i, wait = %i\n", tid[k-1], tasks[tid[k-1]].wait); + for(i = 0; i < count; i++) + if(tasks[tid[i]].wait != 0) + printf("Task %i has nonzero wait, wait is %i, wait_init is %i\n", tid[i], tasks[tid[i]].wait, tasks[tid[i]].wait_init); + //printf("tasks[63].wait = %i, tasks[63].wait_init = %i\n", tasks[63].wait, tasks[63].wait_init); + printf("k = %i, count = %i\n", k, count); + printf("tid[k-1] = %i, tid[k-1].wait = %i, initial wait = %i\n", tid[k], tasks[tid[k]].wait, tasks[tid[k]].wait_init); + for(i = 0; i < count; i++) { - t = &tasks[i]; + t = &tasks[tid[i]]; for(j = 0; j < t->nr_unlocks; j++) { - if(t->unlocks[j] == tid[k-1]) - printf("Task %i is unlocking task %i\n",j, tid[k-1]); + if(t->unlocks[j] == tid[k]) + printf("Task %i is unlocking task %i\n",tid[i], tid[k]); } } error( "Circular dependencies detected." ); @@ -2078,13 +1805,14 @@ qsize = max(2*s->count, 512); data[queues[1].count++] = i; data2[queues[1].count-1] = -temp[i].weight; } - if(temp[i].type != type_load && temp[i].wait == 0) + if(temp[i].type != type_load && temp[i].type != type_ghost && temp[i].wait == 0) { - printf("%i %i\n", temp[i].type, i); + printf("Task of type %i with no wait, ID = %i\n", temp[i].type, i); } } - qsched_sort(data, data2, queues[1].count, minVal(data2,queues[1].count), maxVal(data2, queues[1].count)); + /* Opt to use quicksort as difference between max and min weights can be too large.*/ + qsched_quicksort(data, data2, queues[1].count, minVal(data2,queues[1].count), maxVal(data2, queues[1].count)); free(data2); for ( k = queues[1].count ; k < qsize ; k++ ) data[k] = -1; diff --git a/src/cuda_queue.cu.outdated b/src/cuda_queue.cu.outdated new file mode 100644 index 0000000000000000000000000000000000000000..9ef4acf8c426e8fe01d7cd8c1f52908096242491 --- /dev/null +++ b/src/cuda_queue.cu.outdated @@ -0,0 +1,150 @@ +/* +* Not used. +*/ +void qsched_prepare_deps( struct qsched *s ) +{ + int **is_loaded; + int **parents; + int i, k, j, use, usek, usem; + int *num_parents; + is_loaded = (int**)malloc(sizeof(int*) * s->count); + parents = (int**)malloc(sizeof(int*) * s->count); + num_parents = (int*)malloc(sizeof(int) * s->count); + bzero(num_parents, sizeof(int)*s->count); + k = (sizeof(int)*s->count_res)/32 +1; + for(i = 0; i < s->count; i++) + { + is_loaded[i] = (int*)malloc(k); + bzero(is_loaded[i], k); + } + /* Is loaded[i][k] gives the set of k*32 resources for task i*/ + + /* Reset the waits to 0... */ + for( k = 0; k < s->count; k++ ) + { + s->tasks[k].wait = 0; + } + + /* Run through the tasks and set the waits... */ + for ( k = 0 ; k < s->count ; k++ ) { + struct task *t = &s->tasks[k]; + if ( !( t->flags & task_flag_skip ) ) + for ( j = 0 ; j < t->nr_unlocks ; j++ ) + s->tasks[ t->unlocks[j] ].wait += 1; + } + + /* Sort the tasks topologically. */ + int *tid = (int *)malloc( sizeof(int) * s->count ); + for ( j = 0 , k = 0 ; k < s->count ; k++ ) + if ( s->tasks[k].wait == 0 ) { + tid[j] = k; + j += 1; + } + for ( k = 0 ; k < j ; k++ ) { + struct task *t = &s->tasks[ tid[k] ]; + for ( int kk = 0 ; kk < t->nr_unlocks ; kk++ ) + if ( ( s->tasks[ t->unlocks[kk] ].wait -= 1 ) == 0 ) { + tid[j] = t->unlocks[kk]; + j += 1; + } + } + if ( k < s->count ) + { + error( "Circular dependencies detected." ); + } + + /* Store the maximum number of parents of a task.*/ + int max_parents = 0; + + /* Compute the number of parents for each task */ + for(i = s->count-1; i >= 0; i--) + { + for(j = 0; j < s->tasks[i].nr_unlocks; j++) + { + num_parents[s->tasks[i].unlocks[j]]++; + } + } + /* Allocate memory to store parents in.*/ + for(i = 0; i < s->count; i++) + { + if(num_parents[i] > 0) + parents[i] = (int*)calloc(num_parents[i],sizeof(int)); + else + parents[i] = NULL; + + if(num_parents[i] > max_parents) + { + max_parents = num_parents[i]; + } + + num_parents[i] = 0; + } + + /* This seems to be outdated and unused?*/ + for(i = 0; i < s->count; i++) + { + if(s->tasks[i].type == type_load || s->tasks[i].type == type_unload) + continue; + + for(k = 0; k < s->tasks[i].nr_uses; k++) + { + use = s->tasks[i].uses[k]; + usek = use >> 5; // use / 32; + usem = use & 31; // use % 32. + + if((is_loaded[i][usek] & (1 << (31-usem))) == 0 ) + { + qsched_addunlock(s, s->res[use].task , i ) ; + printf("We actually did this?!"); + is_loaded[i][usek] |= (1 <<(31-usem)); + } + } + for(k = 0; k < s->tasks[i].nr_unlocks; k++) + { + if(s->tasks[s->tasks[i].unlocks[k]].type == type_load || + s->tasks[s->tasks[i].unlocks[k]].type == type_unload ) + continue; + for(j = 0; j < s->count_res/32 +1; j++) + { + is_loaded[s->tasks[i].unlocks[k]][j] |= is_loaded[i][j]; + } + parents[s->tasks[i].unlocks[k]][num_parents[s->tasks[i].unlocks[k]]] = i; + num_parents[s->tasks[i].unlocks[k]] = num_parents[s->tasks[i].unlocks[k]] + 1; + } + } + max_parents = 0; + for(i = 0; i < s->count; i++) + { + if(s->tasks[i].type == type_load || s->tasks[i].type == type_unload) + continue; + if(num_parents[i] > max_parents) + { + max_parents = num_parents[i]; + } + bzero(is_loaded[i], k); + } + for(i = s->count-1; i >= 0; i--) + { + if(s->tasks[i].type == type_load || s->tasks[i].type == type_unload) + continue; + for(k = 0; k < s->tasks[i].nr_uses; k++) + { + use = s->tasks[i].uses[k]; + usek = use >> 5; // use / 32; + usem = use & 31; // use % 32. + if((is_loaded[i][usek] & (1 << (31-usem))) == 0 ) + { + qsched_addunlock(s, i, s->res[use].utask ); + is_loaded[i][usek] |= (1 << (31-usem)); + } + } + for(k = 0; k < num_parents[i]; k++) + { + for(j = 0; j < s->count_res/32 +1; j++) + { + is_loaded[parents[i][k]][j] |= is_loaded[i][j]; + } + } + } + +}