diff --git a/examples/test_qr.cu b/examples/test_qr.cu index 1e8be2456b665d6f53395e0a07ed1604c2d60687..6f075d4b58b3b2c03c6d2912747102c543f3d333 100644 --- a/examples/test_qr.cu +++ b/examples/test_qr.cu @@ -14,6 +14,7 @@ /* Local includes. */ extern "C"{ #include "quicksched.h" +#include "res.h" #include <cblas.h> } #include "cuda_queue.h" @@ -565,7 +566,7 @@ __device__ void runner ( int type , void *data ) { //if(threadIdx.x == 0) // printf("SGEQRF %i %i\n", k, (k*cuda_m+k)*32*32); if(threadIdx.x < 32) - SGEQRF( &GPU_matrix[(k*cuda_m+k)*32*32], 32, GPU_tau, k, cuda_m); + SGEQRF( &GPU_matrix[(k*cuda_m+k)*32*32], 32, GPU_tau, k, cuda_m); break; case task_SLARFT: // if(threadIdx.x == 0) @@ -759,6 +760,9 @@ void test_qr(int m , int n , int K , int nr_threads , int runs) qsched_res_t *rid; int data[3]; ticks tic, toc_run, tot_setup, tot_run = 0; + #ifdef NO_LOADS + ticks tic_loads; + #endif /* Initialize the scheduler. */ qsched_init( &s , 1 , qsched_flag_none ); cudaDeviceReset(); @@ -800,6 +804,12 @@ void test_qr(int m , int n , int K , int nr_threads , int runs) if( cudaMalloc( &device_array , sizeof(float) * m * n * K * K ) != cudaSuccess ) error("Failed to allocate the matrix on the device"); + #ifdef NO_LOADS + tic = getticks(); + if( cudaMemcpy( A, device_array, sizeof(float) * m * n * K *K, cudaMemcpyHostToDevice ) != cudaSuccess) + error("Failed to copy matrix to device"); + tic_loads = getticks() - tic; + #endif if( cudaMemcpyToSymbol( GPU_matrix , &device_array , sizeof(float *) , 0 , cudaMemcpyHostToDevice ) != cudaSuccess ) error("Failed to copy matrix pointer to the device"); @@ -820,6 +830,7 @@ void test_qr(int m , int n , int K , int nr_threads , int runs) rid[k] = qsched_addres( &s , qsched_owner_none , qsched_res_none , &A[k*32*32], sizeof(float) * 32 * 32, device_array + k * 32 *32); } + /* Build the tasks. */ for ( k = 0 ; k < m && k < n ; k++ ) { @@ -882,9 +893,18 @@ void test_qr(int m , int n , int K , int nr_threads , int runs) qsched_run_CUDA( &s , func ); + #ifdef NO_LOADS + tic = getticks(); + if( cudaMemcpy( device_array, A, sizeof(float) * m * n * K *K, cudaMemcpyDeviceToHost ) != cudaSuccess) + error("Failed to copy matrix from device"); + tic_loads += getticks() - tic;double itpms = 1000.0 / CPU_TPS; + printf("%.3f\n", ((double)(tic_loads)) * itpms ); + #endif if(cudaMemcpy( tau , tau_device , sizeof(float) * m * n * K , cudaMemcpyDeviceToHost ) != cudaSuccess ) error("Failed to copy the tau data from the device."); + + // printMatrix(tileToColumn(A, m*n*K*K, m, n, K), m, n); /*float *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K); float *Q = computeQ(tempMatrix, m*K, K, tau, m); @@ -911,11 +931,24 @@ void test_qr(int m , int n , int K , int nr_threads , int runs) // printMatrix(A, m, n); // printTileMatrix(A, m , n); struct task* tasks = qsched_get_timers( &s , s.count ); - /*for(i = 0; i < s.count; i++) +/* for(i = 0; i < s.count; i++) { - printf("%i %lli %lli %i\n", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID); + printf("%i %lli %lli %i ", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID); + if(tasks[i].type < 0) + { + int *tempdata = (int*)s.data; + int *idata = &tempdata[tasks[i].data/4]; + int ii = idata[0]; + int row = ii/m; + int col = ii%m; + printf("%i %i", col, row); + + }else + printf("0 0"); + printf("\n"); }*/ - + free(tasks); +cudaDeviceReset(); } @@ -1174,8 +1207,8 @@ int main ( int argc , char *argv[] ) { } /* Dump arguments. */ - message( "Computing the tiled QR decomposition of a %ix%i matrix using %i threads (%i runs)." , - 32*M , 32*N , nr_threads , runs ); + // message( "Computing the tiled QR decomposition of a %ix%i matrix using %i threads (%i runs)." , + // 32*M , 32*N , nr_threads , runs ); test_qr( M , N , K , nr_threads , runs ); diff --git a/src/cuda_queue.cu b/src/cuda_queue.cu index 45add67c108e3d9a0c2ddcfdb07d76c4eb58a1b8..5a684959016050b6e3d39bb77ce76c051fb0023d 100644 --- a/src/cuda_queue.cu +++ b/src/cuda_queue.cu @@ -63,7 +63,24 @@ __device__ qsched_funtype fun; -#ifdef GPU_locks +__device__ __inline__ int cuda_trymultilock ( volatile int *l ) { + + int res = atomicAdd( (int *)l, 1); + printf("res = %i\n", res); + res = res <= 0; + if( res == 0) + { + atomicAdd( (int *)l, -1); + return 0; + }else + return 1; + +} + +__device__ __inline__ void cuda_multiunlock ( volatile int *l ) { + atomicAdd( (int *)l, -1 ); +} + __device__ __inline__ int cuda_trylock ( volatile int *l ) { int res = atomicCAS( (int *)l, 0 , 1 ); return res; @@ -78,7 +95,7 @@ __device__ __inline__ int cuda_unlock ( volatile int *l ) { int res = atomicCAS( (int *)l , 1 , 0) != 1 ; return res; } -#endif + /** * @brief Copy bulk memory in a strided way. @@ -92,12 +109,23 @@ __device__ __inline__ void cuda_memcpy_tasks ( void *dest , void *source , int c int k; int *idest = (int *)dest, *isource = (int *)source; - + int val1, val2, val3, val4; /* Copy the data in chunks of sizeof(int). */ - for ( k = threadIdx.x ; k < count/sizeof(int) ; k += blockDim.x ){ - idest[k] = isource[k]; + for ( k = threadIdx.x ; k + 4*blockDim.x < count/sizeof(int) ; k += 4*blockDim.x ){ + val1 = isource[k]; + val2 = isource[k+blockDim.x]; + val3 = isource[k+2*blockDim.x]; + val4 = isource[k+3*blockDim.x]; + idest[k] = val1; + idest[k+blockDim.x] = val2; + idest[k+2*blockDim.x] = val3; + idest[k+3*blockDim.x] = val4; } + for( ; k < count/sizeof(int); k+=blockDim.x) + { + idest[k] = isource[k]; + } } @@ -344,6 +372,7 @@ __device__ int runner_cuda_gettask ( struct queue_cuda *q ) { * This routine blocks until a valid task is picked up, or the * specified queue is empty. */ +#ifndef PRIQ __device__ int runner_cuda_gettask ( struct queue_cuda *q ) { int tid = -1; @@ -368,6 +397,147 @@ __device__ int runner_cuda_gettask ( struct queue_cuda *q ) { return tid; } +#endif +#ifdef PRIQ + +__device__ int get_best_task(struct queue_cuda *q ) +{ + + int ind1, ind2, tid1 = -1, tid2 = -1; + + /* Don't even try... */ + // if ( q->rec_count == q->count ) + // return -1; + + /* Get the index of the next task. */ + ind1 = atomicAdd( &q->first , 1 ); + + if( atomicAdd((int*)&q->nr_avail_tasks, -1) <=0 ) + { + atomicAdd((int*)&q->nr_avail_tasks, 1); + ind1 %= cuda_queue_size; + /* Loop until there is a valid task at that index. */ + while ( tot_num_tasks > 0 && ( tid1 = q->data[ind1] ) < 0); + + if(tasks_cuda[tid1].type == type_load) + cuda_trymultilock(&res_cuda[tasks_cuda[tid1].locks[0]].lock); + /* Scratch the task from the queue */ + if ( tid1 >= 0 ) + { + q->data[ind1] = -1; + atomicAdd((int*) &tot_num_tasks, -1); + } + /* Return the acquired task ID. */ + return tid1; + } + ind2 = atomicAdd( &q->first, 1); + /* Wrap the index. */ + ind1 %= cuda_queue_size; + ind2 %= cuda_queue_size; + /* Loop until there is a valid task at that index. */ + while ( tot_num_tasks > 0 && ( tid1 = q->data[ind1] ) < 0); + // atomicAdd((int*) &tot_num_tasks, -1); + while ( tot_num_tasks > 0 && ( tid2 = q->data[ind2] ) < 0); + + if(tid1 < 0 && tid2 < 0) + return -1; + + int pri1, pri2; + pri1 = tasks_cuda[tid1].weight; + pri2 = tasks_cuda[tid2].weight; + + + int res1, res2; + if( (tid1 >= 0 && tasks_cuda[tid1].type == type_load) ) + { + res1 = cuda_trymultilock(&res_cuda[tasks_cuda[tid1].locks[0]].lock); + //printf("%i %i\n", res1, res_cuda[tasks_cuda[tid1].locks[0]].lock); + } + else + res1 = 1; + + if( tid2 >= 0 && tasks_cuda[tid2].type == type_load && (tid1 >= 0 && tasks_cuda[tid1].type != type_load )) + res2 = cuda_trymultilock(&res_cuda[tasks_cuda[tid2].locks[0]].lock); + else + res2 = 1; + + if(res1 == 0 && res2 == 0) + { + cuda_multiunlock(&res_cuda[tasks_cuda[tid1].locks[0]].lock); + q->data[ind1] = -1; + q->data[ind2] = -1; + cuda_queue_puttask( q, tid1); + cuda_queue_puttask( q, tid2); + return -1; + } + + if(res1 == 0) + { + cuda_multiunlock(&res_cuda[tasks_cuda[tid1].locks[0]].lock); + q->data[ind1] = -1; + q->data[ind2] = -1; + atomicAdd((int*) &tot_num_tasks, -1); + cuda_queue_puttask( q, tid1); + return tid2; + } + + if(res2 == 0) + { + cuda_multiunlock(&res_cuda[tasks_cuda[tid2].locks[0]].lock); + q->data[ind1] = -1; + q->data[ind2] = -1; + atomicAdd((int*) &tot_num_tasks, -1); + cuda_queue_puttask( q, tid2); + return tid1; + } + + + + + + if(pri1 >= pri2) + { + + q->data[ind1] = -1; + q->data[ind2] = -1; + atomicAdd((int*) &tot_num_tasks, -1); + cuda_queue_puttask( q, tid2); +// atomicAdd((int*)&q->nr_avail_tasks, 1); + + return tid1; + }else + { + q->data[ind1] = -1; + q->data[ind2] = -1; + atomicAdd((int*) &tot_num_tasks, -1); + cuda_queue_puttask( q, tid1); + // atomicAdd((int*)&q->nr_avail_tasks, 1); + return tid2; + } + +} + + +__device__ int runner_cuda_gettask ( struct queue_cuda *q ) { + + int tid = -1; + if( atomicAdd((int*)&q->nr_avail_tasks, -1) <= 0) + { + atomicAdd((int*)&q->nr_avail_tasks, 1); + return -1; + } +// atomicAdd((int*)&q->nr_avail_tasks, -1); + tid = get_best_task(q); + + /* Put this task into the recycling queue, if needed. */ + if ( tid >= 0 ) { + q->rec_data[ atomicAdd( (int *)&q->rec_count , 1 ) ] = tid; + } + /* Return whatever we got. */ + return tid; + + } +#endif #endif @@ -383,19 +553,39 @@ __global__ void qsched_device_kernel ( ) /* Pull a task from the queues*/ while ( 1 ) { __syncthreads(); + /*if(threadIdx.x == 0 && blockIdx.x == 0) + { + printf("%i %i %i \n", cuda_queues[0].nr_avail_tasks, cuda_queues[0].data[0], cuda_queues[0].data[1]); + break; + } + else + break;*/ if ( threadIdx.x == 0 ) { tid = -1; /* Highest priority queue, holds the unload tasks. */ +#ifndef PRIQ +#ifndef NO_LOADS if(cuda_queues[2].nr_avail_tasks > 0 ) tid = runner_cuda_gettask( &cuda_queues[2] ); +#endif + /* Middle priority queue, contains user-specifed tasks. */ if( tid < 0 && cuda_queues[0].nr_avail_tasks > 0 ) tid = runner_cuda_gettask ( &cuda_queues[0]); - +#ifndef NO_LOADS /* Low priority queue, contains the load tasks. */ - if( tid < 0 && cuda_queues[1].nr_avail_tasks > 0) + if( tid < 0 && cuda_queues[1].nr_avail_tasks > 0 /*&& blockIdx.x < 1*/ ) +{ tid = runner_cuda_gettask ( &cuda_queues[1]); + +} +#endif + +#else + if(tot_num_tasks > 0 && cuda_queues[0].nr_avail_tasks > 0) + tid = runner_cuda_gettask ( &cuda_queues[0] ); +#endif } /*Everyone wait for us to get a task id*/ @@ -409,6 +599,7 @@ __global__ void qsched_device_kernel ( ) if(tid < 0 && cuda_queues[0].nr_avail_tasks == 0 && cuda_queues[1].nr_avail_tasks == 0 && cuda_queues[2].nr_avail_tasks == 0) break; #endif + /* If we couldn't find a task but some are not completed, try again. */ if(tid < 0) continue; @@ -427,6 +618,11 @@ __global__ void qsched_device_kernel ( ) src = (int*)res_cuda[d[0]].data; dest = (int*)res_cuda[d[0]].gpu_data; cuda_memcpy_tasks( dest, src , res_cuda[d[0]].size, tid); + #ifdef PRIQ + __syncthreads(); + if(threadIdx.x == 0) + cuda_multiunlock(&res_cuda[tasks_cuda[tid].locks[0]].lock); + #endif }else if( tasks_cuda[tid].type == type_unload ) { int *d = (int*)&data_cuda[tasks_cuda[tid].data]; @@ -452,12 +648,16 @@ __global__ void qsched_device_kernel ( ) if( atomicSub( &tasks_cuda[tasks_cuda[tid].unlocks[i]].wait , 1 ) == 1 && !( tasks_cuda[tasks_cuda[tid].unlocks[i]].flags & task_flag_skip )) { /* Place unloads into highest priority queue, any other task goes to normal priority queue. Load tasks are never unlocked.*/ + #ifdef PRIQ + cuda_queue_puttask( &cuda_queues[0] , tasks_cuda[tid].unlocks[i] ); + #else if(tasks_cuda[tasks_cuda[tid].unlocks[i]].type != type_unload) { cuda_queue_puttask( &cuda_queues[0] , tasks_cuda[tid].unlocks[i] ); } else cuda_queue_puttask( &cuda_queues[2] , tasks_cuda[tid].unlocks[i] ); + #endif } } } @@ -477,15 +677,21 @@ __global__ void qsched_device_kernel ( ) /* Reset values.*/ + /*printf("cuda_queues[0].first = %i, nr_avail_tasks = %i, tot_num_tasks = %i, data[first] = %i\n", cuda_queues[0].first, cuda_queues[0].nr_avail_tasks ,tot_num_tasks, cuda_queues[0].data[cuda_queues[0].first] ); + printf("cuda_queues[0].last = %i, data[last-1] = %i\n", cuda_queues[0].last ,cuda_queues[0].data[cuda_queues[0].last] ); + for(tid = cuda_queues[0].first; tid < cuda_queues[0].first + cuda_queues[0].nr_avail_tasks; tid++) + { + printf("task =%i, type = %i\n", cuda_queues[0].data[tid%cuda_queue_size], tasks_cuda[cuda_queues[0].data[tid%cuda_queue_size]].type ); + } */ cuda_queues[0].first = 0; cuda_queues[0].last = 0; cuda_queues[0].rec_count = 0; - cuda_queues[1].first = 0; + /*cuda_queues[1].first = 0; cuda_queues[1].last = cuda_queues[1].count; cuda_queues[1].rec_count = 0; cuda_queues[2].first = 0; cuda_queues[2].last = 0; - cuda_queues[2].rec_count = 0; + cuda_queues[2].rec_count = 0;*/ } //TODO @@ -527,7 +733,7 @@ int minVal( int *array, int size ) return maxi ; } -void qsched_create_loads(struct qsched *s, int ID, int size, int numChildren, int parent, int *res, int *sorted ) +void qsched_create_loads(struct qsched *s, int ID, int size, int numChildren, int parent, int *res, int *sorted) { int i,j; int task, utask; @@ -788,7 +994,7 @@ if(s->res[0].task != -1) return; } double itpms = 1000.0 / CPU_TPS; -ticks tic, toc_run ; +ticks tic, toc_run, toc2 ; tic = getticks(); /* Expand the deps array so we can add new dependencies in place. */ @@ -807,7 +1013,7 @@ j = 0; if ( ( deps_new = (int*) malloc(s->size_deps * sizeof(int) ) ) == NULL || ( deps_new_key = (int*) malloc( s->size_deps * sizeof(int) ) ) == NULL ) error( "Failed to allocate new deps lists." ); - +tic = getticks(); /* Copy the dependency list to the new list, leaving a space between each task to fit unload dependencies in.*/ for(i = 0; i < s->count; i++) { @@ -837,7 +1043,7 @@ for(i = 0; i < s->count; i++) free(deps_new_key); deps_new = temp1; deps_new_key = temp2; -// printf("Stretch at line 828. m = %i\n", m); + // printf("Stretch at line 828. m = %i\n", m); t = &s->tasks[i]; } int start_j = j; @@ -870,6 +1076,7 @@ for(i = 0; i < s->count; i++) s->count_deps = j; +toc2 = getticks() - tic; /* Store number of children for each resource*/ sorted = (int*) malloc(sizeof(int) * (s->count_res+1)); @@ -1012,6 +1219,7 @@ for( i = sorted[s->count_res]; i < s->count_res; i++ ) } toc_run = getticks(); +tic = getticks(); // message( "Creating load tasks took %.3f ms" , ((double)(toc_run - tic)) * itpms ); @@ -1049,7 +1257,7 @@ num_uses = (int*) malloc(sizeof(int) * s->count_res ); size_uses = (int*) malloc(sizeof(int) * s->count_res); for(i = 0; i < s->count_res; i++ ) { - usage_list[i] = (int*) malloc(sizeof(int) * s->count_uses / s->count_res + 1); + usage_list[i] = (int*) malloc(sizeof(int) * (s->count_uses / s->count_res + 1)); num_uses[i] = 0; size_uses[i] = s->count_uses / s->count_res + 1; } @@ -1064,6 +1272,7 @@ for(i = 0; i < s->count; i++) for(j = 0; j < t->nr_uses; j++) { t->unlocks[t->nr_unlocks] = s->res[t->uses[j]].utask; + s->tasks[s->res[t->uses[j]].utask].wait_init +=1 ; deps_new_key[(t->unlocks - deps_new) + t->nr_unlocks] = i; t->nr_unlocks++; if(num_uses[t->uses[j]] == size_uses[t->uses[j]]) @@ -1080,6 +1289,7 @@ for(i = 0; i < s->count; i++) for(j = 0; j < t->nr_locks; j++) { t->unlocks[t->nr_unlocks] = s->res[t->locks[j]].utask; + s->tasks[s->res[t->locks[j]].utask].wait_init +=1 ; deps_new_key[(t->unlocks - deps_new) + t->nr_unlocks] = i; t->nr_unlocks++; if(num_uses[t->locks[j]] == size_uses[t->locks[j]]) @@ -1134,7 +1344,7 @@ for(i = 0; i < s->count_res; i++ ) free(deps_new_key); deps_new = temp1; deps_new_key = temp2; -// printf("Stretch at line 1102, m = %i.\n", m); + // printf("Stretch at line 1102, m = %i.\n", m); } for(j = 0; j < numChildren; j++) { @@ -1143,6 +1353,7 @@ for(i = 0; i < s->count_res; i++ ) if( child->utask != resource->utask ) { s->tasks[resource->utask].unlocks[ s->tasks[resource->utask].nr_unlocks ] = child->utask; + s->tasks[child->utask].wait_init += 1; deps_new_key[s->count_deps] = resource->utask; s->tasks[resource->utask].nr_unlocks += 1; s->count_deps += 1; @@ -1183,11 +1394,12 @@ for(i = 0; i < s->count_res; i++ ) free(deps_new_key); deps_new = temp1; deps_new_key = temp2; -// printf("Stretch at line 1151.\n"); + // printf("Stretch at line 1151.\n"); } for(k = 0; k < num_uses[ res[ sorted[ ID ] + j ] ]; k++) { s->tasks[resource->task].unlocks[ s->tasks[resource->task].nr_unlocks ] = usage_list[ res[ sorted[ ID ] +j ] ][k]; + s->tasks[ usage_list[ res[ sorted[ ID ] +j ] ][k] ].wait_init += 1; deps_new_key[s->count_deps] = resource->task; s->tasks[resource->task].nr_unlocks += 1; s->count_deps += 1; @@ -1217,12 +1429,13 @@ for(i = 0; i < s->count_res; i++ ) free(deps_new_key); deps_new = temp1; deps_new_key = temp2; -// printf("Stretch at line 1185.\n"); + // printf("Stretch at line 1185.\n"); } if( parent > 0 ) { s->tasks[resource->task].unlocks[ s->tasks[resource->task].nr_unlocks ] = s->res[parent].task; deps_new_key[s->count_deps] = resource->task; + s->tasks[s->res[parent].task].wait_init += 1; s->tasks[resource->task].nr_unlocks += 1; s->count_deps += 1; } @@ -1231,6 +1444,7 @@ for(i = 0; i < s->count_res; i++ ) { s->tasks[resource->task].unlocks[ s->tasks[resource->task].nr_unlocks ] = usage_list[ res[ i ] ][k]; deps_new_key[s->count_deps] = resource->task; + s->tasks[usage_list[ res[ i ] ][k]].wait_init += 1; s->tasks[resource->task].nr_unlocks += 1; s->count_deps += 1; } @@ -1242,7 +1456,8 @@ free(s->deps_key); s->deps = deps_new; s->deps_key = deps_new_key; s->flags &= ~qsched_flag_dirty; -tic = getticks(); +toc2 += getticks() - tic; + //printf("Number tasks = %i\n", s->count); //printf("Number dependencies = %i\n", s->count_deps); /* Set up dependencies with the rest of the system.*/ @@ -1388,8 +1603,20 @@ tic = getticks(); }*/ //qsched_prepare_deps( s ); //printf("Number dependencies = %i\n", s->count_deps); +#ifdef PRIQ +int PCI_res; +PCI_res = qsched_addres(s , qsched_owner_none , qsched_res_none , NULL, 0 , NULL); +s->res[PCI_res].lock = PCIEX; +for(i = 0; i < s->count; i++) +{ + if(s->tasks[i].type == type_load) + { + qsched_addlock(s, i, PCI_res); + } +} +#endif toc_run = getticks(); -// message( "Setting up dependencies took %.3f ms" , ((double)(toc_run - tic)) * itpms ); + //message( "Setting up dependencies took %.3f ms" , toc2 * itpms ); //error("Got to here"); } @@ -1526,7 +1753,9 @@ ticks tic, toc_run ; } } +#ifndef NO_LOADS qsched_prepare_loads(s); +#endif toc_run = getticks(); // message( "prepare_loads took %.3f ms" , ((double)(toc_run - tic)) * itpms ); @@ -1539,7 +1768,7 @@ toc_run = getticks(); count = s->count; /* If the sched is dirty... */ - if ( s->flags & qsched_flag_dirty ) { + if ( 1 /*s->flags & qsched_flag_dirty*/ ) { /* Do the sorts in parallel, if possible. */ // #pragma omp parallel @@ -1551,7 +1780,7 @@ toc_run = getticks(); /* Sort the locks. */ // #pragma omp single nowait - // qsched_sort( s->locks , s->locks_key , s->count_locks , 0 , count - 1 ); + qsched_sort( s->locks , s->locks_key , s->count_locks , 0 , count - 1 ); /* Sort the uses. */ // #pragma omp single nowait @@ -1583,25 +1812,30 @@ toc_run = getticks(); tasks[k].wait = 0; } - int* store_waits; - if( (store_waits = (int*) malloc(sizeof(int)*s->count) ) == NULL) - error("Failed to allocate store_waits"); - /* Run through the tasks and set the waits... */ - for ( k = 0 ; k < count ; k++ ) { +/* for ( k = 0 ; k < count ; k++ ) { t = &tasks[k]; if ( !( t->flags & task_flag_skip ) ) for ( j = 0 ; j < t->nr_unlocks ; j++ ) { tasks[ t->unlocks[j] ].wait += 1; } - } + }*/ + + for( k = 0; k < count; k++ ) + { + t = &tasks[k]; + t->wait = t->wait_init; +// if(t->wait != t->wait_init) + // { + // printf("Task ID %i has wait %i and wait_init %i and type %i\n", k, t->wait, t->wait_init, t->type); + // } + } /* Sort the tasks topologically. */ int *tid = (int *)malloc( sizeof(int) * count ); for ( j = 0 , k = 0 ; k < count ; k++ ) { - store_waits[k] = tasks[k].wait; if ( tasks[k].wait == 0 ) { tid[j] = k; j += 1; @@ -1671,19 +1905,36 @@ toc_run = getticks(); } /* Run through the topologically sorted tasks backwards and set their weights, re-setting the waits while we're at it. */ +#ifdef PRIQ for ( k = count-1 ; k >= 0 ; k-- ) { int maxweight = 0; t = &tasks[ tid[k] ]; - t->wait = store_waits[tid[k]]; + t->wait = t->wait_init; for ( j = 0 ; j < t->nr_unlocks ; j++ ) { if ( tasks[ t->unlocks[j] ].weight > maxweight ) maxweight = tasks[ t->unlocks[j] ].weight; } - t->weight = t->cost + maxweight; - } - - free(store_waits); + if(t->type != type_load) + t->weight = t->cost + maxweight; + else + t->weight = t->cost + maxweight - 1000; +// if(t->weight < -100) + // printf("%i\n", t->weight); + } +#endif +#ifndef PRIQ + for( k = 0; k < count; k++ ) + { + t = &tasks[k]; + t->wait = t->wait_init; +// if(t->wait != t->wait_init) + // { + // printf("Task ID %i has wait %i and wait_init %i and type %i\n", k, t->wait, t->wait_init, t->type); + // } + } +#endif + /*Allocate temporary tasks to setup device tasks*/ temp = (struct task *) malloc(s->count * sizeof(struct task)); @@ -1762,10 +2013,95 @@ if( cudaMemcpyToSymbol ( tasks_cuda, &cuda_t , sizeof(struct task *) , 0 , cud error("Failed to copy task pointer to the device."); /* Initialize the queues. */ +#ifdef PRIQ +int nr_queues= 1, qsize; +#else int nr_queues= 3,qsize; +#endif int *data2; struct queue_cuda queues[ cuda_numqueues ]; + +#ifdef PRIQ +qsize = max(2*s->count, 512); + if ( cudaMemcpyToSymbol( cuda_queue_size , &qsize , sizeof(int) , 0 , cudaMemcpyHostToDevice ) != cudaSuccess ) + error("Failed to copy queue size to the device."); + + /* Allocate a temporary buffer for the queue data. */ + if ( ( data = (int *)malloc( sizeof(int) * qsize ) ) == NULL ) + error("Failed to allocate data buffer."); + if( ( data2 = (int *) malloc( sizeof(int) * qsize ) ) == NULL ) + error("Failed to allocate data2 buffer."); + queues[0].count = 0; + + for(i = 0; i < s->count; i++) + { + if(s->tasks[i].wait == 0) + { + if(s->tasks[i].type != type_load) + { + printf("i = %i\n", i); + for(k = 0; k < s->count; k++) + { + for(j = 0; j < s->tasks[k].nr_unlocks; j++) + { + if(s->tasks[k].unlocks[j] == i) + printf("Should be unlocked by %i\n", k); + } + for(j = 0; j < s->tasks[k].nr_uses; j++) + { + if(s->tasks[k].uses[j] == 256) + printf("Task %i uses resource 256\n", k); + } + for(j = 0; j < s->tasks[k].nr_locks; j++) + { + if(s->tasks[k].locks[j] == 256) + printf("Task %i locks resource 256\n", k); + } + } + for(k = 0; k < s->count_res; k++) + { + if(s->res[k].utask == i) + printf("resource = %i\n", k); + } + printf("%i\n", i); + } + data[queues[0].count++] = i; + data2[queues[0].count-1] = -temp[i].weight; + } + } + qsched_sort(data, data2, queues[0].count, minVal(data2,queues[0].count), maxVal(data2, queues[0].count)); + free(data2); + for ( i = queues[0].count ; i < qsize ; i++ ) + data[i] = -1; + + /* Allocate and copy the data. */ + if ( cudaMalloc( &queues[0].data , sizeof(int) * qsize ) != cudaSuccess ) + error("Failed to allocate queue data on the device."); + if ( cudaMemcpy( (void *)queues[0].data , data , sizeof(int) * qsize , cudaMemcpyHostToDevice ) != cudaSuccess ) + error("Failed to copy queue data pointer to the device"); + + for ( k = 0; k < qsize; k++ ) + data[k] = -1; + + /* Allocate and copy the recyling data. */ + if ( cudaMalloc( &queues[0].rec_data , sizeof(int) * qsize ) != cudaSuccess ) + error("Failed to allocate queue data on the device."); + if ( cudaMemcpy( (void *)queues[0].rec_data , data , sizeof(int) * qsize , cudaMemcpyHostToDevice ) != cudaSuccess ) + error("Failed to copy queue data pointer to the device"); + + + /* Set some other values. */ + queues[0].first = 0; + queues[0].last = queues[0].count; + queues[0].nr_avail_tasks = queues[0].last; + queues[0].rec_count = 0; + queues[0].count = s->count; + + /* Copy the queue structures to the device. */ + if ( cudaMemcpyToSymbol( cuda_queues , &queues , sizeof(struct queue_cuda) * nr_queues , 0 , cudaMemcpyHostToDevice ) != cudaSuccess ) + error("Failed to copy the queues to the device"); +#else qsize = max(2*s->count / nr_queues, 256); if ( cudaMemcpyToSymbol( cuda_queue_size , &qsize , sizeof(int) , 0 , cudaMemcpyHostToDevice ) != cudaSuccess ) error("Failed to copy queue size to the device."); @@ -1784,6 +2120,10 @@ struct queue_cuda queues[ cuda_numqueues ]; data[queues[1].count++] = i; data2[queues[1].count-1] = -temp[i].weight; } + if(temp[i].type != type_load && temp[i].wait == 0) + { + printf("%i %i\n", temp[i].type, i); + } } qsched_sort(data, data2, queues[1].count, minVal(data2,queues[1].count), maxVal(data2, queues[1].count)); @@ -1872,7 +2212,7 @@ struct queue_cuda queues[ cuda_numqueues ]; /* Copy the queue structures to the device. */ if ( cudaMemcpyToSymbol( cuda_queues , &queues , sizeof(struct queue_cuda) * nr_queues , 0 , cudaMemcpyHostToDevice ) != cudaSuccess ) error("Failed to copy the queues to the device"); - +#endif /* Clean up. */ free( tid ); @@ -1919,22 +2259,28 @@ struct task* qsched_get_timers( struct qsched *s, int numtasks ) void qsched_run_CUDA ( struct qsched *s, qsched_funtype func) { #ifdef WITH_CUDA - ProfilerStart("/home/aidan/quicksched-code/examples/profiler.out"); + //ProfilerStart("/home/aidan/quicksched-code/examples/profiler.out"); double itpms = 1000.0 / CPU_TPS; ticks tic, toc_run ; tic = getticks(); qsched_prepare_cuda( s ); toc_run = getticks(); + printf("%.3f ", ((double)(toc_run - tic)) * itpms ); // message( "prepare_cuda took %.3f ms" , ((double)(toc_run - tic)) * itpms ); cudaMemcpyToSymbol( fun , &func , sizeof(qsched_funtype)); tic = getticks(); - ProfilerStop(); + // ProfilerStop(); cudaMemcpyToSymbol( tot_num_tasks, &s->count, sizeof(int) ); qsched_device_kernel<<<128, 128 >>> ( ); if( cudaDeviceSynchronize() != cudaSuccess ) error("Failed to execute kernel:%s", cudaGetErrorString(cudaPeekAtLastError())); toc_run = getticks(); - message( "run_CUDA took %.3f ms" , ((double)(toc_run - tic)) * itpms ); + #ifdef NO_LOADS + printf("%.3f ", ((double)(toc_run - tic)) * itpms ); + #else + printf("%.3f\n", ((double)(toc_run - tic)) * itpms ); + #endif + // message( "run_CUDA took %.3f ms" , ((double)(toc_run - tic)) * itpms ); #else error("QuickSched was not compiled with CUDA support."); diff --git a/src/lock.h b/src/lock.h index 5c14abfe76e7bac02f6e15512e7276b01ccd5fe3..d739a64c6c256ba0e84ba6ec25a3d12096329ee2 100644 --- a/src/lock.h +++ b/src/lock.h @@ -28,6 +28,7 @@ #ifdef WITH_CUDA #define static +#define PCIEX -16 #endif #ifdef PTHREAD_LOCK diff --git a/src/qsched.c b/src/qsched.c index eebbfdfc1be395b66e6d1602fe6bdb3ebeaf2ff9..4e813c4ac107068066996720539a818d402dabe9 100644 --- a/src/qsched.c +++ b/src/qsched.c @@ -1396,6 +1396,7 @@ void qsched_addunlock ( struct qsched *s , int ta , int tb ) { s->deps[ s->count_deps ] = tb; s->deps_key[ s->count_deps ] = ta; s->tasks[ta].nr_unlocks += 1; + s->tasks[tb].wait_init += 1; /* Increase the deps counter. */ s->count_deps += 1; @@ -1486,6 +1487,7 @@ int qsched_addtask ( struct qsched *s , int type , unsigned int flags , void *da t->nr_unlocks = 0; t->nr_locks = 0; t->nr_uses = 0; + t->wait_init = 0; /* Add a relative pointer to the data. */ memcpy( &s->data[ s->count_data ] , data , data_size ); diff --git a/src/task.h b/src/task.h index 99cd36dcb957f473c799036d2223f7c557dd441e..3878cddc61eb47890d83dabe0576590c9bdb05b0 100644 --- a/src/task.h +++ b/src/task.h @@ -35,7 +35,7 @@ struct task { int data; /* Task wait counter. */ - int wait; + int wait, wait_init; /* Number of potential conflicts. */ int nr_conflicts;