From 17563ce8d279902d069d9b7322cac73d22c87d4d Mon Sep 17 00:00:00 2001 From: aidan <aidan@gtx690.dur.ac.uk> Date: Thu, 26 Mar 2015 11:42:28 +0000 Subject: [PATCH] Latest update --- examples/test_bh_4.cu | 25 ++++++++++---------- src/CUDACompile.sh | 40 ++++++++++++++++++-------------- src/cuda_queue.cu | 54 ++++++++++++++++++++++++++++++++++++------- 3 files changed, 81 insertions(+), 38 deletions(-) diff --git a/examples/test_bh_4.cu b/examples/test_bh_4.cu index a1bd8da..2b8862b 100644 --- a/examples/test_bh_4.cu +++ b/examples/test_bh_4.cu @@ -811,8 +811,7 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj) { /* Create the task. */ tid = qsched_addtask(s, task_type_self, task_flag_none, data, - sizeof(int) * 2, ci->count * ci->count / 2); - + sizeof(int) * 2, ci->count * ci->count * 0.05); /* Add the resource (i.e. the cell) to the new task. */ qsched_addlock(s, tid, ci->res); @@ -840,7 +839,7 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj) { } } - if( 0 && ci->count > 64*cell_maxparts) + if( ci->count > 64*cell_maxparts) { /* Let's also build a particle-monopole task */ for(cp = &cell_pool[ci->firstchild]; cp != &cell_pool[ci->sibling]; cp = &cell_pool[cp->sibling]){ @@ -848,8 +847,8 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj) { data[0] = cp-cell_pool; data[1] = cj-cell_pool; tid = qsched_addtask(s, task_type_pair_pc_split, task_flag_none, data, - sizeof(int) * 2, ci->count + cj->count); - + sizeof(int) * 2, cp->count * cj->count); + /* Add the resource and dependance */ qsched_addlock(s, tid, cp->res); // qsched_addlock(s, tid, cj->res); @@ -859,7 +858,7 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj) { data[0] = cp-cell_pool; data[1] = ci-cell_pool; tid = qsched_addtask(s, task_type_pair_pc_split, task_flag_none, data, - sizeof(int) * 2, ci->count + cj->count); + sizeof(int) * 2, cp->count * cj->count); /* Add the resource and dependance */ qsched_addlock(s, tid, cp->res); @@ -869,12 +868,12 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj) { { data[0] = ci -cell_pool; data[1] = cj - cell_pool; - tid = qsched_addtask(s, task_type_pair_pc, task_flag_none, data, sizeof(int) * 2, ci->count + cj->count); + tid = qsched_addtask(s, task_type_pair_pc, task_flag_none, data, sizeof(int) * 2, ci->count * cj->count); /* Add the resource and dependance */ qsched_addlock(s, tid, ci->res); data[0] = cj - cell_pool; data[1] = ci - cell_pool; - tid = qsched_addtask(s, task_type_pair_pc, task_flag_none, data, sizeof(int) * 2, ci->count + cj->count); + tid = qsched_addtask(s, task_type_pair_pc, task_flag_none, data, sizeof(int) * 2, ci->count * cj->count); qsched_addlock(s, tid, cj->res); } @@ -893,7 +892,7 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj) { /* Create the task. */ tid = qsched_addtask(s, task_type_pair, task_flag_none, data, - sizeof(int) * 2, ci->count * cj->count); + sizeof(int) * 2, ci->count * cj->count * 0.1); /* Add the resources. */ qsched_addlock(s, tid, ci->res); @@ -1226,13 +1225,13 @@ float *comm_temp; if( cudaMemcpyFromSymbol( &pcs, pc_calcs, sizeof(int), 0, cudaMemcpyDeviceToHost) != cudaSuccess) error("Failed"); printf("pc calcs = %i\n", pcs); -/*struct task* tasks = qsched_get_timers( &s , s.count ); +struct task* tasks = qsched_get_timers( &s , s.count ); for(i = 0; i < s.count; i++) { printf("%i %lli %lli %i\n", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID); // printf("\n"); - }*/ + } } #ifdef EXACT @@ -1313,8 +1312,8 @@ ticks tic, toc_run ; int main(int argc, char *argv[]) { int c, nr_threads; int N = 1000, runs = 1; - char fileName[100] = {0}; - + char fileName[100]; + fileName[0] = 0; /* Parse the options */ while ((c = getopt(argc, argv, "n:r:t:f:c:i:")) != -1) switch (c) { case 'n': diff --git a/src/CUDACompile.sh b/src/CUDACompile.sh index 085086c..6f4804a 100755 --- a/src/CUDACompile.sh +++ b/src/CUDACompile.sh @@ -1,42 +1,48 @@ #!/bin/bash FLAGS2="-Xcompiler=-fsanitize=address -Xcompiler=-fno-omit-frame-pointer" DEBUG_FLAGS="-G -DDEBUG_GPU" -FLAGS="-O3 -g -DCPU_TPS=3.1e9 -lineinfo -src-in-ptx -Xptxas -dlcm=cg --maxrregcount=32 -gencode arch=compute_30,code=sm_30 -ftz=true -fmad=true -DFPTYPE_SINGLE -lgomp -DWITH_CUDA -DTIMERS -ccbin=/usr/bin/gcc-4.8" +FLAGS="-O3 -DSIM -g -DCPU_TPS=3.1e9 -lineinfo -src-in-ptx -Xptxas -dlcm=cg --maxrregcount=32 -gencode arch=compute_30,code=sm_30 -ftz=true -fmad=true -DFPTYPE_SINGLE -lgomp -DWITH_CUDA -DTIMERS -ccbin=/usr/bin/gcc-4.8" # -DGPU_locks -Xptxas -dlcm=cg -Xptxas="-v"" # -DNO_LOADS +#-DSIM +#-lprofiler CFLAGS="-fsanitize=address -fno-omit-frame-pointer" -gcc-4.8 -std=gnu99 -DHAVE_CONFIG_H -I. -I.. -g -O3 -Wall -Werror -ffast-math -fstrict-aliasing -ftree-vectorize -funroll-loops -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -fopenmp -DTIMERS -DWITH_CUDA -g -O2 -march=native -pthread -MT qsched.lo -MD -MP -MF .deps/qsched.Tpo -c qsched.c -o qsched.o -lprofiler -gcc-4.8 -std=gnu99 -DHAVE_CONFIG_H -I. -I.. -g -O3 -Wall -Werror -ffast-math -fstrict-aliasing -ftree-vectorize -funroll-loops -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -fopenmp -DTIMERS -DWITH_CUDA -g -O2 -march=native -pthread -MT qsched.lo -MD -MP -MF .deps/qsched.Tpo -c queue.c -o queue.o -lprofiler +gcc-4.8 -std=gnu99 -DHAVE_CONFIG_H -I. -I.. -g -O3 -Wall -Werror -ffast-math -fstrict-aliasing -ftree-vectorize -funroll-loops -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -fopenmp -DTIMERS -DWITH_CUDA -g -O2 -march=native -pthread -MT qsched.lo -MD -MP -MF .deps/qsched.Tpo -c qsched.c -o qsched.o -lprofiler -DSIM #-fsanitize=address -fno-omit-frame-pointer +gcc-4.8 -std=gnu99 -DHAVE_CONFIG_H -I. -I.. -g -O3 -Wall -Werror -ffast-math -fstrict-aliasing -ftree-vectorize -funroll-loops -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -fopenmp -DTIMERS -DWITH_CUDA -g -O2 -march=native -pthread -MT qsched.lo -MD -MP -MF .deps/qsched.Tpo -c queue.c -o queue.o -lprofiler -DSIM #-fsanitize=address -fno-omit-frame-pointer -/home/aidan/cuda_6.0/bin/nvcc $FLAGS -dc cuda_queue.cu qsched.o queue.o -lprofiler +/home/aidan/cuda_7.0/bin/nvcc $FLAGS -dc cuda_queue.cu qsched.o queue.o -lprofiler -/home/aidan/cuda_6.0/bin/nvcc $FLAGS -lib cuda_queue.o qsched.o queue.o -o .libs/libquicksched_cuda.a -lprofiler +/home/aidan/cuda_7.0/bin/nvcc $FLAGS -lib cuda_queue.o qsched.o queue.o -o .libs/libquicksched_cuda.a -lprofiler -/home/aidan/cuda_6.0/bin/nvcc $FLAGS -DGPU_locks -dc cuda_queue.cu qsched.o queue.o -lprofiler +/home/aidan/cuda_7.0/bin/nvcc $FLAGS -DGPU_locks -dc cuda_queue.cu qsched.o queue.o -lprofiler -/home/aidan/cuda_6.0/bin/nvcc $FLAGS -lib cuda_queue.o qsched.o queue.o -o .libs/libquicksched_cuda_locks.a -lprofiler +/home/aidan/cuda_7.0/bin/nvcc $FLAGS -lib cuda_queue.o qsched.o queue.o -o .libs/libquicksched_cuda_locks.a -lprofiler cd ../examples -/home/aidan/cuda_6.0/bin/nvcc $FLAGS -dc -m64 -I../src -dc -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -lcudart -lcuda test_gpu_simple.cu -lprofiler +/home/aidan/cuda_7.0/bin/nvcc $FLAGS -dc -m64 -I../src -dc -L/home/aidan/cuda_7.0/lib -L/home/aidan/cuda_7.0/lib64 -lcudart -lcuda test_gpu_simple.cu -lprofiler -/home/aidan/cuda_6.0/bin/nvcc $FLAGS -m64 -I../src -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -Xnvlink -v test_gpu_simple.o ../src/.libs/libquicksched_cuda.a -o simple_cuda_test -lprofiler +/home/aidan/cuda_7.0/bin/nvcc $FLAGS -m64 -I../src -L/home/aidan/cuda_7.0/lib -L/home/aidan/cuda_7.0/lib64 -Xnvlink -v test_gpu_simple.o ../src/.libs/libquicksched_cuda.a -o simple_cuda_test -lprofiler -/home/aidan/cuda_6.0/bin/nvcc $FLAGS -dc -m64 -I../src -dc -lf77blas -lcblas -latlas -lm -L/home/aidan/ATLAS/ATLAS_linux/lib/ -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -lcudart -lcuda /usr/lib64/atlas/libcblas.a /usr/lib64/atlas/libptcblas.a test_qr.cu -lprofiler +/home/aidan/cuda_7.0/bin/nvcc $FLAGS -dc -m64 -I../src -dc -lf77blas -lcblas -latlas -lm -I$CULA_INC_PATH -L$CULA_LIB_PATH_64 -L/home/aidan/ATLAS/ATLAS_linux/lib/ -L/home/aidan/cuda_7.0/lib -L/home/aidan/cuda_7.0/lib64 -lcudart -lcula_lapack -lcuda /usr/lib64/atlas/libcblas.a /usr/lib64/atlas/libptcblas.a test_qr.cu -lprofiler -/home/aidan/cuda_6.0/bin/nvcc $FLAGS -m64 -I../src -lf77blas -lcblas -latlas -lm -L/home/aidan/ATLAS/ATLAS_linux/lib/ -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -Xnvlink -v test_qr.o ../src/.libs/libquicksched_cuda.a /usr/lib64/atlas/libcblas.a /usr/lib64/atlas/libptcblas.a -o qr_cuda_test -lprofiler +/home/aidan/cuda_7.0/bin/nvcc $FLAGS -m64 -I../src -lf77blas -lcblas -latlas -lm -I$CULA_INC_PATH -L$CULA_LIB_PATH_64 -L/home/aidan/ATLAS/ATLAS_linux/lib/ -L/home/aidan/cuda_7.0/lib -L/home/aidan/cuda_7.0/lib64 -lcula_lapack -Xnvlink -v test_qr.o ../src/.libs/libquicksched_cuda.a /usr/lib64/atlas/libcblas.a /usr/lib64/atlas/libptcblas.a -o qr_cuda_test -lprofiler -/home/aidan/cuda_6.0/bin/nvcc $FLAGS -dc -m64 -I../src -dc -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -lcudart -lcuda test_hierarchy.cu -lprofiler +/home/aidan/cuda_7.0/bin/nvcc $FLAGS -dc -m64 -I../src -dc -L/home/aidan/cuda_7.0/lib -L/home/aidan/cuda_7.0/lib64 -lcudart -lcuda test_hierarchy.cu -lprofiler -/home/aidan/cuda_6.0/bin/nvcc $FLAGS -m64 -I../src -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -Xnvlink -v test_hierarchy.o ../src/.libs/libquicksched_cuda.a -o test_heirarchy -lprofiler +/home/aidan/cuda_7.0/bin/nvcc $FLAGS -m64 -I../src -L/home/aidan/cuda_7.0/lib -L/home/aidan/cuda_7.0/lib64 -Xnvlink -v test_hierarchy.o ../src/.libs/libquicksched_cuda.a -o test_heirarchy -lprofiler -/home/aidan/cuda_6.0/bin/nvcc $FLAGS -dc -m64 -I../src -dc -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -lcudart -lcuda test_bh_2.cu -lprofiler +/home/aidan/cuda_7.0/bin/nvcc $FLAGS -dc -m64 -I../src -dc -L/home/aidan/cuda_7.0/lib -L/home/aidan/cuda_7.0/lib64 -lcudart -lcuda test_bh_2.cu -lprofiler -/home/aidan/cuda_6.0/bin/nvcc $FLAGS -m64 -I../src -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -Xnvlink -v test_bh_2.o ../src/.libs/libquicksched_cuda.a -o test_bh_2 -lprofiler +/home/aidan/cuda_7.0/bin/nvcc $FLAGS -m64 -I../src -L/home/aidan/cuda_7.0/lib -L/home/aidan/cuda_7.0/lib64 -Xnvlink -v test_bh_2.o ../src/.libs/libquicksched_cuda.a -o test_bh_2 -lprofiler -/home/aidan/cuda_6.0/bin/nvcc $FLAGS -dc -m64 -I../src -dc -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -lcudart -lcuda test_bh_3.cu -lprofiler +/home/aidan/cuda_7.0/bin/nvcc $FLAGS -dc -m64 -I../src -dc -L/home/aidan/cuda_7.0/lib -L/home/aidan/cuda_7.0/lib64 -lcudart -lcuda test_bh_3.cu -lprofiler -/home/aidan/cuda_6.0/bin/nvcc $FLAGS -m64 -I../src -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -Xnvlink -v test_bh_3.o ../src/.libs/libquicksched_cuda.a -o test_bh_3 -lprofiler +/home/aidan/cuda_7.0/bin/nvcc $FLAGS -m64 -I../src -L/home/aidan/cuda_7.0/lib -L/home/aidan/cuda_7.0/lib64 -Xnvlink -v test_bh_3.o ../src/.libs/libquicksched_cuda.a -o test_bh_3 -lprofiler + +/home/aidan/cuda_7.0/bin/nvcc $FLAGS -dc -m64 -I../src -dc -L/home/aidan/cuda_7.0/lib -L/home/aidan/cuda_7.0/lib64 -lcudart -lcuda test_bh_4.cu -lprofiler + +/home/aidan/cuda_7.0/bin/nvcc $FLAGS -m64 -I../src -L/home/aidan/cuda_7.0/lib -L/home/aidan/cuda_7.0/lib64 -Xnvlink -v test_bh_4.o ../src/.libs/libquicksched_cuda.a -o test_bh_4 -lprofiler diff --git a/src/cuda_queue.cu b/src/cuda_queue.cu index d42e785..1e87003 100644 --- a/src/cuda_queue.cu +++ b/src/cuda_queue.cu @@ -80,6 +80,10 @@ __device__ int cuda_barrier = 0; __device__ volatile int tot_num_tasks; __device__ qsched_funtype fun; +#ifdef SIM +__device__ int num_loads; +__device__ int num_tasks; +#endif /** * @brief Get a task ID from the given queue. * @@ -533,7 +537,11 @@ __global__ void qsched_device_kernel ( ) tid = -1; /* Highest priority queue, holds the unload tasks. */ #ifndef NO_LOADS + #ifdef SIM + if(cuda_queues[2].nr_avail_tasks > 0 && num_tasks <= 0) + #else if(cuda_queues[2].nr_avail_tasks > 0 ) + #endif { TIMER_TIC tid = runner_cuda_gettask( &cuda_queues[2] ); @@ -543,7 +551,11 @@ __global__ void qsched_device_kernel ( ) #ifndef NO_LOADS /* Low priority queue, contains the load tasks. */ +#ifdef SIM + if( tid < 0 && cuda_queues[1].nr_avail_tasks > 0 ) +#else if( tid < 0 && cuda_queues[1].nr_avail_tasks > 0 && blockIdx.x < 12 ) +#endif { TIMER_TIC tid = runner_cuda_gettask ( &cuda_queues[1]); @@ -552,21 +564,16 @@ __global__ void qsched_device_kernel ( ) #endif /* Middle priority queue, contains user-specifed tasks. */ -#ifdef PRIQ - if( tid < 0 && cuda_queues[0].nr_avail_tasks > 0 ) - { - TIMER_TIC - tid = runner_cuda_gettask_priority ( &cuda_queues[0]); - TIMER_TOC(timers_queue) - } +#ifdef SIM + if( tid < 0 && cuda_queues[0].nr_avail_tasks > 0 && num_loads <= 0 ) #else if( tid < 0 && cuda_queues[0].nr_avail_tasks > 0 ) +#endif { TIMER_TIC tid = runner_cuda_gettask ( &cuda_queues[0]); TIMER_TOC(timers_queue) } -#endif } /*Everyone wait for us to get a task id*/ @@ -601,6 +608,10 @@ __global__ void qsched_device_kernel ( ) dest = (int*)res_cuda[d[0]].gpu_data; cuda_memcpy_tasks( dest, src , res_cuda[d[0]].size, tid); TIMER_TOC(timers_doload) + #ifdef SIM + if(threadIdx.x == 0) + atomicAdd(&num_loads, -1); + #endif }else if( tasks_cuda[tid].type == type_unload ) { TIMER_TIC @@ -614,7 +625,18 @@ __global__ void qsched_device_kernel ( ) fun(tasks_cuda[tid].type , &data_cuda[tasks_cuda[tid].data]); __syncthreads(); TIMER_TOC(timers_doother) + #ifdef SIM + if(threadIdx.x == 0) + atomicAdd(&num_tasks, -1); + + }else if (tasks_cuda[tid].type == type_ghost) + { + if(threadIdx.x == 0) + atomicAdd(&num_tasks, -1); + } + #else } + #endif __syncthreads(); /*Stop the task clock*/ if( threadIdx.x == 0 ) @@ -626,6 +648,7 @@ __global__ void qsched_device_kernel ( ) cuda_done( &tasks_cuda[tid] ); __syncthreads(); #endif +#ifndef SIM for(i = threadIdx.x; i < tasks_cuda[tid].nr_unlocks; i += blockDim.x ) { if( atomicSub( &tasks_cuda[tasks_cuda[tid].unlocks[i]].wait , 1 ) == 1 && !( tasks_cuda[tasks_cuda[tid].unlocks[i]].flags & task_flag_skip )) @@ -639,6 +662,7 @@ __global__ void qsched_device_kernel ( ) cuda_queue_puttask( &cuda_queues[2] , tasks_cuda[tid].unlocks[i] ); } } +#endif } @@ -1588,6 +1612,10 @@ struct queue_cuda queues[ cuda_numqueues ]; for ( k = 0; k < qsize; k++ ) data[k] = -1; +#ifdef SIM + if( cudaMemcpyToSymbol(num_loads, (int*)&queues[1].count, sizeof(int),0, cudaMemcpyHostToDevice ) != cudaSuccess) + error("Failed to copy num_loads"); +#endif /* Allocate and copy the recyling data. */ if ( cudaMalloc( &queues[1].rec_data , sizeof(int) * s->count*1.2 ) != cudaSuccess ) error("Failed to allocate queue data on the device."); @@ -1618,7 +1646,9 @@ struct queue_cuda queues[ cuda_numqueues ]; if(temp[k].type == type_unload) { num_unload++; +#ifndef SIM if(temp[k].wait == 0) +#endif data[queues[2].count++] = k; } } @@ -1640,11 +1670,19 @@ struct queue_cuda queues[ cuda_numqueues ]; queues[0].count = 0; for ( k = 0; k < s->count ; k++ ) { +#ifdef SIM + if(temp[k].type != type_load && temp[k].type != type_unload){ +#else if(temp[k].type != type_load && temp[k].type != type_unload && temp[k].wait == 0){ +#endif data[queues[0].count++] = k; } } + #ifdef SIM + if(cudaMemcpyToSymbol(num_tasks, (int*)&queues[0].count, sizeof(int), 0, cudaMemcpyHostToDevice) != cudaSuccess) + error("Failed to copy num_tasks"); + #endif queues[0].first = 0; queues[0].last = queues[0].count; queues[0].nr_avail_tasks = queues[0].last; -- GitLab