From 17563ce8d279902d069d9b7322cac73d22c87d4d Mon Sep 17 00:00:00 2001
From: aidan <aidan@gtx690.dur.ac.uk>
Date: Thu, 26 Mar 2015 11:42:28 +0000
Subject: [PATCH] Latest update

---
 examples/test_bh_4.cu | 25 ++++++++++----------
 src/CUDACompile.sh    | 40 ++++++++++++++++++--------------
 src/cuda_queue.cu     | 54 ++++++++++++++++++++++++++++++++++++-------
 3 files changed, 81 insertions(+), 38 deletions(-)

diff --git a/examples/test_bh_4.cu b/examples/test_bh_4.cu
index a1bd8da..2b8862b 100644
--- a/examples/test_bh_4.cu
+++ b/examples/test_bh_4.cu
@@ -811,8 +811,7 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj) {
 
       /* Create the task. */
       tid =  qsched_addtask(s, task_type_self, task_flag_none, data,
-			    sizeof(int) * 2, ci->count * ci->count / 2);
-
+			    sizeof(int) * 2, ci->count * ci->count * 0.05);
       /* Add the resource (i.e. the cell) to the new task. */
       qsched_addlock(s, tid, ci->res);
 
@@ -840,7 +839,7 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj) {
 	}
       }
 
-    if( 0 && ci->count > 64*cell_maxparts)
+    if( ci->count > 64*cell_maxparts)
     {
       /* Let's also build a particle-monopole task */
 	for(cp = &cell_pool[ci->firstchild]; cp != &cell_pool[ci->sibling]; cp = &cell_pool[cp->sibling]){
@@ -848,8 +847,8 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj) {
       data[0] = cp-cell_pool;
       data[1] = cj-cell_pool;
       tid = qsched_addtask(s, task_type_pair_pc_split, task_flag_none, data,
-			   sizeof(int) * 2, ci->count + cj->count);
-      
+			   sizeof(int) * 2, cp->count * cj->count);
+
       /* Add the resource and dependance */
       qsched_addlock(s, tid, cp->res);
 //      qsched_addlock(s, tid, cj->res);
@@ -859,7 +858,7 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj) {
       data[0] = cp-cell_pool;
       data[1] = ci-cell_pool;
       tid = qsched_addtask(s, task_type_pair_pc_split, task_flag_none, data,
-			   sizeof(int) * 2, ci->count + cj->count);
+			   sizeof(int) * 2, cp->count * cj->count);
       
       /* Add the resource and dependance */
       qsched_addlock(s, tid, cp->res);
@@ -869,12 +868,12 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj) {
     {
         data[0] = ci -cell_pool;
         data[1] = cj - cell_pool;
-        tid = qsched_addtask(s, task_type_pair_pc, task_flag_none, data, sizeof(int) * 2, ci->count + cj->count);
+        tid = qsched_addtask(s, task_type_pair_pc, task_flag_none, data, sizeof(int) * 2, ci->count * cj->count);
     /* Add the resource and dependance */
       qsched_addlock(s, tid, ci->res);
         data[0] = cj - cell_pool;
         data[1] = ci - cell_pool;
-        tid = qsched_addtask(s, task_type_pair_pc, task_flag_none, data, sizeof(int) * 2, ci->count + cj->count);
+        tid = qsched_addtask(s, task_type_pair_pc, task_flag_none, data, sizeof(int) * 2, ci->count * cj->count);
       qsched_addlock(s, tid, cj->res);
     }
 
@@ -893,7 +892,7 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj) {
       
       /* Create the task. */
       tid = qsched_addtask(s, task_type_pair, task_flag_none, data,
-			   sizeof(int) * 2, ci->count * cj->count);
+			   sizeof(int) * 2, ci->count * cj->count * 0.1);
       
       /* Add the resources. */
       qsched_addlock(s, tid, ci->res);
@@ -1226,13 +1225,13 @@ float *comm_temp;
     if( cudaMemcpyFromSymbol( &pcs, pc_calcs, sizeof(int), 0, cudaMemcpyDeviceToHost) != cudaSuccess)
         error("Failed");
     printf("pc calcs = %i\n", pcs);
-/*struct task* tasks =  qsched_get_timers( &s , s.count );
+struct task* tasks =  qsched_get_timers( &s , s.count );
     for(i = 0; i < s.count; i++)
     {
       printf("%i %lli %lli %i\n", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID);
        // printf("\n");
     
-  }*/
+  }
 
 }   
  #ifdef EXACT
@@ -1313,8 +1312,8 @@ ticks tic, toc_run ;
 int main(int argc, char *argv[]) {
     int c, nr_threads;
     int N = 1000, runs = 1;
-    char fileName[100] = {0};
-
+    char fileName[100];
+    fileName[0] = 0;
     /* Parse the options */
   while ((c = getopt(argc, argv, "n:r:t:f:c:i:")) != -1) switch (c) {
       case 'n':
diff --git a/src/CUDACompile.sh b/src/CUDACompile.sh
index 085086c..6f4804a 100755
--- a/src/CUDACompile.sh
+++ b/src/CUDACompile.sh
@@ -1,42 +1,48 @@
 #!/bin/bash
 FLAGS2="-Xcompiler=-fsanitize=address -Xcompiler=-fno-omit-frame-pointer"
 DEBUG_FLAGS="-G -DDEBUG_GPU"
-FLAGS="-O3 -g -DCPU_TPS=3.1e9 -lineinfo -src-in-ptx -Xptxas -dlcm=cg --maxrregcount=32 -gencode arch=compute_30,code=sm_30 -ftz=true -fmad=true -DFPTYPE_SINGLE -lgomp -DWITH_CUDA -DTIMERS -ccbin=/usr/bin/gcc-4.8"
+FLAGS="-O3 -DSIM -g -DCPU_TPS=3.1e9 -lineinfo -src-in-ptx -Xptxas -dlcm=cg --maxrregcount=32 -gencode arch=compute_30,code=sm_30 -ftz=true -fmad=true -DFPTYPE_SINGLE -lgomp -DWITH_CUDA -DTIMERS -ccbin=/usr/bin/gcc-4.8"
 # -DGPU_locks -Xptxas -dlcm=cg  -Xptxas="-v"" 
 # -DNO_LOADS
+#-DSIM
+#-lprofiler
 
 CFLAGS="-fsanitize=address -fno-omit-frame-pointer"
 
-gcc-4.8 -std=gnu99 -DHAVE_CONFIG_H -I. -I.. -g -O3 -Wall -Werror -ffast-math -fstrict-aliasing -ftree-vectorize -funroll-loops -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -fopenmp -DTIMERS -DWITH_CUDA -g -O2 -march=native -pthread -MT qsched.lo -MD -MP -MF .deps/qsched.Tpo -c qsched.c -o qsched.o -lprofiler
-gcc-4.8 -std=gnu99 -DHAVE_CONFIG_H -I. -I.. -g -O3 -Wall -Werror -ffast-math -fstrict-aliasing -ftree-vectorize -funroll-loops -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -fopenmp -DTIMERS -DWITH_CUDA -g -O2 -march=native -pthread -MT qsched.lo -MD -MP -MF .deps/qsched.Tpo -c queue.c -o queue.o -lprofiler
+gcc-4.8 -std=gnu99 -DHAVE_CONFIG_H -I. -I.. -g -O3 -Wall -Werror -ffast-math -fstrict-aliasing -ftree-vectorize -funroll-loops -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -fopenmp -DTIMERS -DWITH_CUDA -g -O2 -march=native -pthread -MT qsched.lo -MD -MP -MF .deps/qsched.Tpo -c qsched.c -o qsched.o -lprofiler -DSIM #-fsanitize=address -fno-omit-frame-pointer
+gcc-4.8 -std=gnu99 -DHAVE_CONFIG_H -I. -I.. -g -O3 -Wall -Werror -ffast-math -fstrict-aliasing -ftree-vectorize -funroll-loops -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -fopenmp -DTIMERS -DWITH_CUDA -g -O2 -march=native -pthread -MT qsched.lo -MD -MP -MF .deps/qsched.Tpo -c queue.c -o queue.o -lprofiler -DSIM #-fsanitize=address -fno-omit-frame-pointer
 
-/home/aidan/cuda_6.0/bin/nvcc $FLAGS -dc cuda_queue.cu qsched.o queue.o -lprofiler
+/home/aidan/cuda_7.0/bin/nvcc $FLAGS -dc cuda_queue.cu qsched.o queue.o -lprofiler
 
-/home/aidan/cuda_6.0/bin/nvcc $FLAGS  -lib cuda_queue.o qsched.o queue.o -o .libs/libquicksched_cuda.a -lprofiler
+/home/aidan/cuda_7.0/bin/nvcc $FLAGS  -lib cuda_queue.o qsched.o queue.o -o .libs/libquicksched_cuda.a -lprofiler
 
-/home/aidan/cuda_6.0/bin/nvcc $FLAGS  -DGPU_locks -dc cuda_queue.cu qsched.o queue.o -lprofiler
+/home/aidan/cuda_7.0/bin/nvcc $FLAGS  -DGPU_locks -dc cuda_queue.cu qsched.o queue.o -lprofiler
 
-/home/aidan/cuda_6.0/bin/nvcc $FLAGS  -lib cuda_queue.o qsched.o queue.o -o .libs/libquicksched_cuda_locks.a -lprofiler
+/home/aidan/cuda_7.0/bin/nvcc $FLAGS  -lib cuda_queue.o qsched.o queue.o -o .libs/libquicksched_cuda_locks.a -lprofiler
 
 cd ../examples
 
-/home/aidan/cuda_6.0/bin/nvcc $FLAGS  -dc -m64 -I../src -dc -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -lcudart -lcuda  test_gpu_simple.cu -lprofiler
+/home/aidan/cuda_7.0/bin/nvcc $FLAGS  -dc -m64 -I../src -dc -L/home/aidan/cuda_7.0/lib -L/home/aidan/cuda_7.0/lib64 -lcudart -lcuda  test_gpu_simple.cu -lprofiler
 
-/home/aidan/cuda_6.0/bin/nvcc $FLAGS  -m64 -I../src -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -Xnvlink -v test_gpu_simple.o ../src/.libs/libquicksched_cuda.a -o simple_cuda_test -lprofiler
+/home/aidan/cuda_7.0/bin/nvcc $FLAGS  -m64 -I../src -L/home/aidan/cuda_7.0/lib -L/home/aidan/cuda_7.0/lib64 -Xnvlink -v test_gpu_simple.o ../src/.libs/libquicksched_cuda.a -o simple_cuda_test -lprofiler
 
 
-/home/aidan/cuda_6.0/bin/nvcc $FLAGS  -dc -m64 -I../src -dc -lf77blas -lcblas -latlas -lm  -L/home/aidan/ATLAS/ATLAS_linux/lib/ -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -lcudart -lcuda /usr/lib64/atlas/libcblas.a /usr/lib64/atlas/libptcblas.a test_qr.cu -lprofiler
+/home/aidan/cuda_7.0/bin/nvcc $FLAGS  -dc -m64 -I../src -dc -lf77blas -lcblas -latlas -lm -I$CULA_INC_PATH -L$CULA_LIB_PATH_64 -L/home/aidan/ATLAS/ATLAS_linux/lib/ -L/home/aidan/cuda_7.0/lib -L/home/aidan/cuda_7.0/lib64 -lcudart -lcula_lapack -lcuda /usr/lib64/atlas/libcblas.a /usr/lib64/atlas/libptcblas.a test_qr.cu -lprofiler
 
-/home/aidan/cuda_6.0/bin/nvcc $FLAGS  -m64 -I../src -lf77blas -lcblas -latlas -lm  -L/home/aidan/ATLAS/ATLAS_linux/lib/ -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -Xnvlink -v test_qr.o ../src/.libs/libquicksched_cuda.a /usr/lib64/atlas/libcblas.a /usr/lib64/atlas/libptcblas.a -o qr_cuda_test -lprofiler
+/home/aidan/cuda_7.0/bin/nvcc $FLAGS  -m64 -I../src -lf77blas -lcblas -latlas -lm -I$CULA_INC_PATH -L$CULA_LIB_PATH_64 -L/home/aidan/ATLAS/ATLAS_linux/lib/ -L/home/aidan/cuda_7.0/lib -L/home/aidan/cuda_7.0/lib64 -lcula_lapack -Xnvlink -v test_qr.o ../src/.libs/libquicksched_cuda.a /usr/lib64/atlas/libcblas.a /usr/lib64/atlas/libptcblas.a -o qr_cuda_test -lprofiler
 
-/home/aidan/cuda_6.0/bin/nvcc $FLAGS  -dc -m64 -I../src -dc -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -lcudart -lcuda  test_hierarchy.cu -lprofiler
+/home/aidan/cuda_7.0/bin/nvcc $FLAGS  -dc -m64 -I../src -dc -L/home/aidan/cuda_7.0/lib -L/home/aidan/cuda_7.0/lib64 -lcudart -lcuda  test_hierarchy.cu -lprofiler
 
-/home/aidan/cuda_6.0/bin/nvcc $FLAGS  -m64 -I../src -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -Xnvlink -v test_hierarchy.o ../src/.libs/libquicksched_cuda.a -o test_heirarchy -lprofiler
+/home/aidan/cuda_7.0/bin/nvcc $FLAGS  -m64 -I../src -L/home/aidan/cuda_7.0/lib -L/home/aidan/cuda_7.0/lib64 -Xnvlink -v test_hierarchy.o ../src/.libs/libquicksched_cuda.a -o test_heirarchy -lprofiler
 
-/home/aidan/cuda_6.0/bin/nvcc $FLAGS  -dc -m64 -I../src -dc -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -lcudart -lcuda  test_bh_2.cu -lprofiler
+/home/aidan/cuda_7.0/bin/nvcc $FLAGS  -dc -m64 -I../src -dc -L/home/aidan/cuda_7.0/lib -L/home/aidan/cuda_7.0/lib64 -lcudart -lcuda  test_bh_2.cu -lprofiler
 
-/home/aidan/cuda_6.0/bin/nvcc $FLAGS  -m64 -I../src -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -Xnvlink -v test_bh_2.o ../src/.libs/libquicksched_cuda.a -o test_bh_2 -lprofiler
+/home/aidan/cuda_7.0/bin/nvcc $FLAGS  -m64 -I../src -L/home/aidan/cuda_7.0/lib -L/home/aidan/cuda_7.0/lib64 -Xnvlink -v test_bh_2.o ../src/.libs/libquicksched_cuda.a -o test_bh_2 -lprofiler
 
-/home/aidan/cuda_6.0/bin/nvcc $FLAGS  -dc -m64 -I../src -dc -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -lcudart -lcuda  test_bh_3.cu -lprofiler
+/home/aidan/cuda_7.0/bin/nvcc $FLAGS  -dc -m64 -I../src -dc -L/home/aidan/cuda_7.0/lib -L/home/aidan/cuda_7.0/lib64 -lcudart -lcuda  test_bh_3.cu -lprofiler
 
-/home/aidan/cuda_6.0/bin/nvcc $FLAGS  -m64 -I../src -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -Xnvlink -v test_bh_3.o ../src/.libs/libquicksched_cuda.a -o test_bh_3 -lprofiler
+/home/aidan/cuda_7.0/bin/nvcc $FLAGS  -m64 -I../src -L/home/aidan/cuda_7.0/lib -L/home/aidan/cuda_7.0/lib64 -Xnvlink -v test_bh_3.o ../src/.libs/libquicksched_cuda.a -o test_bh_3 -lprofiler
+
+/home/aidan/cuda_7.0/bin/nvcc $FLAGS  -dc -m64 -I../src -dc -L/home/aidan/cuda_7.0/lib -L/home/aidan/cuda_7.0/lib64 -lcudart -lcuda  test_bh_4.cu -lprofiler
+
+/home/aidan/cuda_7.0/bin/nvcc $FLAGS  -m64 -I../src -L/home/aidan/cuda_7.0/lib -L/home/aidan/cuda_7.0/lib64 -Xnvlink -v test_bh_4.o ../src/.libs/libquicksched_cuda.a -o test_bh_4 -lprofiler
diff --git a/src/cuda_queue.cu b/src/cuda_queue.cu
index d42e785..1e87003 100644
--- a/src/cuda_queue.cu
+++ b/src/cuda_queue.cu
@@ -80,6 +80,10 @@ __device__ int cuda_barrier = 0;
 __device__ volatile int tot_num_tasks;
 
 __device__ qsched_funtype fun;
+#ifdef SIM
+__device__ int num_loads;
+__device__ int num_tasks;
+#endif
 /**
  * @brief Get a task ID from the given queue.
  *
@@ -533,7 +537,11 @@ __global__ void qsched_device_kernel ( )
              tid = -1;
                 /* Highest priority queue, holds the unload tasks. */
 #ifndef NO_LOADS
+ #ifdef SIM
+                if(cuda_queues[2].nr_avail_tasks > 0 && num_tasks <= 0)
+    #else
                 if(cuda_queues[2].nr_avail_tasks > 0 )
+   #endif
                 {
                     TIMER_TIC
                     tid = runner_cuda_gettask( &cuda_queues[2] );
@@ -543,7 +551,11 @@ __global__ void qsched_device_kernel ( )
 
 #ifndef NO_LOADS
              /* Low priority queue, contains the load tasks. */
+#ifdef SIM
+            if( tid < 0 && cuda_queues[1].nr_avail_tasks > 0 )
+#else
              if( tid < 0 && cuda_queues[1].nr_avail_tasks > 0 && blockIdx.x < 12 )
+#endif
              {
                 TIMER_TIC
                 tid = runner_cuda_gettask ( &cuda_queues[1]);
@@ -552,21 +564,16 @@ __global__ void qsched_device_kernel ( )
 #endif
 
              /* Middle priority queue, contains user-specifed tasks. */
-#ifdef PRIQ
-            if( tid < 0 && cuda_queues[0].nr_avail_tasks > 0 )
-            {
-                TIMER_TIC
-                tid = runner_cuda_gettask_priority ( &cuda_queues[0]);
-                TIMER_TOC(timers_queue)
-            }   
+#ifdef SIM
+            if( tid < 0 && cuda_queues[0].nr_avail_tasks > 0 && num_loads <= 0 )
 #else
              if( tid < 0 && cuda_queues[0].nr_avail_tasks > 0 )
+#endif
             {
                 TIMER_TIC
                 tid = runner_cuda_gettask ( &cuda_queues[0]);
                 TIMER_TOC(timers_queue)
             }
-#endif
             }
             
         /*Everyone wait for us to get a task id*/
@@ -601,6 +608,10 @@ __global__ void qsched_device_kernel ( )
             dest = (int*)res_cuda[d[0]].gpu_data;
             cuda_memcpy_tasks( dest, src , res_cuda[d[0]].size, tid);
             TIMER_TOC(timers_doload)
+        #ifdef SIM
+            if(threadIdx.x == 0)
+                atomicAdd(&num_loads, -1);
+        #endif
         }else if( tasks_cuda[tid].type == type_unload )
         {
             TIMER_TIC
@@ -614,7 +625,18 @@ __global__ void qsched_device_kernel ( )
            fun(tasks_cuda[tid].type , &data_cuda[tasks_cuda[tid].data]);
             __syncthreads();
             TIMER_TOC(timers_doother)
+        #ifdef SIM
+            if(threadIdx.x == 0)
+                atomicAdd(&num_tasks, -1);
+
+        }else if (tasks_cuda[tid].type == type_ghost)
+        {
+            if(threadIdx.x == 0)
+                atomicAdd(&num_tasks, -1);
+        }
+        #else
         }
+    #endif
         __syncthreads();
         /*Stop the task clock*/
         if( threadIdx.x == 0 ) 
@@ -626,6 +648,7 @@ __global__ void qsched_device_kernel ( )
             cuda_done( &tasks_cuda[tid] );
         __syncthreads();
     #endif
+#ifndef SIM
         for(i = threadIdx.x; i < tasks_cuda[tid].nr_unlocks; i += blockDim.x )
         {
             if( atomicSub( &tasks_cuda[tasks_cuda[tid].unlocks[i]].wait , 1 ) == 1 && !( tasks_cuda[tasks_cuda[tid].unlocks[i]].flags & task_flag_skip ))
@@ -639,6 +662,7 @@ __global__ void qsched_device_kernel ( )
                     cuda_queue_puttask( &cuda_queues[2] , tasks_cuda[tid].unlocks[i] );
             }
         }
+#endif
     }
 
 
@@ -1588,6 +1612,10 @@ struct queue_cuda queues[ cuda_numqueues ];
     for ( k = 0; k < qsize; k++ )
             data[k] = -1;
             
+#ifdef SIM
+    if( cudaMemcpyToSymbol(num_loads, (int*)&queues[1].count, sizeof(int),0, cudaMemcpyHostToDevice ) != cudaSuccess)
+        error("Failed to copy num_loads");
+#endif
     /* Allocate and copy the recyling data. */
     if ( cudaMalloc( &queues[1].rec_data , sizeof(int) * s->count*1.2 ) != cudaSuccess )
         error("Failed to allocate queue data on the device.");
@@ -1618,7 +1646,9 @@ struct queue_cuda queues[ cuda_numqueues ];
         if(temp[k].type == type_unload)
         {
             num_unload++;
+#ifndef SIM
             if(temp[k].wait == 0)
+#endif
                 data[queues[2].count++] = k;
         }
     }
@@ -1640,11 +1670,19 @@ struct queue_cuda queues[ cuda_numqueues ];
     queues[0].count = 0;
     for ( k = 0; k < s->count ; k++ )
     {
+#ifdef SIM
+        if(temp[k].type != type_load && temp[k].type != type_unload){
+#else
         if(temp[k].type != type_load && temp[k].type != type_unload && temp[k].wait == 0){
+#endif
             data[queues[0].count++] = k;
     }
 
     }
+    #ifdef SIM
+        if(cudaMemcpyToSymbol(num_tasks, (int*)&queues[0].count, sizeof(int), 0, cudaMemcpyHostToDevice) != cudaSuccess)
+            error("Failed to copy num_tasks");
+    #endif
     queues[0].first = 0;
     queues[0].last = queues[0].count;
     queues[0].nr_avail_tasks = queues[0].last;
-- 
GitLab