From f0ab955c4be92d874ea9e13ccb33fa3645ff32bb Mon Sep 17 00:00:00 2001
From: aidan <aidan@gtx690.dur.ac.uk>
Date: Mon, 2 Mar 2015 14:02:05 +0000
Subject: [PATCH] Code cleanup

---
 examples/test_bh_2.cu       | 145 ++----------------
 examples/test_bh_3.cu       | 284 ++++++++++--------------------------
 examples/test_gpu_simple.cu |  21 +--
 examples/test_qr.cu         | 158 ++------------------
 src/CUDACompile.sh          |   6 +-
 src/cuda_queue.cu           | 279 +++--------------------------------
 src/qsched.h                |   3 +-
 7 files changed, 126 insertions(+), 770 deletions(-)

diff --git a/examples/test_bh_2.cu b/examples/test_bh_2.cu
index 0cdae97..66ecbf5 100644
--- a/examples/test_bh_2.cu
+++ b/examples/test_bh_2.cu
@@ -40,6 +40,7 @@ extern "C"{
 }
 #include "cuda_queue.h"
 
+
 /** Task types. */
 enum task_type {
   task_type_self = 0,
@@ -59,12 +60,12 @@ unsigned short int split, sorted;
 int parts, firstchild, sibling;
 int res, resz, resm, com_tid;
 
-};//__attribute__((aligned(64)));
+};
 
 
 #define const_G 1
 /* Requred variables to obtain cells. */
-#define cell_maxparts 128
+#define cell_maxparts 256
 #define CELL_STRETCH 2
 #define INITIAL_CELLS 256
 struct cell *cell_pool = NULL;
@@ -101,20 +102,7 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
     int count_i = ci->count, count_j = cj->count;
     int parts_i = ci->parts, parts_j = cj->parts;
     double xi[3];
-    float dx[3], ai[3], mi, mj, r2, w, ir;
-    __shared__ double2 parts_xy[cell_maxparts];
-    __shared__ double parts_z[cell_maxparts];
-    __shared__ float4 parts_am[cell_maxparts];
-    /*if(threadIdx.x == 0)
-    printf("%f, %f, %f, %f, %i, %f, %f, %f, %f, %i\n", ci->h, ci->loc_xy.x, ci->loc_xy.y, ci->loc_z, ci->split, 
-            cj->h, cj->loc_xy.x, cj->loc_xy.y, cj->loc_z, cj->split);*/
-
-    /* Load particles of cell j into shared memory */
-    /*for(k = parts_j + threadIdx.x, j = threadIdx.x; k < parts_j + count_j; k+= blockDim.x, j += blockDim.x ) {
-        parts_xy[j] = parts_pos_xy[k];
-        parts_z[j] = parts_pos_z[k];
-        parts_am[j] = parts_a_m[k];
-    }*/
+    float dx[3], ai[3], mj, r2, w, ir;
 
     /* Loop over cell i.*/
     for(i = parts_i + threadIdx.x; i < parts_i + count_i; i+= blockDim.x) {
@@ -124,7 +112,6 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
         for(k = 0; k < 3; k++) {
             ai[k] = 0.0f;
         }
-        mi = parts_a_m[i].w;
         
         for(j = parts_j; j < parts_j + count_j; j++) {
             r2 = 0.0f;
@@ -136,16 +123,12 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
             r2 += dx[2] * dx[2];
 
 
-    //        ir = 1.0f / sqrtf(r2);
             ir = rsqrtf(r2);
             w = const_G * ir * ir * ir;
             mj = parts_a_m[j].w;
             for(k = 0; k < 3; k++) {
                 ai[k] -= dx[k] * mj * w;
             }
-   //         atomicAdd(&parts_a_m[j].x, w*dx[0]*mi);
-    //        atomicAdd(&parts_a_m[j].y, w*dx[1]*mi);
-     //       atomicAdd(&parts_a_m[j].z, w*dx[2]*mi);
         }            
        atomicAdd(&parts_a_m[i].x, ai[0]);           
        atomicAdd(&parts_a_m[i].y, ai[1]);
@@ -154,11 +137,6 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
     }
 
     /* Load particles of cell i into shared memory */
-    /*for(k = parts_i + threadIdx.x, j = threadIdx.x; k < parts_i + count_i; k+= blockDim.x, j += blockDim.x ) {
-        parts_xy[j] = parts_pos_xy[k];
-        parts_z[j] = parts_pos_z[k];
-        parts_am[j] = parts_a_m[k];
-    }*/
  /*Loop over cell j. */
     for(i = parts_j + threadIdx.x; i < parts_j + count_j; i+= blockDim.x) {
         xi[0] = parts_pos_xy[i].x;
@@ -167,7 +145,6 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
         for(k = 0; k < 3; k++) {
             ai[k] = 0.0f;
         }
-        mi = parts_a_m[i].w;
         
         for(j = parts_i; j < parts_i + count_i; j++) {
             r2 = 0.0f;
@@ -196,29 +173,15 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
 
 __device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell *cj) {
 
-    int i, k;
+    int i;
     double2 j_com_xy;
     double j_com_z;
     float j_com_mass;
     int count = leaf->count;    
     int parts = leaf->parts;
     int cell_j = cj - cells;
-    int temp;
     float r2, dx[3], ir, w;
 
-  //  if(cell_j < 0)
-//    {
-//        if(threadIdx.x == 0)
-  //          printf("cell_j = %i, leaf = %i, threadIdx.x == %i\n", cell_j, leaf-cells, threadIdx.x);
-  //      __syncthreads();
-//        asm("trap;");
-//    }
-
- //   if(threadIdx.x == 0)
- //       printf("%f, %f, %f\n", cj->loc_xy.x, cj->loc_xy.y, cj->loc_z);
-
-
-    temp = cell_j;
 
     /* Init the com's data.*/
     j_com_xy = com_xy[cell_j];
@@ -236,18 +199,10 @@ __device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell
     
         ir = rsqrtf(r2);
         w = j_com_mass * const_G * ir * ir * ir;
-       /* __threadfence();
-        if(!isfinite(w * dx[0])){
-            printf("Error in make_interact_pc, j_com_mass = %f, cell_j = %i, temp = %i, i = %i, threadIdx.x=%i\n", j_com_mass, cell_j, temp, i, threadIdx.x); asm("trap;");}
-        if(!isfinite(w * dx[1])){
-            printf("Error in make_interact_pc\n"); asm("trap;");}
-        if(!isfinite(w * dx[2])){
-            printf("Error in make_interact_pc\n"); asm("trap;");}*/
         atomicAdd( &parts_a_m[i].x , w * dx[0]);
         atomicAdd( &parts_a_m[i].y , w * dx[1]);
         atomicAdd( &parts_a_m[i].z , w * dx[2]);
     }
-//__syncthreads();
 }
 
 /**
@@ -255,7 +210,6 @@ __device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell
  */
 __device__ __forceinline__ int are_neighbours_different_size(struct cell *ci, struct cell *cj) {
 
-    int k;
     float dx[3];
     double cih = ci->h, cjh = cj->h;
     
@@ -274,7 +228,7 @@ __device__ __forceinline__ int are_neighbours_different_size(struct cell *ci, st
 }
 
 __device__ __forceinline__ int are_neighbours(struct cell *ci, struct cell *cj) {
-    int k;
+
     float dx[3];
     float min_dist = ci->h;
     float center_i = ci->loc_xy.x;
@@ -296,13 +250,6 @@ __device__ __forceinline__ int is_inside(struct cell *leaf, struct cell *c) {
 __device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf) {
 
     struct cell *cp ,*cps;
-    int leafnum = leaf - cells;
-//if(threadIdx.x == 0 && leafnum == 23)
-  //      printf("cj = %i\n", cj - cells);
-//    printf("%i\n", leafnum);
-
- //   if(threadIdx.x == 0)
- ///       printf("ci = %i, cj = %i, leaf = %i\n", ci - cells, cj - cells, leaf - cells);
 
     for(cp = &cells[ci->firstchild]; cp != &cells[ci->sibling]; cp = &cells[cp->sibling]) {
         if(is_inside(leaf, cp)) break;
@@ -318,16 +265,12 @@ __device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf
                 }
             } else {
                 make_interact_pc(leaf, cps);
-             //   if(threadIdx.x == 0 && leafnum == 23)
-               //         printf("leafnum = %i with cps = %i here\n", leafnum, cps - cells);
                 __syncthreads();
             }
         }
     }else{
 
         for(cps = &cells[cj->firstchild]; cps!= &cells[cj->sibling]; cps = &cells[cps->sibling]) {
-         //   if(threadIdx.x == 0 && leafnum == 23)
-           //             printf("leafnum = %i with cps = %i\n", leafnum, cps - cells);
             make_interact_pc(leaf, cps);
         }
 
@@ -347,17 +290,6 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) {
 
     struct cell *cp, *cps;
 
-    /*if(leaf->split)
-    {
-        printf("Leaf split = 1, oh dear.");
-        asm("trap;");
-    }
-    if(c->split > 1)
-    {
-        printf("Cell had split > 1\n");
-        asm("trap;");
-    }*/
-
     /* Find the subcell of c the leaf is in.*/
 
     /*cp = c;
@@ -402,31 +334,13 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) {
 __device__ void iact_self_direct(int cellID) {
     struct cell *c = &cells[cellID];
     double xi[3] = {0.0, 0.0, 0.0};
-    float ai[3] = {0.0, 0.0, 0.0 }, mi, mj, dx[3] = {0.0,0.0,0.0}, r2, ir, w;
-    __shared__ double2 parts_xy[cell_maxparts];
-    __shared__ double parts_z[cell_maxparts];
-    __shared__ float4 parts_am[cell_maxparts];
+    float ai[3] = {0.0, 0.0, 0.0 }, mj, dx[3] = {0.0,0.0,0.0}, r2, ir, w;
     int parts;
     int count;
     int i,j,k;
     
-    //if(threadIdx.x == 0)
-     //   printf("%f, %f, %f, %f, %i\n", c->h, c->loc_xy.x, c->loc_xy.y, c->loc_z, c->split);
-    //If cell is split, interact each child with itself, and with each of its siblings.
-    /*if(c->split) {
-        //TODO
-
-    } else {*/
         parts = c->parts;
         count = c->count;
-        int z = threadIdx.x;
-        /* Load particle data into shared memory*/
-        /*for(k = threadIdx.x + parts; k < parts + count; k += blockDim.x , z += blockDim.x) {
-            parts_xy[z] = parts_pos_xy[k];
-            parts_z[z] = parts_pos_z[k];
-            parts_am[z] = parts_a_m[k];
-        }
-        __syncthreads();*/
         for(i = parts+threadIdx.x; i < parts+count; i += blockDim.x)
         {
             xi[0] = parts_pos_xy[i].x;
@@ -435,9 +349,7 @@ __device__ void iact_self_direct(int cellID) {
             for(k = 0; k < 3; k++) {
                 ai[k] = 0.0;
             }
-            mi = parts_a_m[i].w;
             
-            //for(j = i+1; j!= i; j = (j+1)%count) 
             for(j = parts; j < parts+count; j++)
             {
                 if(i != j){
@@ -453,7 +365,6 @@ __device__ void iact_self_direct(int cellID) {
 
                     /* Apply the gravitational acceleration. */
 
-                    //ir = 1.0f / sqrtf(r2);
                     ir = rsqrtf(r2);
                     w = const_G * ir * ir * ir;
                     mj = parts_a_m[j].w;
@@ -479,7 +390,7 @@ __device__ void iact_self_direct(int cellID) {
  */
 static inline int are_neighbours_host(struct cell *ci, struct cell *cj) {
 
-  int k;
+//  int k;
   float dx[3];
 
 #ifdef SANITY_CHECKS
@@ -508,7 +419,7 @@ static inline int are_neighbours_host(struct cell *ci, struct cell *cj) {
 
 struct cell *cell_get()
 {
-    struct cell *res;
+//    struct cell *res;
     
     if(num_cells == 0)
     {
@@ -645,9 +556,7 @@ void cell_split(int c, struct qsched *s) {
     int left[8], right[8];
     double pivot[3];
     static int root = -1;
-//    struct cell *progenitors[8];
     int progenitors[8];
-    int c1 = c;
     double2 *temp_xy;
     double *temp_z;
     float4 *temp_a_m;
@@ -662,7 +571,6 @@ void cell_split(int c, struct qsched *s) {
     {
         if( cudaHostGetDevicePointer(&temp_xy, &parts_pos_xy_host[cell_pool[c].parts], 0) != cudaSuccess )
             error("Failed to get host device pointer.");
-//        printf("tempxy = %p\n", temp_xy);
         cell_pool[c].res = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_xy,
                                         sizeof(double2) * cell_pool[c].count, parts_pos_xy_temp + cell_pool[c].parts);
     }
@@ -680,7 +588,6 @@ void cell_split(int c, struct qsched *s) {
         cell_pool[c].resm = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_a_m,
                                         sizeof(float4) * cell_pool[c].count, parts_a_m_temp + cell_pool[c].parts);
     }
-       // error("Cell has no resource");*///TODO
 
     if(count > cell_maxparts )
     {
@@ -836,13 +743,9 @@ void cell_split(int c, struct qsched *s) {
     /* Otherwise we're at a leaf so we need to make the cell's particle-cell task. */   
     } else {
 
-//    struct cell *data[2] = {root, c};
     int data[2] = {root, c};
         int tid = qsched_addtask(s, task_type_self_pc, task_flag_none, data,
                                  2 * sizeof(int), 3000);
-        /*qsched_adduse(s, tid, cell_pool[root].res);
-        qsched_adduse(s, tid, cell_pool[root].resz);
-        qsched_adduse(s, tid, cell_pool[root].resm);*/
         qsched_addlock(s, tid, cell_pool[c].res);
         qsched_addlock(s, tid, cell_pool[c].resz);
         qsched_addlock(s, tid, cell_pool[c].resm);
@@ -864,8 +767,7 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){
     
     qsched_task_t tid;
     int data[2];
-    struct cell /**data[2],*/ *cp, *cps;
-    int cpi;
+    struct cell *cp, *cps;
     
     
     if(cj == NULL)
@@ -967,11 +869,8 @@ qsched_funtype func;
 void test_bh(int N, int runs, char *fileName) {
   int i, k;
   struct cell *root;
-  struct part *parts;
   FILE *file;
   struct qsched s;
-  ticks tic, toc_run, tot_setup = 0, tot_run = 0;
-  int countMultipoles = 0, countPairs = 0, countCoMs = 0;
   struct cell *gpu_ptr_cells;
 
   cudaFree(0);
@@ -1142,23 +1041,6 @@ float *comm_temp;
     if( cudaMemcpyToSymbol(com_mass, &comm_temp, sizeof(float *), 0, cudaMemcpyHostToDevice) != cudaSuccess)
         error("Failed to copy com pointer to the GPU");
 
-   /* for(i = 0; i < s.count; i++){
-        int *idata = (int*)&s.data[s.tasks[i].data];
-        if(s.tasks[i].type == task_type_self)
-            printf("Self task with data[0] = %i\n", idata[0]);
-        if(s.tasks[i].type == task_type_pair) {
-            printf("Pair task with data[0] = %i and data[1] = %i\n", idata[0], idata[1]);
-        }
-        if(s.tasks[i].type == task_type_self_pc) {
-            printf("PC task with data[0] = %i and data[1] = %i\n", idata[0], idata[1]);
-        }
-    }*/
-        
-      //  printf("com_mass_host[152] = %f\n", com_mass_host[152]);
-        
-
-        //Run code.
-//        printf("gpu_data = %p\n", (int*)s.res[0].gpu_data);
         qsched_run_CUDA( &s , func );
         qsched_print_cuda_timers(&s);
 
@@ -1169,16 +1051,12 @@ struct task* tasks =  qsched_get_timers( &s , s.count );
     for(i = 0; i < s.count; i++)
     {
       printf("%i %lli %lli %i\n", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID);
-       // printf("\n");
     
   }
 
 }   
   /* Dump the particles to a file */
   file = fopen("particle_dump.dat", "w");
-/*  fprintf(file,
-          "# ID m x y z a_exact.x   a_exact.y    a_exact.z    a_legacy.x    "
-          "a_legacy.y    a_legacy.z    a_new.x     a_new.y    a_new.z\n");*/
   for (k = 0; k < N; ++k)
     fprintf(file, "%e, %e, %e, %e, %e, %e, %e\n",
             parts_a_m_host[k].w, parts_pos_xy_host[k].x, parts_pos_xy_host[k].y, parts_pos_z_host[k],
@@ -1228,9 +1106,6 @@ int main(int argc, char *argv[]) {
   /* Tree node information */
   printf("Size of cell: %zu bytes.\n", sizeof(struct cell));
 
-  /* Part information */
-//  printf("Size of part: %zu bytes.\n", sizeof(struct part));
-
   /* Dump arguments. */
   if (fileName[0] == 0) {
     message("Computing the N-body problem over %i random particles using %i "
diff --git a/examples/test_bh_3.cu b/examples/test_bh_3.cu
index a1adfbd..7b3df04 100644
--- a/examples/test_bh_3.cu
+++ b/examples/test_bh_3.cu
@@ -39,6 +39,9 @@ extern "C"{
 #include "res.h"
 }
 #include "cuda_queue.h"
+//#define double float
+//#define double2 float2
+
 
 /** Task types. */
 enum task_type {
@@ -57,20 +60,19 @@ double h;
 int count;
 unsigned short int split, sorted;
 int parts, firstchild, sibling;
-int res, /*resz, resm,*/ com_tid;
-
-};//__attribute__((aligned(64)));
+int res, com_tid;
 
+};
 struct part{
 double loc[3];
 float a[3];
 float m;
-};//__attribute__((aligned(64)));
+};
 
 
 #define const_G 1
 /* Requred variables to obtain cells. */
-#define cell_maxparts 256
+#define cell_maxparts 64
 #define CELL_STRETCH 2
 #define INITIAL_CELLS 256
 struct cell *cell_pool = NULL;
@@ -79,9 +81,6 @@ int num_cells = 0;
 int cell_size = INITIAL_CELLS*sizeof(struct cell);
 
 /* Device locations for the particle values. */
-//__device__ double2 *parts_pos_xy;
-//__device__ double *parts_pos_z;
-//__device__ float4 *parts_a_m;
 __device__ double2 *com_xy;
 __device__ double *com_z;
 __device__ float *com_mass;
@@ -90,9 +89,6 @@ __device__ struct cell *cells;
 
 
 /* Host locations for the particle values. */
-//double2 *parts_pos_xy_host;
-//double *parts_pos_z_host;
-//float4 *parts_a_m_host;
 double2 *com_xy_host;
 double *com_z_host;
 float *com_mass_host;
@@ -109,20 +105,7 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
     int count_i = ci->count, count_j = cj->count;
     int parts_i = ci->parts, parts_j = cj->parts;
     double xi[3];
-    float dx[3], ai[3], mi, mj, r2, w, ir;
-    __shared__ double2 parts_xy[cell_maxparts];
-    __shared__ double parts_z[cell_maxparts];
-    __shared__ float4 parts_am[cell_maxparts];
-    /*if(threadIdx.x == 0)
-    printf("%f, %f, %f, %f, %i, %f, %f, %f, %f, %i\n", ci->h, ci->loc_xy.x, ci->loc_xy.y, ci->loc_z, ci->split, 
-            cj->h, cj->loc_xy.x, cj->loc_xy.y, cj->loc_z, cj->split);*/
-
-    /* Load particles of cell j into shared memory */
-    /*for(k = parts_j + threadIdx.x, j = threadIdx.x; k < parts_j + count_j; k+= blockDim.x, j += blockDim.x ) {
-        parts_xy[j] = parts_pos_xy[k];
-        parts_z[j] = parts_pos_z[k];
-        parts_am[j] = parts_a_m[k];
-    }*/
+    float dx[3], ai[3], mj, r2, w, ir;
 
     /* Loop over cell i.*/
     for(i = parts_i + threadIdx.x; i < parts_i + count_i; i+= blockDim.x) {
@@ -132,7 +115,6 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
         for(k = 0; k < 3; k++) {
             ai[k] = 0.0f;
         }
-        mi = parts_cuda[i].m;
         
         for(j = parts_j; j < parts_j + count_j; j++) {
             r2 = 0.0f;
@@ -144,16 +126,12 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
             r2 += dx[2] * dx[2];
 
 
-    //        ir = 1.0f / sqrtf(r2);
             ir = rsqrtf(r2);
             w = const_G * ir * ir * ir;
             mj = parts_cuda[j].m;
             for(k = 0; k < 3; k++) {
                 ai[k] -= dx[k] * mj * w;
             }
-   //         atomicAdd(&parts_a_m[j].x, w*dx[0]*mi);
-    //        atomicAdd(&parts_a_m[j].y, w*dx[1]*mi);
-     //       atomicAdd(&parts_a_m[j].z, w*dx[2]*mi);
         }            
        atomicAdd(&parts_cuda[i].a[0], ai[0]);           
        atomicAdd(&parts_cuda[i].a[1], ai[1]);
@@ -162,11 +140,6 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
     }
 
     /* Load particles of cell i into shared memory */
-    /*for(k = parts_i + threadIdx.x, j = threadIdx.x; k < parts_i + count_i; k+= blockDim.x, j += blockDim.x ) {
-        parts_xy[j] = parts_pos_xy[k];
-        parts_z[j] = parts_pos_z[k];
-        parts_am[j] = parts_a_m[k];
-    }*/
  /*Loop over cell j. */
     for(i = parts_j + threadIdx.x; i < parts_j + count_j; i+= blockDim.x) {
         xi[0] = parts_cuda[i].loc[0];
@@ -175,7 +148,6 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
         for(k = 0; k < 3; k++) {
             ai[k] = 0.0f;
         }
-        mi = parts_cuda[i].m;
         
         for(j = parts_i; j < parts_i + count_i; j++) {
             r2 = 0.0f;
@@ -202,53 +174,30 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
 
 }
 
-__device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell *cj) {
+__device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell *cj, float3 *accels) {
 
-    int i, k;
-    double2 j_com_xy;
-    double j_com_z;
-    float j_com_mass;
+    int i;
     int count = leaf->count;    
     int parts = leaf->parts;
     int cell_j = cj - cells;
-    int temp;
     float r2, dx[3], ir, w;
 
-  //  if(cell_j < 0)
-//    {
-//        if(threadIdx.x == 0)
-  //          printf("cell_j = %i, leaf = %i, threadIdx.x == %i\n", cell_j, leaf-cells, threadIdx.x);
-  //      __syncthreads();
-//        asm("trap;");
-//    }
-
- //   if(threadIdx.x == 0)
- //       printf("%f, %f, %f\n", cj->loc_xy.x, cj->loc_xy.y, cj->loc_z);
-
-
-    temp = cell_j;
-
-    /* Init the com's data.*/
-    j_com_xy = com_xy[cell_j];
-    j_com_z = com_z[cell_j];
-    j_com_mass = com_mass[cell_j];
 
     for(i = parts+threadIdx.x; i < parts+count; i+=blockDim.x) {
         r2 = 0.0;
-        dx[0] = j_com_xy.x - parts_cuda[i].loc[0];
+        dx[0] = com_xy[cell_j].x - parts_cuda[i].loc[0];
         r2 += dx[0] * dx[0];
-        dx[1] = j_com_xy.y - parts_cuda[i].loc[1];
+        dx[1] = com_xy[cell_j].y - parts_cuda[i].loc[1];
         r2 += dx[1] * dx[1];
-        dx[2] = j_com_z - parts_cuda[i].loc[2];
+        dx[2] = com_z[cell_j] - parts_cuda[i].loc[2];
         r2 += dx[2] * dx[2];
     
         ir = rsqrtf(r2);
-        w = j_com_mass * const_G * ir * ir * ir;
-        atomicAdd( &parts_cuda[i].a[0] , w * dx[0]);
-        atomicAdd( &parts_cuda[i].a[1] , w * dx[1]);
-        atomicAdd( &parts_cuda[i].a[2] , w * dx[2]);
+        w = com_mass[cell_j] * const_G * ir * ir * ir;
+        (*accels).x+= w*dx[0];
+        (*accels).y+= w*dx[1];
+        (*accels).z+= w*dx[2];
     }
-//__syncthreads();
 }
 
 /**
@@ -256,7 +205,6 @@ __device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell
  */
 __device__ __forceinline__ int are_neighbours_different_size(struct cell *ci, struct cell *cj) {
 
-    int k;
     float dx[3];
     double cih = ci->h, cjh = cj->h;
     
@@ -275,7 +223,6 @@ __device__ __forceinline__ int are_neighbours_different_size(struct cell *ci, st
 }
 
 __device__ __forceinline__ int are_neighbours(struct cell *ci, struct cell *cj) {
-    int k;
     float dx[3];
     float min_dist = ci->h;
     float center_i = ci->loc_xy.x;
@@ -294,7 +241,7 @@ __device__ __forceinline__ int is_inside(struct cell *leaf, struct cell *c) {
     return (leaf->parts >= c->parts) && (leaf->parts < c->parts + c->count);
 }
 
-__device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf) {
+__device__ __forceinline__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf, float3 *accels) {
 
     struct cell *cp ,*cps;
 
@@ -311,8 +258,9 @@ __device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf
                 make interact pc                    
 
     */
-    struct cell *loopend;
+   struct cell *loopend;
     cp = ci;
+    #pragma unroll
     while(cp->split)
     {
         for(cp = &cells[ci->firstchild]; cp != &cells[ci->sibling]; cp = &cells[cp->sibling]) {
@@ -322,26 +270,32 @@ __device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf
         if(are_neighbours_different_size(cp, cj)) {
             loopend = &cells[cj->sibling];
             cps = &cells[cj->firstchild];
+            #pragma unroll
             while(cps != loopend)
             {
-                if(!are_neighbours(cp, cps)){
-                    make_interact_pc(leaf, cps);
-                    //__syncthreads();
-                    cps = &cells[cps->sibling];
-                }else{
-                    if(cps->split)
-                        cps = &cells[cps->firstchild];
-                    else
+               // if(are_neighbours_different_size(cps, cj)){
+                   if(!are_neighbours(cp, cps)){
+                        make_interact_pc(leaf, cps, accels);
                         cps = &cells[cps->sibling];
-                }
+                    }else{
+                        if(cps->split)
+                            cps = &cells[cps->firstchild];
+                        else
+                            cps = &cells[cps->sibling];
+                    }
+
+//                }else{
+  //                  struct cell *temp;
+    //                for(temp = &cells[cps->firstchild]; temp != &cells[cps->sibling]; temp = &cells[temp->sibling])
+      //                  make_interact_pc(leaf, temp, accels);
+        //            cps = &cells[cps->sibling];
+          //      }
 
             }   
 
         }else{
             for(cps = &cells[cj->firstchild]; cps != &cells[cj->sibling]; cps = &cells[cps->sibling]) {
-                make_interact_pc(leaf, cps);
-               // __syncthreads();
-                
+                make_interact_pc(leaf, cps, accels);
             }
             break;
         }
@@ -354,7 +308,7 @@ __device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf
     
 
 
-  /* for(cp = &cells[ci->firstchild]; cp != &cells[ci->sibling]; cp = &cells[cp->sibling]) {
+ /*  for(cp = &cells[ci->firstchild]; cp != &cells[ci->sibling]; cp = &cells[cp->sibling]) {
         if(is_inside(leaf, cp)) break;
     }
 
@@ -363,11 +317,11 @@ __device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf
         for(cps = &cells[cj->firstchild]; cps != &cells[cj->sibling]; cps = &cells[cps->sibling]) {
 
             if(!are_neighbours(cp, cps)) {
-                make_interact_pc(leaf, cps);
+                make_interact_pc(leaf, cps, accels);
                 __syncthreads();
             } else {
                 if(cp->split && cps->split) {
-                    iact_pair_pc(cp, cps, leaf);
+                    iact_pair_pc(cp, cps, leaf, accels);
                 }
                 
             }
@@ -375,12 +329,11 @@ __device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf
     }else{
 
         for(cps = &cells[cj->firstchild]; cps!= &cells[cj->sibling]; cps = &cells[cps->sibling]) {
-            make_interact_pc(leaf, cps);
+            make_interact_pc(leaf, cps, accels);
         }
 
-    }*/
-
-//    __syncthreads();
+    }
+*/
 }   
 
 /**
@@ -390,25 +343,21 @@ __device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf
  * @param c The #cell containing the monopoles
  * @param leaf The #cell containing the particles
  */
-__device__ void iact_self_pc(struct cell *c, struct cell *leaf) {
+__device__ __forceinline__ void iact_self_pc(struct cell *c, struct cell *leaf) {
 
     struct cell *cp, *cps;
+    float3 accelerations;
 
-    /*if(leaf->split)
-    {
-        printf("Leaf split = 1, oh dear.");
-        asm("trap;");
-    }
-    if(c->split > 1)
-    {
-        printf("Cell had split > 1\n");
-        asm("trap;");
-    }*/
+    int i;
+    accelerations.x = 0.0f;
+    accelerations.y = 0.0f;
+    accelerations.z = 0.0f;
 
     /* Find the subcell of c the leaf is in.*/
 
     cp = c;
     cps = c;
+    #pragma unroll
     while(c->split)
     {
         for(cp = &cells[cp->firstchild]; cp != &cells[c->sibling]; cp = &cells[cp->sibling]){
@@ -417,12 +366,21 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) {
         if(cp->split){
             for(cps = &cells[c->firstchild]; cps != &cells[c->sibling]; cps = &cells[cps->sibling]) {
                 
-                if(cp != cps && cps->split) iact_pair_pc(cp, cps, leaf);
+                if(cp != cps && cps->split) iact_pair_pc(cp, cps, leaf, &accelerations);
             }
         }
         c = cp;
     }
 
+    int parts = leaf->parts;
+    int count = leaf->count;
+
+    for(i = parts+threadIdx.x; i < parts+count; i+=blockDim.x) {
+        atomicAdd( &parts_cuda[i].a[0] , accelerations.x);
+        atomicAdd( &parts_cuda[i].a[1] , accelerations.y);
+        atomicAdd( &parts_cuda[i].a[2] , accelerations.z);
+    }
+
     /*for( cp = &cells[c->firstchild]; cp != &cells[c->sibling]; cp = &cells[cp->sibling]) {
         if(is_inside(leaf, cp)) break;
     }
@@ -446,34 +404,16 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) {
  *
  * @param cellID The cell ID to compute interactions on.
  */
-__device__ void iact_self_direct(int cellID) {
+__device__ __forceinline__ void iact_self_direct(int cellID) {
     struct cell *c = &cells[cellID];
     double xi[3] = {0.0, 0.0, 0.0};
-    float ai[3] = {0.0, 0.0, 0.0 }, mi, mj, dx[3] = {0.0,0.0,0.0}, r2, ir, w;
-    __shared__ double2 parts_xy[cell_maxparts];
-    __shared__ double parts_z[cell_maxparts];
-    __shared__ float4 parts_am[cell_maxparts];
+    float ai[3] = {0.0, 0.0, 0.0 }, mj, dx[3] = {0.0,0.0,0.0}, r2, ir, w;
     int parts;
     int count;
     int i,j,k;
     
-    //if(threadIdx.x == 0)
-     //   printf("%f, %f, %f, %f, %i\n", c->h, c->loc_xy.x, c->loc_xy.y, c->loc_z, c->split);
-    //If cell is split, interact each child with itself, and with each of its siblings.
-    /*if(c->split) {
-        //TODO
-
-    } else {*/
         parts = c->parts;
         count = c->count;
-        int z = threadIdx.x;
-        /* Load particle data into shared memory*/
-        /*for(k = threadIdx.x + parts; k < parts + count; k += blockDim.x , z += blockDim.x) {
-            parts_xy[z] = parts_pos_xy[k];
-            parts_z[z] = parts_pos_z[k];
-            parts_am[z] = parts_a_m[k];
-        }
-        __syncthreads();*/
         for(i = parts+threadIdx.x; i < parts+count; i += blockDim.x)
         {
             xi[0] = parts_cuda[i].loc[0];
@@ -482,9 +422,7 @@ __device__ void iact_self_direct(int cellID) {
             for(k = 0; k < 3; k++) {
                 ai[k] = 0.0;
             }
-            mi = parts_cuda[i].m;
             
-            //for(j = i+1; j!= i; j = (j+1)%count) 
             for(j = parts; j < parts+count; j++)
             {
                 if(i != j){
@@ -500,7 +438,6 @@ __device__ void iact_self_direct(int cellID) {
 
                     /* Apply the gravitational acceleration. */
 
-                    //ir = 1.0f / sqrtf(r2);
                     ir = rsqrtf(r2);
                     w = const_G * ir * ir * ir;
                     mj = parts_cuda[j].m;
@@ -526,7 +463,7 @@ __device__ void iact_self_direct(int cellID) {
  */
 static inline int are_neighbours_host(struct cell *ci, struct cell *cj) {
 
-  int k;
+//  int k;
   float dx[3];
 
 #ifdef SANITY_CHECKS
@@ -555,7 +492,6 @@ static inline int are_neighbours_host(struct cell *ci, struct cell *cj) {
 
 struct cell *cell_get()
 {
-    struct cell *res;
     
     if(num_cells == 0)
     {
@@ -616,8 +552,6 @@ struct cell *cell_get()
     cell_pool[used_cells-1].sibling = -1;
     cell_pool[used_cells-1].firstchild = -1;
     cell_pool[used_cells-1].res = qsched_res_none;
-    //cell_pool[used_cells-1].resz = qsched_res_none;
-    //cell_pool[used_cells-1].resm = qsched_res_none;
     return &cell_pool[used_cells-1];
 }
 
@@ -686,19 +620,12 @@ void cell_split(int c, struct qsched *s) {
     int i, j, k, kk, count = cell_pool[c].count;
     int parts = cell_pool[c].parts;
     struct part temp_part;
-    double2 tempxy;
-    double tempxy1;
-    float4 tempxy2;
     struct cell *cp, *cell;
     int left[8], right[8];
     double pivot[3];
     static int root = -1;
-//    struct cell *progenitors[8];
     int progenitors[8];
-    int c1 = c;
     struct part *temp_xy;
-    double *temp_z;
-    float4 *temp_a_m;
 
     /* Set the root cell. */
     if (root < 0) {
@@ -710,25 +637,9 @@ void cell_split(int c, struct qsched *s) {
     {
         if( cudaHostGetDevicePointer(&temp_xy, &parts_host[cell_pool[c].parts], 0) != cudaSuccess )
             error("Failed to get host device pointer.");
-//        printf("tempxy = %p\n", temp_xy);
         cell_pool[c].res = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_xy,
                                         sizeof(struct part) * cell_pool[c].count, parts_pos_xy_temp + cell_pool[c].parts);
     }
-    /*if(cell_pool[c].resz == qsched_res_none)
-    {
-        if( cudaHostGetDevicePointer(&temp_z, &parts_pos_z_host[cell_pool[c].parts], 0) != cudaSuccess )
-            error("Failed to get host device pointer.");
-        cell_pool[c].resz = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_z,
-                                        sizeof(double) * cell_pool[c].count, parts_pos_z_temp + cell_pool[c].parts);
-    }
-    if(cell_pool[c].resm == qsched_res_none)
-    {
-        if( cudaHostGetDevicePointer(&temp_a_m, &parts_a_m_host[cell_pool[c].parts], 0) != cudaSuccess )
-            error("Failed to get host device pointer.");
-        cell_pool[c].resm = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_a_m,
-                                        sizeof(float4) * cell_pool[c].count, parts_a_m_temp + cell_pool[c].parts);
-    }*/
-       // error("Cell has no resource");*///TODO
 
     if(count > cell_maxparts )
     {
@@ -818,14 +729,6 @@ void cell_split(int c, struct qsched *s) {
             error("Failed to get host device pointer.");
         cell_pool[progenitors[k]].res = qsched_addres(s, qsched_owner_none, cell->res, temp_xy,
                                         sizeof(struct part) * cell_pool[progenitors[k]].count, parts_temp + cell_pool[progenitors[k]].parts);
-        /*if( cudaHostGetDevicePointer(&temp_z, &parts_pos_z_host[cell_pool[progenitors[k]].parts], 0) != cudaSuccess )
-            error("Failed to get host device pointer.");
-        cell_pool[progenitors[k]].resz = qsched_addres(s, qsched_owner_none, cell->resz, temp_z,
-                                        sizeof(double) * cell_pool[progenitors[k]].count, parts_pos_z_temp + cell_pool[progenitors[k]].parts);
-        if( cudaHostGetDevicePointer(&temp_a_m, &parts_a_m_host[cell_pool[progenitors[k]].parts], 0) != cudaSuccess )
-            error("Failed to get host device pointer.");
-        cell_pool[progenitors[k]].resm = qsched_addres(s, qsched_owner_none, cell->resm, temp_a_m,
-                                        sizeof(float4) * cell_pool[progenitors[k]].count, parts_a_m_temp + cell_pool[progenitors[k]].parts);*/
         }
 
         /* Find the first non-empty progenitor */
@@ -870,12 +773,7 @@ void cell_split(int c, struct qsched *s) {
     int data[2] = {root, c};
         int tid = qsched_addtask(s, task_type_self_pc, task_flag_none, data,
                                  2 * sizeof(int), 3000);
-        /*qsched_adduse(s, tid, cell_pool[root].res);
-        qsched_adduse(s, tid, cell_pool[root].resz);
-        qsched_adduse(s, tid, cell_pool[root].resm);*/
         qsched_addlock(s, tid, cell_pool[c].res);
-     //   qsched_addlock(s, tid, cell_pool[c].resz);
-      //  qsched_addlock(s, tid, cell_pool[c].resm);
     }
 
 #ifndef COM_AS_TASK
@@ -894,8 +792,7 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){
     
     qsched_task_t tid;
     int data[2];
-    struct cell /**data[2],*/ *cp, *cps;
-    int cpi;
+    struct cell *cp, *cps;
     
     
     if(cj == NULL)
@@ -917,8 +814,6 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){
             data[1] = -1;
             tid = qsched_addtask(s, task_type_self, task_flag_none, data, sizeof(int)*2, 2);
             qsched_addlock(s, tid, ci->res);
-            //qsched_addlock(s, tid, ci->resz);
-            //qsched_addlock(s, tid, ci->resm);
         }
     }
     /* Else its a pair!*/
@@ -946,11 +841,7 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){
 
                 /* Add the resources. */
                 qsched_addlock(s, tid, ci->res);
-              //  qsched_addlock(s, tid, ci->resz);
-               // qsched_addlock(s, tid, ci->resm);
                 qsched_addlock(s, tid, cj->res);
-               // qsched_addlock(s, tid, cj->resz);
-               // qsched_addlock(s, tid, cj->resm);
             }
         }
 
@@ -961,7 +852,7 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){
 }
 
 
-__device__ void runner( int type , void *data ) {
+__device__ __forceinline__ void runner( int type , void *data ) {
         
         int *idata = (int *)data;
         int i = idata[0];
@@ -977,6 +868,7 @@ __device__ void runner( int type , void *data ) {
                 iact_self_pc( &cells[i], &cells[j] );
                 break;
             default:
+                printf("Got to default?\n");
                 asm("trap;");
         }
     __syncthreads();
@@ -997,14 +889,12 @@ qsched_funtype func;
 void test_bh(int N, int runs, char *fileName) {
   int i, k;
   struct cell *root;
-  struct part *parts;
   FILE *file;
   struct qsched s;
-  ticks tic, toc_run, tot_setup = 0, tot_run = 0;
-  int countMultipoles = 0, countPairs = 0, countCoMs = 0;
   struct cell *gpu_ptr_cells;
 
   cudaFree(0);
+    cudaThreadSetCacheConfig(cudaFuncCachePreferL1);
     if( cudaMemcpyFromSymbol( &func , function , sizeof(qsched_funtype) ) != cudaSuccess)
         error("Failed to copy function pointer from device");
 
@@ -1068,10 +958,6 @@ void test_bh(int N, int runs, char *fileName) {
             number++;
             if(cell_pool[c].res == qsched_res_none)
                message("cell %i has no res", c);
-            //if(cell_pool[c].resz == qsched_res_none)
-             //  message("cell %i has no resz", c);
-            //if(cell_pool[c].resm == qsched_res_none)
-             //  message("cell %i has no resm", c);
         }
         if(!cell_pool[c].split) {
             nr_leaves++;
@@ -1091,10 +977,6 @@ void test_bh(int N, int runs, char *fileName) {
     message("Average number of parts per leaf is %lf.", ((double)N) / ((double)nr_leaves));
     message("Max number of parts in a leaf is %i, min number is %i", maxparts, minparts);
     
-   /* for(k = 0; k < num_cells; k++)
-        if(cell_pool[k].split > 1 ) 
-            printf("Split > 1\n");*/
-
     create_tasks(&s, root, NULL);    
 
     int self = 0, pair = 0, pc = 0;
@@ -1160,36 +1042,19 @@ float *comm_temp;
     if( cudaMemcpyToSymbol(com_mass, &comm_temp, sizeof(float *), 0, cudaMemcpyHostToDevice) != cudaSuccess)
         error("Failed to copy com pointer to the GPU");
 
-   /* for(i = 0; i < s.count; i++){
-        int *idata = (int*)&s.data[s.tasks[i].data];
-        if(s.tasks[i].type == task_type_self)
-            printf("Self task with data[0] = %i\n", idata[0]);
-        if(s.tasks[i].type == task_type_pair) {
-            printf("Pair task with data[0] = %i and data[1] = %i\n", idata[0], idata[1]);
-        }
-        if(s.tasks[i].type == task_type_self_pc) {
-            printf("PC task with data[0] = %i and data[1] = %i\n", idata[0], idata[1]);
-        }
-    }*/
-        
-      //  printf("com_mass_host[152] = %f\n", com_mass_host[152]);
-        
-
-        //Run code.
-//        printf("gpu_data = %p\n", (int*)s.res[0].gpu_data);
         qsched_run_CUDA( &s , func );
         qsched_print_cuda_timers(&s);
 
     k = 0;
         printf("%e, %e, %e, %e, %e, %e, %e\n", parts_host[k].m, parts_host[k].loc[0], parts_host[k].loc[1], parts_host[k].loc[2],
             parts_host[k].a[0], parts_host[k].a[1], parts_host[k].a[2]);
-struct task* tasks =  qsched_get_timers( &s , s.count );
-    for(i = 0; i < s.count; i++)
-    {
-      printf("%i %lli %lli %i\n", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID);
+//struct task* tasks =  qsched_get_timers( &s , s.count );
+//    for(i = 0; i < s.count; i++)
+//    {
+//      printf("%i %lli %lli %i\n", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID);
        // printf("\n");
     
-  }
+//  }
 
 }   
   /* Dump the particles to a file */
@@ -1224,7 +1089,6 @@ int main(int argc, char *argv[]) {
       case 't':
         if (sscanf(optarg, "%d", &nr_threads) != 1)
           error("Error parsing number of threads.");
-//        omp_set_num_threads(nr_threads);
         break;
       case 'f':
         if (sscanf(optarg, "%s", &fileName[0]) != 1)
@@ -1247,7 +1111,7 @@ int main(int argc, char *argv[]) {
   printf("Size of cell: %zu bytes.\n", sizeof(struct cell));
 
   /* Part information */
-//  printf("Size of part: %zu bytes.\n", sizeof(struct part));
+  printf("Size of part: %zu bytes.\n", sizeof(struct part));
 
   /* Dump arguments. */
   if (fileName[0] == 0) {
diff --git a/examples/test_gpu_simple.cu b/examples/test_gpu_simple.cu
index 066b53b..0924717 100644
--- a/examples/test_gpu_simple.cu
+++ b/examples/test_gpu_simple.cu
@@ -67,7 +67,6 @@ __device__ void quarter(float *data)
 
 __global__ void Manual(float *src)
 {
-    int i;
     int datas = blockIdx.x;
     cuda_data[datas*1000+threadIdx.x] = src[datas*1000+threadIdx.x];
     
@@ -109,13 +108,13 @@ __global__ void Setup()
 
 
 int main ( int argc , char *argv[] ) {
-    float *array, *cuda_array, *cuda_array2, *device_array;
+    float *array, *cuda_array, *device_array;
     int i, k=0;
     qsched_funtype func;
     struct qsched s;
     qsched_task_t *tid;
     qsched_res_t *rid;
-    ticks tic, toc_run, tot_setup, tot_run = 0;
+    ticks tic, toc_run, tot_run = 0;
     qsched_init( &s , 1 , qsched_flag_none );
     cudaDeviceReset();
     cudaSetDevice(0);
@@ -134,11 +133,6 @@ int main ( int argc , char *argv[] ) {
         array[i] = (float)i;
 }
 
-       // if(cudaHostGetDevicePointer(&cuda_array , &array[k*1000] , 0) != cudaSuccess)
-        //error("Failed to get device pointer for data: %s", cudaGetErrorString(cudaPeekAtLastError()));
-
-//    if ( cudaMallocHost(&array, M*sizeof(float)) != cudaSuccess )
-  //      error("Failed to allocate array");
 
  /* Initialize the scheduler. */
 
@@ -193,16 +187,9 @@ printf("Starting second run\n");
 printf("Second run complete\n");
 
     struct task *completed_tasks = qsched_get_timers( &s, s.count );
-/*    tic = getticks();
-    qsched_run_CUDA( &s , func );
-    toc_run = getticks(); 
-	message( "qsched_run_CUDA took %lli ticks..." , toc_run - tic );
-    tot_run += toc_run - tic;*/
 
 
 
-//    if( cudaFreeHost( array) != cudaSuccess)
-  //      error("Failed to free array");
 
 cudaDeviceReset();
 
@@ -215,8 +202,6 @@ cudaDeviceReset();
         error("Failed to get device pointer for data: %s",cudaGetErrorString(cudaPeekAtLastError()));
     if( cudaMalloc(&device_array , M*sizeof(float) ) != cudaSuccess )
         error("Failed to allocate device array: %s", cudaGetErrorString(cudaPeekAtLastError()));
-    /*if( cudaMemcpy( &device_array , array , M*sizeof(float), cudaMemcpyHostToDevice ) != cudaSuccess )
-        error("Failed to copy device array: %s", cudaGetErrorString(cudaPeekAtLastError()));*/
     if( cudaMemcpyToSymbol( cuda_data , &device_array,sizeof(float*), 0 ,  cudaMemcpyHostToDevice) != cudaSuccess )
         error("Failed to copy array pointer to device: %s", cudaGetErrorString(cudaPeekAtLastError()));
     tic = getticks();
@@ -229,6 +214,4 @@ cudaDeviceReset();
         if(array[i] != ((float)i)*((float)i)*0.5f)
             printf("%i wrong, %.3f != %.3f\n", i, array[i], ((float)i)*((float)i)*0.5f );
 
-    //printf("%.3f\n", array[2]);
-
 }
diff --git a/examples/test_qr.cu b/examples/test_qr.cu
index b93b68d..f195f3a 100644
--- a/examples/test_qr.cu
+++ b/examples/test_qr.cu
@@ -22,11 +22,8 @@ extern "C"{
 int g_size;
 enum task_types { task_SGEQRF , task_SLARFT , task_STSQRF , task_SSSRFT} ;
 
-//#define TID threadIdx.x
 #define numthreads 128
-//#define tilesize 32
 #define cuda_maxtasks 1000000
-//#define CO(x,y,ldm) (((y)*ldm) + (x))
 #define WARPS 4
 
 __device__ float *GPU_matrix;
@@ -145,7 +142,7 @@ float* generateColumnMatrix(int size, unsigned long int m_z)
 {
     float* matrix;
     
-    cudaError_t code = cudaMallocHost(&matrix, sizeof(float)*size/**size*/);
+    cudaError_t code = cudaMallocHost(&matrix, sizeof(float)*size);
     if(code != cudaSuccess)
         printf("%s size = %i g_size = %i\n", cudaGetErrorString(code),size, g_size);
        else
@@ -211,8 +208,6 @@ __device__ inline void reduceSumMultiWarp( float* value )
     }
     __threadfence();
     *value = stuff[group * 32];
-        //printf("Not supported.\n");
-        //asm("trap;");
     #endif
 
 }
@@ -236,7 +231,6 @@ __device__ void SGEQRF(volatile float* cornerTile, int tilesize,volatile float*
     int i, j;
     float norm=0.0, sign, u1, tau, z;
     int TID = threadIdx.x % 32;
-    //int set = threadIdx.x / 32;
     float w;
     /*Find the householder vector for each row. */
     for(i = 0; i < tilesize; i++)
@@ -253,7 +247,6 @@ __device__ void SGEQRF(volatile float* cornerTile, int tilesize,volatile float*
         else
             sign = 1;
 
-     //   sign = __shfl(sign, i, 32);
         norm = sqrt(norm);
         if(TID >= i )
             w = cornerTile[ i*tilesize + TID ];
@@ -262,13 +255,8 @@ __device__ void SGEQRF(volatile float* cornerTile, int tilesize,volatile float*
 
 
 
-//        if(TID==i)
             u1 = cornerTile[i*tilesize + i] - sign*norm;
 
-  //      __syncthreads();
-//        u1 = __shfl(u1, i, 32);     
-  //      if(i==1)
-//            printf("%.3f\n", u1);
         if(u1 != 0.0)
         {
             if(TID > i)
@@ -285,8 +273,6 @@ __device__ void SGEQRF(volatile float* cornerTile, int tilesize,volatile float*
         else
             tau = 0.0;
 
-        //if(TID == 0)
-          //  printf("%.3f\n", sign);
 
         /*Store the below diagonal vector */
         if(TID > i)
@@ -358,17 +344,9 @@ void __device__ SLARFT(volatile float* cornerTile, volatile float* rowTile, int
             /* Below Diagonal!*/
             /*Compute w'*A_j*/
 			z = w * rowTile[j*tilesize+TID];
-            //for(n = i; n < tileSize; n++)
-            //{
-            //    z = z + w[n] * rowTile[j*tileSize+n];
-            //}
             reduceSumMultiWarp(&z);
             if(TID >= i)
                 rowTile[j*tilesize + TID] = rowTile[j*tilesize + TID] - tauMatrix[(kk*tilesize+i)*tauNum + kk] * w * z;
-            //for(n = i; n < tileSize; n++)
-            //{
-              //  rowTile[j*tileSize+n] = rowTile[j*tileSize+n] - tauMatrix[(kk*tileSize+i)*tauNum+kk]*w[n]*z;
-            //}
         }
 
         
@@ -393,16 +371,14 @@ void __device__ SLARFT(volatile float* cornerTile, volatile float* rowTile, int
  */
 __device__ void STSQRF(volatile float* cornerTile,volatile float* columnTile, int tilesize, int ii, int kk, volatile float* tauMatrix, int tauNum )
 {
-    int i, j, n;
+    int i, j;
     float norm=0.0, sign, u1, tau, z;
     float w, wupper;
     int TID = threadIdx.x % 32;
-    //int set = threadIdx.x / 32;
 
     /* For each column compute the householder vector. */
     for(i = 0; i < tilesize; i++)
     {
-//        norm = 0.0;
         
         wupper = cornerTile[i*tilesize + i];
 
@@ -446,33 +422,6 @@ __device__ void STSQRF(volatile float* cornerTile,volatile float* columnTile, in
         columnTile[i*tilesize+TID] = w;
         if(threadIdx.x == 0)
             tauMatrix[(kk*tilesize+i)*tauNum+ii] = tau;
-        /* Apply to each row to the right.*/
-//        for(j = i; j < tilesize; j++)
-  //      {
-            /* Find w'*A_j, w is 0s except for first value with upper tile.*/
-    //        z = 1.0 * cornerTile[ j*tilesize+i ];            
-      //      for(n = 0; n < tilesize; n++)
-        //    {
-          //      z = z + w[ tilesize+n ]*columnTile[ j*tilesize+n ];
-//            }
-            /* Apply to upper tile.*/
-  //         cornerTile[j*tilesize+i] = cornerTile[j*tilesize+i ] - tau*1.0*z;
-    //        for(n = i+1; n < tilesize; n++)
-      //      {
-        //        cornerTile[j*tilesize+n] = cornerTile[j*tilesize+n ] - tau*w[n]*z;
-          //  }
-            /* Apply to lower tile.*/
-//            for(n = 0; n < tilesize; n++)
-  //          {
-    //            columnTile[ j*tilesize+n] = columnTile[ j*tilesize+n ] - tau*w[tilesize+n]*z;
-      //      }
-            
-       // }
-        /* Store w*/
-//        for(j = 0; j < tilesize; j++){
-  //          columnTile[ i*tilesize+j ] = w[tilesize+j];
-    //    }
-      //  tauMatrix[(kk*tilesize+i)*tauNum+ ii] = tau;
     }
 }
 
@@ -495,7 +444,7 @@ __device__ void STSQRF(volatile float* cornerTile,volatile float* columnTile, in
  */
 __device__ void SSSRFT( volatile float* cornerTile,volatile float* columnTile,volatile float* rowTile, int tilesize, int ii, int jj, int kk,volatile float* tauMatrix, int tauNum )
 {
-    int i, j, n;
+    int i, j;
     float z;
     float w;
     int TID = threadIdx.x % 32;
@@ -520,30 +469,6 @@ __device__ void SSSRFT( volatile float* cornerTile,volatile float* columnTile,vo
             cornerTile[j*tilesize+TID] = cornerTile[j*tilesize+TID] - tau*w*z;
         }
         __syncthreads();
-        /*for(j = 0; j < i; j++)
-            w[j] = 0.0;
-        w[i] = 1.0;
-        for(j = i+1; j < tilesize; j++)
-            w[j] = 0.0;
-        for(j = 0; j < tilesize; j++)
-            w[j+tilesize] = columnTile[i*tilesize +j];*/
-        
-        /* Apply householder vector (w) to the tiles.*/
-/*        for(j = 0; j < tilesize; j++)
-        {
-            z = 0.0;*/
-            /* Compute w' * A_j */
-/*            for(n = 0; n < tilesize; n++)
-            {
-                z += w[n] * rowTile[j*tilesize+n];
-                z += w[n + tilesize] * cornerTile[j*tilesize+n];
-            }
-            for(n = 0; n < tilesize; n++)
-            {
-                rowTile[j*tilesize + n] = rowTile[j*tilesize + n] - tauMatrix[(kk*tilesize+i)*tauNum+ii]*w[n]*z;
-                cornerTile[j*tilesize+n] = cornerTile[j*tilesize+n]- tauMatrix[(kk*tilesize+i)*tauNum+ii]*w[tilesize+n]*z;
-            }
-        }*/
     }
 
 }
@@ -551,37 +476,25 @@ __device__ void SSSRFT( volatile float* cornerTile,volatile float* columnTile,vo
 __device__ void runner ( int type , void *data ) {
     
 	__shared__ volatile float blockCache[(32*32) + 128];
-	volatile float *workVector;
-	workVector = blockCache + (32*32);
         /* Decode the task data. */
         int *idata = (int *)data;
         int i = idata[0];
         int j = idata[1];
         int k = idata[2];
-    //    int z;
-//        double buff[ 2*K*K ];
         /* Decode and execute the task. */
         switch ( type ) {
             case task_SGEQRF:
-                //if(threadIdx.x == 0)
-                  //  printf("SGEQRF %i %i\n", k, (k*cuda_m+k)*32*32);
                 if(threadIdx.x < 32)
                     SGEQRF( &GPU_matrix[(k*cuda_m+k)*32*32], 32, GPU_tau, k, cuda_m);
                 break;
             case task_SLARFT:
-        //        if(threadIdx.x == 0)
-        //            printf("SLARFT %i %i %i\n", k,j, (j*cuda_m+k)*32*32);
                 SLARFT( &GPU_matrix[(k*cuda_m +k)*32*32], &GPU_matrix[(j*cuda_m+k)*32*32], 32, j, k, GPU_tau, cuda_m);
                 break;
             case task_STSQRF:
-          //      if(threadIdx.x == 0)
-         //           printf("STSQRF %i %i %i\n", k,i, (k*cuda_m+i)*32*32);
                 if(threadIdx.x < 32)
                     STSQRF( &GPU_matrix[(k*cuda_m+k)*32*32], &GPU_matrix[(k*cuda_m + i)*32*32], 32, i, k, GPU_tau, cuda_m);
                 break;
             case task_SSSRFT:
-         //       if(threadIdx.x == 0)
-         //           printf("SSSRFT %i %i %i %i\n", k,j, i, (j*cuda_m+i)*32*32);
                 SSSRFT( &GPU_matrix[(j*cuda_m+i)*32*32],&GPU_matrix[(k*cuda_m + i)*32*32],&GPU_matrix[(j*cuda_m+k)*32*32], 32, i, j,        k, GPU_tau, cuda_m );
                 break;
             default:
@@ -630,8 +543,6 @@ float* columnToTile( float* columnMatrix, int size , int m , int n )
     cudaMallocHost(&TileMatrix, sizeof(float) * size );
     if(TileMatrix == NULL)
         error("failed to allocate TileMatrix");
-    int rows = m*32;
-    int columns = n*32;
     int i,j,k,l;
 
     for( i = 0; i < n ; i++ )
@@ -691,8 +602,6 @@ float* createIdentity(int m, int n)
     cudaMallocHost(&Matrix, sizeof(float) * m*n*32*32);
     if(Matrix == NULL)
         error("Failed to allocate Matrix");
-    int rows = m*32;
-    int columns = n*32;
     int i, j;
     memset ( Matrix, 0, sizeof(float)*m*n*32*32 );
 
@@ -729,24 +638,6 @@ void printMatrix(float* Matrix, int m, int n)
 }
 
 
-/*void printTileMatrix(float* Matrix, int m, int n)
-{
-    int i,j;
-    float *tempMatrix, *tempMatrix2;
-    for(i = 0; i < m*32; i++)
-    {
-        int tiled = i/32;
-        tempMatrix = &Matrix[tiled*32*32];
-        for(j = 0; j < n*32; j++)
-        {
-            int tile = j/32;
-            tempMatrix2 = &tempMatrix[tile*32*32*m + i%32];
-            printf(" %.1f ", tempMatrix2[j*32]);
-            //printf(" %.1f,%i ", tempMatrix2[j*32], &tempMatrix2[j*32]-Matrix);
-        }   
-        printf("\n");
-    }
-}*/
 
 
 qsched_funtype func;
@@ -759,7 +650,7 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
     qsched_task_t *tid, tid_new;
     qsched_res_t *rid;
     int data[3];
-    ticks tic, toc_run, tot_setup, tot_run = 0;
+//    ticks tic, tot_run = 0;
     #ifdef NO_LOADS
     ticks tic_loads;
     #endif
@@ -776,22 +667,15 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
     
     /* Allocate and fill the original matrix. */
     if(cudaMallocHost(&A, sizeof(float) * m * n * K * K  ) != cudaSuccess ||
-        cudaMallocHost(&tau, sizeof(float) * m * n * K ) != cudaSuccess /*||
-        cudaMallocHost(&A_orig, sizeof(float) * m * n * K * K  ) != cudaSuccess*/ )
+        cudaMallocHost(&tau, sizeof(float) * m * n * K ) != cudaSuccess )
         error("Failed to allocate matrices.");
     g_size = g_size + sizeof(float) * m * n * K * K  + sizeof(float) * m * n * K;
-    //cudaFreeHost(A_orig);
-//    for ( k = 0 ; k < m * n * K * K ; k++ )
-  //      A_orig[k] = 2.0f*((float)rand()) / RAND_MAX - 1.0f;
     A_orig = generateColumnMatrix(m*n*K*K, 35532);
-  //  printMatrix(A_orig, m, n);
     float *temp = columnToTile(A_orig, m * n * K *K,  m , n);
     cudaFreeHost(A_orig);
     A_orig = temp;
     temp = tileToColumn(A_orig, m*n*K*K, m , n, K);
-//    printMatrix(temp, m, n);
     
-//    printTileMatrix(A_orig, m, n);
     memcpy( A , A_orig , sizeof(float) * m * n * K * K );
     bzero( tau , sizeof(float) * m * n * K );
 
@@ -814,7 +698,6 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
         error("Failed to copy matrix pointer to the device");
 
     /* Allocate and init the task ID and resource ID matrix. */
-    tic = getticks();
    /* Allocate and init the task ID and resource ID matrix. */
     if( cudaMallocHost(&tid , sizeof(qsched_task_t) * m*n ) != cudaSuccess )
         error("Failed to allocate tid");
@@ -860,7 +743,7 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
             data[0] = i; data[1] = k; data[2] = k;
             tid_new = qsched_addtask( &s , task_STSQRF , task_flag_none , data , sizeof(int)*3 , 3 );
             qsched_addlock(&s, tid_new, rid[k * m + i]);
-           qsched_adduse(&s, tid_new, rid[k * m + k]);
+           qsched_addlock(&s, tid_new, rid[k * m + k]);
             qsched_addunlock(&s, tid[k * m + (i - 1)], tid_new);
             if (tid[k * m + i] != -1) qsched_addunlock(&s, tid[k * m + i], tid_new);
             tid[k * m + i] = tid_new;
@@ -871,7 +754,7 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
                 tid_new = qsched_addtask( &s , task_SSSRFT , task_flag_none , data , sizeof(int)*3 , 5 );
                 qsched_addlock(&s, tid_new, rid[j * m + i]);
                 qsched_adduse(&s, tid_new, rid[k * m + i]);
-                qsched_adduse(&s, tid_new, rid[j * m + k]);
+                qsched_addlock(&s, tid_new, rid[j * m + k]);
                 qsched_addunlock(&s, tid[k * m + i], tid_new);
                 qsched_addunlock(&s, tid[j * m + i - 1], tid_new);
                 if (tid[j * m + i] != -1) qsched_addunlock(&s, tid[j * m + i], tid_new);
@@ -905,15 +788,12 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
 
 
     
-   // printMatrix(tileToColumn(A, m*n*K*K, m, n, K), m, n);
-/*   float *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K);
+   float *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K);
    float *Q = computeQ(tempMatrix, m*K, K, tau, m);
    float *R = getR(tempMatrix, m*K);
    cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m*K, m*K, m*K, 1.0, Q,
    m*K, R, m*K, 0.0, tempMatrix, m*K);
        free(Q);
-   // printMatrix(tempMatrix, m, n);
-   // printf("\n\n\n\n");
        Q = tileToColumn(A_orig, m*n*K*K, m, n, K);
    for(i = 0; i < m * n * K * K; i++)
    {
@@ -922,15 +802,16 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
            printf("Not correct at value %i %.3f %.3e %.3e\n", i, A[i], Q[i],
    tempMatrix[i]);
    }
+   printf("Checked for correctness\n");
    free(tempMatrix);
    free(Q);
-   free(R);*/
+   free(R);
 
 //    cudaMemcpy( A , device_array , sizeof(float) * m * n * K * K, cudaMemcpyHostToDevice);
  //   A = tileToColumn(A,m * n * K * K, m, n, K);
    // printMatrix(A, m, n);
 //    printTileMatrix(A, m , n);
-    struct task* tasks =  qsched_get_timers( &s , s.count );
+   /* struct task* tasks =  qsched_get_timers( &s , s.count );
     for(i = 0; i < s.count; i++)
     {
       printf("%i %lli %lli %i ", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID);
@@ -947,7 +828,7 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
             printf("0 0");
         printf("\n");
     }
-    free(tasks);
+    free(tasks);*/
 cudaDeviceReset();
 }
 
@@ -956,7 +837,6 @@ void test()
 {
     int m = 2, n = 2;
     float* Matrix = createIdentity(m,n);
-    //printMatrix(Matrix, m, n);
     float* MatrixTile = columnToTile(Matrix, m*n*32*32, m, n);
     printMatrix(&MatrixTile[0], 1, 1);
     printf("\n \n \n");
@@ -966,7 +846,6 @@ void test()
     printf("\n \n \n");
     printMatrix(&MatrixTile[3*32*32], 1, 1);
     printf("\n \n \n");
-    //printTileMatrix(MatrixTile, m, n);
     free(Matrix);
     Matrix = tileToColumn(MatrixTile, m*n*32*32, m , n, 32);
     printMatrix(Matrix, m, n);
@@ -988,11 +867,9 @@ __global__ void SGEQRF_test(float* cornerTile)
     
     for(i = threadIdx.x; i < 32*32; i+=blockDim.x)
     {
-       // printf("Copying value %i\n", i);
         tile[i] = cornerTile[i];
     }
 
-   // printf("blockDim.x = %i\n", blockDim.x);
     __syncthreads();
     if(threadIdx.x < 32)
         SGEQRF(tile, 32, tau, 0, 1);
@@ -1029,7 +906,6 @@ __global__ void SLARFT_test(float* cornerTile, float* rowTile)
         SGEQRF(tile, 32, tau, 0, 2);
 
     __syncthreads();
-//    if(threadIdx.x < 32)
         SLARFT(tile, tile2, 32, 1, 0, tau, 2);
     __syncthreads();
     for(i = threadIdx.x; i < 32*32; i+=blockDim.x)
@@ -1165,12 +1041,6 @@ int main ( int argc , char *argv[] ) {
     int c, nr_threads=128;
     int M = 4, N = 4, runs = 1, K = 32;
     
-    /* Get the number of threads. */
-    //#pragma omp parallel shared(nr_threads)
-//    {
-//        if ( omp_get_thread_num() == 0 )
-//            nr_threads = omp_get_num_threads();
-//    }
     
     /* Parse the options */
     while ( ( c = getopt( argc , argv  , "m:n:k:r:t:D" ) ) != -1 )
@@ -1194,7 +1064,6 @@ int main ( int argc , char *argv[] ) {
 	        case 't':
 	            if ( sscanf( optarg , "%d" , &nr_threads ) != 1 )
 	                error( "Error parsing number of threads." );
-	            //omp_set_num_threads( nr_threads );
                 break;
             case 'D':
                 runTests();
@@ -1206,9 +1075,6 @@ int main ( int argc , char *argv[] ) {
 	            exit( EXIT_FAILURE );
 	        }
             
-    /* Dump arguments. */
-  //  message( "Computing the tiled QR decomposition of a %ix%i matrix using %i threads (%i runs)." ,
-    //    32*M , 32*N , nr_threads , runs );
         
     test_qr( M , N , K , nr_threads , runs );
     
diff --git a/src/CUDACompile.sh b/src/CUDACompile.sh
index 0a3a0bf..085086c 100755
--- a/src/CUDACompile.sh
+++ b/src/CUDACompile.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 FLAGS2="-Xcompiler=-fsanitize=address -Xcompiler=-fno-omit-frame-pointer"
 DEBUG_FLAGS="-G -DDEBUG_GPU"
-FLAGS="-O3 -g -G -DCPU_TPS=3.1e9 -lineinfo -src-in-ptx -Xptxas -dlcm=cg --maxrregcount=32 -gencode arch=compute_30,code=sm_30 -ftz=true -fmad=true -DFPTYPE_SINGLE -lgomp -DWITH_CUDA -DTIMERS -ccbin=/usr/bin/gcc-4.8"
+FLAGS="-O3 -g -DCPU_TPS=3.1e9 -lineinfo -src-in-ptx -Xptxas -dlcm=cg --maxrregcount=32 -gencode arch=compute_30,code=sm_30 -ftz=true -fmad=true -DFPTYPE_SINGLE -lgomp -DWITH_CUDA -DTIMERS -ccbin=/usr/bin/gcc-4.8"
 # -DGPU_locks -Xptxas -dlcm=cg  -Xptxas="-v"" 
 # -DNO_LOADS
 
@@ -36,3 +36,7 @@ cd ../examples
 /home/aidan/cuda_6.0/bin/nvcc $FLAGS  -dc -m64 -I../src -dc -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -lcudart -lcuda  test_bh_2.cu -lprofiler
 
 /home/aidan/cuda_6.0/bin/nvcc $FLAGS  -m64 -I../src -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -Xnvlink -v test_bh_2.o ../src/.libs/libquicksched_cuda.a -o test_bh_2 -lprofiler
+
+/home/aidan/cuda_6.0/bin/nvcc $FLAGS  -dc -m64 -I../src -dc -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -lcudart -lcuda  test_bh_3.cu -lprofiler
+
+/home/aidan/cuda_6.0/bin/nvcc $FLAGS  -m64 -I../src -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -Xnvlink -v test_bh_3.o ../src/.libs/libquicksched_cuda.a -o test_bh_3 -lprofiler
diff --git a/src/cuda_queue.cu b/src/cuda_queue.cu
index 853ed2b..d42e785 100644
--- a/src/cuda_queue.cu
+++ b/src/cuda_queue.cu
@@ -333,7 +333,7 @@ __device__ int cuda_locktask ( int tid ) {
 __device__ void cuda_done ( struct task *t ) {
 
     int k;
-    struct task *t2;
+//    struct task *t2;
     
     /* Release this task's locks. */
     for ( k = 0 ; k < t->nr_locks ; k++ )
@@ -437,8 +437,6 @@ __device__ int get_best_task(struct queue_cuda *q )
     int ind1, ind2, tid1 = -1, tid2 = -1;
     
     /* Don't even try... */
-   // if ( q->rec_count == q->count )
-     //   return -1;
 
     /* Get the index of the next task. */
     ind1 = atomicAdd( &q->first , 1 );
@@ -483,7 +481,6 @@ __device__ int get_best_task(struct queue_cuda *q )
         q->data[ind2] = -1;
         atomicAdd((int*) &tot_num_tasks, -1);
         cuda_queue_puttask( q, tid2);
-//        atomicAdd((int*)&q->nr_avail_tasks, 1);
 
         return tid1;
     }else
@@ -492,7 +489,6 @@ __device__ int get_best_task(struct queue_cuda *q )
         q->data[ind2] = -1;
         atomicAdd((int*) &tot_num_tasks, -1);
         cuda_queue_puttask( q, tid1);
-  //      atomicAdd((int*)&q->nr_avail_tasks, 1);
         return tid2;
     }
     
@@ -507,7 +503,6 @@ __device__ int runner_cuda_gettask_priority ( struct queue_cuda *q ) {
         atomicAdd((int*)&q->nr_avail_tasks, 1);
         return -1;
     }
-//    atomicAdd((int*)&q->nr_avail_tasks, -1);
     tid = get_best_task(q);
         
     /* Put this task into the recycling queue, if needed. */
@@ -534,13 +529,6 @@ __global__ void qsched_device_kernel ( )
     /* Pull a task from the queues*/
     while ( 1 ) {
         __syncthreads();
-        /*if(threadIdx.x == 0 && blockIdx.x == 0)
-        {
-            printf("%i %i %i \n", cuda_queues[0].nr_avail_tasks, cuda_queues[0].data[0], cuda_queues[0].data[1]);
-            break;
-        }
-        else
-            break;*/
         if ( threadIdx.x == 0 ) {
              tid = -1;
                 /* Highest priority queue, holds the unload tasks. */
@@ -553,6 +541,16 @@ __global__ void qsched_device_kernel ( )
                 }
 #endif
 
+#ifndef NO_LOADS
+             /* Low priority queue, contains the load tasks. */
+             if( tid < 0 && cuda_queues[1].nr_avail_tasks > 0 && blockIdx.x < 12 )
+             {
+                TIMER_TIC
+                tid = runner_cuda_gettask ( &cuda_queues[1]);
+                TIMER_TOC(timers_load_q)
+             }
+#endif
+
              /* Middle priority queue, contains user-specifed tasks. */
 #ifdef PRIQ
             if( tid < 0 && cuda_queues[0].nr_avail_tasks > 0 )
@@ -568,15 +566,6 @@ __global__ void qsched_device_kernel ( )
                 tid = runner_cuda_gettask ( &cuda_queues[0]);
                 TIMER_TOC(timers_queue)
             }
-#endif
-#ifndef NO_LOADS
-             /* Low priority queue, contains the load tasks. */
-             if( tid < 0 && cuda_queues[1].nr_avail_tasks > 0 /*&& blockIdx.x < 12*/ )
-             {
-                TIMER_TIC
-                tid = runner_cuda_gettask ( &cuda_queues[1]);
-                TIMER_TOC(timers_load_q)
-             }
 #endif
             }
             
@@ -667,21 +656,9 @@ __global__ void qsched_device_kernel ( )
             
 
             /* Reset values.*/
-            /*printf("cuda_queues[0].first = %i, nr_avail_tasks = %i, tot_num_tasks = %i, data[first] = %i\n", cuda_queues[0].first, cuda_queues[0].nr_avail_tasks ,tot_num_tasks, cuda_queues[0].data[cuda_queues[0].first] );
-            printf("cuda_queues[0].last = %i, data[last-1] = %i\n", cuda_queues[0].last ,cuda_queues[0].data[cuda_queues[0].last] );
-            for(tid = cuda_queues[0].first; tid < cuda_queues[0].first + cuda_queues[0].nr_avail_tasks; tid++)
-            {   
-                printf("task =%i, type = %i\n", cuda_queues[0].data[tid%cuda_queue_size], tasks_cuda[cuda_queues[0].data[tid%cuda_queue_size]].type ); 
-            } */
             cuda_queues[0].first = 0;
             cuda_queues[0].last = 0;
             cuda_queues[0].rec_count = 0;
-            /*cuda_queues[1].first = 0;
-            cuda_queues[1].last = cuda_queues[1].count;
-            cuda_queues[1].rec_count = 0;
-            cuda_queues[2].first = 0;
-            cuda_queues[2].last = 0;
-            cuda_queues[2].rec_count = 0;*/
             }
 
         //TODO
@@ -736,12 +713,6 @@ void qsched_create_loads(struct qsched *s, int ID, int size, int numChildren, in
         utask = qsched_addtask( s , type_ghost, task_flag_none, NULL, 0 , 0 );
         qsched_adduse(s, task, ID);
         s->res[ID].utask = utask;
-      //  if(parent >= 0)
-//        {
-            /* Create dependecy to parent. */ 
-          //  qsched_addunlock(s, task, s->res[parent].task );
-          ///  qsched_addunlock(s, s->res[parent].utask, utask );
-  //      }
         for(i = sorted[ID]; i < sorted[ID+1]; i++)
         {
             qsched_create_loads(s, res[i], s->res[res[i]].size, sorted[res[i]+1]-sorted[res[i]], ID, res, sorted);
@@ -756,12 +727,6 @@ void qsched_create_loads(struct qsched *s, int ID, int size, int numChildren, in
         {
             s->res[res[j]].task = task;
         }
-        /* If it has a parent then set the parents ghost task to be dependent on this.*/
-   //     if(parent >= 0)
-     //   {
-          //  qsched_addunlock(s, task, s->res[parent].task );
-          //  qsched_addunlock(s, s->res[parent].utask, utask);
-       // }
     }
 }
 
@@ -781,7 +746,6 @@ int transitive_use_unlocks(struct qsched *s, struct task *t, int res, int depth)
     }
     if(depth >= MAX_DEPTH)
     {
-//        printf("Max Depth reached\n");
         return 0;
     }
     for(i = 0; i < t->nr_unlocks; i++)
@@ -809,7 +773,6 @@ int transitive_use_locks(struct qsched *s, int tid, int res, int depth)
     }
     if(depth >= MAX_DEPTH)
     {
-//        printf("Max Depth reached\n");
         return 0;
     }
     for(i = tid-1; i >= 0; i--)
@@ -830,20 +793,18 @@ int transitive_use_locks(struct qsched *s, int tid, int res, int depth)
 
 void qsched_prepare_loads ( struct qsched *s ) {
 
-int i, task, unload, j , k , unlocked = 0;
+int i, j , k;
 struct task *t;
 int *sorted, lastindex;
 int *res, *res_data;
 
-//printf("tasks[63].wait_init = %i\n", s->tasks[63].wait_init);
 
 if(s->res[0].task != -1)
 {
     printf("Tasks already initialised, not redoing load/unload tasks");
     return;
 }
-double itpms = 1000.0 / CPU_TPS;
-ticks tic, toc_run, toc2 ;
+ticks tic, toc2 ;
     tic = getticks();
 
 /* Expand the deps array so we can add new dependencies in place. */
@@ -947,15 +908,12 @@ for(i = 0; i < s->count_res; i++)
     res_data[i] = ((char *)s->res[res[i]].data) - (char*)s->res[0].data;
     if(res_data[i] < mini)
         mini = res_data[i]; 
-//    printf("%i ", res_data[i]);
 }
 
 for(i = 0; i < s->count_res; i++)
 {
     res_data[i] -= mini;
-//printf("%i ", res_data[i]);
 }
-//printf("\n");
 
 /* Sort the children of each parent by memory address. */
 qsched_sort(res, res_data, sorted[0], minVal(res_data, sorted[0]), maxVal(res_data, sorted[0]));
@@ -970,7 +928,6 @@ for(i = 1; i < s->count_res; i++)
 qsched_sort(&res[sorted[s->count_res-1]], &res_data[sorted[s->count_res-1]], s->count_res - sorted[s->count_res-1], minVal(&res_data[sorted[s->count_res-1]], s->count_res - sorted[s->count_res-1]), maxVal(&res_data[sorted[s->count_res-1]], s->count_res - sorted[s->count_res-1]));
 
 /*res now contains an array of indices, first sorted by parent, then memory address of data. */
-int size=0;
 if(sorted[0] != 0)
 {
     /* Check no overlapping resources.*/
@@ -1008,9 +965,10 @@ for(i = sorted[s->count_res]; i >= 0; i-- )
     sorted[i] = sorted[i-1];
 }
 
-toc_run = getticks();
 
 tic = getticks();
+
+
 /* If nothing overlaps create tasks.*/
 for( i = sorted[s->count_res]; i < s->count_res; i++ )
 {
@@ -1024,9 +982,7 @@ for( i = sorted[s->count_res]; i < s->count_res; i++ )
 
 }
 
-toc_run = getticks();
 tic = getticks();
-//    message( "Creating load tasks took %.3f ms" , ((double)(toc_run - tic)) * itpms );
 
 
 /* Check all resources have load tasks - if not give parents (recursively)*/
@@ -1078,10 +1034,6 @@ for(i = 0; i < s->count; i++)
 
     for(j = 0; j < t->nr_uses; j++)
     {
-        t->unlocks[t->nr_unlocks] = s->res[t->uses[j]].utask;
-        s->tasks[s->res[t->uses[j]].utask].wait_init +=1 ;
-        deps_new_key[(t->unlocks - deps_new) + t->nr_unlocks] = i;
-        t->nr_unlocks++;
         if(num_uses[t->uses[j]] == size_uses[t->uses[j]])
         {
             /* Stretch. */
@@ -1131,7 +1083,7 @@ int *tasks_assigned = (int*)calloc(sizeof(int), s->count);
 for(i = 0; i < s->count_res; i++ )
 {
     int ID = res[i];
-    int size = s->res[ID].size;
+ //   int size = s->res[ID].size;
     int numChildren = sorted[ID+1] - sorted[ID];
     int parent = s->res[ID].parent;
     struct res *resource = &s->res[ res[i] ];
@@ -1226,8 +1178,6 @@ for(i = 0; i < s->count_res; i++ )
                 {
                     s->tasks[resource->task].unlocks[ s->tasks[resource->task].nr_unlocks ] = usage_list[ res[ sorted[ ID ] +j ] ][k];
                     s->tasks[ usage_list[ res[ sorted[ ID ] +j ] ][k] ].wait_init += 1;
-//                    if(usage_list[res[sorted[ID]+j]][k] == 63)
-//                        printf("Increased wait_init of task 63, wait_init now %i\n", s->tasks[63].wait_init);
                     deps_new_key[s->count_deps] = resource->task;
                     s->tasks[resource->task].nr_unlocks += 1;
                     s->count_deps += 1;
@@ -1263,11 +1213,7 @@ for(i = 0; i < s->count_res; i++ )
         s->tasks[resource->task].unlocks[ s->tasks[resource->task].nr_unlocks ] = s->res[parent].task;
         deps_new_key[s->count_deps] = resource->task;
         s->tasks[s->res[parent].task].wait_init += 1;
-        //if(s->res[parent].task == 63)
-//            printf("task %i should unlock task 63, %i position.\n", resource->task, s->tasks[resource->task].nr_unlocks);
         s->tasks[resource->task].nr_unlocks += 1;
-//        if(s->res[parent].task == 63)
-//            printf("s->tasks[resource->task].nr_unlocks = %i\n", s->tasks[resource->task].nr_unlocks);
         s->count_deps += 1;
     }
 
@@ -1276,62 +1222,19 @@ for(i = 0; i < s->count_res; i++ )
         s->tasks[resource->task].unlocks[ s->tasks[resource->task].nr_unlocks ] = usage_list[ res[ i ] ][k];
         deps_new_key[s->count_deps] = resource->task;
         s->tasks[usage_list[ res[ i ] ][k]].wait_init += 1;
-//        if(usage_list[res[i]][k] == 63)
-//            printf("task %i should unlock task 63\n", resource->task);
         s->tasks[resource->task].nr_unlocks += 1;
-//        s->count_deps += 1;
     }
     if(!tasks_assigned[resource->task]) {
         s->count_deps += num_deps[resource->task];
         tasks_assigned[resource->task] = 1;
     }
-  //r(k = 0; k < s->tasks[66].nr_unlocks; k++)
-//        printf("%i ", s->tasks[66].unlocks[k]);
-//    printf("\n");
-//    printf("s->tasks[66].unlocks - deps_new = %i\n", s->tasks[66].unlocks - deps_new);
 }
-//printf("s->deps = %i, deps_new = %i\n", s->deps, deps_new);
 free(s->deps);
 free(s->deps_key);
 s->deps = deps_new;
 s->deps_key = deps_new_key;
 s->flags &= ~qsched_flag_dirty;
 toc2 += getticks() - tic;
- //   printf("s->tasks[50].unlocks - s->deps = %i\n", s->tasks[50].unlocks - s->deps);
-
-/*printf("Dependencies leading from task 50's\n");
-for(i = 0; i < s->tasks[50].nr_unlocks + 5; i++)
-    printf("%i ", s->tasks[50].unlocks[i]);
-printf("\n");
-
-printf("s->tasks[50].nr_unlocks = %i, num_deps[50] = %i\n", s->tasks[50].nr_unlocks, num_deps[50]);
-
-free(num_deps);
-printf("Printing dependencies here:\n");
-for(i = 0; i < s->count; i++)
-{*/
-   // printf("Task ID: %i, type = %i: ", i, s->tasks[i].type);
-    /*if(s->tasks[i].type < 100)
-    {
-        for(k = 0; k < s->count_res; k++)
-        {
-            if(s->res[k].task == i || s->res[k].utask == i )
-            {
-                printf(" resource ID = %i", k);
-                if(s->res[k].parent != -1)
-                    printf(" parent resource ID = %i", s->res[k].parent);
-                printf(": ");
-            }
-        }
-    }*/
-   /* for(k = 0; k < s->tasks[i].nr_unlocks; k++)
-        printf("%i ", s->tasks[i].unlocks[k]);
-    printf("\n");
-}
-    printf("----------------\n");*/
-toc_run = getticks();
-    //message( "Setting up dependencies took %.3f ms" , toc2 * itpms );
-//error("Got to here");
 }
 
 
@@ -1349,7 +1252,6 @@ for(i = 0; i < s->count_res; i++)
         continue;
     if(s->res[i].task >= 0)
         continue;
-//    cudaMalloc( &s->res[ i ].gpu_data, s->res[i].size );
     task = qsched_addtask( s , type_load , task_flag_none , &i , sizeof(int) , 0 );
     s->res[i].task = task;
     unload = qsched_addtask( s , type_unload, task_flag_none , &i, sizeof(int), 0 );
@@ -1383,8 +1285,6 @@ for(i = 0; i < s->count_res; i++)
 
         }
     }
-     //   qsched_adduse(s, task , i);
-       // qsched_adduse(s, unload , i);
 }
 
 
@@ -1410,7 +1310,6 @@ int *data;
 char *sdata;
 
 /* Lock the sched. */
-    //lock_lock( &s->lock );
 
     /* Get a pointer to the tasks, set the count. */
     tasks = s->tasks;
@@ -1448,15 +1347,10 @@ char *sdata;
             }
         
         /* All cleaned-up now! */
-        //s->flags &= ~qsched_flag_dirty;
     
 
         }
 
-    double itpms = 1000.0 / CPU_TPS;
-ticks tic, toc_run ;
-    tic = getticks();
-//qsched_ensure(s, s->count + 2*s->count_res, s->count_res, s->count_deps + 2*s->count_res, s->count_locks, s->count_uses + 2*s->count_res, s->size_data );
 
  for ( k = 0 ; k < count ; k++ ) {
         t = &tasks[k];
@@ -1470,8 +1364,6 @@ ticks tic, toc_run ;
 #ifndef NO_LOADS
 qsched_prepare_loads(s);
 #endif
-toc_run = getticks();
-//    message( "prepare_loads took %.3f ms" , ((double)(toc_run - tic)) * itpms );
     
 
     
@@ -1480,41 +1372,18 @@ toc_run = getticks();
     tasks = s->tasks;
     count = s->count;
     
-    /* If the sched is dirty... */
-    if ( 1 /*s->flags & qsched_flag_dirty*/ ) {
-    
-        /* Do the sorts in parallel, if possible. */
-       // #pragma omp parallel
-        //{
-    
-            /* Sort the unlocks. */
-            //#pragma omp single nowait
-           // qsched_sort( s->deps , s->deps_key , s->count_deps , 0 , count - 1 );
+        qsched_sort( s->locks , s->locks_key , s->count_locks , 0 , count - 1 );
 
-            /* Sort the locks. */
-        //    #pragma omp single nowait
-            qsched_sort( s->locks , s->locks_key , s->count_locks , 0 , count - 1 );
-
-            /* Sort the uses. */
-           // #pragma omp single nowait
-           // qsched_sort( s->uses , s->uses_key , s->count_uses , 0 , count - 1 );
-            
-      //  }
         
         /* Run throught the tasks and link the locks and unlocks. */
-//        tasks[0].unlocks = s->deps;
         tasks[0].locks = s->locks;
- //       tasks[0].uses = s->uses;
         for ( k = 1 ; k < count ; k++ ) {
-      //      tasks[k].unlocks = &tasks[k-1].unlocks[ tasks[k-1].nr_unlocks ];
             tasks[k].locks = &tasks[k-1].locks[ tasks[k-1].nr_locks ];
-        //    tasks[k].uses = &tasks[k-1].uses[ tasks[k-1].nr_uses ];
             }
         
         /* All cleaned-up now! */
         s->flags &= ~qsched_flag_dirty;
     
-        }
     /* Init the queues. */
     for ( k = 0 ; k < s->nr_queues ; k++ )
         queue_init( &s->queues[k] , count );
@@ -1525,24 +1394,11 @@ toc_run = getticks();
         tasks[k].wait = 0;
     }    
 
-    /* Run through the tasks and set the waits... */
-/*    for ( k = 0 ; k < count ; k++ ) {
-        t = &tasks[k];
-        if ( !( t->flags & task_flag_skip ) )
-            for ( j = 0 ; j < t->nr_unlocks ; j++ )
-            {
-                tasks[ t->unlocks[j] ].wait += 1;
-            }
-        }*/
 
     for( k = 0; k < count; k++ )
     {
         t = &tasks[k];
         t->wait = t->wait_init;
-//        if(t->wait != t->wait_init)
-  //      {
-    //        printf("Task ID %i has wait %i and wait_init %i and type %i\n", k, t->wait, t->wait_init, t->type);
-      //  }
     }
 
     /* Sort the tasks topologically. */
@@ -1566,23 +1422,11 @@ toc_run = getticks();
         }
 
 
-        /* Print all dependencies */
-       // for(i = 0; i < count; i++ )
-//        {
-//            printf("Task ID: %i, type=%i, wait= %i,  ", tid[i], tasks[tid[i]].type, tasks[tid[i]].wait);
-  //          printf("Task ID: %i, type=%i, wait= %i,  ", i, tasks[i].type, tasks[i].wait);
-//            for(j = 0; j < tasks[tid[i]].nr_unlocks; j++)
-//            {
-//                printf("%i ", tasks[tid[i]].unlocks[j]);
-//            }
-//            printf("\n");
-//        }
     if ( k < count )
     {
         for(i = 0; i < count; i++)
             if(tasks[tid[i]].wait != 0)
                 printf("Task %i has nonzero wait, wait is %i, wait_init is %i\n", tid[i], tasks[tid[i]].wait, tasks[tid[i]].wait_init);
-        //printf("tasks[63].wait = %i, tasks[63].wait_init = %i\n", tasks[63].wait, tasks[63].wait_init);
         printf("k = %i, count = %i\n", k, count);
         printf("tid[k-1] = %i, tid[k-1].wait = %i, initial wait = %i\n", tid[k], tasks[tid[k]].wait, tasks[tid[k]].wait_init);
 
@@ -1613,8 +1457,6 @@ toc_run = getticks();
         else
             t->weight = t->cost + maxweight - 1000;
 
-//        if(t->weight < -100)
-  //          printf("%i\n", t->weight);
         }
 #endif
 #ifndef PRIQ
@@ -1622,10 +1464,6 @@ toc_run = getticks();
     {
         t = &tasks[k];
         t->wait = t->wait_init;
-//        if(t->wait != t->wait_init)
-  //      {
-    //        printf("Task ID %i has wait %i and wait_init %i and type %i\n", k, t->wait, t->wait_init, t->type);
-      //  }
     }
 #endif
         
@@ -1712,82 +1550,7 @@ int *data2;
 struct queue_cuda queues[ cuda_numqueues ];
 
 
-/*#ifdef PRIQ
-qsize = max(2*s->count, 512);
-    if ( cudaMemcpyToSymbol( cuda_queue_size , &qsize , sizeof(int) , 0 , cudaMemcpyHostToDevice ) != cudaSuccess )
-            error("Failed to copy queue size to the device.");
-
-        if ( ( data = (int *)malloc( sizeof(int) * qsize ) ) == NULL )
-            error("Failed to allocate data buffer.");
-        if( ( data2 = (int *) malloc( sizeof(int) * qsize ) ) == NULL )
-            error("Failed to allocate data2 buffer.");
-    queues[0].count = 0;
-
-    for(i = 0; i < s->count; i++)
-    {
-        if(s->tasks[i].wait == 0)
-        {
-            if(s->tasks[i].type != type_load)
-            {          
-                printf("i = %i\n", i);    
-               for(k = 0; k < s->count; k++)
-                {
-                    for(j = 0; j < s->tasks[k].nr_unlocks; j++)
-                    {
-                        if(s->tasks[k].unlocks[j] == i)
-                        printf("Should be unlocked by %i\n", k);
-                    }
-                    for(j = 0; j < s->tasks[k].nr_uses; j++)
-                    {
-                        if(s->tasks[k].uses[j] == 256)
-                        printf("Task %i uses resource 256\n", k);
-                    }
-                    for(j = 0; j < s->tasks[k].nr_locks; j++)
-                    {
-                        if(s->tasks[k].locks[j] == 256)
-                        printf("Task %i locks resource 256\n", k);
-                    }
-                }
-               for(k = 0; k < s->count_res; k++)
-                {
-                    if(s->res[k].utask == i)
-                        printf("resource = %i\n", k);
-                }
-               printf("%i\n", i);
-            }       
-            data[queues[0].count++] = i;
-            data2[queues[0].count-1] = -temp[i].weight;
-        }
-    }
-    qsched_sort(data, data2, queues[0].count, minVal(data2,queues[0].count), maxVal(data2, queues[0].count));
-    free(data2);
-     for ( i = queues[0].count ; i < qsize ; i++ )
-        data[i] = -1; 
-
-    if ( cudaMalloc( &queues[0].data , sizeof(int) * qsize ) != cudaSuccess )
-        error("Failed to allocate queue data on the device.");
-    if ( cudaMemcpy( (void *)queues[0].data , data , sizeof(int) * qsize , cudaMemcpyHostToDevice ) != cudaSuccess )
-        error("Failed to copy queue data pointer to the device");
-
-    for ( k = 0; k < qsize; k++ )
-            data[k] = -1;
-            
-    if ( cudaMalloc( &queues[0].rec_data , sizeof(int) * qsize ) != cudaSuccess )
-        error("Failed to allocate queue data on the device.");
-    if ( cudaMemcpy( (void *)queues[0].rec_data , data , sizeof(int) * qsize , cudaMemcpyHostToDevice ) != cudaSuccess )
-        error("Failed to copy queue data pointer to the device");
-
-        
-    queues[0].first = 0;
-    queues[0].last = queues[0].count;
-    queues[0].nr_avail_tasks = queues[0].last;
-    queues[0].rec_count = 0;  
-    queues[0].count = s->count;
-
-        if ( cudaMemcpyToSymbol( cuda_queues , &queues , sizeof(struct queue_cuda) * nr_queues , 0 , cudaMemcpyHostToDevice ) != cudaSuccess )
-            error("Failed to copy the queues to the device");   
-#else*/
- qsize = max(2*s->count / nr_queues, 256);
+ qsize = max(s->count, 256);
     if ( cudaMemcpyToSymbol( cuda_queue_size , &qsize , sizeof(int) , 0 , cudaMemcpyHostToDevice ) != cudaSuccess )
             error("Failed to copy queue size to the device.");
             
@@ -1826,17 +1589,17 @@ qsize = max(2*s->count, 512);
             data[k] = -1;
             
     /* Allocate and copy the recyling data. */
-    if ( cudaMalloc( &queues[1].rec_data , sizeof(int) * qsize ) != cudaSuccess )
+    if ( cudaMalloc( &queues[1].rec_data , sizeof(int) * s->count*1.2 ) != cudaSuccess )
         error("Failed to allocate queue data on the device.");
     if ( cudaMemcpy( (void *)queues[1].rec_data , data , sizeof(int) * qsize , cudaMemcpyHostToDevice ) != cudaSuccess )
         error("Failed to copy queue data pointer to the device");
     /* Allocate and copy the recyling data. */
-    if ( cudaMalloc( &queues[0].rec_data , sizeof(int) * qsize ) != cudaSuccess )
+    if ( cudaMalloc( &queues[0].rec_data , sizeof(int) * s->count*1.2 ) != cudaSuccess )
         error("Failed to allocate queue data on the device.");
     if ( cudaMemcpy( (void *)queues[0].rec_data , data , sizeof(int) * qsize , cudaMemcpyHostToDevice ) != cudaSuccess )
         error("Failed to copy queue data pointer to the device");
 /* Allocate and copy the recyling data. */
-    if ( cudaMalloc( &queues[2].rec_data , sizeof(int) * qsize ) != cudaSuccess )
+    if ( cudaMalloc( &queues[2].rec_data , sizeof(int) * s->count*1.2 ) != cudaSuccess )
         error("Failed to allocate queue data on the device.");
     if ( cudaMemcpy( (void *)queues[2].rec_data , data , sizeof(int) * qsize , cudaMemcpyHostToDevice ) != cudaSuccess )
         error("Failed to copy queue data pointer to the device");
diff --git a/src/qsched.h b/src/qsched.h
index dfa9cde..97f58fe 100644
--- a/src/qsched.h
+++ b/src/qsched.h
@@ -163,8 +163,9 @@ struct qsched {
 
 /* Function prototypes. */
 /* Internal functions. */
+void qsched_quicksort ( int *data , int *ind , int N , int min , int max );
 void qsched_sort ( int *data , int *ind , int N , int min , int max );
-void qsched_sort_rec ( int *data , int *ind , int N , int min , int max );
+//void qsched_sort_rec ( int *data , int *ind , int N , int min , int max );
 struct task *qsched_gettask ( struct qsched *s , int qid );
 void qsched_done ( struct qsched *s , struct task *t );
 void *qsched_getdata( struct qsched *s , struct task *t );
-- 
GitLab