Code cleanup

f0ab955c · aidan · 320d4466 · f0ab955c · f0ab955c · f0ab955c
Commit f0ab955c authored 10 years ago by aidan
--- a/examples/test_bh_2.cu
+++ b/examples/test_bh_2.cu
@@ -40,6 +40,7 @@ extern "C"{
 }
 #include "cuda_queue.h"

+
 /** Task types. */
 enum task_type {
  task_type_self = 0,
@@ -59,12 +60,12 @@ unsigned short int split, sorted;
 int parts, firstchild, sibling;
 int res, resz, resm, com_tid;

-};//__attribute__((aligned(64)));
+};


 #define const_G 1
 /* Requred variables to obtain cells. */
-#define cell_maxparts 128
+#define cell_maxparts 256
 #define CELL_STRETCH 2
 #define INITIAL_CELLS 256
 struct cell *cell_pool = NULL;
@@ -101,20 +102,7 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
    int count_i = ci->count, count_j = cj->count;
    int parts_i = ci->parts, parts_j = cj->parts;
    double xi[3];
-    float dx[3], ai[3], mi, mj, r2, w, ir;
-    __shared__ double2 parts_xy[cell_maxparts];
-    __shared__ double parts_z[cell_maxparts];
-    __shared__ float4 parts_am[cell_maxparts];
-    /*if(threadIdx.x == 0)
-    printf("%f, %f, %f, %f, %i, %f, %f, %f, %f, %i\n", ci->h, ci->loc_xy.x, ci->loc_xy.y, ci->loc_z, ci->split, 
-            cj->h, cj->loc_xy.x, cj->loc_xy.y, cj->loc_z, cj->split);*/
-
-    /* Load particles of cell j into shared memory */
-    /*for(k = parts_j + threadIdx.x, j = threadIdx.x; k < parts_j + count_j; k+= blockDim.x, j += blockDim.x ) {
-        parts_xy[j] = parts_pos_xy[k];
-        parts_z[j] = parts_pos_z[k];
-        parts_am[j] = parts_a_m[k];
-    }*/
+    float dx[3], ai[3], mj, r2, w, ir;

    /* Loop over cell i.*/
    for(i = parts_i + threadIdx.x; i < parts_i + count_i; i+= blockDim.x) {
@@ -124,7 +112,6 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
        for(k = 0; k < 3; k++) {
            ai[k] = 0.0f;
        }
-        mi = parts_a_m[i].w;
        
        for(j = parts_j; j < parts_j + count_j; j++) {
            r2 = 0.0f;
@@ -136,16 +123,12 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
            r2 += dx[2] * dx[2];


-    //        ir = 1.0f / sqrtf(r2);
            ir = rsqrtf(r2);
            w = const_G * ir * ir * ir;
            mj = parts_a_m[j].w;
            for(k = 0; k < 3; k++) {
                ai[k] -= dx[k] * mj * w;
            }
-   //         atomicAdd(&parts_a_m[j].x, w*dx[0]*mi);
-    //        atomicAdd(&parts_a_m[j].y, w*dx[1]*mi);
-     //       atomicAdd(&parts_a_m[j].z, w*dx[2]*mi);
        }            
       atomicAdd(&parts_a_m[i].x, ai[0]);           
       atomicAdd(&parts_a_m[i].y, ai[1]);
@@ -154,11 +137,6 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
    }

    /* Load particles of cell i into shared memory */
-    /*for(k = parts_i + threadIdx.x, j = threadIdx.x; k < parts_i + count_i; k+= blockDim.x, j += blockDim.x ) {
-        parts_xy[j] = parts_pos_xy[k];
-        parts_z[j] = parts_pos_z[k];
-        parts_am[j] = parts_a_m[k];
-    }*/
 /*Loop over cell j. */
    for(i = parts_j + threadIdx.x; i < parts_j + count_j; i+= blockDim.x) {
        xi[0] = parts_pos_xy[i].x;
@@ -167,7 +145,6 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
        for(k = 0; k < 3; k++) {
            ai[k] = 0.0f;
        }
-        mi = parts_a_m[i].w;
        
        for(j = parts_i; j < parts_i + count_i; j++) {
            r2 = 0.0f;
@@ -196,29 +173,15 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c

 __device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell *cj) {

-    int i, k;
+    int i;
    double2 j_com_xy;
    double j_com_z;
    float j_com_mass;
    int count = leaf->count;    
    int parts = leaf->parts;
    int cell_j = cj - cells;
-    int temp;
    float r2, dx[3], ir, w;

-  //  if(cell_j < 0)
-//    {
-//        if(threadIdx.x == 0)
-  //          printf("cell_j = %i, leaf = %i, threadIdx.x == %i\n", cell_j, leaf-cells, threadIdx.x);
-  //      __syncthreads();
-//        asm("trap;");
-//    }
-
- //   if(threadIdx.x == 0)
- //       printf("%f, %f, %f\n", cj->loc_xy.x, cj->loc_xy.y, cj->loc_z);
-
-
-    temp = cell_j;

    /* Init the com's data.*/
    j_com_xy = com_xy[cell_j];
@@ -236,18 +199,10 @@ __device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell
    
        ir = rsqrtf(r2);
        w = j_com_mass * const_G * ir * ir * ir;
-       /* __threadfence();
-        if(!isfinite(w * dx[0])){
-            printf("Error in make_interact_pc, j_com_mass = %f, cell_j = %i, temp = %i, i = %i, threadIdx.x=%i\n", j_com_mass, cell_j, temp, i, threadIdx.x); asm("trap;");}
-        if(!isfinite(w * dx[1])){
-            printf("Error in make_interact_pc\n"); asm("trap;");}
-        if(!isfinite(w * dx[2])){
-            printf("Error in make_interact_pc\n"); asm("trap;");}*/
        atomicAdd( &parts_a_m[i].x , w * dx[0]);
        atomicAdd( &parts_a_m[i].y , w * dx[1]);
        atomicAdd( &parts_a_m[i].z , w * dx[2]);
    }
-//__syncthreads();
 }

 /**
@@ -255,7 +210,6 @@ __device__ __forceinline__ void make_interact_pc(struct cell *leaf, struct cell
 */
 __device__ __forceinline__ int are_neighbours_different_size(struct cell *ci, struct cell *cj) {

-    int k;
    float dx[3];
    double cih = ci->h, cjh = cj->h;
    
@@ -274,7 +228,7 @@ __device__ __forceinline__ int are_neighbours_different_size(struct cell *ci, st
 }

 __device__ __forceinline__ int are_neighbours(struct cell *ci, struct cell *cj) {
-    int k;
+
    float dx[3];
    float min_dist = ci->h;
    float center_i = ci->loc_xy.x;
@@ -296,13 +250,6 @@ __device__ __forceinline__ int is_inside(struct cell *leaf, struct cell *c) {
 __device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf) {

    struct cell *cp ,*cps;
-    int leafnum = leaf - cells;
-//if(threadIdx.x == 0 && leafnum == 23)
-  //      printf("cj = %i\n", cj - cells);
-//    printf("%i\n", leafnum);
-
- //   if(threadIdx.x == 0)
- ///       printf("ci = %i, cj = %i, leaf = %i\n", ci - cells, cj - cells, leaf - cells);

    for(cp = &cells[ci->firstchild]; cp != &cells[ci->sibling]; cp = &cells[cp->sibling]) {
        if(is_inside(leaf, cp)) break;
@@ -318,16 +265,12 @@ __device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf
                }
            } else {
                make_interact_pc(leaf, cps);
-             //   if(threadIdx.x == 0 && leafnum == 23)
-               //         printf("leafnum = %i with cps = %i here\n", leafnum, cps - cells);
                __syncthreads();
            }
        }
    }else{

        for(cps = &cells[cj->firstchild]; cps!= &cells[cj->sibling]; cps = &cells[cps->sibling]) {
-         //   if(threadIdx.x == 0 && leafnum == 23)
-           //             printf("leafnum = %i with cps = %i\n", leafnum, cps - cells);
            make_interact_pc(leaf, cps);
        }

@@ -347,17 +290,6 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) {

    struct cell *cp, *cps;

-    /*if(leaf->split)
-    {
-        printf("Leaf split = 1, oh dear.");
-        asm("trap;");
-    }
-    if(c->split > 1)
-    {
-        printf("Cell had split > 1\n");
-        asm("trap;");
-    }*/
-
    /* Find the subcell of c the leaf is in.*/

    /*cp = c;
@@ -402,31 +334,13 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) {
 __device__ void iact_self_direct(int cellID) {
    struct cell *c = &cells[cellID];
    double xi[3] = {0.0, 0.0, 0.0};
-    float ai[3] = {0.0, 0.0, 0.0 }, mi, mj, dx[3] = {0.0,0.0,0.0}, r2, ir, w;
-    __shared__ double2 parts_xy[cell_maxparts];
-    __shared__ double parts_z[cell_maxparts];
-    __shared__ float4 parts_am[cell_maxparts];
+    float ai[3] = {0.0, 0.0, 0.0 }, mj, dx[3] = {0.0,0.0,0.0}, r2, ir, w;
    int parts;
    int count;
    int i,j,k;
    
-    //if(threadIdx.x == 0)
-     //   printf("%f, %f, %f, %f, %i\n", c->h, c->loc_xy.x, c->loc_xy.y, c->loc_z, c->split);
-    //If cell is split, interact each child with itself, and with each of its siblings.
-    /*if(c->split) {
-        //TODO
-
-    } else {*/
        parts = c->parts;
        count = c->count;
-        int z = threadIdx.x;
-        /* Load particle data into shared memory*/
-        /*for(k = threadIdx.x + parts; k < parts + count; k += blockDim.x , z += blockDim.x) {
-            parts_xy[z] = parts_pos_xy[k];
-            parts_z[z] = parts_pos_z[k];
-            parts_am[z] = parts_a_m[k];
-        }
-        __syncthreads();*/
        for(i = parts+threadIdx.x; i < parts+count; i += blockDim.x)
        {
            xi[0] = parts_pos_xy[i].x;
@@ -435,9 +349,7 @@ __device__ void iact_self_direct(int cellID) {
            for(k = 0; k < 3; k++) {
                ai[k] = 0.0;
            }
-            mi = parts_a_m[i].w;
            
-            //for(j = i+1; j!= i; j = (j+1)%count) 
            for(j = parts; j < parts+count; j++)
            {
                if(i != j){
@@ -453,7 +365,6 @@ __device__ void iact_self_direct(int cellID) {

                    /* Apply the gravitational acceleration. */

-                    //ir = 1.0f / sqrtf(r2);
                    ir = rsqrtf(r2);
                    w = const_G * ir * ir * ir;
                    mj = parts_a_m[j].w;
@@ -479,7 +390,7 @@ __device__ void iact_self_direct(int cellID) {
 */
 static inline int are_neighbours_host(struct cell *ci, struct cell *cj) {

-  int k;
+//  int k;
  float dx[3];

 #ifdef SANITY_CHECKS
@@ -508,7 +419,7 @@ static inline int are_neighbours_host(struct cell *ci, struct cell *cj) {

 struct cell *cell_get()
 {
-    struct cell *res;
+//    struct cell *res;
    
    if(num_cells == 0)
    {
@@ -645,9 +556,7 @@ void cell_split(int c, struct qsched *s) {
    int left[8], right[8];
    double pivot[3];
    static int root = -1;
-//    struct cell *progenitors[8];
    int progenitors[8];
-    int c1 = c;
    double2 *temp_xy;
    double *temp_z;
    float4 *temp_a_m;
@@ -662,7 +571,6 @@ void cell_split(int c, struct qsched *s) {
    {
        if( cudaHostGetDevicePointer(&temp_xy, &parts_pos_xy_host[cell_pool[c].parts], 0) != cudaSuccess )
            error("Failed to get host device pointer.");
-//        printf("tempxy = %p\n", temp_xy);
        cell_pool[c].res = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_xy,
                                        sizeof(double2) * cell_pool[c].count, parts_pos_xy_temp + cell_pool[c].parts);
    }
@@ -680,7 +588,6 @@ void cell_split(int c, struct qsched *s) {
        cell_pool[c].resm = qsched_addres(s, qsched_owner_none, qsched_res_none, temp_a_m,
                                        sizeof(float4) * cell_pool[c].count, parts_a_m_temp + cell_pool[c].parts);
    }
-       // error("Cell has no resource");*///TODO

    if(count > cell_maxparts )
    {
@@ -836,13 +743,9 @@ void cell_split(int c, struct qsched *s) {
    /* Otherwise we're at a leaf so we need to make the cell's particle-cell task. */   
    } else {

-//    struct cell *data[2] = {root, c};
    int data[2] = {root, c};
        int tid = qsched_addtask(s, task_type_self_pc, task_flag_none, data,
                                 2 * sizeof(int), 3000);
-        /*qsched_adduse(s, tid, cell_pool[root].res);
-        qsched_adduse(s, tid, cell_pool[root].resz);
-        qsched_adduse(s, tid, cell_pool[root].resm);*/
        qsched_addlock(s, tid, cell_pool[c].res);
        qsched_addlock(s, tid, cell_pool[c].resz);
        qsched_addlock(s, tid, cell_pool[c].resm);
@@ -864,8 +767,7 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj){
    
    qsched_task_t tid;
    int data[2];
-    struct cell /**data[2],*/ *cp, *cps;
-    int cpi;
+    struct cell *cp, *cps;
    
    
    if(cj == NULL)
@@ -967,11 +869,8 @@ qsched_funtype func;
 void test_bh(int N, int runs, char *fileName) {
  int i, k;
  struct cell *root;
-  struct part *parts;
  FILE *file;
  struct qsched s;
-  ticks tic, toc_run, tot_setup = 0, tot_run = 0;
-  int countMultipoles = 0, countPairs = 0, countCoMs = 0;
  struct cell *gpu_ptr_cells;

  cudaFree(0);
@@ -1142,23 +1041,6 @@ float *comm_temp;
    if( cudaMemcpyToSymbol(com_mass, &comm_temp, sizeof(float *), 0, cudaMemcpyHostToDevice) != cudaSuccess)
        error("Failed to copy com pointer to the GPU");

-   /* for(i = 0; i < s.count; i++){
-        int *idata = (int*)&s.data[s.tasks[i].data];
-        if(s.tasks[i].type == task_type_self)
-            printf("Self task with data[0] = %i\n", idata[0]);
-        if(s.tasks[i].type == task_type_pair) {
-            printf("Pair task with data[0] = %i and data[1] = %i\n", idata[0], idata[1]);
-        }
-        if(s.tasks[i].type == task_type_self_pc) {
-            printf("PC task with data[0] = %i and data[1] = %i\n", idata[0], idata[1]);
-        }
-    }*/
-        
-      //  printf("com_mass_host[152] = %f\n", com_mass_host[152]);
-        
-
-        //Run code.
-//        printf("gpu_data = %p\n", (int*)s.res[0].gpu_data);
        qsched_run_CUDA( &s , func );
        qsched_print_cuda_timers(&s);

@@ -1169,16 +1051,12 @@ struct task* tasks =  qsched_get_timers( &s , s.count );
    for(i = 0; i < s.count; i++)
    {
      printf("%i %lli %lli %i\n", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID);
-       // printf("\n");
    
  }

 }   
  /* Dump the particles to a file */
  file = fopen("particle_dump.dat", "w");
-/*  fprintf(file,
-          "# ID m x y z a_exact.x   a_exact.y    a_exact.z    a_legacy.x    "
-          "a_legacy.y    a_legacy.z    a_new.x     a_new.y    a_new.z\n");*/
  for (k = 0; k < N; ++k)
    fprintf(file, "%e, %e, %e, %e, %e, %e, %e\n",
            parts_a_m_host[k].w, parts_pos_xy_host[k].x, parts_pos_xy_host[k].y, parts_pos_z_host[k],
@@ -1228,9 +1106,6 @@ int main(int argc, char *argv[]) {
  /* Tree node information */
  printf("Size of cell: %zu bytes.\n", sizeof(struct cell));

-  /* Part information */
-//  printf("Size of part: %zu bytes.\n", sizeof(struct part));
-
  /* Dump arguments. */
  if (fileName[0] == 0) {
    message("Computing the N-body problem over %i random particles using %i "

--- a/examples/test_bh_3.cu
+++ b/examples/test_bh_3.cu
--- a/examples/test_gpu_simple.cu
+++ b/examples/test_gpu_simple.cu
@@ -67,7 +67,6 @@ __device__ void quarter(float *data)

 __global__ void Manual(float *src)
 {
-    int i;
    int datas = blockIdx.x;
    cuda_data[datas*1000+threadIdx.x] = src[datas*1000+threadIdx.x];
    
@@ -109,13 +108,13 @@ __global__ void Setup()


 int main ( int argc , char *argv[] ) {
-    float *array, *cuda_array, *cuda_array2, *device_array;
+    float *array, *cuda_array, *device_array;
    int i, k=0;
    qsched_funtype func;
    struct qsched s;
    qsched_task_t *tid;
    qsched_res_t *rid;
-    ticks tic, toc_run, tot_setup, tot_run = 0;
+    ticks tic, toc_run, tot_run = 0;
    qsched_init( &s , 1 , qsched_flag_none );
    cudaDeviceReset();
    cudaSetDevice(0);
@@ -134,11 +133,6 @@ int main ( int argc , char *argv[] ) {
        array[i] = (float)i;
 }

-       // if(cudaHostGetDevicePointer(&cuda_array , &array[k*1000] , 0) != cudaSuccess)
-        //error("Failed to get device pointer for data: %s", cudaGetErrorString(cudaPeekAtLastError()));
-
-//    if ( cudaMallocHost(&array, M*sizeof(float)) != cudaSuccess )
-  //      error("Failed to allocate array");

 /* Initialize the scheduler. */

@@ -193,16 +187,9 @@ printf("Starting second run\n");
 printf("Second run complete\n");

    struct task *completed_tasks = qsched_get_timers( &s, s.count );
-/*    tic = getticks();
-    qsched_run_CUDA( &s , func );
-    toc_run = getticks(); 
-	message( "qsched_run_CUDA took %lli ticks..." , toc_run - tic );
-    tot_run += toc_run - tic;*/



-//    if( cudaFreeHost( array) != cudaSuccess)
-  //      error("Failed to free array");

 cudaDeviceReset();

@@ -215,8 +202,6 @@ cudaDeviceReset();
        error("Failed to get device pointer for data: %s",cudaGetErrorString(cudaPeekAtLastError()));
    if( cudaMalloc(&device_array , M*sizeof(float) ) != cudaSuccess )
        error("Failed to allocate device array: %s", cudaGetErrorString(cudaPeekAtLastError()));
-    /*if( cudaMemcpy( &device_array , array , M*sizeof(float), cudaMemcpyHostToDevice ) != cudaSuccess )
-        error("Failed to copy device array: %s", cudaGetErrorString(cudaPeekAtLastError()));*/
    if( cudaMemcpyToSymbol( cuda_data , &device_array,sizeof(float*), 0 ,  cudaMemcpyHostToDevice) != cudaSuccess )
        error("Failed to copy array pointer to device: %s", cudaGetErrorString(cudaPeekAtLastError()));
    tic = getticks();
@@ -229,6 +214,4 @@ cudaDeviceReset();
        if(array[i] != ((float)i)*((float)i)*0.5f)
            printf("%i wrong, %.3f != %.3f\n", i, array[i], ((float)i)*((float)i)*0.5f );

-    //printf("%.3f\n", array[2]);
-
 }
--- a/examples/test_qr.cu
+++ b/examples/test_qr.cu
@@ -22,11 +22,8 @@ extern "C"{
 int g_size;
 enum task_types { task_SGEQRF , task_SLARFT , task_STSQRF , task_SSSRFT} ;

-//#define TID threadIdx.x
 #define numthreads 128
-//#define tilesize 32
 #define cuda_maxtasks 1000000
-//#define CO(x,y,ldm) (((y)*ldm) + (x))
 #define WARPS 4

 __device__ float *GPU_matrix;
@@ -145,7 +142,7 @@ float* generateColumnMatrix(int size, unsigned long int m_z)
 {
    float* matrix;
    
-    cudaError_t code = cudaMallocHost(&matrix, sizeof(float)*size/**size*/);
+    cudaError_t code = cudaMallocHost(&matrix, sizeof(float)*size);
    if(code != cudaSuccess)
        printf("%s size = %i g_size = %i\n", cudaGetErrorString(code),size, g_size);
       else
@@ -211,8 +208,6 @@ __device__ inline void reduceSumMultiWarp( float* value )
    }
    __threadfence();
    *value = stuff[group * 32];
-        //printf("Not supported.\n");
-        //asm("trap;");
    #endif

 }
@@ -236,7 +231,6 @@ __device__ void SGEQRF(volatile float* cornerTile, int tilesize,volatile float*
    int i, j;
    float norm=0.0, sign, u1, tau, z;
    int TID = threadIdx.x % 32;
-    //int set = threadIdx.x / 32;
    float w;
    /*Find the householder vector for each row. */
    for(i = 0; i < tilesize; i++)
@@ -253,7 +247,6 @@ __device__ void SGEQRF(volatile float* cornerTile, int tilesize,volatile float*
        else
            sign = 1;

-     //   sign = __shfl(sign, i, 32);
        norm = sqrt(norm);
        if(TID >= i )
            w = cornerTile[ i*tilesize + TID ];
@@ -262,13 +255,8 @@ __device__ void SGEQRF(volatile float* cornerTile, int tilesize,volatile float*



-//        if(TID==i)
            u1 = cornerTile[i*tilesize + i] - sign*norm;

-  //      __syncthreads();
-//        u1 = __shfl(u1, i, 32);     
-  //      if(i==1)
-//            printf("%.3f\n", u1);
        if(u1 != 0.0)
        {
            if(TID > i)
@@ -285,8 +273,6 @@ __device__ void SGEQRF(volatile float* cornerTile, int tilesize,volatile float*
        else
            tau = 0.0;

-        //if(TID == 0)
-          //  printf("%.3f\n", sign);

        /*Store the below diagonal vector */
        if(TID > i)
@@ -358,17 +344,9 @@ void __device__ SLARFT(volatile float* cornerTile, volatile float* rowTile, int
            /* Below Diagonal!*/
            /*Compute w'*A_j*/
 			z = w * rowTile[j*tilesize+TID];
-            //for(n = i; n < tileSize; n++)
-            //{
-            //    z = z + w[n] * rowTile[j*tileSize+n];
-            //}
            reduceSumMultiWarp(&z);
            if(TID >= i)
                rowTile[j*tilesize + TID] = rowTile[j*tilesize + TID] - tauMatrix[(kk*tilesize+i)*tauNum + kk] * w * z;
-            //for(n = i; n < tileSize; n++)
-            //{
-              //  rowTile[j*tileSize+n] = rowTile[j*tileSize+n] - tauMatrix[(kk*tileSize+i)*tauNum+kk]*w[n]*z;
-            //}
        }

        
@@ -393,16 +371,14 @@ void __device__ SLARFT(volatile float* cornerTile, volatile float* rowTile, int
 */
 __device__ void STSQRF(volatile float* cornerTile,volatile float* columnTile, int tilesize, int ii, int kk, volatile float* tauMatrix, int tauNum )
 {
-    int i, j, n;
+    int i, j;
    float norm=0.0, sign, u1, tau, z;
    float w, wupper;
    int TID = threadIdx.x % 32;
-    //int set = threadIdx.x / 32;

    /* For each column compute the householder vector. */
    for(i = 0; i < tilesize; i++)
    {
-//        norm = 0.0;
        
        wupper = cornerTile[i*tilesize + i];

@@ -446,33 +422,6 @@ __device__ void STSQRF(volatile float* cornerTile,volatile float* columnTile, in
        columnTile[i*tilesize+TID] = w;
        if(threadIdx.x == 0)
            tauMatrix[(kk*tilesize+i)*tauNum+ii] = tau;
-        /* Apply to each row to the right.*/
-//        for(j = i; j < tilesize; j++)
-  //      {
-            /* Find w'*A_j, w is 0s except for first value with upper tile.*/
-    //        z = 1.0 * cornerTile[ j*tilesize+i ];            
-      //      for(n = 0; n < tilesize; n++)
-        //    {
-          //      z = z + w[ tilesize+n ]*columnTile[ j*tilesize+n ];
-//            }
-            /* Apply to upper tile.*/
-  //         cornerTile[j*tilesize+i] = cornerTile[j*tilesize+i ] - tau*1.0*z;
-    //        for(n = i+1; n < tilesize; n++)
-      //      {
-        //        cornerTile[j*tilesize+n] = cornerTile[j*tilesize+n ] - tau*w[n]*z;
-          //  }
-            /* Apply to lower tile.*/
-//            for(n = 0; n < tilesize; n++)
-  //          {
-    //            columnTile[ j*tilesize+n] = columnTile[ j*tilesize+n ] - tau*w[tilesize+n]*z;
-      //      }
-            
-       // }
-        /* Store w*/
-//        for(j = 0; j < tilesize; j++){
-  //          columnTile[ i*tilesize+j ] = w[tilesize+j];
-    //    }
-      //  tauMatrix[(kk*tilesize+i)*tauNum+ ii] = tau;
    }
 }

@@ -495,7 +444,7 @@ __device__ void STSQRF(volatile float* cornerTile,volatile float* columnTile, in
 */
 __device__ void SSSRFT( volatile float* cornerTile,volatile float* columnTile,volatile float* rowTile, int tilesize, int ii, int jj, int kk,volatile float* tauMatrix, int tauNum )
 {
-    int i, j, n;
+    int i, j;
    float z;
    float w;
    int TID = threadIdx.x % 32;
@@ -520,30 +469,6 @@ __device__ void SSSRFT( volatile float* cornerTile,volatile float* columnTile,vo
            cornerTile[j*tilesize+TID] = cornerTile[j*tilesize+TID] - tau*w*z;
        }
        __syncthreads();
-        /*for(j = 0; j < i; j++)
-            w[j] = 0.0;
-        w[i] = 1.0;
-        for(j = i+1; j < tilesize; j++)
-            w[j] = 0.0;
-        for(j = 0; j < tilesize; j++)
-            w[j+tilesize] = columnTile[i*tilesize +j];*/
-        
-        /* Apply householder vector (w) to the tiles.*/
-/*        for(j = 0; j < tilesize; j++)
-        {
-            z = 0.0;*/
-            /* Compute w' * A_j */
-/*            for(n = 0; n < tilesize; n++)
-            {
-                z += w[n] * rowTile[j*tilesize+n];
-                z += w[n + tilesize] * cornerTile[j*tilesize+n];
-            }
-            for(n = 0; n < tilesize; n++)
-            {
-                rowTile[j*tilesize + n] = rowTile[j*tilesize + n] - tauMatrix[(kk*tilesize+i)*tauNum+ii]*w[n]*z;
-                cornerTile[j*tilesize+n] = cornerTile[j*tilesize+n]- tauMatrix[(kk*tilesize+i)*tauNum+ii]*w[tilesize+n]*z;
-            }
-        }*/
    }

 }
@@ -551,37 +476,25 @@ __device__ void SSSRFT( volatile float* cornerTile,volatile float* columnTile,vo
 __device__ void runner ( int type , void *data ) {
    
 	__shared__ volatile float blockCache[(32*32) + 128];
-	volatile float *workVector;
-	workVector = blockCache + (32*32);
        /* Decode the task data. */
        int *idata = (int *)data;
        int i = idata[0];
        int j = idata[1];
        int k = idata[2];
-    //    int z;
-//        double buff[ 2*K*K ];
        /* Decode and execute the task. */
        switch ( type ) {
            case task_SGEQRF:
-                //if(threadIdx.x == 0)
-                  //  printf("SGEQRF %i %i\n", k, (k*cuda_m+k)*32*32);
                if(threadIdx.x < 32)
                    SGEQRF( &GPU_matrix[(k*cuda_m+k)*32*32], 32, GPU_tau, k, cuda_m);
                break;
            case task_SLARFT:
-        //        if(threadIdx.x == 0)
-        //            printf("SLARFT %i %i %i\n", k,j, (j*cuda_m+k)*32*32);
                SLARFT( &GPU_matrix[(k*cuda_m +k)*32*32], &GPU_matrix[(j*cuda_m+k)*32*32], 32, j, k, GPU_tau, cuda_m);
                break;
            case task_STSQRF:
-          //      if(threadIdx.x == 0)
-         //           printf("STSQRF %i %i %i\n", k,i, (k*cuda_m+i)*32*32);
                if(threadIdx.x < 32)
                    STSQRF( &GPU_matrix[(k*cuda_m+k)*32*32], &GPU_matrix[(k*cuda_m + i)*32*32], 32, i, k, GPU_tau, cuda_m);
                break;
            case task_SSSRFT:
-         //       if(threadIdx.x == 0)
-         //           printf("SSSRFT %i %i %i %i\n", k,j, i, (j*cuda_m+i)*32*32);
                SSSRFT( &GPU_matrix[(j*cuda_m+i)*32*32],&GPU_matrix[(k*cuda_m + i)*32*32],&GPU_matrix[(j*cuda_m+k)*32*32], 32, i, j,        k, GPU_tau, cuda_m );
                break;
            default:
@@ -630,8 +543,6 @@ float* columnToTile( float* columnMatrix, int size , int m , int n )
    cudaMallocHost(&TileMatrix, sizeof(float) * size );
    if(TileMatrix == NULL)
        error("failed to allocate TileMatrix");
-    int rows = m*32;
-    int columns = n*32;
    int i,j,k,l;

    for( i = 0; i < n ; i++ )
@@ -691,8 +602,6 @@ float* createIdentity(int m, int n)
    cudaMallocHost(&Matrix, sizeof(float) * m*n*32*32);
    if(Matrix == NULL)
        error("Failed to allocate Matrix");
-    int rows = m*32;
-    int columns = n*32;
    int i, j;
    memset ( Matrix, 0, sizeof(float)*m*n*32*32 );

@@ -729,24 +638,6 @@ void printMatrix(float* Matrix, int m, int n)
 }


-/*void printTileMatrix(float* Matrix, int m, int n)
-{
-    int i,j;
-    float *tempMatrix, *tempMatrix2;
-    for(i = 0; i < m*32; i++)
-    {
-        int tiled = i/32;
-        tempMatrix = &Matrix[tiled*32*32];
-        for(j = 0; j < n*32; j++)
-        {
-            int tile = j/32;
-            tempMatrix2 = &tempMatrix[tile*32*32*m + i%32];
-            printf(" %.1f ", tempMatrix2[j*32]);
-            //printf(" %.1f,%i ", tempMatrix2[j*32], &tempMatrix2[j*32]-Matrix);
-        }   
-        printf("\n");
-    }
-}*/


 qsched_funtype func;
@@ -759,7 +650,7 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
    qsched_task_t *tid, tid_new;
    qsched_res_t *rid;
    int data[3];
-    ticks tic, toc_run, tot_setup, tot_run = 0;
+//    ticks tic, tot_run = 0;
    #ifdef NO_LOADS
    ticks tic_loads;
    #endif
@@ -776,22 +667,15 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
    
    /* Allocate and fill the original matrix. */
    if(cudaMallocHost(&A, sizeof(float) * m * n * K * K  ) != cudaSuccess ||
-        cudaMallocHost(&tau, sizeof(float) * m * n * K ) != cudaSuccess /*||
-        cudaMallocHost(&A_orig, sizeof(float) * m * n * K * K  ) != cudaSuccess*/ )
+        cudaMallocHost(&tau, sizeof(float) * m * n * K ) != cudaSuccess )
        error("Failed to allocate matrices.");
    g_size = g_size + sizeof(float) * m * n * K * K  + sizeof(float) * m * n * K;
-    //cudaFreeHost(A_orig);
-//    for ( k = 0 ; k < m * n * K * K ; k++ )
-  //      A_orig[k] = 2.0f*((float)rand()) / RAND_MAX - 1.0f;
    A_orig = generateColumnMatrix(m*n*K*K, 35532);
-  //  printMatrix(A_orig, m, n);
    float *temp = columnToTile(A_orig, m * n * K *K,  m , n);
    cudaFreeHost(A_orig);
    A_orig = temp;
    temp = tileToColumn(A_orig, m*n*K*K, m , n, K);
-//    printMatrix(temp, m, n);
    
-//    printTileMatrix(A_orig, m, n);
    memcpy( A , A_orig , sizeof(float) * m * n * K * K );
    bzero( tau , sizeof(float) * m * n * K );

@@ -814,7 +698,6 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
        error("Failed to copy matrix pointer to the device");

    /* Allocate and init the task ID and resource ID matrix. */
-    tic = getticks();
   /* Allocate and init the task ID and resource ID matrix. */
    if( cudaMallocHost(&tid , sizeof(qsched_task_t) * m*n ) != cudaSuccess )
        error("Failed to allocate tid");
@@ -860,7 +743,7 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
            data[0] = i; data[1] = k; data[2] = k;
            tid_new = qsched_addtask( &s , task_STSQRF , task_flag_none , data , sizeof(int)*3 , 3 );
            qsched_addlock(&s, tid_new, rid[k * m + i]);
-           qsched_adduse(&s, tid_new, rid[k * m + k]);
+           qsched_addlock(&s, tid_new, rid[k * m + k]);
            qsched_addunlock(&s, tid[k * m + (i - 1)], tid_new);
            if (tid[k * m + i] != -1) qsched_addunlock(&s, tid[k * m + i], tid_new);
            tid[k * m + i] = tid_new;
@@ -871,7 +754,7 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
                tid_new = qsched_addtask( &s , task_SSSRFT , task_flag_none , data , sizeof(int)*3 , 5 );
                qsched_addlock(&s, tid_new, rid[j * m + i]);
                qsched_adduse(&s, tid_new, rid[k * m + i]);
-                qsched_adduse(&s, tid_new, rid[j * m + k]);
+                qsched_addlock(&s, tid_new, rid[j * m + k]);
                qsched_addunlock(&s, tid[k * m + i], tid_new);
                qsched_addunlock(&s, tid[j * m + i - 1], tid_new);
                if (tid[j * m + i] != -1) qsched_addunlock(&s, tid[j * m + i], tid_new);
@@ -905,15 +788,12 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)


    
-   // printMatrix(tileToColumn(A, m*n*K*K, m, n, K), m, n);
-/*   float *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K);
+   float *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K);
   float *Q = computeQ(tempMatrix, m*K, K, tau, m);
   float *R = getR(tempMatrix, m*K);
   cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m*K, m*K, m*K, 1.0, Q,
   m*K, R, m*K, 0.0, tempMatrix, m*K);
       free(Q);
-   // printMatrix(tempMatrix, m, n);
-   // printf("\n\n\n\n");
       Q = tileToColumn(A_orig, m*n*K*K, m, n, K);
   for(i = 0; i < m * n * K * K; i++)
   {
@@ -922,15 +802,16 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
           printf("Not correct at value %i %.3f %.3e %.3e\n", i, A[i], Q[i],
   tempMatrix[i]);
   }
+   printf("Checked for correctness\n");
   free(tempMatrix);
   free(Q);
-   free(R);*/
+   free(R);

 //    cudaMemcpy( A , device_array , sizeof(float) * m * n * K * K, cudaMemcpyHostToDevice);
 //   A = tileToColumn(A,m * n * K * K, m, n, K);
   // printMatrix(A, m, n);
 //    printTileMatrix(A, m , n);
-    struct task* tasks =  qsched_get_timers( &s , s.count );
+   /* struct task* tasks =  qsched_get_timers( &s , s.count );
    for(i = 0; i < s.count; i++)
    {
      printf("%i %lli %lli %i ", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID);
@@ -947,7 +828,7 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
            printf("0 0");
        printf("\n");
    }
-    free(tasks);
+    free(tasks);*/
 cudaDeviceReset();
 }

@@ -956,7 +837,6 @@ void test()
 {
    int m = 2, n = 2;
    float* Matrix = createIdentity(m,n);
-    //printMatrix(Matrix, m, n);
    float* MatrixTile = columnToTile(Matrix, m*n*32*32, m, n);
    printMatrix(&MatrixTile[0], 1, 1);
    printf("\n \n \n");
@@ -966,7 +846,6 @@ void test()
    printf("\n \n \n");
    printMatrix(&MatrixTile[3*32*32], 1, 1);
    printf("\n \n \n");
-    //printTileMatrix(MatrixTile, m, n);
    free(Matrix);
    Matrix = tileToColumn(MatrixTile, m*n*32*32, m , n, 32);
    printMatrix(Matrix, m, n);
@@ -988,11 +867,9 @@ __global__ void SGEQRF_test(float* cornerTile)
    
    for(i = threadIdx.x; i < 32*32; i+=blockDim.x)
    {
-       // printf("Copying value %i\n", i);
        tile[i] = cornerTile[i];
    }

-   // printf("blockDim.x = %i\n", blockDim.x);
    __syncthreads();
    if(threadIdx.x < 32)
        SGEQRF(tile, 32, tau, 0, 1);
@@ -1029,7 +906,6 @@ __global__ void SLARFT_test(float* cornerTile, float* rowTile)
        SGEQRF(tile, 32, tau, 0, 2);

    __syncthreads();
-//    if(threadIdx.x < 32)
        SLARFT(tile, tile2, 32, 1, 0, tau, 2);
    __syncthreads();
    for(i = threadIdx.x; i < 32*32; i+=blockDim.x)
@@ -1165,12 +1041,6 @@ int main ( int argc , char *argv[] ) {
    int c, nr_threads=128;
    int M = 4, N = 4, runs = 1, K = 32;
    
-    /* Get the number of threads. */
-    //#pragma omp parallel shared(nr_threads)
-//    {
-//        if ( omp_get_thread_num() == 0 )
-//            nr_threads = omp_get_num_threads();
-//    }
    
    /* Parse the options */
    while ( ( c = getopt( argc , argv  , "m:n:k:r:t:D" ) ) != -1 )
@@ -1194,7 +1064,6 @@ int main ( int argc , char *argv[] ) {
 	        case 't':
 	            if ( sscanf( optarg , "%d" , &nr_threads ) != 1 )
 	                error( "Error parsing number of threads." );
-	            //omp_set_num_threads( nr_threads );
                break;
            case 'D':
                runTests();
@@ -1206,9 +1075,6 @@ int main ( int argc , char *argv[] ) {
 	            exit( EXIT_FAILURE );
 	        }
            
-    /* Dump arguments. */
-  //  message( "Computing the tiled QR decomposition of a %ix%i matrix using %i threads (%i runs)." ,
-    //    32*M , 32*N , nr_threads , runs );
        
    test_qr( M , N , K , nr_threads , runs );
    

--- a/src/CUDACompile.sh
+++ b/src/CUDACompile.sh
 #!/bin/bash
 FLAGS2="-Xcompiler=-fsanitize=address -Xcompiler=-fno-omit-frame-pointer"
 DEBUG_FLAGS="-G -DDEBUG_GPU"
-FLAGS="-O3 -g -G -DCPU_TPS=3.1e9 -lineinfo -src-in-ptx -Xptxas -dlcm=cg --maxrregcount=32 -gencode arch=compute_30,code=sm_30 -ftz=true -fmad=true -DFPTYPE_SINGLE -lgomp -DWITH_CUDA -DTIMERS -ccbin=/usr/bin/gcc-4.8"
+FLAGS="-O3 -g -DCPU_TPS=3.1e9 -lineinfo -src-in-ptx -Xptxas -dlcm=cg --maxrregcount=32 -gencode arch=compute_30,code=sm_30 -ftz=true -fmad=true -DFPTYPE_SINGLE -lgomp -DWITH_CUDA -DTIMERS -ccbin=/usr/bin/gcc-4.8"
 # -DGPU_locks -Xptxas -dlcm=cg  -Xptxas="-v"" 
 # -DNO_LOADS

@@ -36,3 +36,7 @@ cd ../examples
 /home/aidan/cuda_6.0/bin/nvcc $FLAGS  -dc -m64 -I../src -dc -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -lcudart -lcuda  test_bh_2.cu -lprofiler

 /home/aidan/cuda_6.0/bin/nvcc $FLAGS  -m64 -I../src -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -Xnvlink -v test_bh_2.o ../src/.libs/libquicksched_cuda.a -o test_bh_2 -lprofiler
+
+/home/aidan/cuda_6.0/bin/nvcc $FLAGS  -dc -m64 -I../src -dc -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -lcudart -lcuda  test_bh_3.cu -lprofiler
+
+/home/aidan/cuda_6.0/bin/nvcc $FLAGS  -m64 -I../src -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -Xnvlink -v test_bh_3.o ../src/.libs/libquicksched_cuda.a -o test_bh_3 -lprofiler
--- a/src/cuda_queue.cu
+++ b/src/cuda_queue.cu
--- a/src/qsched.h
+++ b/src/qsched.h
@@ -163,8 +163,9 @@ struct qsched {

 /* Function prototypes. */
 /* Internal functions. */
+void qsched_quicksort ( int *data , int *ind , int N , int min , int max );
 void qsched_sort ( int *data , int *ind , int N , int min , int max );
-void qsched_sort_rec ( int *data , int *ind , int N , int min , int max );
+//void qsched_sort_rec ( int *data , int *ind , int N , int min , int max );
 struct task *qsched_gettask ( struct qsched *s , int qid );
 void qsched_done ( struct qsched *s , struct task *t );
 void *qsched_getdata( struct qsched *s , struct task *t );