diff --git a/examples/test_qr.cu b/examples/test_qr.cu
index 1e8be2456b665d6f53395e0a07ed1604c2d60687..6f075d4b58b3b2c03c6d2912747102c543f3d333 100644
--- a/examples/test_qr.cu
+++ b/examples/test_qr.cu
@@ -14,6 +14,7 @@
 /* Local includes. */
 extern "C"{
 #include "quicksched.h"
+#include "res.h"
 #include <cblas.h>
 }
 #include "cuda_queue.h"
@@ -565,7 +566,7 @@ __device__ void runner ( int type , void *data ) {
                 //if(threadIdx.x == 0)
                   //  printf("SGEQRF %i %i\n", k, (k*cuda_m+k)*32*32);
                 if(threadIdx.x < 32)
-                SGEQRF( &GPU_matrix[(k*cuda_m+k)*32*32], 32, GPU_tau, k, cuda_m);
+                    SGEQRF( &GPU_matrix[(k*cuda_m+k)*32*32], 32, GPU_tau, k, cuda_m);
                 break;
             case task_SLARFT:
         //        if(threadIdx.x == 0)
@@ -759,6 +760,9 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
     qsched_res_t *rid;
     int data[3];
     ticks tic, toc_run, tot_setup, tot_run = 0;
+    #ifdef NO_LOADS
+    ticks tic_loads;
+    #endif
  /* Initialize the scheduler. */
     qsched_init( &s , 1 , qsched_flag_none );
     cudaDeviceReset();
@@ -800,6 +804,12 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
 
     if( cudaMalloc( &device_array , sizeof(float) * m * n * K * K ) != cudaSuccess )
         error("Failed to allocate the matrix on the device");
+    #ifdef NO_LOADS
+    tic = getticks();
+    if( cudaMemcpy( A, device_array, sizeof(float) * m * n * K *K, cudaMemcpyHostToDevice ) != cudaSuccess)
+        error("Failed to copy matrix to device");
+    tic_loads = getticks() - tic;
+    #endif
     if( cudaMemcpyToSymbol( GPU_matrix , &device_array , sizeof(float *) , 0 , cudaMemcpyHostToDevice ) != cudaSuccess )
         error("Failed to copy matrix pointer to the device");
 
@@ -820,6 +830,7 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
         rid[k] = qsched_addres( &s , qsched_owner_none , qsched_res_none , &A[k*32*32], sizeof(float) * 32 * 32, device_array + k * 32 *32);
         }
 
+
  /* Build the tasks. */
     for ( k = 0 ; k < m && k < n ; k++ ) {
     
@@ -882,9 +893,18 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
 
         qsched_run_CUDA( &s , func );
 
+    #ifdef NO_LOADS
+    tic = getticks();
+    if( cudaMemcpy( device_array, A, sizeof(float) * m * n * K *K, cudaMemcpyDeviceToHost ) != cudaSuccess)
+        error("Failed to copy matrix from device");
+    tic_loads += getticks() - tic;double itpms = 1000.0 / CPU_TPS;
+    printf("%.3f\n", ((double)(tic_loads)) * itpms );
+    #endif
     if(cudaMemcpy(  tau , tau_device , sizeof(float) * m * n * K , cudaMemcpyDeviceToHost ) != cudaSuccess )
         error("Failed to copy the tau data from the device.");
 
+
+    
    // printMatrix(tileToColumn(A, m*n*K*K, m, n, K), m, n);
    /*float *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K);
    float *Q = computeQ(tempMatrix, m*K, K, tau, m);
@@ -911,11 +931,24 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
    // printMatrix(A, m, n);
 //    printTileMatrix(A, m , n);
     struct task* tasks =  qsched_get_timers( &s , s.count );
-    /*for(i = 0; i < s.count; i++)
+/*    for(i = 0; i < s.count; i++)
     {
-        printf("%i %lli %lli %i\n", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID);
+      printf("%i %lli %lli %i ", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID);
+        if(tasks[i].type < 0)
+        {
+        int *tempdata = (int*)s.data;
+        int *idata = &tempdata[tasks[i].data/4];
+        int ii = idata[0];
+        int row = ii/m;
+        int col = ii%m;
+           printf("%i %i", col, row);
+
+        }else
+            printf("0 0");
+        printf("\n");
     }*/
-
+    free(tasks);
+cudaDeviceReset();
 }
 
 
@@ -1174,8 +1207,8 @@ int main ( int argc , char *argv[] ) {
 	        }
             
     /* Dump arguments. */
-    message( "Computing the tiled QR decomposition of a %ix%i matrix using %i threads (%i runs)." ,
-        32*M , 32*N , nr_threads , runs );
+  //  message( "Computing the tiled QR decomposition of a %ix%i matrix using %i threads (%i runs)." ,
+    //    32*M , 32*N , nr_threads , runs );
         
     test_qr( M , N , K , nr_threads , runs );
     
diff --git a/src/cuda_queue.cu b/src/cuda_queue.cu
index 45add67c108e3d9a0c2ddcfdb07d76c4eb58a1b8..5a684959016050b6e3d39bb77ce76c051fb0023d 100644
--- a/src/cuda_queue.cu
+++ b/src/cuda_queue.cu
@@ -63,7 +63,24 @@ __device__ qsched_funtype fun;
 
 
 
-#ifdef GPU_locks
+__device__ __inline__ int cuda_trymultilock ( volatile int *l ) {
+
+    int res = atomicAdd( (int *)l, 1);
+        printf("res = %i\n", res);
+    res = res <= 0;
+    if( res == 0)
+    {
+        atomicAdd( (int *)l, -1);
+        return 0;
+    }else
+        return 1;
+    
+}
+
+__device__ __inline__ void cuda_multiunlock ( volatile int *l ) {
+    atomicAdd( (int *)l, -1 );
+}
+
 __device__ __inline__ int cuda_trylock ( volatile int *l ) {
     int res = atomicCAS( (int *)l, 0 , 1 );
     return res;
@@ -78,7 +95,7 @@ __device__ __inline__ int cuda_unlock ( volatile int *l ) {
     int res = atomicCAS( (int *)l , 1 , 0) != 1 ;
     return res;
 }   
-#endif
+
 
 /**
  * @brief Copy bulk memory in a strided way.
@@ -92,12 +109,23 @@ __device__ __inline__ void cuda_memcpy_tasks ( void *dest , void *source , int c
 
     int k;
     int *idest = (int *)dest, *isource = (int *)source;
-    
+    int val1, val2, val3, val4;
     
     /* Copy the data in chunks of sizeof(int). */
-    for ( k = threadIdx.x ; k < count/sizeof(int) ; k += blockDim.x ){
-        idest[k] = isource[k];
+    for ( k = threadIdx.x ; k + 4*blockDim.x < count/sizeof(int) ; k += 4*blockDim.x ){
+        val1 = isource[k];
+        val2 = isource[k+blockDim.x];
+        val3 = isource[k+2*blockDim.x];
+        val4 = isource[k+3*blockDim.x];
+        idest[k] = val1;
+        idest[k+blockDim.x] = val2;
+        idest[k+2*blockDim.x] = val3;
+        idest[k+3*blockDim.x] = val4;
         }
+    for( ; k < count/sizeof(int); k+=blockDim.x)
+    {
+        idest[k] = isource[k];
+    }
         
     }
  
@@ -344,6 +372,7 @@ __device__ int runner_cuda_gettask ( struct queue_cuda *q ) {
  * This routine blocks until a valid task is picked up, or the
  * specified queue is empty.
  */
+#ifndef PRIQ
 __device__ int runner_cuda_gettask ( struct queue_cuda *q ) {
 
     int tid = -1;
@@ -368,6 +397,147 @@ __device__ int runner_cuda_gettask ( struct queue_cuda *q ) {
     return tid;
 
     }
+#endif
+#ifdef PRIQ
+
+__device__ int get_best_task(struct queue_cuda *q )
+{
+
+    int ind1, ind2, tid1 = -1, tid2 = -1;
+    
+    /* Don't even try... */
+   // if ( q->rec_count == q->count )
+     //   return -1;
+
+    /* Get the index of the next task. */
+    ind1 = atomicAdd( &q->first , 1 );
+
+    if( atomicAdd((int*)&q->nr_avail_tasks, -1) <=0 )
+    {
+        atomicAdd((int*)&q->nr_avail_tasks, 1);
+        ind1 %= cuda_queue_size;
+        /* Loop until there is a valid task at that index. */
+        while ( tot_num_tasks > 0 && ( tid1 = q->data[ind1] ) < 0);
+        
+        if(tasks_cuda[tid1].type == type_load)
+            cuda_trymultilock(&res_cuda[tasks_cuda[tid1].locks[0]].lock);   
+        /* Scratch the task from the queue */
+        if ( tid1 >= 0 )
+        {
+            q->data[ind1] = -1;
+            atomicAdd((int*) &tot_num_tasks, -1);
+        }
+        /* Return the acquired task ID. */
+        return tid1;
+    }
+    ind2 = atomicAdd( &q->first, 1);
+    /* Wrap the index. */
+    ind1 %= cuda_queue_size; 
+    ind2 %= cuda_queue_size; 
+    /* Loop until there is a valid task at that index. */
+    while ( tot_num_tasks > 0 && ( tid1 = q->data[ind1] ) < 0);
+         //   atomicAdd((int*) &tot_num_tasks, -1);
+    while ( tot_num_tasks > 0 && ( tid2 = q->data[ind2] ) < 0);
+
+    if(tid1 < 0 && tid2 < 0)
+        return -1;
+
+    int pri1, pri2;
+        pri1 = tasks_cuda[tid1].weight;
+        pri2 = tasks_cuda[tid2].weight;
+
+
+    int res1, res2;
+    if( (tid1 >= 0 && tasks_cuda[tid1].type == type_load) )
+    {
+        res1 = cuda_trymultilock(&res_cuda[tasks_cuda[tid1].locks[0]].lock);
+        //printf("%i %i\n", res1, res_cuda[tasks_cuda[tid1].locks[0]].lock);
+    }
+    else
+        res1 = 1;
+
+    if( tid2 >= 0 && tasks_cuda[tid2].type == type_load && (tid1 >= 0 && tasks_cuda[tid1].type != type_load ))
+        res2 = cuda_trymultilock(&res_cuda[tasks_cuda[tid2].locks[0]].lock);
+    else
+        res2 = 1;
+
+    if(res1 == 0 && res2 == 0)
+    {
+        cuda_multiunlock(&res_cuda[tasks_cuda[tid1].locks[0]].lock);
+        q->data[ind1] = -1;
+        q->data[ind2] = -1;
+        cuda_queue_puttask( q, tid1);
+        cuda_queue_puttask( q, tid2);
+        return -1;
+    }
+
+    if(res1 == 0)
+    {
+           cuda_multiunlock(&res_cuda[tasks_cuda[tid1].locks[0]].lock);
+            q->data[ind1] = -1;
+            q->data[ind2] = -1;
+            atomicAdd((int*) &tot_num_tasks, -1);
+            cuda_queue_puttask( q, tid1);
+            return tid2;
+    }
+
+    if(res2 == 0)
+    {
+           cuda_multiunlock(&res_cuda[tasks_cuda[tid2].locks[0]].lock);
+            q->data[ind1] = -1;
+            q->data[ind2] = -1;
+            atomicAdd((int*) &tot_num_tasks, -1);
+            cuda_queue_puttask( q, tid2);
+            return tid1;
+    }
+    
+
+
+
+
+    if(pri1 >= pri2)
+    {
+
+        q->data[ind1] = -1;
+        q->data[ind2] = -1;
+        atomicAdd((int*) &tot_num_tasks, -1);
+        cuda_queue_puttask( q, tid2);
+//        atomicAdd((int*)&q->nr_avail_tasks, 1);
+
+        return tid1;
+    }else
+    {
+        q->data[ind1] = -1;
+        q->data[ind2] = -1;
+        atomicAdd((int*) &tot_num_tasks, -1);
+        cuda_queue_puttask( q, tid1);
+  //      atomicAdd((int*)&q->nr_avail_tasks, 1);
+        return tid2;
+    }
+    
+}
+
+
+__device__ int runner_cuda_gettask ( struct queue_cuda *q ) {
+
+    int tid = -1;
+    if( atomicAdd((int*)&q->nr_avail_tasks, -1) <= 0)
+    {
+        atomicAdd((int*)&q->nr_avail_tasks, 1);
+        return -1;
+    }
+//    atomicAdd((int*)&q->nr_avail_tasks, -1);
+    tid = get_best_task(q);
+        
+    /* Put this task into the recycling queue, if needed. */
+    if ( tid >= 0 ) {
+            q->rec_data[ atomicAdd( (int *)&q->rec_count , 1 ) ] = tid;
+        }
+    /* Return whatever we got. */
+    return tid;
+
+    }
+#endif
 
 
 #endif
@@ -383,19 +553,39 @@ __global__ void qsched_device_kernel ( )
     /* Pull a task from the queues*/
     while ( 1 ) {
         __syncthreads();
+        /*if(threadIdx.x == 0 && blockIdx.x == 0)
+        {
+            printf("%i %i %i \n", cuda_queues[0].nr_avail_tasks, cuda_queues[0].data[0], cuda_queues[0].data[1]);
+            break;
+        }
+        else
+            break;*/
         if ( threadIdx.x == 0 ) {
              tid = -1;
                 /* Highest priority queue, holds the unload tasks. */
+#ifndef PRIQ
+#ifndef NO_LOADS
                 if(cuda_queues[2].nr_avail_tasks > 0 )
                 tid = runner_cuda_gettask( &cuda_queues[2] );
+#endif
+
 
              /* Middle priority queue, contains user-specifed tasks. */
              if( tid < 0 && cuda_queues[0].nr_avail_tasks > 0 )
                 tid = runner_cuda_gettask ( &cuda_queues[0]);
-
+#ifndef NO_LOADS
              /* Low priority queue, contains the load tasks. */
-             if( tid < 0 && cuda_queues[1].nr_avail_tasks > 0)
+             if( tid < 0 && cuda_queues[1].nr_avail_tasks > 0 /*&& blockIdx.x < 1*/ )
+{
                 tid = runner_cuda_gettask ( &cuda_queues[1]);
+
+}
+#endif
+
+#else
+            if(tot_num_tasks > 0 && cuda_queues[0].nr_avail_tasks > 0)
+                tid = runner_cuda_gettask ( &cuda_queues[0] );
+#endif
             }
             
         /*Everyone wait for us to get a task id*/
@@ -409,6 +599,7 @@ __global__ void qsched_device_kernel ( )
         if(tid < 0 && cuda_queues[0].nr_avail_tasks == 0 && cuda_queues[1].nr_avail_tasks == 0 && cuda_queues[2].nr_avail_tasks == 0)
             break;
         #endif
+
         /* If we couldn't find a task but some are not completed, try again. */
         if(tid < 0)
             continue;
@@ -427,6 +618,11 @@ __global__ void qsched_device_kernel ( )
             src = (int*)res_cuda[d[0]].data;
             dest = (int*)res_cuda[d[0]].gpu_data;
             cuda_memcpy_tasks( dest, src , res_cuda[d[0]].size, tid);
+            #ifdef PRIQ
+            __syncthreads();
+            if(threadIdx.x == 0)
+                cuda_multiunlock(&res_cuda[tasks_cuda[tid].locks[0]].lock);
+            #endif
         }else if( tasks_cuda[tid].type == type_unload )
         {
         int *d = (int*)&data_cuda[tasks_cuda[tid].data];
@@ -452,12 +648,16 @@ __global__ void qsched_device_kernel ( )
             if( atomicSub( &tasks_cuda[tasks_cuda[tid].unlocks[i]].wait , 1 ) == 1 && !( tasks_cuda[tasks_cuda[tid].unlocks[i]].flags & task_flag_skip ))
             {
                 /* Place unloads into highest priority queue, any other task goes to normal priority queue. Load tasks are never unlocked.*/
+                #ifdef PRIQ
+                    cuda_queue_puttask( &cuda_queues[0] , tasks_cuda[tid].unlocks[i] );
+                #else
                 if(tasks_cuda[tasks_cuda[tid].unlocks[i]].type != type_unload)
                 {
                     cuda_queue_puttask( &cuda_queues[0] , tasks_cuda[tid].unlocks[i] );
                 }
                 else
                     cuda_queue_puttask( &cuda_queues[2] , tasks_cuda[tid].unlocks[i] );
+                #endif
             }
         }
     }
@@ -477,15 +677,21 @@ __global__ void qsched_device_kernel ( )
             
 
             /* Reset values.*/
+            /*printf("cuda_queues[0].first = %i, nr_avail_tasks = %i, tot_num_tasks = %i, data[first] = %i\n", cuda_queues[0].first, cuda_queues[0].nr_avail_tasks ,tot_num_tasks, cuda_queues[0].data[cuda_queues[0].first] );
+            printf("cuda_queues[0].last = %i, data[last-1] = %i\n", cuda_queues[0].last ,cuda_queues[0].data[cuda_queues[0].last] );
+            for(tid = cuda_queues[0].first; tid < cuda_queues[0].first + cuda_queues[0].nr_avail_tasks; tid++)
+            {   
+                printf("task =%i, type = %i\n", cuda_queues[0].data[tid%cuda_queue_size], tasks_cuda[cuda_queues[0].data[tid%cuda_queue_size]].type ); 
+            } */
             cuda_queues[0].first = 0;
             cuda_queues[0].last = 0;
             cuda_queues[0].rec_count = 0;
-            cuda_queues[1].first = 0;
+            /*cuda_queues[1].first = 0;
             cuda_queues[1].last = cuda_queues[1].count;
             cuda_queues[1].rec_count = 0;
             cuda_queues[2].first = 0;
             cuda_queues[2].last = 0;
-            cuda_queues[2].rec_count = 0;
+            cuda_queues[2].rec_count = 0;*/
             }
 
         //TODO
@@ -527,7 +733,7 @@ int minVal( int *array, int size )
    return maxi ;
 }
 
-void qsched_create_loads(struct qsched *s, int ID, int size, int numChildren, int parent, int *res, int *sorted )
+void qsched_create_loads(struct qsched *s, int ID, int size, int numChildren, int parent, int *res, int *sorted)
 {
     int i,j;
     int task, utask;
@@ -788,7 +994,7 @@ if(s->res[0].task != -1)
     return;
 }
 double itpms = 1000.0 / CPU_TPS;
-ticks tic, toc_run ;
+ticks tic, toc_run, toc2 ;
     tic = getticks();
 
 /* Expand the deps array so we can add new dependencies in place. */
@@ -807,7 +1013,7 @@ j = 0;
 if ( ( deps_new = (int*) malloc(s->size_deps * sizeof(int) ) ) == NULL ||
     ( deps_new_key = (int*) malloc(  s->size_deps * sizeof(int) ) ) == NULL )
         error( "Failed to allocate new deps lists." );
-
+tic = getticks();
 /* Copy the dependency list to the new list, leaving a space between each task to fit unload dependencies in.*/
 for(i = 0; i < s->count; i++)
 {
@@ -837,7 +1043,7 @@ for(i = 0; i < s->count; i++)
         free(deps_new_key);
         deps_new = temp1;
         deps_new_key = temp2;
-//        printf("Stretch at line 828. m = %i\n", m);
+       // printf("Stretch at line 828. m = %i\n", m);
     t = &s->tasks[i];
     }
     int start_j = j;
@@ -870,6 +1076,7 @@ for(i = 0; i < s->count; i++)
 
 s->count_deps = j;
 
+toc2 = getticks() - tic;
 
 /* Store number of children for each resource*/
 sorted = (int*) malloc(sizeof(int) * (s->count_res+1));
@@ -1012,6 +1219,7 @@ for( i = sorted[s->count_res]; i < s->count_res; i++ )
 }
 
 toc_run = getticks();
+tic = getticks();
 //	message( "Creating load tasks took %.3f ms" , ((double)(toc_run - tic)) * itpms );
 
 
@@ -1049,7 +1257,7 @@ num_uses = (int*) malloc(sizeof(int) * s->count_res );
 size_uses = (int*) malloc(sizeof(int) * s->count_res);
 for(i = 0; i < s->count_res; i++ )
 {
-    usage_list[i] = (int*) malloc(sizeof(int) * s->count_uses / s->count_res + 1);
+    usage_list[i] = (int*) malloc(sizeof(int) * (s->count_uses / s->count_res + 1));
     num_uses[i] = 0;
     size_uses[i] = s->count_uses / s->count_res + 1;   
 }
@@ -1064,6 +1272,7 @@ for(i = 0; i < s->count; i++)
     for(j = 0; j < t->nr_uses; j++)
     {
         t->unlocks[t->nr_unlocks] = s->res[t->uses[j]].utask;
+        s->tasks[s->res[t->uses[j]].utask].wait_init +=1 ;
         deps_new_key[(t->unlocks - deps_new) + t->nr_unlocks] = i;
         t->nr_unlocks++;
         if(num_uses[t->uses[j]] == size_uses[t->uses[j]])
@@ -1080,6 +1289,7 @@ for(i = 0; i < s->count; i++)
     for(j = 0; j < t->nr_locks; j++)
     {
         t->unlocks[t->nr_unlocks] = s->res[t->locks[j]].utask;
+        s->tasks[s->res[t->locks[j]].utask].wait_init +=1 ;
         deps_new_key[(t->unlocks - deps_new) + t->nr_unlocks] = i;
         t->nr_unlocks++;
         if(num_uses[t->locks[j]] == size_uses[t->locks[j]])
@@ -1134,7 +1344,7 @@ for(i = 0; i < s->count_res; i++ )
             free(deps_new_key);
             deps_new = temp1;
             deps_new_key = temp2;
-//        printf("Stretch at line 1102, m = %i.\n", m);
+      //  printf("Stretch at line 1102, m = %i.\n", m);
         }
         for(j = 0; j < numChildren; j++)
         {
@@ -1143,6 +1353,7 @@ for(i = 0; i < s->count_res; i++ )
             if( child->utask != resource->utask )
             {
                 s->tasks[resource->utask].unlocks[ s->tasks[resource->utask].nr_unlocks ] = child->utask;
+                s->tasks[child->utask].wait_init += 1;
                 deps_new_key[s->count_deps] = resource->utask;
                 s->tasks[resource->utask].nr_unlocks += 1;
                 s->count_deps += 1;
@@ -1183,11 +1394,12 @@ for(i = 0; i < s->count_res; i++ )
                     free(deps_new_key);
                     deps_new = temp1;
                     deps_new_key = temp2;
-//                    printf("Stretch at line 1151.\n");
+        //            printf("Stretch at line 1151.\n");
                 }                
                 for(k = 0; k < num_uses[ res[ sorted[ ID ] + j ] ]; k++)
                 {
                     s->tasks[resource->task].unlocks[ s->tasks[resource->task].nr_unlocks ] = usage_list[ res[ sorted[ ID ] +j ] ][k];
+                    s->tasks[ usage_list[ res[ sorted[ ID ] +j ] ][k] ].wait_init += 1;
                     deps_new_key[s->count_deps] = resource->task;
                     s->tasks[resource->task].nr_unlocks += 1;
                     s->count_deps += 1;
@@ -1217,12 +1429,13 @@ for(i = 0; i < s->count_res; i++ )
             free(deps_new_key);
             deps_new = temp1;
             deps_new_key = temp2;
-//        printf("Stretch at line 1185.\n");
+   //     printf("Stretch at line 1185.\n");
     }
     if( parent > 0 )
     {
         s->tasks[resource->task].unlocks[ s->tasks[resource->task].nr_unlocks ] = s->res[parent].task;
         deps_new_key[s->count_deps] = resource->task;
+        s->tasks[s->res[parent].task].wait_init += 1;
         s->tasks[resource->task].nr_unlocks += 1;
         s->count_deps += 1;
     }
@@ -1231,6 +1444,7 @@ for(i = 0; i < s->count_res; i++ )
     {
         s->tasks[resource->task].unlocks[ s->tasks[resource->task].nr_unlocks ] = usage_list[ res[ i ] ][k];
         deps_new_key[s->count_deps] = resource->task;
+        s->tasks[usage_list[ res[ i ] ][k]].wait_init += 1;
         s->tasks[resource->task].nr_unlocks += 1;
         s->count_deps += 1;
     }
@@ -1242,7 +1456,8 @@ free(s->deps_key);
 s->deps = deps_new;
 s->deps_key = deps_new_key;
 s->flags &= ~qsched_flag_dirty;
-tic = getticks();
+toc2 += getticks() - tic;
+
 //printf("Number tasks = %i\n", s->count);
 //printf("Number dependencies = %i\n", s->count_deps);
 /* Set up dependencies with the rest of the system.*/
@@ -1388,8 +1603,20 @@ tic = getticks();
 }*/
 //qsched_prepare_deps( s );
 //printf("Number dependencies = %i\n", s->count_deps);
+#ifdef PRIQ
+int PCI_res;
+PCI_res = qsched_addres(s , qsched_owner_none , qsched_res_none , NULL, 0 , NULL);
+s->res[PCI_res].lock = PCIEX;
+for(i = 0; i < s->count; i++)
+{
+    if(s->tasks[i].type == type_load)
+    {
+        qsched_addlock(s, i, PCI_res);
+    }
+}
+#endif
 toc_run = getticks();
-//	message( "Setting up dependencies took %.3f ms" , ((double)(toc_run - tic)) * itpms );
+	//message( "Setting up dependencies took %.3f ms" , toc2 * itpms );
 //error("Got to here");
 }
 
@@ -1526,7 +1753,9 @@ ticks tic, toc_run ;
             }
         }
 
+#ifndef NO_LOADS
 qsched_prepare_loads(s);
+#endif
 toc_run = getticks();
 //	message( "prepare_loads took %.3f ms" , ((double)(toc_run - tic)) * itpms );
     
@@ -1539,7 +1768,7 @@ toc_run = getticks();
     count = s->count;
     
     /* If the sched is dirty... */
-    if ( s->flags & qsched_flag_dirty ) {
+    if ( 1 /*s->flags & qsched_flag_dirty*/ ) {
     
         /* Do the sorts in parallel, if possible. */
        // #pragma omp parallel
@@ -1551,7 +1780,7 @@ toc_run = getticks();
 
             /* Sort the locks. */
         //    #pragma omp single nowait
-         //   qsched_sort( s->locks , s->locks_key , s->count_locks , 0 , count - 1 );
+            qsched_sort( s->locks , s->locks_key , s->count_locks , 0 , count - 1 );
 
             /* Sort the uses. */
            // #pragma omp single nowait
@@ -1583,25 +1812,30 @@ toc_run = getticks();
         tasks[k].wait = 0;
     }    
 
-    int* store_waits;
-    if( (store_waits = (int*) malloc(sizeof(int)*s->count) ) == NULL)
-        error("Failed to allocate store_waits");
-
     /* Run through the tasks and set the waits... */
-    for ( k = 0 ; k < count ; k++ ) {
+/*    for ( k = 0 ; k < count ; k++ ) {
         t = &tasks[k];
         if ( !( t->flags & task_flag_skip ) )
             for ( j = 0 ; j < t->nr_unlocks ; j++ )
             {
                 tasks[ t->unlocks[j] ].wait += 1;
             }
-        }
+        }*/
+
+    for( k = 0; k < count; k++ )
+    {
+        t = &tasks[k];
+        t->wait = t->wait_init;
+//        if(t->wait != t->wait_init)
+  //      {
+    //        printf("Task ID %i has wait %i and wait_init %i and type %i\n", k, t->wait, t->wait_init, t->type);
+      //  }
+    }
         
     /* Sort the tasks topologically. */
     int *tid = (int *)malloc( sizeof(int) * count );
     for ( j = 0 , k = 0 ; k < count ; k++ )
     {
-        store_waits[k] = tasks[k].wait;
         if ( tasks[k].wait == 0 ) {
             tid[j] = k;
             j += 1;
@@ -1671,19 +1905,36 @@ toc_run = getticks();
      }   
     /* Run through the topologically sorted tasks backwards and
        set their weights, re-setting the waits while we're at it. */
+#ifdef PRIQ
     for ( k = count-1 ; k >= 0 ; k-- ) {
         int maxweight = 0;
         t = &tasks[ tid[k] ];
-        t->wait = store_waits[tid[k]];
+        t->wait = t->wait_init;
         for ( j = 0 ; j < t->nr_unlocks ; j++ ) {
             if ( tasks[ t->unlocks[j] ].weight > maxweight )
                 maxweight = tasks[ t->unlocks[j] ].weight;
             }
-        t->weight = t->cost + maxweight;
-        }
-
-    free(store_waits);
+        if(t->type != type_load)
+            t->weight = t->cost + maxweight;
+        else
+            t->weight = t->cost + maxweight - 1000;
 
+//        if(t->weight < -100)
+  //          printf("%i\n", t->weight);
+        }
+#endif
+#ifndef PRIQ
+    for( k = 0; k < count; k++ )
+    {
+        t = &tasks[k];
+        t->wait = t->wait_init;
+//        if(t->wait != t->wait_init)
+  //      {
+    //        printf("Task ID %i has wait %i and wait_init %i and type %i\n", k, t->wait, t->wait_init, t->type);
+      //  }
+    }
+#endif
+        
 
 /*Allocate temporary tasks to setup device tasks*/
 temp = (struct task *) malloc(s->count * sizeof(struct task));
@@ -1762,10 +2013,95 @@ if( cudaMemcpyToSymbol ( tasks_cuda, &cuda_t ,  sizeof(struct task *) , 0 ,  cud
     error("Failed to copy task pointer to the device.");
 
 /* Initialize the queues. */
+#ifdef PRIQ
+int nr_queues= 1, qsize;
+#else
 int nr_queues= 3,qsize;
+#endif
 int *data2;
 struct queue_cuda queues[ cuda_numqueues ];
 
+
+#ifdef PRIQ
+qsize = max(2*s->count, 512);
+    if ( cudaMemcpyToSymbol( cuda_queue_size , &qsize , sizeof(int) , 0 , cudaMemcpyHostToDevice ) != cudaSuccess )
+            error("Failed to copy queue size to the device.");
+
+    /* Allocate a temporary buffer for the queue data. */
+        if ( ( data = (int *)malloc( sizeof(int) * qsize ) ) == NULL )
+            error("Failed to allocate data buffer.");
+        if( ( data2 = (int *) malloc( sizeof(int) * qsize ) ) == NULL )
+            error("Failed to allocate data2 buffer.");
+    queues[0].count = 0;
+
+    for(i = 0; i < s->count; i++)
+    {
+        if(s->tasks[i].wait == 0)
+        {
+            if(s->tasks[i].type != type_load)
+            {          
+                printf("i = %i\n", i);    
+               for(k = 0; k < s->count; k++)
+                {
+                    for(j = 0; j < s->tasks[k].nr_unlocks; j++)
+                    {
+                        if(s->tasks[k].unlocks[j] == i)
+                        printf("Should be unlocked by %i\n", k);
+                    }
+                    for(j = 0; j < s->tasks[k].nr_uses; j++)
+                    {
+                        if(s->tasks[k].uses[j] == 256)
+                        printf("Task %i uses resource 256\n", k);
+                    }
+                    for(j = 0; j < s->tasks[k].nr_locks; j++)
+                    {
+                        if(s->tasks[k].locks[j] == 256)
+                        printf("Task %i locks resource 256\n", k);
+                    }
+                }
+               for(k = 0; k < s->count_res; k++)
+                {
+                    if(s->res[k].utask == i)
+                        printf("resource = %i\n", k);
+                }
+               printf("%i\n", i);
+            }       
+            data[queues[0].count++] = i;
+            data2[queues[0].count-1] = -temp[i].weight;
+        }
+    }
+    qsched_sort(data, data2, queues[0].count, minVal(data2,queues[0].count), maxVal(data2, queues[0].count));
+    free(data2);
+     for ( i = queues[0].count ; i < qsize ; i++ )
+        data[i] = -1; 
+
+    /* Allocate and copy the data. */
+    if ( cudaMalloc( &queues[0].data , sizeof(int) * qsize ) != cudaSuccess )
+        error("Failed to allocate queue data on the device.");
+    if ( cudaMemcpy( (void *)queues[0].data , data , sizeof(int) * qsize , cudaMemcpyHostToDevice ) != cudaSuccess )
+        error("Failed to copy queue data pointer to the device");
+
+    for ( k = 0; k < qsize; k++ )
+            data[k] = -1;
+            
+    /* Allocate and copy the recyling data. */
+    if ( cudaMalloc( &queues[0].rec_data , sizeof(int) * qsize ) != cudaSuccess )
+        error("Failed to allocate queue data on the device.");
+    if ( cudaMemcpy( (void *)queues[0].rec_data , data , sizeof(int) * qsize , cudaMemcpyHostToDevice ) != cudaSuccess )
+        error("Failed to copy queue data pointer to the device");
+
+        
+    /* Set some other values. */
+    queues[0].first = 0;
+    queues[0].last = queues[0].count;
+    queues[0].nr_avail_tasks = queues[0].last;
+    queues[0].rec_count = 0;  
+    queues[0].count = s->count;
+
+       /* Copy the queue structures to the device. */
+        if ( cudaMemcpyToSymbol( cuda_queues , &queues , sizeof(struct queue_cuda) * nr_queues , 0 , cudaMemcpyHostToDevice ) != cudaSuccess )
+            error("Failed to copy the queues to the device");   
+#else
  qsize = max(2*s->count / nr_queues, 256);
     if ( cudaMemcpyToSymbol( cuda_queue_size , &qsize , sizeof(int) , 0 , cudaMemcpyHostToDevice ) != cudaSuccess )
             error("Failed to copy queue size to the device.");
@@ -1784,6 +2120,10 @@ struct queue_cuda queues[ cuda_numqueues ];
             data[queues[1].count++] = i;
             data2[queues[1].count-1] = -temp[i].weight;
         }
+        if(temp[i].type != type_load && temp[i].wait == 0)
+        {
+            printf("%i %i\n", temp[i].type, i);
+        }   
     
     }
     qsched_sort(data, data2, queues[1].count, minVal(data2,queues[1].count), maxVal(data2, queues[1].count));
@@ -1872,7 +2212,7 @@ struct queue_cuda queues[ cuda_numqueues ];
        /* Copy the queue structures to the device. */
         if ( cudaMemcpyToSymbol( cuda_queues , &queues , sizeof(struct queue_cuda) * nr_queues , 0 , cudaMemcpyHostToDevice ) != cudaSuccess )
             error("Failed to copy the queues to the device");
-
+#endif
     /* Clean up. */
     free( tid );
         
@@ -1919,22 +2259,28 @@ struct task* qsched_get_timers( struct qsched *s, int numtasks )
 void qsched_run_CUDA ( struct qsched *s, qsched_funtype func) {
 
 #ifdef WITH_CUDA
-    ProfilerStart("/home/aidan/quicksched-code/examples/profiler.out");
+    //ProfilerStart("/home/aidan/quicksched-code/examples/profiler.out");
     double itpms = 1000.0 / CPU_TPS;
     ticks tic, toc_run ;
     tic = getticks();
     qsched_prepare_cuda( s );
   toc_run = getticks(); 
+    printf("%.3f ", ((double)(toc_run - tic)) * itpms );
 //	message( "prepare_cuda took %.3f ms" , ((double)(toc_run - tic)) * itpms );
     cudaMemcpyToSymbol( fun , &func , sizeof(qsched_funtype));
     tic = getticks();
-    ProfilerStop();
+  //  ProfilerStop();
     cudaMemcpyToSymbol( tot_num_tasks, &s->count, sizeof(int) );
     qsched_device_kernel<<<128, 128 >>> (  );
     if( cudaDeviceSynchronize() != cudaSuccess )
         error("Failed to execute kernel:%s", cudaGetErrorString(cudaPeekAtLastError()));
   toc_run = getticks(); 
-    message( "run_CUDA took %.3f ms" , ((double)(toc_run - tic)) * itpms );
+    #ifdef NO_LOADS
+    printf("%.3f ", ((double)(toc_run - tic)) * itpms ); 
+    #else
+        printf("%.3f\n", ((double)(toc_run - tic)) * itpms );
+    #endif
+   // message( "run_CUDA took %.3f ms" , ((double)(toc_run - tic)) * itpms );
 
 #else
         error("QuickSched was not compiled with CUDA support.");
diff --git a/src/lock.h b/src/lock.h
index 5c14abfe76e7bac02f6e15512e7276b01ccd5fe3..d739a64c6c256ba0e84ba6ec25a3d12096329ee2 100644
--- a/src/lock.h
+++ b/src/lock.h
@@ -28,6 +28,7 @@
 
 #ifdef WITH_CUDA
 #define static
+#define PCIEX -16
 #endif
 
 #ifdef PTHREAD_LOCK
diff --git a/src/qsched.c b/src/qsched.c
index eebbfdfc1be395b66e6d1602fe6bdb3ebeaf2ff9..4e813c4ac107068066996720539a818d402dabe9 100644
--- a/src/qsched.c
+++ b/src/qsched.c
@@ -1396,6 +1396,7 @@ void qsched_addunlock ( struct qsched *s , int ta , int tb ) {
     s->deps[ s->count_deps ] = tb;
     s->deps_key[ s->count_deps ] = ta;
     s->tasks[ta].nr_unlocks += 1;
+    s->tasks[tb].wait_init += 1;
     
     /* Increase the deps counter. */
     s->count_deps += 1;
@@ -1486,6 +1487,7 @@ int qsched_addtask ( struct qsched *s , int type , unsigned int flags , void *da
     t->nr_unlocks = 0;
     t->nr_locks = 0;
     t->nr_uses = 0;
+    t->wait_init = 0;
     
     /* Add a relative pointer to the data. */
     memcpy( &s->data[ s->count_data ] , data , data_size );
diff --git a/src/task.h b/src/task.h
index 99cd36dcb957f473c799036d2223f7c557dd441e..3878cddc61eb47890d83dabe0576590c9bdb05b0 100644
--- a/src/task.h
+++ b/src/task.h
@@ -35,7 +35,7 @@ struct task {
     int data;
     
     /* Task wait counter. */
-    int wait;
+    int wait, wait_init;
     
     /* Number of potential conflicts. */
     int nr_conflicts;