diff --git a/examples/test_qr_mpi.c b/examples/test_qr_mpi.c
index c77c46c773da1b775217bdf73cbcc0f6abb73155..2a235235fee9146fbc9b8e82169b189b07b888fa 100644
--- a/examples/test_qr_mpi.c
+++ b/examples/test_qr_mpi.c
@@ -28,7 +28,7 @@
 #include <math.h>
 #include <float.h>
 #include <limits.h>
-#include <omp.h>
+//#include <omp.h>
 #include <fenv.h>
 #include <mpi.h>
 
@@ -909,9 +909,9 @@ int main(int argc, char* argv[]) {
       case 't':
         if (sscanf(optarg, "%d", &nr_threads) != 1)
           error("Error parsing number of threads.");
-        omp_set_num_threads(nr_threads);
-        message("omp_get_max_threads() = %i\n", omp_get_max_threads());
-        message("omp_get_num_procs() = %i\n", omp_get_num_procs());
+    //    omp_set_num_threads(nr_threads);
+      //  message("omp_get_max_threads() = %i\n", omp_get_max_threads());
+       // message("omp_get_num_procs() = %i\n", omp_get_num_procs());
         break;
       case '?':
         fprintf(stderr, "Usage: %s [-t nr_threads] [-m M] [-n N] [-k K]\n",
diff --git a/src/qsched.c b/src/qsched.c
index 2be6b1751cfaf78c563d166035ceec80e4408e42..c3615f23ec7d8d7998bcfd5b5c1e811d99abe842 100644
--- a/src/qsched.c
+++ b/src/qsched.c
@@ -146,6 +146,8 @@ void* qsched_getresdata( struct qsched *s, qsched_res_t res_id )
             }
         }
     }
+    if(s->res[getindex(res_id, s)].data == NULL)
+        error("Data is NULL");
     return s->res[getindex(res_id, s)].data;
 }
 #endif
@@ -2225,15 +2227,15 @@ void qsched_integrate( struct qsched *s, struct tsched *ts)
 
 
     /* Once this is done, we need to sort out the arrays and fix the pointers to the locks/unlocks arrays.*/
-     #pragma omp parallel
+     ////#pragma ompparallel
         {
     
             /* Sort the unlocks. */
-            #pragma omp single nowait
+       //     //#pragma ompsingle nowait
                 qsched_quicksort( s->deps , s->deps_key , s->count_deps , 0 ,0xFFFFFFFFFFFFFFF );
 
             /* Sort the locks. */
-            #pragma omp single nowait
+       //     //#pragma ompsingle nowait
                 qsched_quicksort( s->locks , s->locks_key , s->count_locks , 0 , 0xFFFFFFFFFFFFFFF);
 
         }
@@ -2253,7 +2255,9 @@ void qsched_partition_create_sends( struct qsched *s, long long int *tid) {
 
 int i, j, k, l;
 int count = s->task_ranks[s->count_ranks];
+ticks tic, toc;
 
+tic = getticks();
 /* We need a temporary scheduler! */
 struct tsched ts;
 
@@ -2317,7 +2321,7 @@ int sends_added = 0;
 for(i = 0; i < count; i++)
 {
     struct task *t = &s->tasks[gettaskindex(tid[i], s)];
-    
+    long long int ttid = t->id;
     
     /* If we don't execute this task locally then we can skip it!*/
     if( t->flags & task_flag_skip )
@@ -2328,31 +2332,34 @@ for(i = 0; i < count; i++)
         /* Loop through the locks, and for each resource that is not local create a send/recv pair. */
         for(j = 0; j < t->nr_locks; j++){
 
+            int lock_index = getindex(t->locks[j],s);
+            struct res *res = &s->res[lock_index];
+
             /* We need to get each resource that is non-local.*/
-            if(s->res[getindex(t->locks[j],s)].node != s->rank){
+            if(res->node != s->rank){
                 /* If this data isn't already sent to us:*/
-                if(first_recv[getindex(t->locks[j], s)] == -1){
+                if(first_recv[lock_index] == -1){
                 /* Find out the task data*/
-                task_data[0] = s->res[getindex(t->locks[j],s)].node;
+                task_data[0] = res->node;
                 task_data[1] = s->rank;
-                task_data[2] = (int)(s->res[getindex(t->locks[j],s)].ID >> 32);
-                task_data[3] = (int)(s->res[getindex(t->locks[j],s)].ID & 0xFFFFFFFF);
+                task_data[2] = (int)(res->ID >> 32);
+                task_data[3] = (int)(res->ID & 0xFFFFFFFF);
                 task_data[4] = sends_added;
 
                 /* Create the send task. */
-                send_task_id = tsched_addtask(&ts, task_type_send , 0 , task_data , 5 * sizeof(int) , s->res[getindex(t->locks[j],s)].size );
+                send_task_id = tsched_addtask(&ts, task_type_send , 0 , task_data , 5 * sizeof(int) , res->size );
                 /* The send task needs to lock the resource. */
-                tsched_addlock( &ts, send_task_id, s->res[getindex(t->locks[j], s)].ID);
+                tsched_addlock( &ts, send_task_id, res->ID);
                 /* Create the recv task*/
-                recv_task_id = tsched_addtask(&ts, task_type_recv , 0 , task_data , 5 * sizeof(int) , s->res[getindex(t->locks[j],s)].size );
+                recv_task_id = tsched_addtask(&ts, task_type_recv , 0 , task_data , 5 * sizeof(int) , res->size );
                 /* The recv task unlocks this task locally. */
                 tsched_addunlock( &ts, recv_task_id, t->id);
                 /* The recv task needs to lock the resource. */
-                tsched_addlock( &ts, recv_task_id, s->res[getindex(t->locks[j], s)].ID);
+                tsched_addlock( &ts, recv_task_id, res->ID);
                 sends_added +=1;
-                first_recv[getindex(t->locks[j], s)] = recv_task_id;                
+                first_recv[lock_index] = recv_task_id;                
                 }else{
-                    tsched_addunlock( &ts, first_recv[getindex(t->locks[j], s)], t->id);
+                    tsched_addunlock( &ts, first_recv[lock_index], t->id);
                 }
 
             }
@@ -2362,30 +2369,34 @@ for(i = 0; i < count; i++)
 
         /* Loop through the uses, and for each resource that is not local create a send/recv pair. */
         for(j = 0; j < t->nr_uses; j++){
-            if(s->res[getindex(t->uses[j],s)].node != s->rank){
+
+            int use_index = getindex(t->uses[j],s);
+            struct res *res = &s->res[use_index];
+
+            if(res->node != s->rank){
                 /* Find out the task data*/
                 /* If this data isn't already sent to us:*/
-                if(first_recv[getindex(t->uses[j], s)] == -1){
-                task_data[0] = s->res[getindex(t->uses[j],s)].node;
+                if(first_recv[use_index] == -1){
+                task_data[0] = res->node;
                 task_data[1] = s->rank;
-                task_data[2] = (int)(s->res[getindex(t->uses[j],s)].ID >> 32);
-                task_data[3] = (int)(s->res[getindex(t->uses[j],s)].ID & 0xFFFFFFFF);
+                task_data[2] = (int)(res->ID >> 32);
+                task_data[3] = (int)(res->ID & 0xFFFFFFFF);
                 task_data[4] = sends_added;
 
                 /* Create the send task. */
-                send_task_id = tsched_addtask(&ts, task_type_send , 0 , task_data , 5 * sizeof(int) , s->res[getindex(t->uses[j],s)].size );
+                send_task_id = tsched_addtask(&ts, task_type_send , 0 , task_data , 5 * sizeof(int) , res->size );
                 /* The send task needs to lock the resource. */
-                tsched_addlock( &ts, send_task_id, s->res[getindex(t->uses[j], s)].ID);
+                tsched_addlock( &ts, send_task_id, res->ID);
                 /* Create the recv task*/
-                recv_task_id = tsched_addtask(&ts, task_type_recv , 0 , task_data , 5 * sizeof(int) , s->res[getindex(t->uses[j],s)].size );
+                recv_task_id = tsched_addtask(&ts, task_type_recv , 0 , task_data , 5 * sizeof(int) , res->size );
                 /* The recv task unlocks this task locally. */
                 tsched_addunlock( &ts, recv_task_id, t->id);
                 /* The recv task needs to lock the resource. */
-                tsched_addlock( &ts, recv_task_id, s->res[getindex(t->uses[j], s)].ID);
+                tsched_addlock( &ts, recv_task_id, res->ID);
                 sends_added +=1;
-                first_recv[getindex(t->uses[j], s)] = recv_task_id; 
+                first_recv[use_index] = recv_task_id; 
                 }else{
-                    tsched_addunlock( &ts, first_recv[getindex(t->uses[j], s)], t->id);
+                    tsched_addunlock( &ts, first_recv[use_index], t->id);
                 }   
 
         
@@ -2397,12 +2408,14 @@ for(i = 0; i < count; i++)
     /* If the task has dependencies*/
     /*Find the parents. */
     num_parents = 0;
-    for(j = 0; j < i && num_parents < wait_init[gettaskindex(t->id, s)]; j++)
+    int max = wait_init[gettaskindex(t->id, s)];
+    for(j = 0; j < i && num_parents < max; j++)
     {
         struct task *t2 = &s->tasks[gettaskindex(tid[j], s)];
-        for(k = 0; k < t2->nr_unlocks; k++)
+        int nr_un = t2->nr_unlocks;
+        for(k = 0; k < nr_un; k++)
         {
-            if(t2->unlocks[k] == t->id)
+            if(t2->unlocks[k] == ttid)
             {
                 /* If parents array is full we need to extend it.*/
                 if(num_parents == size_parents -1)
@@ -2430,22 +2443,30 @@ for(i = 0; i < count; i++)
     /* Loop over the locked resources */
     for(j = 0; j < t->nr_locks; j++)
     {
+            int lock_index = getindex(t->locks[j],s);
+            struct res *res = &s->res[lock_index];
+
+
+        if(res->node == s->rank)
+            continue;
         /* Find the parents that unlock this resource. */
         num_current = 0;
         int num_skipped = 0;
         int last_index = -1;
+        int tlock = t->locks[j];
         for(k = num_parents-1; k >= 0; k--)
         {
+
             struct task *t2 = &s->tasks[ gettaskindex(tid[parents[k]], s)];
             int found = 0;
             for(l = 0; l < t2->nr_locks; l++)
             {
-                if(t2->locks[l] == t->locks[j])
+                if(t2->locks[l] == tlock)
                 {
                     found = 1;
                     current_parents[num_current++] = parents[k];
-                    if(k > last_index)
-                        last_index = k;
+                    if(parents[k] > last_index)
+                        last_index = parents[k];
                     /* If we skipped the parent task, increment the number we skipped. */
                     if(t2->flags & task_flag_skip)
                         num_skipped++;
@@ -2455,12 +2476,12 @@ for(i = 0; i < count; i++)
                 while(r->parent != -1)
                 {
                     r = &s->res[getindex(r->parent,s)];
-                    if(r->ID == t->locks[j])
+                    if(r->ID == tlock)
                     {
                         found = 1;
                         current_parents[num_current++] = parents[k];
-                        if(k > last_index)
-                            last_index = k;
+                        if(parents[k] > last_index)
+                            last_index = parents[k];
                         /* If we skipped the parent task, increment the number we skipped. */
                         if(t2->flags & task_flag_skip)
                             num_skipped++;
@@ -2470,7 +2491,7 @@ for(i = 0; i < count; i++)
                 if(found)
                     continue;
 
-                r = &s->res[getindex(t->locks[l],s)];
+                r = res;
                 while(r->parent != -1)
                 {
                     r = &s->res[getindex(r->parent,s)];
@@ -2478,8 +2499,8 @@ for(i = 0; i < count; i++)
                     {
                         found = 1;
                         current_parents[num_current++] = parents[k];
-                        if(k > last_index)
-                            last_index = k;
+                        if(parents[k] > last_index)
+                            last_index = parents[k];
                         /* If we skipped the parent task, increment the number we skipped. */
                         if(t2->flags & task_flag_skip)
                             num_skipped++;
@@ -2523,6 +2544,7 @@ for(i = 0; i < count; i++)
             continue;
         }/* If no current parents.*/
 
+
         /* Check how many parents we skipped. */
         if(num_skipped > 0)
         {
@@ -2631,6 +2653,8 @@ for(i = 0; i < count; i++)
 /* Loop over the used resources */
     for(j = 0; j < t->nr_uses; j++)
     {
+        if(s->res[getindex(t->uses[j],s)].node == s->rank)
+            continue;
         /* Find the parents that unlock this resource. */
         num_current = 0;
         int num_skipped = 0;
@@ -2721,7 +2745,6 @@ for(i = 0; i < count; i++)
             continue;
         }/* If no current parents.*/
 
-
         /* Check how many parents we skipped. */
         if(num_skipped > 0)
         {
@@ -2849,12 +2872,20 @@ for(i = 0; i < count; i++)
 free(parents);
 free(current_parents);
 
+    toc = getticks();
+    message("qsched_partition_create_sends_creation took %lli (= %.3fms) ticks\n", toc-tic, (float)(toc-tic)/2.67e6);
+    message("tsched_count = %i", ts.count);
+
+    tic = getticks();
     /* All the nodes have created new tasks, we need to synchronize this across nodes now.*/
     tsched_synchronize( &ts, s);
 
        /* Once we have synchronized, we need to integrate the temporary scheduler into quicksched's scheduler. */
     qsched_integrate( s, &ts);
 
+    toc = getticks();
+    message("qsched_partition_create_sends_synchronize took %lli (= %.3fms) ticks\n", toc-tic, (float)(toc-tic)/2.67e6);
+
     tsched_free( &ts) ;
 }
 
@@ -3293,7 +3324,7 @@ void qsched_prepare_mpi( struct qsched *s){
     struct task *t, *tasks;
     
     TIMER_TIC
-//    ticks tic, toc;
+ //   ticks tic, toc;
 
     /* Lock the sched. */
     lock_lock( &s->lock );
@@ -3325,11 +3356,11 @@ void qsched_prepare_mpi( struct qsched *s){
 //        count = s->res_ranks[s->count_ranks];
         /* Do the sorts in parallel, if possible. */
 //        tic = getticks();
-        #pragma omp parallel
+//        //#pragma ompparallel
         {
     
             /* Sort the unlocks. */
-            #pragma omp single nowait
+//            //#pragma ompsingle nowait
             qsched_quicksort( s->deps , s->deps_key , s->count_deps , 0 , 0xFFFFFFFFFFFFFFF);
             /*for(k = 1; k < s->count_deps; k++)
             {
@@ -3340,17 +3371,17 @@ void qsched_prepare_mpi( struct qsched *s){
                 }
             }*/
             /* Sort the locks. */
-            #pragma omp single nowait
+  //          //#pragma ompsingle nowait
             qsched_quicksort( s->locks , s->locks_key , s->count_locks , 0 , 0xFFFFFFFFFFFFFFF);
 
             /* Sort the uses. */
-            #pragma omp single nowait
+    //        //#pragma ompsingle nowait
             qsched_quicksort( s->uses , s->uses_key , s->count_uses , 0 , 0xFFFFFFFFFFFFFFF );
 
-            #pragma omp single nowait
+      //      //#pragma ompsingle nowait
             qsched_quicksort( s->users, s->users_key, s->count_users, 0, 0xFFFFFFFFFFFFFFF);
 
-            #pragma omp single nowait
+//            //#pragma ompsingle nowait
             qsched_quicksort( s->lockers, s->lockers_key, s->count_lockers, 0, 0xFFFFFFFFFFFFFFF);
             
         }
@@ -3375,10 +3406,10 @@ void qsched_prepare_mpi( struct qsched *s){
 //        toc = getticks();
    //     message("Cleaning up scheduler took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic));
         }
-    tic = getticks();
+//    tic = getticks();
     qsched_partition(s);
-    toc = getticks();
-    message("qsched_partition took %lli (= %.3fms) ticks\n", toc-tic, (float)(toc-tic)/2.67e6);
+//    toc = getticks();
+//    message("qsched_partition took %lli (= %.3fms) ticks\n", toc-tic, (float)(toc-tic)/2.67e6);
 
 long long int *tid = malloc(sizeof(long long int) * s->task_ranks[s->count_ranks]);
 if(tid == NULL)
@@ -3700,7 +3731,12 @@ printf("Rank[%i]: qsched_prepare_mpi took %lli (= %.3fms) ticks\n", s->rank,
             error( "Failed to create pthread." );
         #if defined( HAVE_LIBNUMA)
         CPU_ZERO(&cpuset);
-        CPU_SET(cpuid[tid], &cpuset);
+#define SHMEM
+        #ifdef SHMEM
+        CPU_SET(cpuid[tid+s->rank], &cpuset);
+        #else
+        CPU_SET(cpuid[tid], &cpuset);        
+        #endif
         if ( pthread_setaffinity_np(s->runners[tid].thread, sizeof(cpu_set_t), &cpuset ) != 0){
             error("Failed to set thread affinity.");
         }
@@ -3722,12 +3758,24 @@ printf("Rank[%i]: qsched_prepare_mpi took %lli (= %.3fms) ticks\n", s->rank,
 
 toc = getticks();
 
-        printf("Rank[%i]: Execution took %lli (= %e) ticks\n", s->rank,
-         toc - tic, (float)(toc - tic));
+/*        printf("Rank[%i]: Execution took %lli (= %e) ticks\n", s->rank,
+         toc - tic, (float)(toc - tic));*/
     //TIMINGS BASED ON COSMA4!
     message("Execution took %.3f milliseconds.", (toc-tic)/2.67e6);
     MPI_Barrier(s->comm);
-            
+
+    float max = (float)(toc-tic)/2.67e6;
+    float total = (float)(toc-tic)/2.67e6;
+         
+    MPI_Allreduce(MPI_IN_PLACE, &max, 1, MPI_FLOAT, MPI_MAX, s->comm);
+    MPI_Allreduce(MPI_IN_PLACE, &total, 1, MPI_FLOAT, MPI_SUM, s->comm);
+
+    if(s->rank == 0)
+    {
+        message("Max runtime = %fms", max);
+        message("Total runtime = %fms", total*nr_threads);
+    }
+   
 #else
     #if defined( HAVE_OPENMP )
 MPI_Barrier(s->comm);
@@ -3737,7 +3785,7 @@ MPI_Barrier(s->comm);
     message("Beginning execution of quicksched");
     tic = getticks();
 /* Parallel loop. */
-    #pragma omp parallel num_threads( nr_threads )
+    //#pragma ompparallel num_threads( nr_threads )
     {
         /* Local variable. */
         struct task *t;
@@ -3802,7 +3850,7 @@ void qsched_run_openmp ( struct qsched *s , int nr_threads , qsched_funtype fun
     qsched_prepare( s );
 
     /* Parallel loop. */
-    #pragma omp parallel num_threads( nr_threads )
+    //#pragma ompparallel num_threads( nr_threads )
     {
         /* Local variable. */
         struct task *t;
@@ -4577,13 +4625,13 @@ void qsched_quicksort (long long int *restrict data ,long long int *restrict ind
 
             /* Recurse on the left? */
             if ( j > 0  && pivot > min ) {
-                #pragma omp task untied
+//                //#pragma omptask untied
                 qsched_quicksort( data , ind , j+1 , min , pivot );
                 }
 
             /* Recurse on the right? */
             if ( i < N && pivot+1 < max ) {
-                #pragma omp task untied
+//                //#pragma omptask untied
                 qsched_quicksort( &data[i], &ind[i], N-i , pivot+1 , max );
                 }
 
@@ -4628,19 +4676,19 @@ void qsched_prepare ( struct qsched *s ) {
     if ( s->flags & qsched_flag_dirty ) {
     
         /* Do the sorts in parallel, if possible. */
-        #pragma omp parallel
+        //#pragma ompparallel
         {
     
             /* Sort the unlocks. */
-            #pragma omp single nowait
+            //#pragma ompsingle nowait
             qsched_sort( s->deps , s->deps_key , s->count_deps , 0 , count - 1 );
 
             /* Sort the locks. */
-            #pragma omp single nowait
+            //#pragma ompsingle nowait
             qsched_sort( s->locks , s->locks_key , s->count_locks , 0 , count - 1 );
 
             /* Sort the uses. */
-            #pragma omp single nowait
+            //#pragma ompsingle nowait
             qsched_sort( s->uses , s->uses_key , s->count_uses , 0 , count - 1 );
             
         }