diff --git a/examples/test_bh_mpi.c b/examples/test_bh_mpi.c
index ed72750192b6a1601843f02f7d7d6194f4655858..1e80224b1b8d58387bee4994a961fe5a04df7bcb 100644
--- a/examples/test_bh_mpi.c
+++ b/examples/test_bh_mpi.c
@@ -39,7 +39,7 @@
 
 /* Some local constants. */
 #define cell_pool_grow 1000
-#define cell_maxparts 50
+#define cell_maxparts 512
 #define task_limit 1
 #define const_G 1    // 6.6738e-8
 #define dist_min 0.5 /* Used for legacy walk only */
@@ -49,20 +49,20 @@
 #define SANITY_CHECKS
 #define NO_COM_AS_TASK
 #define NO_COUNTERS
-#define EXACT
+#define NO_EXACT
 
 
 
 /** Data structure for the particles. */
 struct part {
   double x[3];
-  //union {
+  union {
     float a[3];
     float a_legacy[3];
     #ifndef EXACT
     float a_exact[3];
     #endif
-  //};
+  };
     #ifdef EXACT
     float a_exact[3];
     #endif
@@ -1244,7 +1244,7 @@ if(MpiThreadLevel != MPI_THREAD_MULTIPLE)
 //  for (k = 0; k < task_type_count; k++) task_timers[k] = 0;
 
   /* Initialize the scheduler. */
-    qsched_init(&s, nr_threads, qsched_flag_noreown, MPI_COMM_WORLD);
+    qsched_init(&s, nr_threads, qsched_flag_yield | qsched_flag_pthread, MPI_COMM_WORLD);
 
   /* Init and fill the particle array. */
 //  if ((parts = (struct part *)malloc(sizeof(struct part) * N)) == NULL)
@@ -1294,7 +1294,6 @@ if(s.rank == 0)
   root->parts = parts;
   root->res_parts = parts_res;
   cell_split(root, &s);
-
   /* Iterate over the cells and get the average number of particles
      per leaf. */
   struct cell *c = root;
@@ -1319,7 +1318,6 @@ if(s.rank == 0)
 
 }
    qsched_sync_resources(&s);
-
 #ifdef EXACT
     if(s.rank == 0)
     {
@@ -1359,7 +1357,7 @@ if(s.rank == 0)
     }
     message("leaf_depth = %i", leaf_depth);
     message("tasks before = %i", s.count);
-    create_pcs(&s, root, NULL, 0, 1);    
+    create_pcs(&s, root, NULL, 0, leaf_depth-1);    
     message("tasks after = %i", s.count);
 }
 printf("s.count = %i\n", s.count);
@@ -1371,6 +1369,7 @@ qsched_run_MPI(&s, nr_threads, runner);
     printf("Hello world from processor %s, rank = %i, count_ranks = %i\n",
            processor_name, s.rank, s.count_ranks);
 
+#ifdef OUTPUT_PARTS
 if(s.rank == 0)
 {
     file = fopen("particle_dump.out", "w");
@@ -1391,6 +1390,7 @@ for(i = 0; i < s.count_ranks; i++)
     }
     MPI_Barrier(s.comm);
 }
+#endif
 
 //Need to clean up everything.
 //    free(parts);
@@ -1413,7 +1413,7 @@ int main(int argc, char *argv[]) {
   char fileName[100] = {0};
 
   /* Die on FP-exceptions. */
-  feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
+//  feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
 
 /* Get the number of threads. */
 /*#pragma omp parallel shared(nr_threads)
@@ -1436,8 +1436,6 @@ int main(int argc, char *argv[]) {
         if (sscanf(optarg, "%d", &nr_threads) != 1)
           error("Error parsing number of threads.");
         omp_set_num_threads(nr_threads);
-        message("omp_get_max_threads() = %i\n", omp_get_max_threads());
-        message("omp_get_num_procs() = %i\n", omp_get_num_procs());
         break;
       case 'f':
         if (sscanf(optarg, "%s", &fileName[0]) != 1)
diff --git a/src/Makefile.am b/src/Makefile.am
index b40775a0defa29b3e1aee5b501424fa80c6d2bf9..4d39c981ed5423256ec3562b3f1dfa6aa12461f5 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -36,7 +36,7 @@ libquicksched_la_SOURCES = qsched.c queue.c
 #libquickschedMPI_la_CC = mpicc
 libquickschedMPI_la_CFLAGS = -g -O3 -Wall -Werror -ffast-math -fstrict-aliasing -ftree-vectorize \
     -funroll-loops $(SIMD_FLAGS) $(OPENMP_CFLAGS) -DTIMERS -std=gnu99 \
-    -DWITH_MPI
+    -DWITH_MPI 
 #-DTASK_TIMERS
 libquickschedMPI_la_SOURCES = qsched.c queue.c
 
diff --git a/src/qsched.c b/src/qsched.c
index e41df559a414329798031b09b47447585bc8706a..b520f48f8e5ff4a1ed9e5e263d93889c9cb7883d 100644
--- a/src/qsched.c
+++ b/src/qsched.c
@@ -418,6 +418,7 @@ void qsched_sync_schedulers( struct qsched *s){
         tasks_local[i] = s->tasks[i];
     }
     int number = sizeof(struct task) * s->task_ranks[s->count_ranks];
+    //message("number=%i\n", number);
     number = number / sizeof(int);
     errors = MPI_Allreduce(MPI_IN_PLACE, tasks_new, number, MPI_INT, MPI_SUM, s->comm);
     if(errors != MPI_SUCCESS)
@@ -1900,6 +1901,7 @@ void tsched_synchronize(struct tsched *s, struct qsched *qs){
     /* Synchronize the deps. */
     memset(temp, 0, sizeof(int) * (qs->count_ranks+1));
     temp[qs->rank+1] = s->count_deps;
+//    message("temp = %i", s->count_deps);
     errors = MPI_Allreduce(MPI_IN_PLACE, temp, qs->count_ranks+1, MPI_INT, MPI_SUM, qs->comm);
     if(errors != MPI_SUCCESS)
     {
@@ -1930,6 +1932,7 @@ void tsched_synchronize(struct tsched *s, struct qsched *qs){
     }
 
     size = sizeof(long long int) * temp[qs->count_ranks];
+  //  printf("size=%i\n", size);
     size = size / sizeof(int);
 
     errors = MPI_Allreduce(MPI_IN_PLACE, deps_new, size, MPI_INT, MPI_SUM, qs->comm);
@@ -2332,7 +2335,6 @@ for(i = 0; i < count; i++)
                 task_data[3] = (int)(s->res[getindex(t->locks[j],s)].ID & 0xFFFFFFFF);
                 task_data[4] = sends_added;
 
-
                 /* Create the send task. */
                 send_task_id = tsched_addtask(&ts, task_type_send , 0 , task_data , 5 * sizeof(int) , s->res[getindex(t->locks[j],s)].size );
                 /* The send task needs to lock the resource. */
@@ -2365,7 +2367,6 @@ for(i = 0; i < count; i++)
                 task_data[2] = (int)(s->res[getindex(t->uses[j],s)].ID >> 32);
                 task_data[3] = (int)(s->res[getindex(t->uses[j],s)].ID & 0xFFFFFFFF);
                 task_data[4] = sends_added;
-    
 
                 /* Create the send task. */
                 send_task_id = tsched_addtask(&ts, task_type_send , 0 , task_data , 5 * sizeof(int) , s->res[getindex(t->uses[j],s)].size );
@@ -2498,7 +2499,6 @@ for(i = 0; i < count; i++)
                 task_data[3] = (int)(s->res[getindex(t->locks[j],s)].ID & 0xFFFFFFFF);
                 task_data[4] = sends_added;
 
-
                 /* Create the send task. */
                 send_task_id = tsched_addtask(&ts, task_type_send , 0 , task_data , 5 * sizeof(int) , s->res[getindex(t->locks[j],s)].size );
                 /* The send task needs to lock the resource. */
@@ -2532,7 +2532,6 @@ for(i = 0; i < count; i++)
                 task_data[3] = (int)(s->res[getindex(t->locks[j],s)].ID & 0xFFFFFFFF);
                 task_data[4] = sends_added;
 
-
                 /* Create the send task. */
                 send_task_id = tsched_addtask(&ts, task_type_send , 0 , task_data , 5 * sizeof(int) , s->res[getindex(t->locks[j],s)].size );
                 /* The send task needs to lock the resource. */
@@ -2698,7 +2697,6 @@ for(i = 0; i < count; i++)
                 task_data[3] = (int)(s->res[getindex(t->uses[j],s)].ID & 0xFFFFFFFF);
                 task_data[4] = sends_added;
 
-
                 /* Create the send task. */
                 send_task_id = tsched_addtask(&ts, task_type_send , 0 , task_data , 5 * sizeof(int) , s->res[getindex(t->uses[j],s)].size );
                 /* The send task needs to lock the resource. */
@@ -2730,7 +2728,7 @@ for(i = 0; i < count; i++)
                 task_data[2] = (int)(s->res[getindex(t->uses[j],s)].ID >> 32);
                 task_data[3] = (int)(s->res[getindex(t->uses[j],s)].ID & 0xFFFFFFFF);
                 task_data[4] = sends_added;
-    
+
                 /* Create the send task. */
                 send_task_id = tsched_addtask(&ts, task_type_send , 0 , task_data , 5 * sizeof(int) , s->res[getindex(t->uses[j],s)].size );
                 /* The send task needs to lock the resource. */
@@ -2809,19 +2807,11 @@ for(i = 0; i < count; i++)
                             
                         }*/
 
-                        //TODO What are we actually doing here?.
-                        //TODO We want to look through ALL of ts.unlocks (as I don't think they have been sorted yet - maybe we're lucky and they're in order but i'm not sure thats the case).
-                        //TODO We want to find if the current_parent unlocks this task.
                         if(ts.unlockers_key[l] == temp->id && ts.unlockers[l] == tid[current_parents[k]])
                         {
                             found = 1;
                             break;
                         }
-/*                        if(temp->unlocks[l] == tid[current_parents[k]] && ts.deps_key[l] == t->uses[j])
-                        {
-                            found = 1;
-                            break;
-                        }       */
                     }
                     if(!found)
                     {
@@ -3072,7 +3062,7 @@ for(i = 0; i < node_count; i++)
 //    message("METIS_PartGraphKway took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic));
         //TODO Check the costs.
 //        message("node_count = %i", node_count);
-        int count_me = 0;
+        long long int count_me = 0;
         for(i = 0; i < node_count; i++)
         {
             if(nodeIDs[i] == s->rank)
@@ -3083,7 +3073,7 @@ for(i = 0; i < node_count; i++)
             }
         }
 //        printf("\n");
-        message("My \"cost\" = %i", count_me);
+        message("My \"cost\" = %lli", count_me);
 
 
 
@@ -3453,16 +3443,45 @@ if(tid == NULL)
     for ( k = 0 ; k < s->nr_queues ; k++ )
         queue_init( &s->queues[k] , count );
 
+    int num_send =0, num_recv = 0;
     /* Run through the tasks and enqueue the non-waiting ones. */
     for ( k = 0 ; k < s->task_ranks[s->count_ranks] ; k++ ) {
         t = &tasks[gettaskindex(tid[k],s)];
 
         if ( t->wait == 0 && !( t->flags & task_flag_skip ) )
         {
+            if(t->type == task_type_send)
+                num_send++;
+            else if (t->type == task_type_recv)
+                num_recv++;
             qsched_enqueue( s , t );
         }
+        //TODO Remove this debug stuff.
+      /* if(t->flags & task_flag_skip && t->type == task_type_send && t->id == 886394)
+        {
+            int* data = (int*)&s->data[ t->data ];
+            int to = data[1];
+            int from = data[0];
+            int tag = data[4];
+            long long int resid = (((long long int)data[2]) << 32) + data[3];
+            struct res *res = &s->res[getindex(resid, s)];
+            int res_owner = res->node;
+            if(to != s->rank)
+            {
+                message("Not sending task to %i from %i with tag %i, id = %lli", to, from,tag, t->id);
+            }
+            if(to == s->rank)
+            {
+                message("This task should come to %i from %i with tag %i, id = %lli", from , to,tag, t->id);
+            }
+                message("Resource owner is node %i", res_owner);
         }
 
+
+    message("Initial setup %i send tasks", num_send);
+    message("Initial setup %i recv tasks", num_recv);*/
+
+        }
     /* Clean up. */
     free( tid );
         
@@ -3479,14 +3498,6 @@ if(tid == NULL)
     /* Set the ready flag. */
     s->flags |= qsched_flag_ready;
 
-    for(k = 0; k < s->res_ranks[s->count_ranks]; k++)
-    {
-        if(s->res[k].lock != 0)
-            error("Initial lock value not 0 for res = %lli\n.", s->res[k].ID);
-        if(s->res[k].hold != 0)
-            error("Initial hold value not 0 for res = %lli\n.", s->res[k].ID);
-    }
-
 
     /* Unlock the sched. */
     lock_unlock_blind( &s->lock );
@@ -3650,14 +3661,15 @@ printf("Rank[%i]: qsched_prepare_mpi took %lli (= %e) ticks\n", s->rank,
 
     if(numa_available() >= 0){
         /* Ascending NUMA distance. Use Angus' Bubblesort implementation. */
-        int home = numa_node_of_cpu(sched_getcpu()), half = nr_cores / 2;
+        //int home = numa_node_of_cpu(sched_getcpu());
+        int half = nr_cores / 2;
         int done = 0;
         while(!done) {
             done = 1;
             for(i = 1; i < nr_cores ; i++)
             {
-                int node_a = numa_node_of_cpu(cpuid[i-1]);
-                int node_b = numa_node_of_cpu(cpuid[i]);
+//                int node_a = numa_node_of_cpu(cpuid[i-1]);
+//                int node_b = numa_node_of_cpu(cpuid[i]);
       
 
                 /* Try to avoid hyperthreads... */