diff --git a/examples/test_bh_mpi.c b/examples/test_bh_mpi.c
index 1e80224b1b8d58387bee4994a961fe5a04df7bcb..1f0689919be42d0818e757e9d771192241f8194f 100644
--- a/examples/test_bh_mpi.c
+++ b/examples/test_bh_mpi.c
@@ -33,6 +33,9 @@
 #include <fenv.h>
 #include <mpi.h>
 
+#define NO_TASK_TIMERS
+#define NO_LOAD_BALANCE_EXACT
+
 /* Local includes. */
 #include "quicksched.h"
 #include "res.h"
@@ -53,6 +56,8 @@
 
 
 
+
+
 /** Data structure for the particles. */
 struct part {
   double x[3];
@@ -814,6 +819,9 @@ void create_pcs(struct qsched *s, struct cell *ci, struct cell *cj, int depth, i
   qsched_task_t data[2];
   qsched_task_t cp, cps;
   struct cell *cp1, *cp2;
+  #ifdef LOAD_BALANCE_EXACT
+  ticks tic, toc;
+  #endif
 
     #ifdef SANITY_CHECKS
         if(cj!= NULL && ci->h != cj->h)
@@ -874,9 +882,15 @@ void create_pcs(struct qsched *s, struct cell *ci, struct cell *cj, int depth, i
                 /* Create the task. */
                 data[0] = ci->res;
                 data[1] = cj->res;
+                #ifdef LOAD_BALANCE_EXACT
+                tic = getticks();
+                iact_pair_pc(s, ci, cj);
+                toc = getticks();
+                tid = qsched_addtask(s, task_type_pair_pc, task_flag_none, data, sizeof(qsched_task_t) * 2, (toc-tic) / 100);
+                #else
                 tid = qsched_addtask(s, task_type_pair_pc, task_flag_none, data,
 			           sizeof(qsched_task_t) * 2, ci->count * 8 );
-          
+                #endif
                 /* Add the resource and dependance */
                 qsched_addlock(s, tid, ci->res_parts);
                 qsched_adduse(s, tid, ci->res);
@@ -885,9 +899,15 @@ void create_pcs(struct qsched *s, struct cell *ci, struct cell *cj, int depth, i
                 /* Create the task. */
                 data[0] = cj->res;
                 data[1] = ci->res;
+                #ifdef LOAD_BALANCE_EXACT
+                tic = getticks();
+                iact_pair_pc(s, cj, ci);
+                toc = getticks();
+                tid = qsched_addtask(s, task_type_pair_pc, task_flag_none, data, sizeof(qsched_task_t) * 2, (toc-tic) / 100);
+                #else
                 tid = qsched_addtask(s, task_type_pair_pc, task_flag_none, data,
 			           sizeof(qsched_task_t) * 2, cj->count * 8 );
-          
+                #endif          
                 /* Add the resource and dependance */
                 qsched_addlock(s, tid, cj->res_parts);
                 qsched_adduse(s, tid, cj->res);
@@ -911,6 +931,9 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj) {
   qsched_task_t data[2];
   qsched_task_t cp, cps;
   struct cell *cp1, *cp2;
+  #ifdef LOAD_BALANCE_EXACT
+  ticks tic, toc;
+  #endif
 
 #ifdef SANITY_CHECKS
 
@@ -947,9 +970,15 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj) {
       data[1] = -1;
 
       /* Create the task. */
+      #ifdef LOAD_BALANCE_EXACT
+      tic = getticks();
+      iact_self_direct(s, ci);
+      toc = getticks();
+      tid = qsched_addtask(s, task_type_self, task_flag_none, data, sizeof(qsched_task_t) * 2, (toc-tic) / 100);
+            #else
       tid =  qsched_addtask(s, task_type_self, task_flag_none, data,
 			    sizeof( qsched_task_t) * 2, ci->count * ci->count / 2);
-
+      #endif
       /* Add the resource (i.e. the cell) to the new task. */
       qsched_addlock(s, tid, ci->res_parts);
       qsched_adduse(s, tid, ci->res);
@@ -988,53 +1017,6 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj) {
     	}
     }
 
-#ifdef OLD_SETUP
-      /* Create the task. */
-      data[0] = ci->res;
-      data[1] = cj->res;
-      tid = qsched_addtask(s, task_type_pair_pc, task_flag_none, data,
-			   sizeof(qsched_task_t) * 2, ci->count * 8 );
-      
-      /* Add the resource and dependance */
-      qsched_addlock(s, tid, ci->res_parts);
-      qsched_adduse(s, tid, ci->res);
-      qsched_adduse(s, tid, cj->res);
-      for(cp = ci->firstchild; cp != ci->sibling; cp = cp1->sibling)
-      {
-            cp1 = (struct cell*) qsched_getresdata(s, cp);
-            qsched_adduse(s, tid, cp1->res);
-      }
-      for(cp = cj->firstchild; cp != cj->sibling; cp = cp1->sibling)
-      {
-            cp1 = (struct cell*) qsched_getresdata(s, cp);
-            qsched_adduse(s, tid, cp1->res);
-      }
-
-      /* Create the task. */
-      data[0] = cj->res;
-      data[1] = ci->res;
-      tid = qsched_addtask(s, task_type_pair_pc, task_flag_none, data,
-			   sizeof(qsched_task_t) * 2, cj->count * 8);
-
-      qsched_addlock(s, tid, cj->res_parts);
-      qsched_adduse(s, tid, ci->res);
-      qsched_adduse(s, tid, cj->res);
-      for(cp = ci->firstchild; cp != ci->sibling; cp = cp1->sibling)
-      {
-            cp1 = (struct cell*) qsched_getresdata(s, cp);
-            qsched_adduse(s, tid, cp1->res);
-      }
-      for(cp = cj->firstchild; cp != cj->sibling; cp = cp1->sibling)
-      {
-            cp1 = (struct cell*) qsched_getresdata(s, cp);
-            qsched_adduse(s, tid, cp1->res);
-      }
-
-#endif
-      /* Add the resource and dependance */
-//      qsched_addunlock(s, ci->com_tid, tid);
-
-     
     } else {  /* Otherwise, at least one of the cells is not split, build a direct
 	       * interaction. */
 
@@ -1043,9 +1025,15 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj) {
       data[1] = cj->res;
       
       /* Create the task. */
+      #ifdef LOAD_BALANCE_EXACT
+      tic = getticks();
+      iact_pair_direct(s, ci, cj);
+      toc = getticks();
+      tid = qsched_addtask(s, task_type_pair, task_flag_none, data, sizeof(qsched_task_t) * 2, (toc-tic) / 100);
+            #else
       tid = qsched_addtask(s, task_type_pair, task_flag_none, data,
 			   sizeof(qsched_task_t) * 2, ci->count * cj->count);
-      
+        #endif      
         struct part *part_j = (struct part*) qsched_getresdata(s, cj->res_parts );
         struct part *part_i = (struct part*) qsched_getresdata(s, ci->res_parts );
         ci->parts = part_i;
@@ -1330,6 +1318,15 @@ if(s.rank == 0)
 if(s.rank == 0)
 {
     create_tasks(&s, root, NULL);
+  #ifdef LOAD_BALANCE_EXACT
+  struct parts *part = qsched_getresdata(&s, root->res_parts);
+  for(i = 0; i < root->count; i++)
+  {
+    parts[i].a[0] = 0.0f;
+    parts[i].a[1] = 0.0f;
+    parts[i].a[2] = 0.0f;
+  }
+  #endif
     /* Compute the loweest depth of a leaf. */
     int depth = 1;
     int leaf_depth = 0xFFFFFFF;
@@ -1357,7 +1354,7 @@ if(s.rank == 0)
     }
     message("leaf_depth = %i", leaf_depth);
     message("tasks before = %i", s.count);
-    create_pcs(&s, root, NULL, 0, leaf_depth-1);    
+    create_pcs(&s, root, NULL, 0, leaf_depth-2);    
     message("tasks after = %i", s.count);
 }
 printf("s.count = %i\n", s.count);
@@ -1391,6 +1388,32 @@ for(i = 0; i < s.count_ranks; i++)
     MPI_Barrier(s.comm);
 }
 #endif
+#ifdef TASK_TIMERS
+//Each rank wants to loop through the tasks they executed and output the data, then synchronize.
+int j;
+if(s.rank == 0)
+{
+    file = fopen("task_timers.tks", "w");
+    fclose(file);
+}
+for(i = 0; i < s.count_ranks; i++)
+{
+    if(i == s.rank)
+    {
+        file = fopen("task_timers.tks", "a");
+        for(j = 0; j < s.task_ranks[s.count_ranks]; j++)
+        {
+            if(s.tasks[j].node_executed == s.rank)
+            {
+                struct task *t = &s.tasks[j];
+                fprintf(file, "%lli %i %llu %llu %i %i\n", t->id, t->type, t->task_start, t->task_finish, t->node_executed, t->thread_executed);
+            }   
+        }
+        fclose(file);
+    }
+    MPI_Barrier(s.comm);
+}
+#endif
 
 //Need to clean up everything.
 //    free(parts);
diff --git a/examples/test_qr_mpi.c b/examples/test_qr_mpi.c
index ca83d75c14013be19488347ab050365855f2033a..c77c46c773da1b775217bdf73cbcc0f6abb73155 100644
--- a/examples/test_qr_mpi.c
+++ b/examples/test_qr_mpi.c
@@ -720,7 +720,7 @@ for(i = 0; i < s.count_ranks; i++)
         file = fopen("task_timers.tks", "a");
         for(j = 0; j < s.task_ranks[s.count_ranks]; j++)
         {
-            if(s.tasks[j].node_executed == s.rank && s.tasks[j].type != -101)
+            if(s.tasks[j].node_executed == s.rank)
             {
                 struct task *t = &s.tasks[j];
                 fprintf(file, "%lli %i %llu %llu %i %i\n", t->id, t->type, t->task_start, t->task_finish, t->node_executed, t->thread_executed);
diff --git a/src/Makefile.am b/src/Makefile.am
index 4d39c981ed5423256ec3562b3f1dfa6aa12461f5..b40775a0defa29b3e1aee5b501424fa80c6d2bf9 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -36,7 +36,7 @@ libquicksched_la_SOURCES = qsched.c queue.c
 #libquickschedMPI_la_CC = mpicc
 libquickschedMPI_la_CFLAGS = -g -O3 -Wall -Werror -ffast-math -fstrict-aliasing -ftree-vectorize \
     -funroll-loops $(SIMD_FLAGS) $(OPENMP_CFLAGS) -DTIMERS -std=gnu99 \
-    -DWITH_MPI 
+    -DWITH_MPI
 #-DTASK_TIMERS
 libquickschedMPI_la_SOURCES = qsched.c queue.c
 
diff --git a/src/qsched.c b/src/qsched.c
index b520f48f8e5ff4a1ed9e5e263d93889c9cb7883d..2be6b1751cfaf78c563d166035ceec80e4408e42 100644
--- a/src/qsched.c
+++ b/src/qsched.c
@@ -1632,12 +1632,16 @@ void *temp;
     t->nr_locks = 0;
     t->nr_uses = 0;
     t->id = id;
+    #ifdef TASK_TIMERS
+        t->node_executed = -1;
+        t->thread_executed = -1;
+    #endif
     
     /* Add a relative pointer to the data. */
     memcpy( &ts->data[ ts->count_data ] , data , data_size );
     t->data = &ts->data[ ts->count_data ] - ts->data;
     ts->count_data += data_size2;
-    
+
     /* Increase the task counter. */
     ts->count += 1;
     
@@ -2565,6 +2569,8 @@ for(i = 0; i < count; i++)
 
 
                 /* Update data_pos to the latest parent task in the top order. */
+            /* We know we have the data correct as of the latest parent task in the topological order. */
+            if(data_pos[getindex(t->locks[j], s)] < last_index)
                 data_pos[getindex(t->locks[j], s)] = last_index;
                 
                 sends_added+=1;
@@ -2763,6 +2769,8 @@ for(i = 0; i < count; i++)
 
 
                 /* Update data_pos to the latest parent task in the top order. */
+            /* We know we have the data correct as of the latest parent task in the topological order. */
+            if(data_pos[getindex(t->uses[j], s)] < last_index)
                 data_pos[getindex(t->uses[j], s)] = last_index;
                 
                 sends_added+=1;
@@ -2857,15 +2865,15 @@ void qsched_partition( struct qsched *s){
     int i, j;
     struct task *t;
     int errors;
-//    ticks tic, toc;
+    ticks tic, toc;
    // struct res *r;
 
     res_costs = (idx_t*) calloc(s->res_ranks[s->count_ranks], sizeof(idx_t));
     
-//    tic = getticks();
+    tic = getticks();
     qsched_partition_compute_costs( s, res_costs);
-//    toc = getticks();
-//    message("qsched_partition_compute_costs took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic));
+    toc = getticks();
+    message("qsched_partition_compute_costs took %lli (= %.3fms) ticks\n", toc-tic, (float)(toc-tic)/2.67e6);
 
     //All reduce res_costs;
     #if IDXTYPEWIDTH == 32
@@ -2900,10 +2908,10 @@ void qsched_partition( struct qsched *s){
     for(i = 0; i < s->res_ranks[s->count_ranks]; i++)
         pos_in_nodelist[i] = -1;
 
-//    tic = getticks();
+    tic = getticks();
     qsched_partition_build_nodelist(s, nodelist, noderef, &node_count, res_costs, pos_in_nodelist);
-//    toc = getticks();
-   // message("qsched_partition_build_nodelist took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic));
+    toc = getticks();
+    message("qsched_partition_build_nodelist took %lli (= %.3fms) ticks\n", toc-tic, (float)(toc-tic)/2.67e6);
 
 
 //Build an edgelist where edges are of weight += task->weight for each task that locks both. If noderef doesn't contain, recurse until we find the ones it does contain (yuck). Build a set of adjacency lists.
@@ -2933,10 +2941,10 @@ for(i = 0; i < node_count; i++)
     edge_sizes[i] = initial_size;
 }
 
-//    tic = getticks();
+    tic = getticks();
     qsched_partition_build_edgelist(s, edge_vwgts, edge_lists, edge_counts, edge_sizes, node_count, noderef, pos_in_nodelist);
-//    toc = getticks();
-//    message("qsched_partition_build_edgelist took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic));
+    toc = getticks();
+    message("qsched_partition_build_edgelist took %lli (= %.3fms) ticks\n", toc-tic, (float)(toc-tic)/2.67e6);
 
 idx_t edgelist_size = 0;
 for(i = 0; i < node_count; i++)
@@ -2970,7 +2978,7 @@ for(i = 0; i < node_count; i++)
     tic = getticks();
     qsched_partition_build_edgelist(s, edgelist, node_count, noderef, pos_in_nodelist);
     toc = getticks();
-    message("qsched_partition_build_edgelist took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic));
+    message("qsched_partition_build_edgelist took %lli (= %.3fms) ticks\n", toc-tic, (float)(toc-tic)/2.67e6);
 
 #endif
  //   free(pos_in_nodelist);
@@ -3001,7 +3009,7 @@ for(i = 0; i < node_count; i++)
             edgelist_size++;
     }
     toc = getticks();
-    message("Checking number of elements in new edgelist took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic));
+    message("Checking number of elements in new edgelist took %lli (= %.3fms) ticks\n", toc-tic, (float)(toc-tic)/2.67e6);
 
 
     //Make the new contiguous edge list.
@@ -3023,7 +3031,7 @@ for(i = 0; i < node_count; i++)
     tic = getticks();
     qsched_partition_edgelist_squash(s, edgelist_pos, edgelist_new, edgelist_vwgt, &edgelist_count, edgelist, node_count);
     toc = getticks();
-    message("qsched_partition_edgelist_squash took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic));
+    message("qsched_partition_edgelist_squash took %lli (= %.3fms) ticks\n", toc-tic, (float)(toc-tic)/2.67e6);
 #endif
    
     /*#if IDXTYPEWIDTH == 32
@@ -3053,13 +3061,13 @@ for(i = 0; i < node_count; i++)
         if(nodeIDs == NULL)
             error("Failed to allocate nodeIDs");
         idx_t temp_count_ranks = s->count_ranks;
-//        tic = getticks();
+        tic = getticks();
         if(s->count_ranks > 1) {
             if( METIS_PartGraphKway(&node_count, &one, edgelist_pos, edgelist_new, nodelist, NULL, edgelist_vwgt, &temp_count_ranks, NULL, NULL,options, objval, nodeIDs) != METIS_OK)
                 error("Failed to partition\n");
         }
-//            toc = getticks();
-//    message("METIS_PartGraphKway took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic));
+           toc = getticks();
+    message("METIS_PartGraphKway took %lli (= %.3fms) ticks\n", toc-tic, (float)(toc-tic)/2.67e6);
         //TODO Check the costs.
 //        message("node_count = %i", node_count);
         long long int count_me = 0;
@@ -3077,7 +3085,7 @@ for(i = 0; i < node_count; i++)
 
 
 
-//    tic = getticks();
+    tic = getticks();
 if(s->count_ranks > 1)
 {
     MPI_Request *reqs;
@@ -3188,14 +3196,14 @@ if(s->count_ranks > 1)
             if(temp->node == s->rank && temp->data == NULL)
                 error("Local resource has data set to NULL");
         }
-//    toc = getticks();
-//    message("qsched_partition synchronizing resources took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic));
+    toc = getticks();
+    message("qsched_partition synchronizing resources took %lli (= %.3fms) ticks\n", toc-tic, (float)(toc-tic)/2.67e6);
         // Move the tasks time!
         // Tasks belong to the node of the resource they lock of highest size.
         // If they don't lock any resources, the resources they use of highest size.
         // Everyone does it for all tasks at the moment...
         // TODO ISSUE: Whoever is assigned bigger resources will run more tasks - not balanced well. Minimises communication, less relevant.
-//    tic = getticks();
+    tic = getticks();
         for(i = 0; i < s->task_ranks[s->count_ranks]; i++)
         {
             struct task *t = &s->tasks[i];
@@ -3229,8 +3237,8 @@ if(s->count_ranks > 1)
         }
 
 
-//    toc = getticks();
-//    message("qsched_partition task \"movement\" took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic));
+    toc = getticks();
+    message("qsched_partition task \"movement\" took %lli (= %.3fms) ticks\n", toc-tic, (float)(toc-tic)/2.67e6);
 
          //First we need to do a topological sort.
     int count = s->task_ranks[s->count_ranks];
@@ -3258,10 +3266,10 @@ if(s->count_ranks > 1)
     }
 
     //Now we just need to create the send/recv tasks from the dependencies.
-//    tic = getticks();
+    tic = getticks();
     qsched_partition_create_sends( s, tid);
-//    toc = getticks();
-//    message("qsched_partition_create_sends took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic));
+    toc = getticks();
+    message("qsched_partition_create_sends took %lli (= %.3fms) ticks\n", toc-tic, (float)(toc-tic)/2.67e6);
 
     free(edge_lists);
     free(edge_sizes);
@@ -3367,10 +3375,10 @@ void qsched_prepare_mpi( struct qsched *s){
 //        toc = getticks();
    //     message("Cleaning up scheduler took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic));
         }
-//    tic = getticks();
+    tic = getticks();
     qsched_partition(s);
-//    toc = getticks();
- //   message("qsched_partition took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic));
+    toc = getticks();
+    message("qsched_partition took %lli (= %.3fms) ticks\n", toc-tic, (float)(toc-tic)/2.67e6);
 
 long long int *tid = malloc(sizeof(long long int) * s->task_ranks[s->count_ranks]);
 if(tid == NULL)
@@ -3456,30 +3464,6 @@ if(tid == NULL)
                 num_recv++;
             qsched_enqueue( s , t );
         }
-        //TODO Remove this debug stuff.
-      /* if(t->flags & task_flag_skip && t->type == task_type_send && t->id == 886394)
-        {
-            int* data = (int*)&s->data[ t->data ];
-            int to = data[1];
-            int from = data[0];
-            int tag = data[4];
-            long long int resid = (((long long int)data[2]) << 32) + data[3];
-            struct res *res = &s->res[getindex(resid, s)];
-            int res_owner = res->node;
-            if(to != s->rank)
-            {
-                message("Not sending task to %i from %i with tag %i, id = %lli", to, from,tag, t->id);
-            }
-            if(to == s->rank)
-            {
-                message("This task should come to %i from %i with tag %i, id = %lli", from , to,tag, t->id);
-            }
-                message("Resource owner is node %i", res_owner);
-        }
-
-
-    message("Initial setup %i send tasks", num_send);
-    message("Initial setup %i recv tasks", num_recv);*/
 
         }
     /* Clean up. */
@@ -3493,7 +3477,6 @@ if(tid == NULL)
         if(!(t->flags & task_flag_skip) )
             s->waiting++;
     }
-//    printf("s->waiting = %i, count = %i\n", s->waiting, count);
         
     /* Set the ready flag. */
     s->flags |= qsched_flag_ready;
@@ -3639,8 +3622,8 @@ void qsched_run_MPI ( struct qsched *s, int nr_threads, qsched_funtype fun ) {
     /* Prepare the scheduler*/
         qsched_prepare_mpi( s );
     ticks toc = getticks();
-printf("Rank[%i]: qsched_prepare_mpi took %lli (= %e) ticks\n", s->rank,
-         toc - tic, (float)(toc - tic));
+printf("Rank[%i]: qsched_prepare_mpi took %lli (= %.3fms) ticks\n", s->rank,
+         toc - tic, (float)(toc - tic)/2.67e6);
 
 #if defined( HAVE_PTHREAD )