diff --git a/examples/test_bh.c b/examples/test_bh.c
index afb42f4bd358aaa67dc98927ad25ee0d0b596bdd..f417eaeb3d6c54f27fcbc94fe7726950759b1f8a 100644
--- a/examples/test_bh.c
+++ b/examples/test_bh.c
@@ -54,8 +54,8 @@ struct part {
   union {
     float a[3];
     float a_legacy[3];
-    float a_exact[3];
   };
+    float a_exact[3];
   float mass;
   int id;
 };  // __attribute__((aligned(32)));
@@ -1472,7 +1472,7 @@ void test_bh(int N, int nr_threads, int runs, char *fileName) {
   }
   message("Average number of parts per leaf is %f.", ((double)N) / nr_leaves);
 
-#if ICHECK > 0
+//#if ICHECK > 0
   printf("----------------------------------------------------------\n");
 
   /* Do a N^2 interactions calculation */
@@ -1485,7 +1485,7 @@ void test_bh(int N, int nr_threads, int runs, char *fileName) {
          toc_exact - tic_exact, (float)(toc_exact - tic_exact));
 
   printf("----------------------------------------------------------\n");
-#endif
+//#endif
 
   /* Create the tasks. */
   tic = getticks();
@@ -1590,7 +1590,7 @@ void test_bh(int N, int nr_threads, int runs, char *fileName) {
   printf("] ticks.\n");
 
   /* Dump the particles to a file */
-  file = fopen("particle_dump.dat", "w");
+  file = fopen("particle_dump.out", "w");
   fprintf(file,
           "# ID m x y z a_exact.x   a_exact.y    a_exact.z    a_legacy.x    "
           "a_legacy.y    a_legacy.z    a_new.x     a_new.y    a_new.z\n");
diff --git a/examples/test_bh_mpi.c b/examples/test_bh_mpi.c
index 9bf72a527e7bbbcc79807bdddde2b620e4f09228..ed72750192b6a1601843f02f7d7d6194f4655858 100644
--- a/examples/test_bh_mpi.c
+++ b/examples/test_bh_mpi.c
@@ -56,19 +56,19 @@
 /** Data structure for the particles. */
 struct part {
   double x[3];
-  union {
+  //union {
     float a[3];
     float a_legacy[3];
     #ifndef EXACT
     float a_exact[3];
     #endif
-  };
+  //};
     #ifdef EXACT
     float a_exact[3];
     #endif
   float mass;
   int id;
-};  // __attribute__((aligned(32)));
+};  // __attribute__((aligned(32))); TODO Why is part not aligned? Becuase we don't use it in inner loop I assume.
 
 struct part_local {
   float x[3];
@@ -369,22 +369,24 @@ static inline void iact_pair_direct(struct qsched *s, struct cell *ci, struct ce
       ++count_direct_unsorted;
 #endif
 
-#if ICHECK >= 0
-      if (parts_i[i].id == ICHECK)
+#if ICHECK != -1 && 0
+      if (parts_i[i].id == ICHECK  && j == 0)
         /*message(
             "[NEW] Interaction with particle id= %d (pair i) a=( %e %e %e )",
             cj->parts[j].id, -mi * w * dx[0], -mi * w * dx[1], -mi * w * dx[2]);*/
-        message(
+        /*message(
             "%d %e %e %e",
-            cj->parts[j].id, -mi * w * dx[0], -mi * w * dx[1], -mi * w * dx[2]);
+            cj->parts[j].id, -mi * w * dx[0], -mi * w * dx[1], -mi * w * dx[2]);*/
+        message("Particle %d (=%i)interacts (pair_direct) with cell %.3f %.3f %.3f of size %.3f", ci->parts[i].id, ICHECK, cj->loc[0], cj->loc[1], cj->loc[2], cj->h);
 
-      if (cj->parts[j].id == ICHECK)
+      if (cj->parts[j].id == ICHECK && i == 0)
         /*message(
             "[NEW] Interaction with particle id= %d (pair j) a=( %e %e %e )",
             parts_i[i].id, mj * w * dx[0], mj * w * dx[1], mj * w * dx[2]);*/
-        message(
+/*        message(
             "%d %e %e %e",
-            parts_i[i].id, mj * w * dx[0], mj * w * dx[1], mj * w * dx[2]);
+            parts_i[i].id, mj * w * dx[0], mj * w * dx[1], mj * w * dx[2]);*/
+        message("Particle %i (=%i) interacts (pair_direct) with cell %.3f %.3f %.3f of size %.3f", cj->parts[j].id,ICHECK, ci->loc[0], ci->loc[1], ci->loc[2], ci->h);
 #endif
 
     } /* loop over every other particle. */
@@ -463,18 +465,16 @@ void iact_self_direct(struct qsched *s, struct cell *c) {
           ai[k] -= wdx * mj;
         }
 
-#if ICHECK >= 0
-        if (c->parts[i].id == ICHECK)
-/*          message("[NEW] Interaction with particle id= %d (self i)  a=( %e %e %e )",
-                  c->parts[j].id,  -mi * w * dx[0], -mi * w * dx[1], -mi * w * dx[2]);*/
-          message("%d %e %e %e",
-                  c->parts[j].id,  -mi * w * dx[0], -mi * w * dx[1], -mi * w * dx[2]);
-
-        if (c->parts[j].id == ICHECK)
-/*          message("[NEW] Interaction with particle id= %d (self j) a =( %e %e %e )",
-                  c->parts[i].id,  -mj * w * dx[0], -mj * w * dx[1], -mj * w * dx[2]);*/
-          message("%d %e %e %e",
-                  c->parts[i].id,  -mj * w * dx[0], -mj * w * dx[1], -mj * w * dx[2]);
+#if ICHECK != 0 && 0
+        if (c->parts[i].id == ICHECK && j == 0)
+/*          message("[NEW] Interaction with particle id= %d (self i)",
+                  c->parts[j].id);*/
+        message("Particle %d (=%i)interacts (self_direct) with cell %.3f %.3f %.3f of size %.3f", c->parts[i].id, ICHECK, c->loc[0], c->loc[1], c->loc[2], c->h); 
+
+        if (c->parts[j].id == ICHECK && i == 0)
+/*          message("[NEW] Interaction with particle id= %d (self j)",
+                  c->parts[i].id);*/
+        message("Particle %d (=%i)interacts (self_direct) with cell %.3f %.3f %.3f of size %.3f", c->parts[j].id, ICHECK, c->loc[0], c->loc[1], c->loc[2], c->h); 
 #endif
 
       } /* loop over every other particle. */
@@ -536,6 +536,10 @@ static inline void iact_pair_pc(struct qsched *s, struct cell *ci, struct cell *
     /* Clean up local parts? */
     for (int k = 0; k < part_count; k++) {
         for (int j = 0; j < 3; j++) ci->parts[k].a[j] += parts[k].a[j];
+      #if ICHECK != 0 && 0
+            if(ci->parts[k].id == ICHECK)
+            message("Part-cell (particle cell is %.3f %.3f %.3f h=%.3f) task with cell %.3f %.3f %.3f h = %.3f",ci->loc[0],ci->loc[1],ci->loc[2],ci->h, cj->loc[0], cj->loc[1], cj->loc[2], cj->h);
+      #endif
     }
     free(parts);
 }
@@ -821,70 +825,74 @@ void create_pcs(struct qsched *s, struct cell *ci, struct cell *cj, int depth, i
     {
         for (cp = ci->firstchild; cp != ci->sibling; cp = cp1->sibling) {
             cp1 = (struct cell*) qsched_getresdata(s, cp);
+            if(cp1->split /*&& depth < leaf_depth*/)
+                create_pcs(s, cp1, NULL, depth+1, leaf_depth);
                 /* Recurse over all pairs. */
             for (cps = cp1->sibling; cps != ci->sibling; cps = cp2->sibling)
             {
                 cp2 = (struct cell*) qsched_getresdata(s, cps);
-                create_pcs(s, cp1, cp2, depth+1, leaf_depth);
+                if(cp1->split && cp2->split)
+                    create_pcs(s, cp1, cp2, depth+1, leaf_depth);
             }
         }
         /* Once we get back here we're done!*/
-        return;
-    }
 
-    /* If we're not yet at the level of making tasks */
-    if(depth < leaf_depth)
-    {
-        for (cp = ci->firstchild; cp != ci->sibling; cp = cp1->sibling) {
-            cp1 = (struct cell*) qsched_getresdata(s, cp);
-                /* Recurse over all pairs. */
-            for (cps = cj->firstchild; cps != cj->sibling; cps = cp2->sibling)
-            {
-                cp2 = (struct cell*) qsched_getresdata(s, cps);
-                create_pcs(s, cp1, cp2, depth+1, leaf_depth);
-            }
-        }
     }else{
-        /* We can make tasks if they're not neighbours! */
-        if(are_neighbours(ci, cj))
+
+        /* If we're not yet at the level of making tasks */
+        if(depth < leaf_depth)
         {
-            /* If they are neighbours we need to recurse if possible. If either is not split then we do the pair interaction so we can stop.*/
-            if(ci->split && cj->split)
+            for (cp = ci->firstchild; cp != ci->sibling; cp = cp1->sibling) {
+                cp1 = (struct cell*) qsched_getresdata(s, cp);
+                    /* Recurse over all pairs. */
+                for (cps = cj->firstchild; cps != cj->sibling; cps = cp2->sibling)
+                {
+                    cp2 = (struct cell*) qsched_getresdata(s, cps);
+                    create_pcs(s, cp1, cp2, depth+1, leaf_depth);
+                }
+            }
+        }else{
+            /* We can make tasks if they're not neighbours! */
+            if(are_neighbours(ci, cj))
             {
-                for (cp = ci->firstchild; cp != ci->sibling; cp = cp1->sibling) 
+                /* If they are neighbours we need to recurse if possible. If either is not split then we do the pair interaction so we can stop.*/
+                if(ci->split && cj->split)
                 {
-                    cp1 = (struct cell*) qsched_getresdata(s, cp);
-                    /* Recurse over all pairs. */
-                    for (cps = cj->firstchild; cps != cj->sibling; cps = cp2->sibling)
+                    for (cp = ci->firstchild; cp != ci->sibling; cp = cp1->sibling) 
                     {
-                        cp2 = (struct cell*) qsched_getresdata(s, cps);
-                        create_pcs(s, cp1, cp2, depth+1, leaf_depth);
+                        cp1 = (struct cell*) qsched_getresdata(s, cp);
+                        /* Recurse over all pairs. */
+                        for (cps = cj->firstchild; cps != cj->sibling; cps = cp2->sibling)
+                        {
+                            cp2 = (struct cell*) qsched_getresdata(s, cps);
+                            create_pcs(s, cp1, cp2, depth+1, leaf_depth);
+                        }
                     }
                 }
+            }else
+            {
+                /* Create the task. */
+                data[0] = ci->res;
+                data[1] = cj->res;
+                tid = qsched_addtask(s, task_type_pair_pc, task_flag_none, data,
+			           sizeof(qsched_task_t) * 2, ci->count * 8 );
+          
+                /* Add the resource and dependance */
+                qsched_addlock(s, tid, ci->res_parts);
+                qsched_adduse(s, tid, ci->res);
+                qsched_adduse(s, tid, cj->res);
+
+                /* Create the task. */
+                data[0] = cj->res;
+                data[1] = ci->res;
+                tid = qsched_addtask(s, task_type_pair_pc, task_flag_none, data,
+			           sizeof(qsched_task_t) * 2, cj->count * 8 );
+          
+                /* Add the resource and dependance */
+                qsched_addlock(s, tid, cj->res_parts);
+                qsched_adduse(s, tid, cj->res);
+                qsched_adduse(s, tid, ci->res);
             }
-        }else
-        {
-            /* Create the task. */
-            data[0] = ci->res;
-            data[1] = cj->res;
-            tid = qsched_addtask(s, task_type_pair_pc, task_flag_none, data,
-			       sizeof(qsched_task_t) * 2, ci->count * 8 );
-      
-            /* Add the resource and dependance */
-            qsched_addlock(s, tid, ci->res_parts);
-            qsched_adduse(s, tid, ci->res);
-            qsched_adduse(s, tid, cj->res);
-
-            /* Create the task. */
-            data[0] = cj->res;
-            data[1] = ci->res;
-            tid = qsched_addtask(s, task_type_pair_pc, task_flag_none, data,
-			       sizeof(qsched_task_t) * 2, cj->count * 8 );
-      
-            /* Add the resource and dependance */
-            qsched_addlock(s, tid, cj->res_parts);
-            qsched_adduse(s, tid, cj->res);
-            qsched_adduse(s, tid, ci->res);
         }
     }
 }
@@ -1244,7 +1252,7 @@ if(MpiThreadLevel != MPI_THREAD_MULTIPLE)
 if(s.rank == 0)
 {
   parts_res = qsched_addres(&s, qsched_owner_none, sizeof(struct part) * N, (void**) &parts);
-
+    srand(6);
   /* If no input file was specified, generate random particle positions. */
   if (fileName == NULL || fileName[0] == 0) {
     for (k = 0; k < N; k++) {
@@ -1291,12 +1299,19 @@ if(s.rank == 0)
      per leaf. */
   struct cell *c = root;
   int nr_leaves = 0;
-  while (c != NULL && (c->sibling > 0 || c->firstchild > 0)) {
+  while (c != NULL /*&& (c->sibling > 0 || c->firstchild > 0)*/) {
     if (!c->split) {
       nr_leaves++;
+      if(c->sibling > 0)
       c = (struct cell*) qsched_getresdata( &s, c->sibling );
+      else
+      c = NULL;
+
     } else {
+      if(c->firstchild > 0)
       c = (struct cell*) qsched_getresdata( &s, c->firstchild );
+      else
+      c = NULL;
     }
   }
   message("There are %i leaves.", nr_leaves);
@@ -1344,12 +1359,13 @@ if(s.rank == 0)
     }
     message("leaf_depth = %i", leaf_depth);
     message("tasks before = %i", s.count);
-    create_pcs(&s, root, NULL, 0, leaf_depth-1);    
+    create_pcs(&s, root, NULL, 0, 1);    
     message("tasks after = %i", s.count);
 }
 printf("s.count = %i\n", s.count);
 //Mark all the schedulers dirty so they actually synchronize.
 s.flags |= qsched_flag_dirty;
+message("nr_threads = %i", nr_threads);
 qsched_run_MPI(&s, nr_threads, runner);
   // Print off a hello world message
     printf("Hello world from processor %s, rank = %i, count_ranks = %i\n",
@@ -1359,7 +1375,7 @@ if(s.rank == 0)
 {
     file = fopen("particle_dump.out", "w");
   /*fprintf(file,
-          "ID m x y z a_exact.x   a_exact.y    a_exact.z    a_legacy.x    "
+          "# ID m x y z a_exact.x   a_exact.y    a_exact.z    a_legacy.x    "
           "a_legacy.y    a_legacy.z    a_new.x     a_new.y    a_new.z\n");*/
 //    fprintf(file, "");
     fclose(file);
diff --git a/examples/test_bh_new.c b/examples/test_bh_new.c
new file mode 100644
index 0000000000000000000000000000000000000000..4439b828c2f7652112c5a8b736a8f0f737a324c2
--- /dev/null
+++ b/examples/test_bh_new.c
@@ -0,0 +1,1657 @@
+    /*******************************************************************************
+ * This file is part of QuickSched.
+ * Coypright (c) 2014 Pedro Gonnet (pedro.gonnet@durham.ac.uk),
+ *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+* *****************************************************************************/
+
+/* Config parameters. */
+#include "../config.h"
+
+/* Standard includes. */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include <float.h>
+#include <limits.h>
+#include <omp.h>
+#include <fenv.h>
+
+/* Local includes. */
+#include "quicksched.h"
+
+/* Some local constants. */
+#define cell_pool_grow 1000
+#define cell_maxparts 50
+#define task_limit 1//e7
+#define const_G 1    // 6.6738e-8
+#define dist_min 0.5 /* Used for legacy walk only */
+#define dist_cutoff_ratio 1.5
+
+#define ICHECK -1
+#define NO_SANITY_CHECKS
+#define COM_AS_TASK
+#define NO_COUNTERS
+#define NO_QUADRUPOLES
+
+double itpms = 1000.0/3.1e9;
+
+/** Data structure for the particles. */
+struct part {
+  double x[3];
+  //  union {
+    float a[3];
+    float a_legacy[3];
+    float a_exact[3];
+  //  };
+  float mass;
+  int id;
+};  // __attribute__((aligned(32)));
+
+struct part_local {
+  float x[3];
+  float a[3];
+  float mass;
+} __attribute__((aligned(32)));
+
+struct multipole {
+
+#ifdef QUADRUPOLES
+  double I_xx;
+  double I_yy;
+  double I_zz;
+  double I_xy;
+  double I_xz;
+  double I_yz;
+#endif
+
+  double com[3];
+  float mass;
+};
+
+struct multipole_local {
+
+#ifdef QUADRUPOLES
+  float I_xx;
+  float I_yy;
+  float I_zz;
+  float I_xy;
+  float I_xz;
+  float I_yz;
+#endif
+
+  float com[3];
+  float mass;
+};
+
+
+/** Data structure for the BH tree cell. */
+struct cell {
+  double loc[3];
+  double h;
+
+  int count;
+  unsigned short int split, sorted;
+  struct part *parts;
+
+  struct cell *firstchild; /* Next node if opening */
+  struct cell *sibling;    /* Next node */
+
+  /* We keep both CoMs and masses to make sure the comp_com calculation is
+   * correct (use an anonymous union to keep variable names compact).  */
+  union {
+
+    /* Information for the legacy walk */
+    struct multipole legacy;
+
+    /* Information for the QuickShed walk */
+    struct multipole new;
+  };
+
+  int res, com_tid;
+  struct index *indices;
+
+} __attribute__((aligned(64)));
+
+/** Task types. */
+enum task_type {
+  task_type_self = 0,
+  task_type_pair,
+  task_type_pair_pc,
+  task_type_com,
+  task_type_count
+};
+
+#ifdef COUNTERS
+int count_direct_unsorted;
+int count_direct_sorted_pp;
+int count_direct_sorted_pm_i;
+int count_direct_sorted_pm_j;
+#endif
+
+/** Per-type timers. */
+ticks task_timers[task_type_count];
+
+/** Global variable for the pool of allocated cells. */
+struct cell *cell_pool = NULL;
+
+/**
+ * @brief Get a #cell from the pool.
+ */
+struct cell *cell_get() {
+
+  struct cell *res;
+  int k;
+
+  /* Allocate a new batch? */
+  if (cell_pool == NULL) {
+
+    /* Allocate the cell array. */
+    if ((cell_pool =
+             (struct cell *)calloc(cell_pool_grow, sizeof(struct cell))) ==
+        NULL)
+      error("Failed to allocate fresh batch of cells.");
+
+    /* Link them up via their progeny pointers. */
+    for (k = 1; k < cell_pool_grow; k++)
+      cell_pool[k - 1].firstchild = &cell_pool[k];
+  }
+
+  /* Pick a cell off the pool. */
+  res = cell_pool;
+  cell_pool = cell_pool->firstchild;
+
+  /* Clean up a few things. */
+  res->res = qsched_res_none;
+  res->firstchild = 0;
+
+  /* Return the cell. */
+  return res;
+}
+
+/**
+ * @brief Compute the center of mass of a given cell.
+ *
+ * @param c The #cell.
+ */
+void comp_com(struct cell *c) {
+
+  int k, count = c->count;
+  struct cell *cp;
+  struct part *parts = c->parts;
+  double com[3] = {0.0, 0.0, 0.0}, mass = 0.0, imass = 0.0;
+
+#ifdef QUADRUPOLES
+  double I_xx = 0.;
+  double I_yy = 0.;
+  double I_zz = 0.;
+  double I_xy = 0.;
+  double I_xz = 0.;
+  double I_yz = 0.;
+#endif
+
+#ifdef SANITY_CHECKS
+  if ( count == 0 )
+    error("CoM computed for an empty cell");
+#endif
+  
+  if (c->split) {
+
+    /* Loop over the projenitors and collect the multipole information. */
+    for (cp = c->firstchild; cp != c->sibling; cp = cp->sibling) {
+      float cp_mass = cp->new.mass;
+      com[0] += cp->new.com[0] * cp_mass;
+      com[1] += cp->new.com[1] * cp_mass;
+      com[2] += cp->new.com[2] * cp_mass;
+      mass += cp_mass;
+
+#ifdef QUADRUPOLES
+      I_xx += cp->new.I_xx;
+      I_yy += cp->new.I_yy;
+      I_zz += cp->new.I_zz;
+      I_xy += cp->new.I_xy;
+      I_xz += cp->new.I_xz;
+      I_yz += cp->new.I_yz;
+#endif 
+
+    }
+
+  /* Otherwise, collect the multipole from the particles. */
+  } else {
+
+    for (k = 0; k < count; k++) {
+      float p_mass = parts[k].mass;
+      com[0] += parts[k].x[0] * p_mass;
+      com[1] += parts[k].x[1] * p_mass;
+      com[2] += parts[k].x[2] * p_mass;
+      mass += p_mass;
+
+#ifdef QUADRUPOLES
+      I_xx += parts[k].x[0] * parts[k].x[0] * p_mass;
+      I_yy += parts[k].x[1] * parts[k].x[1] * p_mass;
+      I_zz += parts[k].x[2] * parts[k].x[2] * p_mass;
+      I_xy += parts[k].x[0] * parts[k].x[1] * p_mass;
+      I_xz += parts[k].x[0] * parts[k].x[2] * p_mass;
+      I_yz += parts[k].x[1] * parts[k].x[2] * p_mass;
+#endif 
+
+    }
+
+  }
+
+  /* Store the multipole data */
+  imass = 1.0f / mass;
+  c->new.mass = mass;
+  c->new.com[0] = com[0] * imass;
+  c->new.com[1] = com[1] * imass;
+  c->new.com[2] = com[2] * imass;
+
+
+#ifdef QUADRUPOLES
+  /* Store the quadrupole data */
+  c->new.I_xx = I_xx;
+  c->new.I_yy = I_yy;
+  c->new.I_zz = I_zz;
+  c->new.I_xy = I_xy;
+  c->new.I_xz = I_xz;
+  c->new.I_yz = I_yz;
+#endif
+
+}
+
+/**
+ * @brief Sort the parts into eight bins along the given pivots and
+ *        fill the multipoles. Also adds the hierarchical resources
+ *        to the sched.
+ *
+ * @param c The #cell to be split.
+ * @param N The total number of parts.
+ * @param s The #sched to store the resources.
+ */
+void cell_split(struct cell *c, struct qsched *s) {
+
+  int i, j, k, kk, count = c->count;
+  struct part temp, *parts = c->parts;
+  struct cell *cp;
+  int left[8], right[8];
+  double pivot[3];
+  static struct cell *root = NULL;
+  struct cell *progenitors[8];
+
+  /* Set the root cell. */
+  if (root == NULL) {
+    root = c;
+    c->sibling = 0;
+  }
+
+  /* Add a resource for this cell if it doesn't have one yet. */
+  if (c->res == qsched_res_none)
+    c->res = qsched_addres_local(s, qsched_owner_none, qsched_res_none);
+
+/* Attach a center-of-mass task to the cell. */
+#ifdef COM_AS_TASK
+  if (count > 0)
+    c->com_tid = qsched_addtask_local(s, task_type_com, task_flag_none, &c,
+                                sizeof(struct cell *), 1);
+#endif
+
+  /* Does this cell need to be split? */
+  if (count > cell_maxparts) {
+
+    /* Mark this cell as split. */
+    c->split = 1;
+
+    /* Create the progeny. */
+    for (k = 0; k < 8; k++) {
+      progenitors[k] = cp = cell_get();
+      cp->loc[0] = c->loc[0];
+      cp->loc[1] = c->loc[1];
+      cp->loc[2] = c->loc[2];
+      cp->h = c->h * 0.5;
+      cp->res = qsched_addres_local(s, qsched_owner_none, c->res);
+      if (k & 4) cp->loc[0] += cp->h;
+      if (k & 2) cp->loc[1] += cp->h;
+      if (k & 1) cp->loc[2] += cp->h;
+    }
+
+    /* Init the pivots. */
+    for (k = 0; k < 3; k++) pivot[k] = c->loc[k] + c->h * 0.5;
+
+    /* Split along the x-axis. */
+    i = 0;
+    j = count - 1;
+    while (i <= j) {
+      while (i <= count - 1 && parts[i].x[0] < pivot[0]) i += 1;
+      while (j >= 0 && parts[j].x[0] >= pivot[0]) j -= 1;
+      if (i < j) {
+        temp = parts[i];
+        parts[i] = parts[j];
+        parts[j] = temp;
+      }
+    }
+    left[1] = i;
+    right[1] = count - 1;
+    left[0] = 0;
+    right[0] = j;
+
+    /* Split along the y axis, twice. */
+    for (k = 1; k >= 0; k--) {
+      i = left[k];
+      j = right[k];
+      while (i <= j) {
+        while (i <= right[k] && parts[i].x[1] < pivot[1]) i += 1;
+        while (j >= left[k] && parts[j].x[1] >= pivot[1]) j -= 1;
+        if (i < j) {
+          temp = parts[i];
+          parts[i] = parts[j];
+          parts[j] = temp;
+        }
+      }
+      left[2 * k + 1] = i;
+      right[2 * k + 1] = right[k];
+      left[2 * k] = left[k];
+      right[2 * k] = j;
+    }
+
+    /* Split along the z axis, four times. */
+    for (k = 3; k >= 0; k--) {
+      i = left[k];
+      j = right[k];
+      while (i <= j) {
+        while (i <= right[k] && parts[i].x[2] < pivot[2]) i += 1;
+        while (j >= left[k] && parts[j].x[2] >= pivot[2]) j -= 1;
+        if (i < j) {
+          temp = parts[i];
+          parts[i] = parts[j];
+          parts[j] = temp;
+        }
+      }
+      left[2 * k + 1] = i;
+      right[2 * k + 1] = right[k];
+      left[2 * k] = left[k];
+      right[2 * k] = j;
+    }
+
+    /* Store the counts and offsets. */
+    for (k = 0; k < 8; k++) {
+      progenitors[k]->count = right[k] - left[k] + 1;
+      progenitors[k]->parts = &c->parts[left[k]];
+    }
+
+    /* Find the first non-empty progenitor */
+    for (k = 0; k < 8; k++)
+      if (progenitors[k]->count > 0) {
+        c->firstchild = progenitors[k];
+        break;
+      }
+
+#ifdef SANITY_CHECKS
+    if (c->firstchild == NULL)
+      error("Cell has been split but all progenitors have 0 particles");
+#endif
+
+    /* Prepare the pointers. */
+    for (k = 0; k < 8; k++) {
+
+      /* Find the next non-empty sibling */
+      for (kk = k + 1; kk < 8; ++kk) {
+        if (progenitors[kk]->count > 0) {
+          progenitors[k]->sibling = progenitors[kk];
+          break;
+        }
+      }
+
+      /* No non-empty sibling ? Go back a level */
+      if (kk == 8) progenitors[k]->sibling = c->sibling;
+    }
+
+    /* Recurse. */
+    for (k = 0; k < 8; k++)
+      if (progenitors[k]->count > 0) cell_split(progenitors[k], s);
+
+#ifdef COM_AS_TASK
+    /* Link the COM tasks. */
+    for (k = 0; k < 8; k++)
+      if (progenitors[k]->count > 0)
+        qsched_addunlock(s, progenitors[k]->com_tid, c->com_tid);
+#endif
+
+  } /* does the cell need to be split? */
+
+
+#ifndef COM_AS_TASK
+  /* Compute the cell's center of mass. */
+  comp_com(c);
+#endif
+
+  /* Set this cell's resources ownership. */
+  qsched_res_own(s, c->res,
+                 s->nr_queues * (c->parts - root->parts) / root->count);
+}
+
+/* -------------------------------------------------------------------------- */
+/* New tree walk */
+/* -------------------------------------------------------------------------- */
+
+
+/**
+ * @brief Checks whether the cells are direct neighbours ot not. Both cells have
+ * to be of the same size
+ *
+ */
+static inline int are_neighbours(struct cell *ci, struct cell *cj) {
+
+#ifdef SANITY_CHECKS
+  if (ci->h != cj->h)
+    error(" Cells of different size in distance calculation.");
+#endif
+
+  /* Maximum allowed distance */
+  float min_dist = ci->h;
+
+  /* (Manhattan) Distance between the cells */
+  for (int k = 0; k < 3; k++) {
+    float center_i = ci->loc[k];
+    float center_j = cj->loc[k];
+    if (fabsf(center_i - center_j) > min_dist) return 0;
+  }
+
+  return 1;
+}
+
+
+
+
+/**
+ * @brief Interacts all count particles in parts
+ * with a series of multipoles
+ *
+ * @param parts An array of local particles
+ * @param count The number of particles in the array
+ * @param mults The multipoles to interact with
+ * @param mult_count The number of multipoles to interact with
+ */
+static inline void make_interact_pc(struct part_local *parts, int part_count, 
+				    struct multipole_local mults[8], int mult_count) {
+
+  int i, j, k;
+  float dx[3] = {0.0, 0.0, 0.0}, r2, ir, mrinv3;
+
+#ifdef SANITY_CHECKS
+
+  /* Sanity checks */
+  if (part_count == 0) error("Empty cell!");
+
+  /* Sanity check. */
+  for (j = 0; j < mult_count; j++)
+    if (mults[j].mass == 0.0) 
+      error("com %d does not seem to have been set.", j);
+
+#endif
+
+  /* Loop over every particle in leaf. */
+  for (i = 0; i < part_count; i++) {
+
+    /* Loop over the multipoles */
+    for (j = 0; j < mult_count; j++) {
+      
+      /* Compute the pairwise distance. */
+      for (r2 = 0.0, k = 0; k < 3; k++) {
+	dx[k] = mults[j].com[k] - parts[i].x[k];
+	r2 += dx[k] * dx[k];
+      }
+
+      /* Apply the gravitational acceleration. */
+      ir = 1.0f / sqrtf(r2);
+      mrinv3 = mults[j].mass * const_G * ir * ir * ir;
+
+#ifndef QUADRUPOLES
+
+      parts[i].a[0] += mrinv3 * dx[0];
+      parts[i].a[1] += mrinv3 * dx[1];
+      parts[i].a[2] += mrinv3 * dx[2];
+
+#else
+
+      /* Follows the notation in Bonsai */
+      float mrinv5 = mrinv3 * ir * ir;
+      float mrinv7 = mrinv5 * ir * ir;
+
+      float D1 = -mrinv3;
+      float D2 =  3.f * mrinv5;
+      float D3 = -15.f * mrinv7;
+
+      float q = mults[j].I_xx + mults[j].I_yy + mults[j].I_zz;
+      float qRx = mults[j].I_xx*dx[0] + mults[j].I_xy*dx[1] + mults[j].I_xz*dx[2];
+      float qRy = mults[j].I_xy*dx[0] + mults[j].I_yy*dx[1] + mults[j].I_yz*dx[2];
+      float qRz = mults[j].I_xz*dx[0] + mults[j].I_yz*dx[1] + mults[j].I_zz*dx[2];
+      float qRR = qRx * dx[0] + qRy * dx[1] + qRz * dx[2];
+      float C = D1 + 0.5f * D2 * q + 0.5f * D3 * qRR;
+
+      parts[i].a[0] -= C * dx[0] + D2 * qRx;
+      parts[i].a[1] -= C * dx[1] + D2 * qRy;
+      parts[i].a[2] -= C * dx[2] + D2 * qRz;
+
+
+#endif
+
+    } /* loop over every multipole. */
+  } /* loop over every particle. */
+}
+
+
+
+
+/**
+ * @brief Loops all siblings of cells ci and cj and launches all
+ * the cell-monopole interactions that can be done between the siblings
+ *
+ * @param ci The first cell
+ * @param cj The second cell
+ */
+static inline void iact_pair_pc(struct cell *ci, struct cell *cj) {
+
+  struct part_local *parts = NULL;
+  struct multipole_local mult_local[8]; 
+  struct cell *cp, *cps;
+  int part_count, mult_count;
+  
+#ifdef SANITY_CHECKS
+  if (ci->h != cj->h)
+    error(" Cells of different size !.");
+
+  if ( ! are_neighbours(ci, cj) ) 
+     error(" Cells that are not neighbours !");
+#endif
+
+  if(!are_neighbours(ci, cj) )
+  {
+        part_count = ci->count;
+        /* Get local copies of the particle data in cp. */     
+        if ((parts = (struct part_local *) malloc(sizeof(struct part_local) * part_count)) == NULL)
+	    error("Failed to allocate local parts.");
+    for (int k = 0; k < part_count; k++) {
+      for (int j = 0; j < 3; j++) {
+	parts[k].x[j] = ci->parts[k].x[j] - ci->loc[j];
+	parts[k].a[j] = 0.0f;
+      }
+      parts[k].mass = ci->parts[k].mass;
+    }
+   
+    mult_count = 1;
+    mult_local[mult_count].com[0] = cj->new.com[0] - ci->loc[0];
+	mult_local[mult_count].com[1] = cj->new.com[1] - ci->loc[1];
+	mult_local[mult_count].com[2] = cj->new.com[2] - ci->loc[2];
+	mult_local[mult_count].mass = cj->new.mass;
+        /* Perform the interaction between the particles and the list of multipoles */
+    make_interact_pc(parts, part_count, mult_local, mult_count);
+
+    /* Clean up local parts? */
+    for (int k = 0; k < part_count; k++) {
+      for (int j = 0; j < 3; j++) ci->parts[k].a[j] += parts[k].a[j];
+      #if ICHECK != 0 && 0
+            if(ci->parts[k].id == ICHECK)
+            message("Part-cell task with cell %.3f %.3f %.3f h = %.3f", cj->loc[0], cj->loc[1], cj->loc[2], cj->h);
+      #endif
+    }
+    free(parts);
+    }else{
+
+  /* Let's loop over all siblings of ci */
+  for (cp = ci->firstchild; cp != ci->sibling; cp = cp->sibling) {
+
+    part_count = cp->count;
+
+    /* Get local copies of the particle data in cp. */     
+    if ((parts = (struct part_local *) malloc(sizeof(struct part_local) * part_count)) == NULL)
+	error("Failed to allocate local parts.");
+
+    for (int k = 0; k < part_count; k++) {
+      for (int j = 0; j < 3; j++) {
+	parts[k].x[j] = cp->parts[k].x[j] - cp->loc[j];
+	parts[k].a[j] = 0.0f;
+      }
+      parts[k].mass = cp->parts[k].mass;
+    }
+   
+    mult_count = 0;
+
+    /* Now loop over all the other cells */
+    for (cps = cj->firstchild; cps != cj->sibling; cps = cps->sibling) {
+
+      /* And build the list of multipoles to interact with */
+      if ( ! are_neighbours(cp, cps) ) {
+
+	mult_local[mult_count].com[0] = cps->new.com[0] - cp->loc[0];
+	mult_local[mult_count].com[1] = cps->new.com[1] - cp->loc[1];
+	mult_local[mult_count].com[2] = cps->new.com[2] - cp->loc[2];
+	mult_local[mult_count].mass = cps->new.mass;
+
+#ifdef QUADRUPOLES
+
+	/* Final operation to get the quadrupoles */
+	double imass = 1. / cps->new.mass;
+	mult_local[mult_count].I_xx = cps->new.I_xx*imass - cps->new.com[0] * cps->new.com[0];
+	mult_local[mult_count].I_xy = cps->new.I_xy*imass - cps->new.com[0] * cps->new.com[1];
+	mult_local[mult_count].I_xz = cps->new.I_xz*imass - cps->new.com[0] * cps->new.com[2];
+	mult_local[mult_count].I_yy = cps->new.I_yy*imass - cps->new.com[1] * cps->new.com[1];
+	mult_local[mult_count].I_yz = cps->new.I_yz*imass - cps->new.com[1] * cps->new.com[2];
+	mult_local[mult_count].I_zz = cps->new.I_zz*imass - cps->new.com[2] * cps->new.com[2];
+#endif	
+
+	mult_count++;
+      }
+    }
+
+    /* Perform the interaction between the particles and the list of multipoles */
+    make_interact_pc(parts, part_count, mult_local, mult_count);
+
+    /* Clean up local parts? */
+    for (int k = 0; k < part_count; k++) {
+      for (int j = 0; j < 3; j++) cp->parts[k].a[j] += parts[k].a[j];
+      #if ICHECK != 0 && 0
+            if(ci->parts[k].id == ICHECK)
+            message("Part-cell task with cell %.3f %.3f %.3f h = %.3f", cj->loc[0], cj->loc[1], cj->loc[2], cj->h);
+      #endif
+    }
+    free(parts);
+  }
+    }
+}
+
+
+
+
+/**
+ * @brief Interacts all particles in a cell ci with all particles in another
+ * cell cj using a simple double for-loop.
+ *
+ * @param ci The first cell
+ * @param cj The second cell
+ */
+
+static inline void iact_pair_direct(struct cell *ci, struct cell *cj) {
+
+  int i, j, k;
+  int count_i = ci->count, count_j = cj->count;
+  float xi[3];
+  float dx[3], ai[3], mi, mj, r2, w, ir;
+  double com[3];
+  struct part_local parts_j[count_j];
+  struct part *parts_i = ci->parts;
+
+#ifdef SANITY_CHECKS
+
+  /* Bad stuff will happen if cell sizes are different */
+  if (ci->h != cj->h)
+    error("Non matching cell sizes !! h_i=%f h_j=%f\n", ci->h, cj->h);
+
+#endif
+
+  /* Find the center point of the interaction. */
+  for (k = 0; k < 3; k++) {
+    com[k] = 0.5 * (ci->loc[k] + cj->loc[k]);
+  }
+
+  /* Init the local copies of the particles. */
+  for (k = 0; k < count_j; k++) {
+    for (j = 0; j < 3; j++) {
+      parts_j[k].x[j] = cj->parts[k].x[j] - com[j];
+      parts_j[k].a[j] = 0.0f;
+    }
+    parts_j[k].mass = cj->parts[k].mass;
+  }
+
+  /* Loop over all particles in ci... */
+  for (i = 0; i < count_i; i++) {
+
+    /* Init the ith particle's data. */
+    for (k = 0; k < 3; k++) {
+      xi[k] = parts_i[i].x[k] - com[k];
+      ai[k] = 0.0f;
+    }
+    mi = parts_i[i].mass;
+
+    /* Loop over every particle in the other cell. */
+    for (j = 0; j < count_j; j++) {
+
+      /* Compute the pairwise distance. */
+      for (r2 = 0.0, k = 0; k < 3; k++) {
+        dx[k] = xi[k] - parts_j[j].x[k];
+        r2 += dx[k] * dx[k];
+      }
+
+      /* Apply the gravitational acceleration. */
+      ir = 1.0f / sqrtf(r2);
+      w = const_G * ir * ir * ir;
+      mj = parts_j[j].mass;
+      for (k = 0; k < 3; k++) {
+        float wdx = w * dx[k];
+        parts_j[j].a[k] += wdx * mi;
+        ai[k] -= wdx * mj;
+      }
+
+#ifdef COUNTERS
+      ++count_direct_unsorted;
+#endif
+
+#if ICHECK != -1 && 0
+      if (parts_i[i].id == ICHECK && j == 0)
+        /*printf(
+            "[NEW] Interaction with particle id= %d (pair i) a=( %e %e %e )\n",
+            parts_j[j].id, -mi * w * dx[0], -mi * w * dx[1], -mi * w * dx[2]);*/
+        message("Particle %d (=%i)interacts (pair_direct) with cell %.3f %.3f %.3f of size %.3f", ci->parts[i].id, ICHECK, cj->loc[0], cj->loc[1], cj->loc[2], cj->h); 
+
+      if (cj->parts[j].id == ICHECK && i == 0)
+/*        printf(
+            "[NEW] Interaction with particle id= %d (pair j) a=( %e %e %e )\n",
+            parts_i[i].id, mj * w * dx[0], mj * w * dx[1], mj * w * dx[2]);*/
+        message("Particle %i (=%i) interacts (pair_direct) with cell %.3f %.3f %.3f of size %.3f", cj->parts[j].id,ICHECK, ci->loc[0], ci->loc[1], ci->loc[2], ci->h);
+#endif
+
+    } /* loop over every other particle. */
+
+    /* Store the accumulated acceleration on the ith part. */
+    for (k = 0; k < 3; k++) parts_i[i].a[k] += ai[k];
+
+  } /* loop over all particles. */
+
+  /* Copy the local particle data back. */
+  for (k = 0; k < count_j; k++) {
+    for (j = 0; j < 3; j++) cj->parts[k].a[j] += parts_j[k].a[j];
+  }
+}
+
+
+
+/**
+ * @brief Decides whether two cells use the direct summation interaction or the
+ * multipole interactions
+ *
+ * @param ci The #cell.
+ * @param cj The other #cell.
+ */
+void iact_pair(struct cell *ci, struct cell *cj) {
+
+  struct cell *cp, *cps;
+
+#ifdef SANITY_CHECKS
+
+  /* Early abort? */
+  if (ci->count == 0 || cj->count == 0) error("Empty cell !");
+
+  /* Bad stuff will happen if cell sizes are different */
+  if (ci->h != cj->h)
+    error("Non matching cell sizes !! h_i=%f h_j=%f\n", ci->h, cj->h);
+
+  /* Sanity check */
+  if (ci == cj)
+    error("The impossible has happened: pair interaction between a cell and "
+          "itself.");
+
+  /* Are the cells direct neighbours? */
+  if (!are_neighbours(ci, cj)) 
+    error("Non-neighbouring cells !");
+
+#endif
+
+
+  /* Are both cells split ? */
+  if (ci->split && cj->split) {
+
+    /* Let's split both cells and build all possible pairs */
+    for (cp = ci->firstchild; cp != ci->sibling; cp = cp->sibling) {
+      for (cps = cj->firstchild; cps != cj->sibling; cps = cps->sibling) {
+	
+	/* If the cells are neighbours, recurse. */
+	if (are_neighbours(cp, cps)) {
+	  iact_pair(cp, cps);
+	}
+      }
+    }
+    
+    /* Build all possible cell-multipole interaction pairs */
+    iact_pair_pc(ci, cj);
+    iact_pair_pc(cj, ci);
+
+  } else { /* Otherwise, compute the interactions at this level directly. */
+    iact_pair_direct(ci, cj);
+  }
+}
+
+
+
+/**
+ * @brief Interacts all particles in cell c with themselves
+ * either by recursing or by direct summation
+ *
+ * @param c The #cell containing the particles
+ */
+void iact_self_direct(struct cell *c) {
+  int i, j, k, count = c->count;
+  float xi[3] = {0.0, 0.0, 0.0};
+  float ai[3] = {0.0, 0.0, 0.0}, mi, mj, dx[3] = {0.0, 0.0, 0.0}, r2, ir, w;
+  struct cell *cp, *cps;
+
+#ifdef SANITY_CHECKS
+
+  /* Early abort? */
+  if (count == 0) error("Empty cell !");
+
+#endif
+
+  /* If the cell is split, interact each progeny with itself, and with
+     each of its siblings. */
+  if (c->split) {
+    for (cp = c->firstchild; cp != c->sibling; cp = cp->sibling) {
+      iact_self_direct(cp);
+      for (cps = cp->sibling; cps != c->sibling; cps = cps->sibling)
+        iact_pair(cp, cps);
+    }
+
+    /* Otherwise, compute the interactions directly. */
+  } else {
+
+    /* Init the local copies of the particles. */
+    double loc[3];
+    struct part_local parts[count];
+    loc[0] = c->loc[0];
+    loc[1] = c->loc[1];
+    loc[2] = c->loc[2];
+    for (k = 0; k < count; k++) {
+      for (j = 0; j < 3; j++) {
+        parts[k].x[j] = c->parts[k].x[j] - loc[j];
+        parts[k].a[j] = 0.0f;
+      }
+      parts[k].mass = c->parts[k].mass;
+    }
+
+    /* Loop over all particles... */
+    for (i = 0; i < count; i++) {
+
+      /* Init the ith particle's data. */
+      for (k = 0; k < 3; k++) {
+        xi[k] = parts[i].x[k];
+        ai[k] = 0.0;
+      }
+      mi = parts[i].mass;
+
+      /* Loop over every following particle. */
+      for (j = i + 1; j < count; j++) {
+
+        /* Compute the pairwise distance. */
+        for (r2 = 0.0, k = 0; k < 3; k++) {
+          dx[k] = xi[k] - parts[j].x[k];
+          r2 += dx[k] * dx[k];
+        }
+
+        /* Apply the gravitational acceleration. */
+        ir = 1.0f / sqrtf(r2);
+        w = const_G * ir * ir * ir;
+        mj = parts[j].mass;
+        for (k = 0; k < 3; k++) {
+          float wdx = w * dx[k];
+          parts[j].a[k] += wdx * mi;
+          ai[k] -= wdx * mj;
+        }
+
+#if ICHECK != 0 && 0
+        if (c->parts[i].id == ICHECK && j == 0)
+/*          message("[NEW] Interaction with particle id= %d (self i)",
+                  c->parts[j].id);*/
+        message("Particle %d (=%i)interacts (self_direct) with cell %.3f %.3f %.3f of size %.3f", c->parts[i].id, ICHECK, c->loc[0], c->loc[1], c->loc[2], c->h); 
+
+        if (c->parts[j].id == ICHECK && i == 0)
+/*          message("[NEW] Interaction with particle id= %d (self j)",
+                  c->parts[i].id);*/
+        message("Particle %d (=%i)interacts (self_direct) with cell %.3f %.3f %.3f of size %.3f", c->parts[j].id, ICHECK, c->loc[0], c->loc[1], c->loc[2], c->h); 
+#endif
+
+      } /* loop over every other particle. */
+
+      /* Store the accumulated acceleration on the ith part. */
+      for (k = 0; k < 3; k++) parts[i].a[k] += ai[k];
+
+    } /* loop over all particles. */
+
+    /* Copy the local particle data back. */
+    for (k = 0; k < count; k++) {
+      for (j = 0; j < 3; j++) c->parts[k].a[j] += parts[k].a[j];
+    }
+  } /* otherwise, compute interactions directly. */
+}
+
+
+/**
+ * @brief Create the tasks for the cell pair/self.
+ *
+ * @param s The #sched in which to create the tasks.
+ * @param ci The first #cell.
+ * @param cj The second #cell.
+ */
+void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj) {
+
+  qsched_task_t tid;
+  struct cell *data[2], *cp, *cps;
+
+#ifdef SANITY_CHECKS
+
+  /* If either cell is empty, stop. */
+  if (ci->count == 0 || (cj != NULL && cj->count == 0)) error("Empty cell !");
+
+#endif
+
+  /* Single cell? */
+  if (cj == NULL) {
+
+    /* Is this cell split and above the task limit ? */
+    if (ci->split && ci->count > task_limit / ci->count) {
+
+      /* Loop over each of this cell's progeny. */
+      for (cp = ci->firstchild; cp != ci->sibling; cp = cp->sibling) {
+
+        /* Make self-interaction task. */
+        create_tasks(s, cp, NULL);
+
+        /* Make all pair-interaction tasks. */
+        for (cps = cp->sibling; cps != ci->sibling; cps = cps->sibling)
+          create_tasks(s, cp, cps);
+      }
+
+      /* Otherwise, add a self-interaction task. */
+    } else {
+
+      /* Set the data. */
+      data[0] = ci;
+      data[1] = NULL;
+
+      /* Create the task. */
+      tid =  qsched_addtask_local(s, task_type_self, task_flag_none, data,
+			    sizeof(struct cell *) * 2, ci->count * ci->count / 2);
+
+      /* Add the resource (i.e. the cell) to the new task. */
+      qsched_addlock(s, tid, ci->res);
+
+/* Since this call might recurse, add a dependency on the cell's COM
+   task. */
+#ifdef COM_AS_TASK
+      if (ci->split) qsched_addunlock(s, ci->com_tid, tid);
+#endif
+    }
+
+    /* Otherwise, it's a pair. */
+  } else {
+
+    /* Are both cells split and we are above the task limit ? */
+    if (ci->split && cj->split && ci->count > task_limit / cj->count) {
+      
+      /* Let's split both cells and build all possible pairs */
+      for (cp = ci->firstchild; cp != ci->sibling; cp = cp->sibling) {
+	for (cps = cj->firstchild; cps != cj->sibling; cps = cps->sibling) {
+	  
+	  /* Recurse */
+	  if (are_neighbours(cp, cps)) {
+	    create_tasks(s, cp, cps);
+	  }
+	}
+      }
+
+      /* Let's also build particle-monopole tasks */
+	
+      /* Create the task. */
+      data[0] = ci;
+      data[1] = cj;
+      tid = qsched_addtask_local(s, task_type_pair_pc, task_flag_none, data,
+			   sizeof(struct cell *) * 2, ci->count * 8 );
+      
+      /* Add the resource and dependance */
+      qsched_addlock(s, tid, ci->res);
+
+#ifdef COM_AS_TASK
+      qsched_addunlock(s, cj->com_tid, tid);
+#endif
+
+
+      /* Create the task. */
+      data[0] = cj;
+      data[1] = ci;
+      tid = qsched_addtask_local(s, task_type_pair_pc, task_flag_none, data,
+			   sizeof(struct cell *) * 2, cj->count * 8);
+
+      /* Add the resource and dependance */
+      qsched_addunlock(s, ci->com_tid, tid);
+
+#ifdef COM_AS_TASK
+      qsched_addlock(s, tid, cj->res);
+#endif
+     
+    } else {  /* Otherwise, at least one of the cells is not split, build a direct
+	       * interaction. */
+
+      /* Set the data. */
+      data[0] = ci;
+      data[1] = cj;
+      
+      /* Create the task. */
+      tid = qsched_addtask_local(s, task_type_pair, task_flag_none, data,
+			   sizeof(struct cell *) * 2, ci->count * cj->count);
+      
+      /* Add the resources. */
+      qsched_addlock(s, tid, ci->res);
+      qsched_addlock(s, tid, cj->res);
+
+    }
+
+  }   /* Otherwise it's a pair */
+}
+
+/* -------------------------------------------------------------------------- */
+/* Legacy tree walk */
+/* -------------------------------------------------------------------------- */
+
+/**
+ * @brief Compute the center of mass of a given cell recursively.
+ *
+ * @param c The #cell.
+ */
+void legacy_comp_com(struct cell *c, int *countCoMs) {
+
+  int k, count = c->count;
+  struct cell *cp;
+  struct part *p, *parts = c->parts;
+  double com[3] = {0.0, 0.0, 0.0}, mass = 0.0;
+
+  ++(*countCoMs);
+
+  /* Is the cell split? */
+  if (c->split) {
+
+    /* Loop over the progeny. */
+    for (cp = c->firstchild; cp != c->sibling; cp = cp->sibling) {
+      /* Recurse */
+      legacy_comp_com(cp, countCoMs);
+
+      /* Collect multipole information */
+      float cp_mass = cp->legacy.mass;
+      com[0] += cp->legacy.com[0] * cp_mass;
+      com[1] += cp->legacy.com[1] * cp_mass;
+      com[2] += cp->legacy.com[2] * cp_mass;
+      mass += cp_mass;
+    }
+
+    /* Otherwise, collect the multipole from local data. */
+  } else {
+
+    for (k = 0; k < count; k++) {
+      p = &parts[k];
+      float p_mass = p->mass;
+      com[0] += p->x[0] * p_mass;
+      com[1] += p->x[1] * p_mass;
+      com[2] += p->x[2] * p_mass;
+      mass += p_mass;
+    }
+  }
+
+  /* Finish multipole calculation */
+  if (mass > 0.0) {
+    float imass = 1.0f / mass;
+    c->legacy.com[0] = com[0] * imass;
+    c->legacy.com[1] = com[1] * imass;
+    c->legacy.com[2] = com[2] * imass;
+    c->legacy.mass = mass;
+  } else {
+    c->legacy.com[0] = 0.0;
+    c->legacy.com[1] = 0.0;
+    c->legacy.com[2] = 0.0;
+    c->legacy.mass = 0.0;
+  }
+}
+
+/**
+ * @brief Interacts a particle with a cell recursively using the original B-H
+ * tree walk procedure
+ *
+ * @param parts The array of particles
+ * @param i The particle of interest
+ * @param root The root of the tree under which we will search.
+ * @param monitor If set to @c parts[i].id, will produce debug output when
+ *        ICHECK is set.
+ * @param cell The cell the particle interacts with
+ */
+void legacy_interact(struct part *parts, int i, struct cell *root, int monitor,
+                     int *countMultipoles, int *countPairs) {
+
+  int j, k;
+  float r2, dx[3], ir, w;
+  float a[3] = {0.0, 0.0, 0.0};
+  double pix[3] = {parts[i].x[0], parts[i].x[1], parts[i].x[2]};
+  int pid = parts[i].id;
+  struct cell *cell = root;
+
+  /* Traverse the cells of the tree. */
+  while (cell != NULL) {
+
+    /* Are we in a leaf ? */
+    if (!cell->split) {
+
+      /* Interact the particle with the particles in the leaf */
+      for (j = 0; j < cell->count; ++j) {
+        if (cell->parts[j].id == pid) continue;
+
+        /* Compute the pairwise distance. */
+        for (r2 = 0.0, k = 0; k < 3; k++) {
+          dx[k] = cell->parts[j].x[k] - pix[k];
+          r2 += dx[k] * dx[k];
+        }
+
+        /* Apply the gravitational acceleration. */
+        ir = 1.0f / sqrtf(r2);
+        w = cell->parts[j].mass * const_G * ir * ir * ir;
+        for (k = 0; k < 3; k++) a[k] += w * dx[k];
+
+        (*countPairs)++;
+
+#if ICHECK >= 0 && 0
+        if (pid == monitor)
+          message("[BH_] Interaction with particle id= %d a=( %e %e %e ) h= %f "
+                  "loc=( %e %e %e )\n",
+                  cell->parts[j].id, w * dx[0], w * dx[1], w * dx[2], cell->h,
+                  cell->loc[0], cell->loc[1], cell->loc[2]);
+#endif
+      }
+
+      cell = cell->sibling;
+    } else {
+
+      /* We are in a node */
+      for (r2 = 0.0, k = 0; k < 3; k++) {
+        dx[k] = cell->legacy.com[k] - pix[k];
+        r2 += dx[k] * dx[k];
+      }
+
+#if ICHECK >= 0 && 0
+      if (pid == monitor)
+        message("This is a node with %d particles h= %f. r= %f theta= %f",
+                cell->count, cell->h, sqrt(r2), dist_min);
+#endif
+
+      /* Is the cell far enough ? */
+      if (dist_min * dist_min * r2 < cell->h * cell->h) {
+
+#if ICHECK >= 0 && 0
+        if (pid == monitor) printf("Recursing...\n");
+#endif
+        cell = cell->firstchild;
+        continue;
+      }
+
+#if ICHECK >= 0 && 0  
+      if (pid == monitor)
+        message("[BH_] Can interact with the monopole. x= %f %f %f m= %f h= %f",
+                cell->legacy.com[0], cell->legacy.com[1], cell->legacy.com[2],
+                cell->legacy.mass, cell->h);
+#endif
+
+      /* Apply the gravitational acceleration. */
+      ir = 1.0f / sqrtf(r2);
+      w = cell->legacy.mass * const_G * ir * ir * ir;
+      for (k = 0; k < 3; k++) a[k] += w * dx[k];
+
+      (*countMultipoles)++;
+
+      /* Move to the next node */
+      cell = cell->sibling;
+    }
+  }
+
+  /* Store the locally computed acceleration back to the particle. */
+  for (k = 0; k < 3; k++) parts[i].a_legacy[k] += a[k];
+}
+
+/**
+ * @brief Does a tree walk as in the B-H original work for all particles
+ *
+ * @param N The number of particles
+ * @param parts The array of particles
+ * @param root The root cell of the tree
+ * @param monitor ID of the particle to monitor and output interactions to
+ *        stdout
+ */
+void legacy_tree_walk(int N, struct part *parts, struct cell *root, int monitor,
+                      int *countMultipoles, int *countPairs, int *countCoMs) {
+
+  int i;
+
+  /* Compute multipoles (recursively) */
+  legacy_comp_com(root, countCoMs);
+
+  //#pragma omp parallel for
+  for (i = 0; i < N; ++i) {
+    if (parts[i].id == monitor)
+      message("tree walk for particle %d x= %f %f %f", parts[i].id,
+              parts[i].x[0], parts[i].x[1], parts[i].x[2]);
+
+    legacy_interact(parts, i, root, monitor, countMultipoles, countPairs);
+
+    if (parts[i].id == monitor)
+      message("[check] legacy acceleration for particle %d a= %.3e %.3e %.3e",
+              parts[i].id, parts[i].a_legacy[0], parts[i].a_legacy[1],
+              parts[i].a_legacy[2]);
+  }
+}
+
+/* -------------------------------------------------------------------------- */
+/* Exact interaction */
+/* -------------------------------------------------------------------------- */
+
+/**
+ * @brief Solve the particle interactions using the stupid N^2 algorithm
+ *
+ * @param N The number of particles
+ * @param parts The array of particles
+ */
+void interact_exact(int N, struct part *parts, int monitor) {
+
+  int i, j, k;
+  float ir, w, r2, dx[3];
+
+  /* Loop over all particles. */
+  for (i = 0; i < N; ++i) {
+
+    /* Some things to store locally. */
+    double pix[3] = {parts[i].x[0], parts[i].x[1], parts[i].x[2]};
+    float mi = parts[i].mass;
+
+    /* Loop over every other particle. */
+    for (j = i + 1; j < N; ++j) {
+
+      /* Compute the pairwise distance. */
+      for (r2 = 0.0, k = 0; k < 3; k++) {
+        dx[k] = parts[j].x[k] - pix[k];
+        r2 += dx[k] * dx[k];
+      }
+
+      /* Apply the gravitational acceleration. */
+      ir = 1.0f / sqrtf(r2);
+      w = const_G * ir * ir * ir;
+
+      for (k = 0; k < 3; k++) {
+        float wdx = w * dx[k];
+        parts[j].a_exact[k] -= wdx * mi;
+        parts[i].a_exact[k] += wdx * parts[j].mass;
+      }
+    }
+  }
+
+#if ICHECK >= 0 && 0
+  for (i = 0; i < N; ++i)
+    if (parts[i].id == monitor)
+      message("[check] exact acceleration for particle %d a= %.3e %.3e %.3e\n",
+              parts[i].id, parts[i].a_exact[0], parts[i].a_exact[1],
+              parts[i].a_exact[2]);
+#endif
+}
+
+/**
+ * @brief Set up and run a task-based Barnes-Hutt N-body solver.
+ *
+ * @param N The number of random particles to use.
+ * @param nr_threads Number of threads to use.
+ * @param runs Number of force evaluations to use as a benchmark.
+ * @param fileName Input file name. If @c NULL or an empty string, random
+ *        particle positions will be used.
+ */
+void test_bh(int N, int nr_threads, int runs, char *fileName) {
+
+  int i, k;
+  struct cell *root;
+  struct part *parts;
+  FILE *file, *fileTime;
+  struct qsched s;
+  ticks tic, toc_run, tot_setup = 0, tot_run = 0;
+  int countMultipoles = 0, countPairs = 0, countCoMs = 0;
+  char buffer[200];
+
+
+  /* Runner function. */
+  void runner(int type, void * data) {
+
+    ticks tic = getticks();
+
+    /* Decode the data. */
+    struct cell **d = (struct cell **)data;
+
+    /* Decode and execute the task. */
+    switch (type) {
+      case task_type_self:
+        iact_self_direct(d[0]);
+        break;
+      case task_type_pair:
+        iact_pair(d[0], d[1]);
+        break;
+      case task_type_pair_pc:
+	iact_pair_pc(d[0], d[1]);
+        break;
+      case task_type_com:
+        comp_com(d[0]);
+        break;
+      default:
+        error("Unknown task type.");
+    }
+
+    atomic_add(&task_timers[type], getticks() - tic);
+  }
+
+  /* Initialize the per-task type timers. */
+  for (k = 0; k < task_type_count; k++) task_timers[k] = 0;
+
+  /* Initialize the scheduler. */
+  qsched_init(&s, nr_threads, qsched_flag_noreown | qsched_flag_pthread);
+    srand(6);
+  /* Init and fill the particle array. */
+  if ((parts = (struct part *)malloc(sizeof(struct part) * N)) == NULL)
+    error("Failed to allocate particle buffer.");
+
+  /* If no input file was specified, generate random particle positions. */
+  if (fileName == NULL || fileName[0] == 0) {
+    for (k = 0; k < N; k++) {
+      parts[k].id = k;
+      parts[k].x[0] = ((double)rand()) / RAND_MAX;
+      parts[k].x[1] = ((double)rand()) / RAND_MAX;
+      parts[k].x[2] = ((double)rand()) / RAND_MAX;
+      parts[k].mass = ((double)rand()) / RAND_MAX;
+      parts[k].a_legacy[0] = 0.0;
+      parts[k].a_legacy[1] = 0.0;
+      parts[k].a_legacy[2] = 0.0;
+    }
+
+    /* Otherwise, read them from a file. */
+  } else {
+    file = fopen(fileName, "r");
+    if (file) {
+      for (k = 0; k < N; k++) {
+        if (fscanf(file, "%d", &parts[k].id) != 1)
+          error("Failed to read ID of part %i.", k);
+        if (fscanf(file, "%lf%lf%lf", &parts[k].x[0], &parts[k].x[1],
+                   &parts[k].x[2]) !=
+            3)
+          error("Failed to read position of part %i.", k);
+        if (fscanf(file, "%f", &parts[k].mass) != 1)
+          error("Failed to read mass of part %i.", k);
+      }
+      fclose(file);
+    }
+  }
+
+  /* Init the cells. */
+  root = cell_get();
+  root->loc[0] = 0.0;
+  root->loc[1] = 0.0;
+  root->loc[2] = 0.0;
+  root->h = 1.0;
+  root->count = N;
+  root->parts = parts;
+  cell_split(root, &s);
+
+  /* Iterate over the cells and get the average number of particles
+     per leaf. */
+  struct cell *c = root;
+  int nr_leaves = 0;
+  while (c != NULL) {
+    if (!c->split) {
+      nr_leaves++;
+      c = c->sibling;
+    } else {
+      c = c->firstchild;
+    }
+  }
+  message("There are %i leaves.", nr_leaves);
+  message("Average number of parts per leaf is %f.", ((double)N) / nr_leaves);
+
+//#if ICHECK > 0
+  printf("----------------------------------------------------------\n");
+
+  /* Do a N^2 interactions calculation */
+
+  ticks tic_exact = getticks();
+#ifdef EXACT
+  interact_exact(N, parts, ICHECK);
+#endif
+  ticks toc_exact = getticks();
+
+  printf("Exact calculation (1 thread) took %lli (= %e) ticks\n",
+         toc_exact - tic_exact, (float)(toc_exact - tic_exact));
+
+  printf("----------------------------------------------------------\n");
+//#endif
+
+  /* Create the tasks. */
+  tic = getticks();
+  create_tasks(&s, root, NULL);
+  tot_setup += getticks() - tic;
+
+  /* Dump the number of tasks. */
+  message("total nr of tasks: %i.", s.count);
+  message("total nr of deps: %i.", s.count_deps);
+  message("total nr of res: %i.", s.count_res);
+  message("total nr of locks: %i.", s.count_locks);
+  message("total nr of uses: %i.", s.count_uses);
+  int counts[task_type_count];
+  for (k = 0; k < task_type_count; k++) counts[k] = 0;
+  for (k = 0; k < s.count; k++) counts[s.tasks[k].type] += 1;
+
+  if (nr_threads == 1) {
+
+    /* Prepare for timing test */
+    sprintf(buffer, "timings_legacy_%dM.dat", N/1000000);
+    fileTime = fopen(buffer, "a");
+    tot_run = 0.;
+    omp_set_num_threads(nr_threads);
+    
+    
+    /* Loop over the number of runs. */
+    for (k = 0; k < 1; k++) {
+      
+      countMultipoles = 0;
+      countPairs = 0;
+      countCoMs = 0;
+      
+      /* Execute the legacy walk. */
+      tic = getticks();
+      /*legacy_tree_walk(N, parts, root, ICHECK, &countMultipoles, &countPairs,
+		       &countCoMs);*/
+      toc_run = getticks();
+      
+      /* Dump some timings. */
+      message("%ith legacy run took %lli (= %e) ticks... = %e ms", k, toc_run - tic,
+	      (float)(toc_run - tic), (double)(toc_run - tic) * itpms);
+      tot_run += toc_run - tic;
+    }
+    
+    message("Time: %fms", (tot_run / runs) * itpms);
+    fprintf(fileTime, "%d %d %d %f\n", N, nr_threads, runs, (tot_run / runs) * itpms);
+    fclose(fileTime);
+
+  }
+
+  printf("task counts: [ %10s %10s %10s %10s %10s ]\n", "self", "pair", "m-poles",
+         "direct", "CoMs");
+  printf("task counts: [ %10i %10i %10i %10i %10i ] (legacy).\n", 0, 0,
+         countMultipoles, countPairs, countCoMs);
+  printf("task counts: [ %10i %10i %10i %10i %10i ] (new).\n", counts[0],
+         counts[1], counts[2], 0,  counts[3]);
+
+
+
+  /* Prepare for timing test */
+  sprintf(buffer, "timings_new_%dM.dat", N/1000000);
+  fileTime = fopen(buffer, "a");
+  tot_run = 0.;
+
+  /* Dry run */
+  //qsched_run(&s, nr_threads, runner);
+
+  /* Loop over the number of runs. */
+  for (k = 0; k < runs; k++) {
+
+    for (i = 0; i < N; ++i) {
+      parts[i].a[0] = 0.0;
+      parts[i].a[1] = 0.0;
+      parts[i].a[2] = 0.0;
+    }
+
+    /* Execute the tasks. */
+    tic = getticks();
+    qsched_run(&s, nr_threads, runner);
+    toc_run = getticks();
+    message("%ith run took %lli (= %e) ticks... = %e ms", k, toc_run - tic,
+            (float)(toc_run - tic), (double)(toc_run - tic) * itpms);
+    tot_run += toc_run - tic;
+  }
+
+  message("Time: %fms", (tot_run / runs) * itpms);
+  fprintf(fileTime, "%d %d %d %f\n", N, nr_threads, runs, (tot_run / runs) * itpms);
+  fclose(fileTime);
+
+
+  message("[check] root mass= %f %f", root->legacy.mass, root->new.mass);
+  message("[check] root CoMx= %f %f", root->legacy.com[0], root->new.com[0]);
+  message("[check] root CoMy= %f %f", root->legacy.com[1], root->new.com[1]);
+  message("[check] root CoMz= %f %f", root->legacy.com[2], root->new.com[2]);
+
+
+#if ICHECK >= 0 && 0
+  for (i = 0; i < N; ++i)
+    if (root->parts[i].id == ICHECK)
+      message("[check] accel of part %i is [%.3e,%.3e,%.3e]", ICHECK,
+              root->parts[i].a[0], root->parts[i].a[1], root->parts[i].a[2]);
+#endif
+
+  /* Dump the tasks. */
+  FILE* taskFile = fopen("output_bh_new.out", "w");
+  for ( k = 0 ; k < s.count ; k++ )
+    fprintf(taskFile, " %i %i %lli %lli\n" , s.tasks[k].type , s.tasks[k].qid ,
+     s.tasks[k].tic , s.tasks[k].toc );
+  fclose(taskFile);
+
+
+  /* Dump the costs. */
+  message("costs: setup=%lli ticks, run=%lli ticks.", tot_setup,
+          tot_run / runs);
+
+  /* Dump the timers. */
+  for (k = 0; k < qsched_timer_count; k++)
+    message("timer %s is %lli ticks.", qsched_timer_names[k], s.timers[k] / runs);
+
+
+  /* Dump the per-task type timers. */
+  printf("task timers: [ ");
+  for (k = 0; k < task_type_count; k++) printf("%lli ", task_timers[k] / runs);
+  printf("] ticks.\n");
+
+  /* Dump the particles to a file */
+  file = fopen("particle_dump.dat", "w");
+  /*fprintf(file,
+          "# ID m x y z a_exact.x   a_exact.y    a_exact.z    a_legacy.x    "
+          "a_legacy.y    a_legacy.z    a_new.x     a_new.y    a_new.z\n");*/
+  for (k = 0; k < N; ++k)
+    fprintf(file, "%d %e %e %e %e %e %e %e %e %e %e %e %e %e\n", parts[k].id,
+            parts[k].mass, parts[k].x[0], parts[k].x[1], parts[k].x[2],
+            parts[k].a_exact[0], parts[k].a_exact[1], parts[k].a_exact[2],
+            parts[k].a_legacy[0], parts[k].a_legacy[1], parts[k].a_legacy[2],
+            parts[k].a[0], parts[k].a[1], parts[k].a[2]);
+  fclose(file);
+
+  /* Clean up. */
+  qsched_free(&s);
+  free(parts);
+}
+
+/**
+ * @brief Main function.
+ */
+
+int main(int argc, char *argv[]) {
+
+  int c, nr_threads;
+  int N = 1000, runs = 1;
+  char fileName[100] = {0};
+
+  /* Die on FP-exceptions. */
+  feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
+
+/* Get the number of threads. */
+#pragma omp parallel shared(nr_threads)
+  {
+    if (omp_get_thread_num() == 0) nr_threads = omp_get_num_threads();
+  }
+
+  /* Parse the options */
+  while ((c = getopt(argc, argv, "n:r:t:f:c:i:")) != -1) switch (c) {
+      case 'n':
+        if (sscanf(optarg, "%d", &N) != 1)
+          error("Error parsing number of particles.");
+        break;
+      case 'r':
+        if (sscanf(optarg, "%d", &runs) != 1)
+          error("Error parsing number of runs.");
+        break;
+      case 't':
+        if (sscanf(optarg, "%d", &nr_threads) != 1)
+          error("Error parsing number of threads.");
+        omp_set_num_threads(nr_threads);
+        break;
+      case 'f':
+        if (sscanf(optarg, "%s", &fileName[0]) != 1)
+          error("Error parsing file name.");
+        break;
+      case '?':
+        fprintf(stderr, "Usage: %s [-t nr_threads] [-n N] [-r runs] [-f file] "
+                        "[-c Nparts] [-i Niterations] \n",
+                argv[0]);
+        fprintf(stderr, "Solves the N-body problem using a Barnes-Hut\n"
+                        "tree code with N random particles read from a file in "
+                        "[0,1]^3 using"
+                        "nr_threads threads.\n"
+                        "A test of the neighbouring cells interaction with "
+                        "Nparts per cell is also run Niterations times.\n");
+        exit(EXIT_FAILURE);
+    }
+
+  /* Tree node information */
+  printf("Size of cell: %zu bytes.\n", sizeof(struct cell));
+
+  /* Part information */
+  printf("Size of part: %zu bytes.\n", sizeof(struct part));
+
+  /* Dump arguments. */
+  if (fileName[0] == 0) {
+    message("Computing the N-body problem over %i random particles using %i "
+            "threads (%i runs).",
+            N, nr_threads, runs);
+  } else {
+    message("Computing the N-body problem over %i particles read from '%s' "
+            "using %i threads (%i runs).",
+            N, fileName, nr_threads, runs);
+  }
+
+  /* Run the BH test. */
+  test_bh(N, nr_threads, runs, fileName);
+
+  return 0;
+}