diff --git a/examples/test_fmm_sorted.c b/examples/test_fmm.c
similarity index 51%
rename from examples/test_fmm_sorted.c
rename to examples/test_fmm.c
index ec33c0d53f412d723f93c24563d7cd0af07afc27..906fcfd4b883399677b8a7e81c59815a2bd9e734 100644
--- a/examples/test_fmm_sorted.c
+++ b/examples/test_fmm.c
@@ -37,37 +37,36 @@
 
 /* Some local constants. */
 #define cell_pool_grow 1000
-#define cell_maxparts 1
+#define cell_maxparts 100
 #define task_limit 1e8
 #define const_G 1    // 6.6738e-8
 #define dist_min 0.5 /* Used for legacy walk only */
 #define dist_cutoff_ratio 1.5
-#define iact_pair_direct iact_pair_direct_sorted_multipole
+
 #define ICHECK -1
 #define NO_SANITY_CHECKS
 #define NO_COM_AS_TASK
-//#define COUNTERS
-
+#define NO_COUNTERS
 
 /** Data structure for the particles. */
 struct part {
   double x[3];
-  // union {
-  float a[3];
-  float a_legacy[3];
-  float a_exact[3];
-  // };
+  union {
+    float a[3];
+    float a_legacy[3];
+    float a_exact[3];
+  };
   float mass;
   int id;
-};  // __attribute__((aligned(64)));
+};  // __attribute__((aligned(32)));
 
-/** Data structure for the sorted particle positions. */
-struct index {
-  int ind;
-  float d;
-};
+struct part_local {
+  float x[3];
+  float a[3];
+  float mass;
+} __attribute__((aligned(32)));
 
-struct monopole {
+struct multipole {
   double com[3];
   float mass;
 };
@@ -89,10 +88,10 @@ struct cell {
   union {
 
     /* Information for the legacy walk */
-    struct monopole legacy;
+    struct multipole legacy;
 
     /* Information for the QuickShed walk */
-    struct monopole new;
+    struct multipole new;
   };
 
   int res, com_tid;
@@ -104,116 +103,11 @@ struct cell {
 enum task_type {
   task_type_self = 0,
   task_type_pair,
-  //task_type_self_pc,
-  task_type_pair_direct,
+  task_type_self_pc,
   task_type_com,
   task_type_count
 };
 
-
-
-
-/** Multipole stuff. */
-struct multipole {
-  float mass;
-  float mu[3]; /* Centre of Mass */
-  float sigma_00, sigma_11, sigma_22, sigma_01, sigma_02, sigma_12;
-};
-
-static inline void multipole_reset(struct multipole *d) {
-  d->mu[0] = 0.0;
-  d->mu[1] = 0.0;
-  d->mu[2] = 0.0;
-
-  d->sigma_00 = 0.0;
-  d->sigma_11 = 0.0;
-  d->sigma_22 = 0.0;
-  d->sigma_01 = 0.0;
-  d->sigma_02 = 0.0;
-  d->sigma_12 = 0.0;
-
-  d->mass = 0.0;
-}
-
-static inline void multipole_add(struct multipole *d, const float *x, float mass) {
-  d->mu[0] += x[0] * mass;
-  d->mu[1] += x[1] * mass;
-  d->mu[2] += x[2] * mass;
-  
-  d->sigma_00 += mass * x[0] * x[0];
-  d->sigma_11 += mass * x[1] * x[1];
-  d->sigma_22 += mass * x[2] * x[2];
-  d->sigma_01 += mass * x[0] * x[1];
-  d->sigma_02 += mass * x[0] * x[2];
-  d->sigma_12 += mass * x[1] * x[2];
-
-  d->mass += mass;
-}
-
-
-static inline void multipole_iact(struct multipole *d, const float *x, float* a) {
-
-  int k;
-  float r2,ir,w, dx[3];
-
-  /* early abort */
-  if (d->mass == 0.) return;
-
-  float inv_mass = 1. / d->mass;
-  
-  /* Temporary quantity */
-  dx[0] = d->sigma_00 - inv_mass*d->mu[0]*d->mu[0];  /* Abusing a so far unused */
-  dx[1] = d->sigma_11 - inv_mass*d->mu[1]*d->mu[1];  /* variable temporarily... */
-  dx[2] = d->sigma_22 - inv_mass*d->mu[2]*d->mu[2];
-
-  /* Construct quadrupole matrix */
-  float Q_00 = 2.0f*dx[0] - dx[1] - dx[2]; 
-  float Q_11 = 2.0f*dx[1] - dx[0] - dx[2]; 
-  float Q_22 = Q_00 + Q_11; /* Q_ij is traceless */
-  Q_22 *= -1.0f;
-
-  float Q_01 = d->sigma_01 - inv_mass*d->mu[0]*d->mu[1];
-  float Q_02 = d->sigma_02 - inv_mass*d->mu[0]*d->mu[2];
-  float Q_12 = d->sigma_12 - inv_mass*d->mu[1]*d->mu[2];
-  Q_01 *= 3.0f;
-  Q_02 *= 3.0f;
-  Q_12 *= 3.0f;
-
-  /* Compute the distance to the CoM. */
-  for (r2 = 0.0f, k = 0; k < 3; k++) {
-    dx[k] = x[k] - d->mu[k] * inv_mass;
-    r2 += dx[k] * dx[k];
-  }
-  ir = 1.0f / sqrtf(r2);
-
-  /* Compute the acceleration from the monopole */
-  w = const_G * ir * ir * ir * d->mass;
-  for (k = 0; k < 3; k++) a[k] -= w * dx[k];
-
-  /* Compute some helpful terms */
-  float w1 = const_G * ir * ir * ir * ir * ir; 
-  float w2 = -2.5f * const_G * ir * ir * ir * ir * ir * ir * ir;
-  float xi = 0.;
-
-  xi += dx[0] * dx[0] * Q_00;
-  xi += dx[1] * dx[1] * Q_11;
-  xi += dx[2] * dx[2] * Q_22;
-  xi += 2.f * dx[0] * dx[1] * Q_01;
-  xi += 2.f * dx[0] * dx[2] * Q_02;
-  xi += 2.f * dx[1] * dx[2] * Q_12;
-
-  /* Compute the acceleration from the quadrupole */    
-  a[0] += w2 * xi * dx[0] + w1 * (dx[0]*Q_00 + dx[1]*Q_01 + dx[2]*Q_02);
-  a[1] += w2 * xi * dx[1] + w1 * (dx[0]*Q_01 + dx[1]*Q_11 + dx[2]*Q_12);
-  a[2] += w2 * xi * dx[2] + w1 * (dx[0]*Q_02 + dx[1]*Q_12 + dx[2]*Q_22);
-
-}
-
-
-
-
-
-
 #ifdef COUNTERS
 int count_direct_unsorted;
 int count_direct_sorted_pp;
@@ -227,311 +121,6 @@ ticks task_timers[task_type_count];
 /** Global variable for the pool of allocated cells. */
 struct cell *cell_pool = NULL;
 
-/** Constants for the sorting axes. */
-const float axis_shift[13 * 3] = {
-  5.773502691896258e-01, 5.773502691896258e-01, 5.773502691896258e-01,
-  7.071067811865475e-01, 7.071067811865475e-01, 0.0,
-  5.773502691896258e-01, 5.773502691896258e-01, -5.773502691896258e-01,
-  7.071067811865475e-01, 0.0, 7.071067811865475e-01,
-  1.0, 0.0, 0.0,
-  7.071067811865475e-01, 0.0, -7.071067811865475e-01,
-  5.773502691896258e-01, -5.773502691896258e-01, 5.773502691896258e-01,
-  7.071067811865475e-01, -7.071067811865475e-01, 0.0,
-  5.773502691896258e-01, -5.773502691896258e-01, -5.773502691896258e-01,
-  0.0,   7.071067811865475e-01, 7.071067811865475e-01,
-  0.0, 1.0, 0.0,
-  0.0, 7.071067811865475e-01, -7.071067811865475e-01,
-  0.0, 0.0, 1.0,
-};
-
-const char axis_flip[27] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
-                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-
-/* Map shift vector to sortlist. */
-const int axis_sid[27] = {
-  /* ( -1 , -1 , -1 ) */ 0,
-  /* ( -1 , -1 ,  0 ) */ 1,
-  /* ( -1 , -1 ,  1 ) */ 2,
-  /* ( -1 ,  0 , -1 ) */ 3,
-  /* ( -1 ,  0 ,  0 ) */ 4,
-  /* ( -1 ,  0 ,  1 ) */ 5,
-  /* ( -1 ,  1 , -1 ) */ 6,
-  /* ( -1 ,  1 ,  0 ) */ 7,
-  /* ( -1 ,  1 ,  1 ) */ 8,
-  /* (  0 , -1 , -1 ) */ 9,
-  /* (  0 , -1 ,  0 ) */ 10,
-  /* (  0 , -1 ,  1 ) */ 11,
-  /* (  0 ,  0 , -1 ) */ 12,
-  /* (  0 ,  0 ,  0 ) */ 0,
-  /* (  0 ,  0 ,  1 ) */ 12,
-  /* (  0 ,  1 , -1 ) */ 11,
-  /* (  0 ,  1 ,  0 ) */ 10,
-  /* (  0 ,  1 ,  1 ) */ 9,
-  /* (  1 , -1 , -1 ) */ 8,
-  /* (  1 , -1 ,  0 ) */ 7,
-  /* (  1 , -1 ,  1 ) */ 6,
-  /* (  1 ,  0 , -1 ) */ 5,
-  /* (  1 ,  0 ,  0 ) */ 4,
-  /* (  1 ,  0 ,  1 ) */ 3,
-  /* (  1 ,  1 , -1 ) */ 2,
-  /* (  1 ,  1 ,  0 ) */ 1,
-  /* (  1 ,  1 ,  1 ) */ 0
-};
-
-/* Some forward declarations. */
-void comp_com(struct cell *c);
-
-/**
- * @brief Sort the entries in ascending order using QuickSort.
- *
- * @param sort The indices
- * @param N The number of entries.
- */
-void indices_sort(struct index *sort, int N) {
-
-  struct {
-    short int lo, hi;
-  } qstack[10];
-  int qpos, i, j, lo, hi, imin;
-  struct index temp;
-  float pivot;
-
-  /* Sort parts in cell_i in decreasing order with quicksort */
-  qstack[0].lo = 0;
-  qstack[0].hi = N - 1;
-  qpos = 0;
-  while (qpos >= 0) {
-    lo = qstack[qpos].lo;
-    hi = qstack[qpos].hi;
-    qpos -= 1;
-    if (hi - lo < 15) {
-      for (i = lo; i < hi; i++) {
-        imin = i;
-        for (j = i + 1; j <= hi; j++)
-          if (sort[j].d < sort[imin].d) imin = j;
-        if (imin != i) {
-          temp = sort[imin];
-          sort[imin] = sort[i];
-          sort[i] = temp;
-        }
-      }
-    } else {
-      pivot = sort[(lo + hi) / 2].d;
-      i = lo;
-      j = hi;
-      while (i <= j) {
-        while (sort[i].d < pivot) i++;
-        while (sort[j].d > pivot) j--;
-        if (i <= j) {
-          if (i < j) {
-            temp = sort[i];
-            sort[i] = sort[j];
-            sort[j] = temp;
-          }
-          i += 1;
-          j -= 1;
-        }
-      }
-      if (j > (lo + hi) / 2) {
-        if (lo < j) {
-          qpos += 1;
-          qstack[qpos].lo = lo;
-          qstack[qpos].hi = j;
-        }
-        if (i < hi) {
-          qpos += 1;
-          qstack[qpos].lo = i;
-          qstack[qpos].hi = hi;
-        }
-      } else {
-        if (i < hi) {
-          qpos += 1;
-          qstack[qpos].lo = i;
-          qstack[qpos].hi = hi;
-        }
-        if (lo < j) {
-          qpos += 1;
-          qstack[qpos].lo = lo;
-          qstack[qpos].hi = j;
-        }
-      }
-    }
-  }
-}
-
-/**
- * @brief Sort the particles in the given cell along the given axis.
- *
- * @param c The #cell.
- * @param axis The normalized axis along which to sort.
- * @param aid The axis ID at which to store the indices.
- */
-void cell_sort(struct cell *c, const float *axis, int aid) {
-
-  /* Has the indices array even been allocated? */
-  if (c->indices == NULL) {
-    if ((c->indices = (struct index *)malloc((c->count + 1) * 13 *
-                                             sizeof(struct index))) ==
-        NULL)
-      error("Failed to allocate cell sorting indices.");
-  } else if (c->sorted & (1 << aid)) {
-    return;
-  }
-
-  /* If this cell has been split, merge from the progeny. */
-  if (c->split) {
-
-    /* Heap of pointers to the progeny's indices. */
-    struct {
-      struct index *index;
-      int offset;
-    } temp, pindex[8];
-    int k = 0;
-
-    /* First, make sure all the progeny have been sorted, and get the pointers
-       to their first entries. */
-    for (struct cell *cp = c->firstchild; cp != c->sibling; cp = cp->sibling) {
-      if (!(cp->sorted & (1 << aid))) cell_sort(cp, axis, aid);
-      pindex[k].index = &cp->indices[(cp->count + 1) * aid];
-      pindex[k].offset = cp->parts - c->parts;
-      k++;
-    }
-
-    /* Fill any remaining gaps with the sentinel. */
-    c->indices[(c->count + 1) * aid + c->count].d = FLT_MAX;
-    for (; k < 8; k++)
-      pindex[k].index = &c->indices[(c->count + 1) * aid + c->count];
-
-    /* Heapify the pindices. */
-    for (k = 3; k >= 0; k--) {
-      int j = k;
-      while (j < 4) {
-        int jj = 2 * j + 1;
-        if (jj + 1 < 8 && pindex[jj + 1].index->d < pindex[jj].index->d)
-          jj = jj + 1;
-        if (pindex[jj].index->d < pindex[j].index->d) {
-          temp = pindex[jj];
-          pindex[jj] = pindex[j];
-          pindex[j] = temp;
-          j = jj;
-        } else
-          break;
-      }
-    }
-
-    /* Copy the indices into the local index in increasing order. */
-    struct index *dest = &c->indices[(c->count + 1) * aid];
-    for (int k = 0; k < c->count; k++) {
-
-      /* Copy the top of the heap to the destination. */
-      *dest = *pindex[0].index;
-      dest->ind += pindex[0].offset;
-
-      /* Increase the pointers. */
-      dest++;
-      pindex[0].index++;
-
-      /* Fix the heap. */
-      int j = 0;
-      while (j < 4) {
-        int jj = 2 * j + 1;
-        if (jj + 1 < 8 && pindex[jj + 1].index->d < pindex[jj].index->d)
-          jj = jj + 1;
-        if (pindex[jj].index->d < pindex[j].index->d) {
-          temp = pindex[jj];
-          pindex[jj] = pindex[j];
-          pindex[j] = temp;
-          j = jj;
-        } else
-          break;
-      }
-    }
-
-    /* Add a sentinel at the end. */
-    dest->ind = -1;
-    dest->d = FLT_MAX;
-
-    /* Otherwise, just sort the entries. */
-  } else {
-
-    /* Fill the indices. */
-    struct index *dest = &c->indices[(c->count + 1) * aid];
-    for (int k = 0; k < c->count; k++) {
-      dest[k].ind = k;
-      dest[k].d = c->parts[k].x[0] * axis[0] + c->parts[k].x[1] * axis[1] +
-                  c->parts[k].x[2] * axis[2];
-    }
-
-    /* Sort the indices. */
-    indices_sort(&c->indices[(c->count + 1) * aid], c->count);
-
-    /* Set the sentinel on the last entry. */
-    dest[c->count].ind = -1;
-    dest[c->count].d = FLT_MAX;
-  }
-
-  /* Mark this cell as sorted in the given direction. */
-  c->sorted |= (1 << aid);
-
-  /* Verify the sort. */
-  /* int offset = (c->count + 1) * aid;
-  for (int k = 0; k < c->count; k++)
-    if (c->indices[offset + k].d > c->indices[offset + k + 1].d)
-      error( "Sorting failed." ); */
-} /*  */
-
-/**
- * @brief Get all the data needed for computing a sorted pair.
- *
- * @param ci Pointer to a pointer to the first cell.
- * @param cj Pointer to a pointer to the second cell.
- * @param ind_i Sorted indices of the cell @c ci.
- * @param ind_j Sorted indices of the cell @c cj.
- */
-void get_axis(struct cell **ci, struct cell **cj, struct index **ind_i,
-              struct index **ind_j) {
-
-  float axis[3];
-  float dx[3];
-  int aid = 0;
-
-  /* Get the cell pair separation and the axis index. */
-  for (int k = 0; k < 3; k++) {
-    dx[k] = (*cj)->loc[k] - (*ci)->loc[k];
-    aid = 3 * aid + ((dx[k] < 0) ? 0 : (dx[k] > 0) ? 2 : 1);
-  }
-
-  /* Flip the cells? */
-  if (axis_flip[aid]) {
-    struct cell *temp = *ci;
-    *ci = *cj;
-    *cj = temp;
-  }
-  aid = axis_sid[aid];
-
-  /* Copy the shift vector to the output parameter. */
-  axis[0] = axis_shift[aid * 3 + 0];
-  axis[1] = axis_shift[aid * 3 + 1];
-  axis[2] = axis_shift[aid * 3 + 2];
-
-  /* Make sure the cells are sorted. */
-  if (!((*ci)->sorted & (1 << aid))) cell_sort(*ci, axis, aid);
-  if (!((*cj)->sorted & (1 << aid))) cell_sort(*cj, axis, aid);
-
-  /* Set the indices. */
-  *ind_i = &(*ci)->indices[aid * ((*ci)->count + 1)];
-  *ind_j = &(*cj)->indices[aid * ((*cj)->count + 1)];
-
-
-  /* Make sure the sorts are ok. */
-  /* for (int k = 1; k < (*ci)->count; k++)
-    if ((*ind_i)[k].d < (*ind_i)[k-1].d)
-      error("Sorting failed.");
-  for (int k = 1; k < (*cj)->count; k++)
-    if ((*ind_j)[k].d < (*ind_j)[k-1].d)
-      error("Sorting failed."); */
-}
-
 /**
  * @brief Get a #cell from the pool.
  */
@@ -566,6 +155,56 @@ struct cell *cell_get() {
   return res;
 }
 
+/**
+ * @brief Compute the center of mass of a given cell.
+ *
+ * @param c The #cell.
+ */
+void comp_com(struct cell *c) {
+
+  int k, count = c->count;
+  struct cell *cp;
+  struct part *parts = c->parts;
+  double com[3] = {0.0, 0.0, 0.0}, mass = 0.0;
+
+  if (c->split) {
+
+    /* Loop over the projenitors and collect the multipole information. */
+    for (cp = c->firstchild; cp != c->sibling; cp = cp->sibling) {
+      float cp_mass = cp->new.mass;
+      com[0] += cp->new.com[0] * cp_mass;
+      com[1] += cp->new.com[1] * cp_mass;
+      com[2] += cp->new.com[2] * cp_mass;
+      mass += cp_mass;
+    }
+
+    /* Otherwise, collect the multipole from the particles. */
+  } else {
+
+    for (k = 0; k < count; k++) {
+      float p_mass = parts[k].mass;
+      com[0] += parts[k].x[0] * p_mass;
+      com[1] += parts[k].x[1] * p_mass;
+      com[2] += parts[k].x[2] * p_mass;
+      mass += p_mass;
+    }
+  }
+
+  /* Store the COM data, if any was collected. */
+  if (mass > 0.0) {
+    float imass = 1.0f / mass;
+    c->new.com[0] = com[0] * imass;
+    c->new.com[1] = com[1] * imass;
+    c->new.com[2] = com[2] * imass;
+    c->new.mass = mass;
+  } else {
+    c->new.com[0] = 0.0;
+    c->new.com[1] = 0.0;
+    c->new.com[2] = 0.0;
+    c->new.mass = 0.0;
+  }
+}
+
 /**
  * @brief Sort the parts into eight bins along the given pivots and
  *        fill the multipoles. Also adds the hierarchical resources
@@ -724,8 +363,15 @@ void cell_split(struct cell *c, struct qsched *s) {
 #endif
 
     /* Otherwise, we're at a leaf, so create the cell's particle-cell task. */
-  }
-
+  } else {
+    struct cell *data[2] = {root, c};
+    int tid = qsched_addtask(s, task_type_self_pc, task_flag_none, data,
+                             2 * sizeof(struct cell *), 1);
+    qsched_addlock(s, tid, c->res);
+#ifdef COM_AS_TASK
+    qsched_addunlock(s, root->com_tid, tid);
+#endif
+  } /* does the cell need to be split? */
 
 /* Compute the cell's center of mass. */
 #ifndef COM_AS_TASK
@@ -741,63 +387,14 @@ void cell_split(struct cell *c, struct qsched *s) {
 /* New tree walk */
 /* -------------------------------------------------------------------------- */
 
-/**
- * @brief Compute the center of mass of a given cell.
- *
- * @param c The #cell.
- */
-void comp_com(struct cell *c) {
-
-  int k, count = c->count;
-  struct cell *cp;
-  struct part *parts = c->parts;
-  double com[3] = {0.0, 0.0, 0.0}, mass = 0.0;
-
-  if (c->split) {
-
-    /* Loop over the projenitors and collect the multipole information. */
-    for (cp = c->firstchild; cp != c->sibling; cp = cp->sibling) {
-      float cp_mass = cp->new.mass;
-      com[0] += cp->new.com[0] * cp_mass;
-      com[1] += cp->new.com[1] * cp_mass;
-      com[2] += cp->new.com[2] * cp_mass;
-      mass += cp_mass;
-    }
-
-    /* Otherwise, collect the multipole from the particles. */
-  } else {
-
-    for (k = 0; k < count; k++) {
-      float p_mass = parts[k].mass;
-      com[0] += parts[k].x[0] * p_mass;
-      com[1] += parts[k].x[1] * p_mass;
-      com[2] += parts[k].x[2] * p_mass;
-      mass += p_mass;
-    }
-  }
-
-  /* Store the COM data, if any was collected. */
-  if (mass > 0.0) {
-    float imass = 1.0f / mass;
-    c->new.com[0] = com[0] * imass;
-    c->new.com[1] = com[1] * imass;
-    c->new.com[2] = com[2] * imass;
-    c->new.mass = mass;
-  } else {
-    c->new.com[0] = 0.0;
-    c->new.com[1] = 0.0;
-    c->new.com[2] = 0.0;
-    c->new.mass = 0.0;
-  }
-}
-
 /**
  * @brief Interacts all particles in ci with the monopole in cj
  */
-static inline void make_interact_pc(struct cell *leaf, struct cell *cj) {
+static inline void make_interact_pc(struct part_local *parts, int count,
+                                    double *loc, struct cell *cj) {
 
   int j, k;
-  double com[3] = {0.0, 0.0, 0.0};
+  float com[3] = {0.0, 0.0, 0.0};
   float mcom, dx[3] = {0.0, 0.0, 0.0}, r2, ir, w;
 
 #ifdef SANITY_CHECKS
@@ -819,35 +416,22 @@ static inline void make_interact_pc(struct cell *leaf, struct cell *cj) {
 #endif
 
   /* Init the com's data. */
-  for (k = 0; k < 3; k++) com[k] = cj->new.com[k];
+  for (k = 0; k < 3; k++) com[k] = cj->new.com[k] - loc[k];
   mcom = cj->new.mass;
 
   /* Loop over every particle in leaf. */
-  for (j = 0; j < leaf->count; j++) {
+  for (j = 0; j < count; j++) {
 
     /* Compute the pairwise distance. */
     for (r2 = 0.0, k = 0; k < 3; k++) {
-      dx[k] = com[k] - leaf->parts[j].x[k];
+      dx[k] = com[k] - parts[j].x[k];
       r2 += dx[k] * dx[k];
     }
 
     /* Apply the gravitational acceleration. */
     ir = 1.0f / sqrtf(r2);
     w = mcom * const_G * ir * ir * ir;
-    for (k = 0; k < 3; k++) leaf->parts[j].a[k] += w * dx[k];
-
-#if ICHECK >= 0
-    if (leaf->parts[j].id == ICHECK)
-      printf("[DEBUG] cell [%f,%f,%f] interacting with cj->loc=[%f,%f,%f] "
-             "m=%f h=%f\n",
-             leaf->loc[0], leaf->loc[1], leaf->loc[2], cj->loc[0], cj->loc[1],
-             cj->loc[2], mcom, cj->h);
-
-    if (leaf->parts[j].id == ICHECK)
-      printf("[NEW] Interaction with monopole a=( %e %e %e ) h= %f Nj= %d m_j= "
-             "%f\n",
-             w * dx[0], w * dx[1], w * dx[2], cj->h, cj->count, mcom);
-#endif
+    for (k = 0; k < 3; k++) parts[j].a[k] += w * dx[k];
 
   } /* loop over every particle. */
 }
@@ -865,21 +449,19 @@ static inline int is_inside(struct cell *leaf, struct cell *c) {
 static inline int are_neighbours_different_size(struct cell *ci,
                                                 struct cell *cj) {
 
-  int k;
-  float dx[3];
   double cih = ci->h, cjh = cj->h;
 
   /* Maximum allowed distance */
   float min_dist = 0.5 * (cih + cjh);
 
   /* (Manhattan) Distance between the cells */
-  for (k = 0; k < 3; k++) {
+  for (int k = 0; k < 3; k++) {
     float center_i = ci->loc[k] + 0.5 * cih;
     float center_j = cj->loc[k] + 0.5 * cjh;
-    dx[k] = fabs(center_i - center_j);
+    if (fabsf(center_i - center_j) > min_dist) return 0;
   }
 
-  return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist);
+  return 1;
 }
 
 /**
@@ -888,9 +470,6 @@ static inline int are_neighbours_different_size(struct cell *ci,
  */
 static inline int are_neighbours(struct cell *ci, struct cell *cj) {
 
-  int k;
-  float dx[3];
-
 #ifdef SANITY_CHECKS
   if (ci->h != cj->h)
     error(" Cells of different size in distance calculation.");
@@ -900,138 +479,155 @@ static inline int are_neighbours(struct cell *ci, struct cell *cj) {
   float min_dist = ci->h;
 
   /* (Manhattan) Distance between the cells */
-  for (k = 0; k < 3; k++) {
+  for (int k = 0; k < 3; k++) {
     float center_i = ci->loc[k];
     float center_j = cj->loc[k];
-    dx[k] = fabs(center_i - center_j);
+    if (fabsf(center_i - center_j) > min_dist) return 0;
   }
 
-  return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist);
+  return 1;
 }
 
-/* /\** */
-/*  * @brief Compute the interactions between all particles in a cell leaf */
-/*  *        and the center of mass of all the cells in a part of the tree */
-/* * described by ci and cj */
-/*  * */
-/*  * @param ci The #cell containing the particle */
-/*  * @param cj The #cell containing the center of mass. */
-/*  *\/ */
-/* static inline void iact_pair_pc(struct cell *ci, struct cell *cj, */
-/*                                 struct cell *leaf) { */
-
-/*   struct cell *cp, *cps; */
+/**
+ * @brief Compute the interactions between all particles in a cell leaf
+ *        and the center of mass of all the cells in a part of the tree
+ * described by ci and cj
+ *
+ * @param ci The #cell containing the particle
+ * @param cj The #cell containing the center of mass.
+ */
+static inline void iact_pair_pc(struct cell *ci, struct cell *cj,
+                                struct cell *leaf, struct part_local *parts,
+                                int count, double *loc) {
 
-/* #ifdef SANITY_CHECKS */
+  struct cell *cp, *cps;
 
-/*   /\* Early abort? *\/ */
-/*   if (ci->count == 0 || cj->count == 0) error("Empty cell !"); */
+#ifdef SANITY_CHECKS
 
-/*   /\* Sanity check *\/ */
-/*   if (ci == cj) */
-/*     error("The impossible has happened: pair interaction between a cell and " */
-/*           "itself."); */
+  /* Early abort? */
+  if (ci->count == 0 || cj->count == 0) error("Empty cell !");
 
-/*   /\* Sanity check *\/ */
-/*   if (!is_inside(leaf, ci)) */
-/*     error("The impossible has happened: The leaf is not within ci"); */
+  /* Sanity check */
+  if (ci == cj)
+    error("The impossible has happened: pair interaction between a cell and "
+          "itself.");
 
-/*   /\* Are the cells direct neighbours? *\/ */
-/*   if (!are_neighbours(ci, cj)) error("Cells are not neighours"); */
+  /* Sanity check */
+  if (!is_inside(leaf, ci))
+    error("The impossible has happened: The leaf is not within ci");
 
-/*   /\* Are both cells split ? *\/ */
-/*   if (!ci->split || !cj->split) error("One of the cells is not split !"); */
-/* #endif */
+  /* Are the cells direct neighbours? */
+  if (!are_neighbours(ci, cj)) error("Cells are not neighours");
 
-/*   /\* Let's find in which subcell of ci the leaf is *\/ */
-/*   for (cp = ci->firstchild; cp != ci->sibling; cp = cp->sibling) { */
+  /* Are both cells split ? */
+  if (!ci->split || !cj->split) error("One of the cells is not split !");
+#endif
 
-/*     if (is_inside(leaf, cp)) break; */
-/*   } */
+  /* Let's find in which subcell of ci the leaf is */
+  for (cp = ci->firstchild; cp != ci->sibling; cp = cp->sibling) {
 
-/*   if (are_neighbours_different_size(cp, cj)) { */
+    if (is_inside(leaf, cp)) break;
+  }
 
-/*     /\* Now interact this subcell with all subcells of cj *\/ */
-/*     for (cps = cj->firstchild; cps != cj->sibling; cps = cps->sibling) { */
+  if (are_neighbours_different_size(cp, cj)) {
 
-/*       /\* Check whether we have to recurse or can directly jump to the multipole */
-/*        * calculation *\/ */
-/*       if (are_neighbours(cp, cps)) { */
+    /* Now interact this subcell with all subcells of cj */
+    for (cps = cj->firstchild; cps != cj->sibling; cps = cps->sibling) {
 
-/*         /\* We only recurse if the children are split *\/ */
-/*         if (cp->split && cps->split) { */
-/*           iact_pair_pc(cp, cps, leaf); */
-/*         } */
+      /* Check whether we have to recurse or can directly jump to the multipole
+       * calculation */
+      if (are_neighbours(cp, cps)) {
 
-/*       } else { */
-/*         make_interact_pc(leaf, cps); */
-/*       } */
-/*     } */
-/*   } else { */
+        /* We only recurse if the children are split */
+        if (cp->split && cps->split) {
+          iact_pair_pc(cp, cps, leaf, parts, count, loc);
+        }
 
-/*     /\* If cp is not a neoghbour of cj, we can directly interact with the */
-/*      * multipoles *\/ */
-/*     for (cps = cj->firstchild; cps != cj->sibling; cps = cps->sibling) { */
+      } else {
+        make_interact_pc(parts, count, loc, cps);
+      }
+    }
+  } else {
 
-/*       make_interact_pc(leaf, cps); */
-/*     } */
-/*   } */
-/* } */
+    /* If cp is not a neoghbour of cj, we can directly interact with the
+     * multipoles */
+    for (cps = cj->firstchild; cps != cj->sibling; cps = cps->sibling) {
 
-/* /\** */
-/*  * @brief Compute the interactions between all particles in a leaf and */
-/*  *        and all the monopoles in the cell c */
-/*  * */
-/*  * @param c The #cell containing the monopoles */
-/*  * @param leaf The #cell containing the particles */
-/*  *\/ */
-/* static inline void iact_self_pc(struct cell *c, struct cell *leaf) { */
+      make_interact_pc(parts, count, loc, cps);
+    }
+  }
+}
 
-/*   struct cell *cp, *cps; */
+/**
+ * @brief Compute the interactions between all particles in a leaf and
+ *        and all the monopoles in the cell c
+ *
+ * @param c The #cell containing the monopoles
+ * @param leaf The #cell containing the particles
+ */
+static inline void iact_self_pc(struct cell *c, struct cell *leaf,
+                                struct part_local *parts) {
 
-/* #ifdef SANITY_CHECKS */
+  struct cell *cp, *cps;
+  int collect_part_data = 0;
 
-/*   /\* Early abort? *\/ */
-/*   if (c->count == 0) error("Empty cell !"); */
+#ifdef SANITY_CHECKS
 
-/*   if (!c->split) error("Cell is not split !"); */
+  /* Early abort? */
+  if (c->count == 0) error("Empty cell !");
 
-/* #endif */
+  if (!c->split) error("Cell is not split !");
 
-/*   /\* Find in which subcell of c the leaf is *\/ */
-/*   for (cp = c->firstchild; cp != c->sibling; cp = cp->sibling) { */
+#endif
 
-/*     /\* Only recurse if the leaf is in this part of the tree *\/ */
-/*     if (is_inside(leaf, cp)) break; */
-/*   } */
+  /* Get local copies of the particle data. */
+  if (parts == NULL) {
+    int count = leaf->count;
+    if ((parts =
+             (struct part_local *)malloc(sizeof(struct part_local) * count)) ==
+        NULL)
+      error("Failed to allocate local parts.");
+    for (int k = 0; k < count; k++) {
+      for (int j = 0; j < 3; j++) {
+        parts[k].x[j] = leaf->parts[k].x[j] - leaf->loc[j];
+        parts[k].a[j] = 0.0f;
+      }
+      parts[k].mass = leaf->parts[k].mass;
+    }
+    collect_part_data = 1;
+  }
 
-/*   if (cp->split) { */
+  /* Find in which subcell of c the leaf is */
+  for (cp = c->firstchild; cp != c->sibling; cp = cp->sibling) {
 
-/*     /\* Recurse if the cell can be split *\/ */
-/*     iact_self_pc(cp, leaf); */
+    /* Only recurse if the leaf is in this part of the tree */
+    if (is_inside(leaf, cp)) break;
+  }
 
-/*     /\* Now, interact with every other subcell *\/ */
-/*     for (cps = c->firstchild; cps != c->sibling; cps = cps->sibling) { */
+  if (cp->split) {
 
-/*       /\* Since cp and cps will be direct neighbours it is only worth recursing */
-/*        *\/ */
-/*       /\* if the cells can both be split *\/ */
-/*       if (cp != cps && cps->split) iact_pair_pc(cp, cps, leaf); */
-/*     } */
-/*   } */
-/* } */
+    /* Recurse if the cell can be split */
+    iact_self_pc(cp, leaf, parts);
 
-/* /\** */
-/*  * @brief Starts the recursive tree walk of a given leaf */
-/*  *\/ */
-/* void init_multipole_walk(struct cell *root, struct cell *leaf) { */
+    /* Now, interact with every other subcell */
+    for (cps = c->firstchild; cps != c->sibling; cps = cps->sibling) {
 
-/*   /\* Pre-fetch the leaf's particles *\/ */
-/*   __builtin_prefetch(leaf->parts, 1, 3); */
+      /* Since cp and cps will be direct neighbours it is only worth recursing
+       */
+      /* if the cells can both be split */
+      if (cp != cps && cps->split)
+        iact_pair_pc(cp, cps, leaf, parts, leaf->count, leaf->loc);
+    }
+  }
 
-/*   /\* Start the recursion *\/ */
-/*   iact_self_pc(root, leaf); */
-/* } */
+  /* Clean up local parts? */
+  if (collect_part_data) {
+    for (int k = 0; k < leaf->count; k++) {
+      for (int j = 0; j < 3; j++) leaf->parts[k].a[j] += parts[k].a[j];
+    }
+    free(parts);
+  }
+}
 
 /**
  * @brief Compute the interactions between all particles in a cell
@@ -1040,13 +636,13 @@ static inline int are_neighbours(struct cell *ci, struct cell *cj) {
  * @param ci The #cell containing the particles.
  * @param cj The #cell containing the other particles
  */
-static inline void iact_pair_direct_unsorted(struct cell *ci, struct cell *cj) {
+static inline void iact_pair_direct_dp(struct cell *ci, struct cell *cj) {
 
   int i, j, k;
   int count_i = ci->count, count_j = cj->count;
-  struct part *parts_i = ci->parts, *parts_j = cj->parts;
   double xi[3];
   float dx[3], ai[3], mi, mj, r2, w, ir;
+  struct part *parts_i = ci->parts, *parts_j = cj->parts;
 
 #ifdef SANITY_CHECKS
 
@@ -1109,26 +705,15 @@ static inline void iact_pair_direct_unsorted(struct cell *ci, struct cell *cj) {
   } /* loop over all particles. */
 }
 
-
-
-
-
-
-static inline void iact_pair_direct_sorted_multipole(struct cell *ci,
-						     struct cell *cj) {
-
-  struct part_local {
-    float x[3];
-    float a[3];
-    float mass, d;
-  };
+static inline void iact_pair_direct(struct cell *ci, struct cell *cj) {
 
   int i, j, k;
-  int count_i, count_j;
-  struct part_local *parts_i, *parts_j;
-  float cjh = cj->h;
+  int count_i = ci->count, count_j = cj->count;
   float xi[3];
   float dx[3], ai[3], mi, mj, r2, w, ir;
+  double com[3];
+  struct part_local parts_j[count_j];
+  struct part *parts_i = ci->parts;
 
 #ifdef SANITY_CHECKS
 
@@ -1138,75 +723,32 @@ static inline void iact_pair_direct_sorted_multipole(struct cell *ci,
 
 #endif
 
-  /* Get the sorted indices and stuff. */
-  struct index *ind_i, *ind_j;
-  struct multipole multi;
-  get_axis(&ci, &cj, &ind_i, &ind_j);
-  multipole_reset(&multi);
-  cjh = cj->h;
-
-  /* Allocate and fill-in the local parts. */
-  count_i = ci->count;
-  count_j = cj->count;
-  if ((parts_i =
-           (struct part_local *)alloca(sizeof(struct part_local) *count_i)) ==
-          NULL ||
-      (parts_j =
-           (struct part_local *)alloca(sizeof(struct part_local) * count_j)) ==
-          NULL)
-    error("Failed to allocate local part arrays.");
-  for (i = 0; i < count_i; i++) {
-    int pid = ind_i[i].ind;
-    parts_i[i].d = ind_i[i].d;
-    for (k = 0; k < 3; k++) {
-      parts_i[i].x[k] = ci->parts[pid].x[k] - cj->loc[k];
-      parts_i[i].a[k] = 0.0f;
-    }
-    parts_i[i].mass = ci->parts[pid].mass;
+  /* Find the center point of the interaction. */
+  for (k = 0; k < 3; k++) {
+    com[k] = 0.5 * (ci->loc[k] + cj->loc[k]);
   }
-  for (j = 0; j < count_j; j++) {
-    int pjd = ind_j[j].ind;
-    parts_j[j].d = ind_j[j].d;
-    for (k = 0; k < 3; k++) {
-      parts_j[j].x[k] = cj->parts[pjd].x[k] - cj->loc[k];
-      parts_j[j].a[k] = 0.0f;
+
+  /* Init the local copies of the particles. */
+  for (k = 0; k < count_j; k++) {
+    for (j = 0; j < 3; j++) {
+      parts_j[k].x[j] = cj->parts[k].x[j] - com[j];
+      parts_j[k].a[j] = 0.0f;
     }
-    parts_j[j].mass = cj->parts[pjd].mass;
+    parts_j[k].mass = cj->parts[k].mass;
   }
 
-#if ICHECK >= 0
-  for (k = 0; k < count_i; k++)
-    if (parts_i[k].id == ICHECK)
-      message("[DEBUG] interacting cells loc=[%f,%f,%f], h=%f and "
-              "loc=[%f,%f,%f], h=%f.",
-              ci->loc[0], ci->loc[1], ci->loc[2], ci->h, cj->loc[0], cj->loc[1],
-              cj->loc[2], cj->h);
-  for (k = 0; k < count_j; k++)
-    if (parts_j[k].id == ICHECK)
-      message("[DEBUG] interacting cells loc=[%f,%f,%f], h=%f and "
-              "loc=[%f,%f,%f], h=%f.",
-              cj->loc[0], cj->loc[1], cj->loc[2], cj->h, ci->loc[0], ci->loc[1],
-              ci->loc[2], ci->h);
-#endif
-
-  /* Distance along the axis as of which we will use a multipole. */
-  float d_max = dist_cutoff_ratio * cjh;
-
   /* Loop over all particles in ci... */
-  for (i = count_i - 1; i >= 0; i--) {
-
-    /* Get a local copy of the distance along the axis. */
-    float di = parts_i[i].d;
+  for (i = 0; i < count_i; i++) {
 
     /* Init the ith particle's data. */
     for (k = 0; k < 3; k++) {
-      xi[k] = parts_i[i].x[k];
-      ai[k] = 0.0;
+      xi[k] = parts_i[i].x[k] - com[k];
+      ai[k] = 0.0f;
     }
     mi = parts_i[i].mass;
 
-    /* Loop over every following particle within d_max along the axis. */
-    for (j = 0; j < count_j && (parts_j[j].d - di) < d_max; j++) {
+    /* Loop over every particle in the other cell. */
+    for (j = 0; j < count_j; j++) {
 
       /* Compute the pairwise distance. */
       for (r2 = 0.0, k = 0; k < 3; k++) {
@@ -1224,103 +766,38 @@ static inline void iact_pair_direct_sorted_multipole(struct cell *ci,
         ai[k] -= wdx * mj;
       }
 
-#if ICHECK >= 0
-      if (parts_i[i].id == ICHECK)
-        message("Interaction with part %d - d= %f", parts_j[j].id, sqrt(r2));
-#endif
-
 #ifdef COUNTERS
-      ++count_direct_sorted_pp;
+      ++count_direct_unsorted;
 #endif
 
 #if ICHECK >= 0 && 0
       if (parts_i[i].id == ICHECK)
-        printf("[NEW] Interaction with particle id= %d (pair i)\n",
-               parts_j[pjd].id);
+        printf(
+            "[NEW] Interaction with particle id= %d (pair i) a=( %e %e %e )\n",
+            parts_j[j].id, -mi * w * dx[0], -mi * w * dx[1], -mi * w * dx[2]);
 
       if (parts_j[j].id == ICHECK)
-        printf("[NEW] Interaction with particle id= %d (pair j) h_i= %f h_j= "
-               "%f ci= %p cj= %p count_i= %d count_j= %d d_i= %d d_j= %d\n",
-               parts_i[pid].id, ci->h, cj->h, ci, cj, count_i, count_j, ci->res,
-               cj->res);
+        printf(
+            "[NEW] Interaction with particle id= %d (pair j) a=( %e %e %e )\n",
+            parts_i[i].id, mj * w * dx[0], mj * w * dx[1], mj * w * dx[2]);
 #endif
 
     } /* loop over every other particle. */
 
-    /* Add any remaining particles to the COM. */
-    for (int jj = j; jj < count_j; jj++) {
-      multipole_add(&multi, parts_j[jj].x, parts_j[jj].mass);
-    }
-
-    /* Shrink count_j to the latest valid particle. */
-    count_j = j;
-
-    /* Interact part_i with the center of mass. */
-    multipole_iact(&multi, xi, ai);
-
-#if ICHECK >= 0
-      if (parts_i[i].id == ICHECK)
-        message("Interaction with multipole");  //, parts_j[j].id );
-#endif
-
-#ifdef COUNTERS
-      ++count_direct_sorted_pm_i;
-#endif
-
-
     /* Store the accumulated acceleration on the ith part. */
     for (k = 0; k < 3; k++) parts_i[i].a[k] += ai[k];
 
-  } /* loop over all particles in ci. */
-
-
-
-  /* Re-init some values. */
-  count_j = cj->count;
-  int last_i = 0;
-  multipole_reset(&multi);
-
-  /* Loop over the particles in cj, catch the COM interactions. */
-  for (j = 0; j < count_j; j++) {
-
-    /* Get the sorted index. */
-    float dj = parts_j[j].d;
-
-    /* Fill the COM with any new particles. */
-    for (i = last_i; i < count_i && (dj - parts_i[i].d) > d_max; i++) {
-      multipole_add(&multi, parts_i[i].x, parts_i[i].mass);
-    }
-
-    /* Set the new last_i to the last particle checked. */
-    last_i = i;
-
-    /* Interact part_j with the COM. */
-    multipole_iact(&multi, parts_j[j].x, parts_j[j].a);
-
-#ifdef COUNTERS
-      ++count_direct_sorted_pm_j;
-#endif
-    
-  }
+  } /* loop over all particles. */
 
-  /* Copy the accelerations back to the original particles. */
-  for (i = 0; i < count_i; i++) {
-    int pid = ind_i[i].ind;
-    for (k = 0; k < 3; k++) ci->parts[pid].a[k] += parts_i[i].a[k];
-  }
-  for (j = 0; j < count_j; j++) {
-    int pjd = ind_j[j].ind;
-    for (k = 0; k < 3; k++) cj->parts[pjd].a[k] += parts_j[j].a[k];
+  /* Copy the local particle data back. */
+  for (k = 0; k < count_j; k++) {
+    for (j = 0; j < 3; j++) cj->parts[k].a[j] = parts_j[k].a[j];
   }
 }
 
-
-
-
-
 /**
  * @brief Decides whether two cells use the direct summation interaction or the
-* multipole interactions
+ * multipole interactions
  *
  * @param ci The #cell.
  * @param cj The other #cell.
@@ -1372,12 +849,12 @@ void iact_pair(struct cell *ci, struct cell *cj) {
  *
  * @param c The #cell.
  */
-void iact_self_direct(struct cell *c) {
+void iact_self_direct_dp(struct cell *c) {
   int i, j, k, count = c->count;
   double xi[3] = {0.0, 0.0, 0.0};
   float ai[3] = {0.0, 0.0, 0.0}, mi, mj, dx[3] = {0.0, 0.0, 0.0}, r2, ir, w;
-  struct part *parts = c->parts;
   struct cell *cp, *cps;
+  struct part *parts = c->parts;
 
 #ifdef SANITY_CHECKS
 
@@ -1390,7 +867,7 @@ void iact_self_direct(struct cell *c) {
      each of its siblings. */
   if (c->split) {
     for (cp = c->firstchild; cp != c->sibling; cp = cp->sibling) {
-      iact_self_direct(cp);
+      iact_self_direct_dp(cp);
       for (cps = cp->sibling; cps != c->sibling; cps = cps->sibling)
         iact_pair(cp, cps);
     }
@@ -1447,6 +924,98 @@ void iact_self_direct(struct cell *c) {
   } /* otherwise, compute interactions directly. */
 }
 
+void iact_self_direct(struct cell *c) {
+  int i, j, k, count = c->count;
+  float xi[3] = {0.0, 0.0, 0.0};
+  float ai[3] = {0.0, 0.0, 0.0}, mi, mj, dx[3] = {0.0, 0.0, 0.0}, r2, ir, w;
+  struct cell *cp, *cps;
+
+#ifdef SANITY_CHECKS
+
+  /* Early abort? */
+  if (count == 0) error("Empty cell !");
+
+#endif
+
+  /* If the cell is split, interact each progeny with itself, and with
+     each of its siblings. */
+  if (c->split) {
+    for (cp = c->firstchild; cp != c->sibling; cp = cp->sibling) {
+      iact_self_direct(cp);
+      for (cps = cp->sibling; cps != c->sibling; cps = cps->sibling)
+        iact_pair(cp, cps);
+    }
+
+    /* Otherwise, compute the interactions directly. */
+  } else {
+
+    /* Init the local copies of the particles. */
+    double loc[3];
+    struct part_local parts[count];
+    loc[0] = c->loc[0];
+    loc[1] = c->loc[1];
+    loc[2] = c->loc[2];
+    for (k = 0; k < count; k++) {
+      for (j = 0; j < 3; j++) {
+        parts[k].x[j] = c->parts[k].x[j] - loc[j];
+        parts[k].a[j] = 0.0f;
+      }
+      parts[k].mass = c->parts[k].mass;
+    }
+
+    /* Loop over all particles... */
+    for (i = 0; i < count; i++) {
+
+      /* Init the ith particle's data. */
+      for (k = 0; k < 3; k++) {
+        xi[k] = parts[i].x[k];
+        ai[k] = 0.0;
+      }
+      mi = parts[i].mass;
+
+      /* Loop over every following particle. */
+      for (j = i + 1; j < count; j++) {
+
+        /* Compute the pairwise distance. */
+        for (r2 = 0.0, k = 0; k < 3; k++) {
+          dx[k] = xi[k] - parts[j].x[k];
+          r2 += dx[k] * dx[k];
+        }
+
+        /* Apply the gravitational acceleration. */
+        ir = 1.0f / sqrtf(r2);
+        w = const_G * ir * ir * ir;
+        mj = parts[j].mass;
+        for (k = 0; k < 3; k++) {
+          float wdx = w * dx[k];
+          parts[j].a[k] += wdx * mi;
+          ai[k] -= wdx * mj;
+        }
+
+#if ICHECK >= 0
+        if (c->parts[i].id == ICHECK)
+          message("[NEW] Interaction with particle id= %d (self i)",
+                  c->parts[j].id);
+
+        if (c->parts[j].id == ICHECK)
+          message("[NEW] Interaction with particle id= %d (self j)",
+                  c->parts[i].id);
+#endif
+
+      } /* loop over every other particle. */
+
+      /* Store the accumulated acceleration on the ith part. */
+      for (k = 0; k < 3; k++) parts[i].a[k] += ai[k];
+
+    } /* loop over all particles. */
+
+    /* Copy the local particle data back. */
+    for (k = 0; k < count; k++) {
+      for (j = 0; j < 3; j++) c->parts[k].a[j] += parts[k].a[j];
+    }
+  } /* otherwise, compute interactions directly. */
+}
+
 /**
  * @brief Create the tasks for the cell pair/self.
  *
@@ -1543,8 +1112,6 @@ void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj) {
   }   /* Otherwise it's a pair */
 }
 
-
-
 /* -------------------------------------------------------------------------- */
 /* Legacy tree walk */
 /* -------------------------------------------------------------------------- */
@@ -1827,12 +1394,9 @@ void test_bh(int N, int nr_threads, int runs, char *fileName) {
       case task_type_pair:
         iact_pair(d[0], d[1]);
         break;
-      case task_type_pair_direct:
-        iact_pair_direct(d[0], d[1]);
+      case task_type_self_pc:
+        iact_self_pc(d[0], d[1], NULL);
         break;
-      /* case task_type_self_pc: */
-      /*   init_multipole_walk(d[0], d[1]); */
-      /*   break; */
       case task_type_com:
         comp_com(d[0]);
         break;
@@ -1847,7 +1411,7 @@ void test_bh(int N, int nr_threads, int runs, char *fileName) {
   for (k = 0; k < task_type_count; k++) task_timers[k] = 0;
 
   /* Initialize the scheduler. */
-  qsched_init(&s, nr_threads, qsched_flag_noreown);
+  qsched_init(&s, nr_threads, qsched_flag_noreown | qsched_flag_pthread);
 
   /* Init and fill the particle array. */
   if ((parts = (struct part *)malloc(sizeof(struct part) * N)) == NULL)
@@ -1966,8 +1530,8 @@ void test_bh(int N, int nr_threads, int runs, char *fileName) {
 
 #if ICHECK >= 0
   message("[check] accel of part %i is [%.3e,%.3e,%.3e]", ICHECK,
-          root->parts[ICHECK].a[0], root->parts[ICHECK].a[1],
-          root->parts[ICHECK].a[2]);
+          root->parts[ICHECK].a_legacy[0], root->parts[ICHECK].a_legacy[1],
+          root->parts[ICHECK].a_legacy[2]);
 #endif
   printf("task counts: [ %8s %8s %8s %8s %8s ]\n", "self", "pair", "m-poles",
          "direct", "CoMs");
@@ -1986,15 +1550,6 @@ void test_bh(int N, int nr_threads, int runs, char *fileName) {
       parts[i].a[2] = 0.0;
     }
 
-    struct cell *finger = root;
-    while (finger != NULL) {
-      finger->sorted = 0;
-      if (finger->split)
-        finger = finger->firstchild;
-      else
-        finger = finger->sibling;
-    }
-
     /* Execute the tasks. */
     tic = getticks();
     qsched_run(&s, nr_threads, runner);
@@ -2052,390 +1607,19 @@ void test_bh(int N, int nr_threads, int runs, char *fileName) {
   free(parts);
 }
 
-/* All 26 configurations for the cell tests */
-const float cell_shift[27 * 3] = {1.0, 1.0, 1.0,    /* 0 */
-                                  1.0, 1.0, 0.0,    /* 1 */
-                                  1.0, 1.0, -1.0,   /* 2 */
-                                  1.0, 0.0, 1.0,    /* 3 */
-                                  1.0, 0.0, 0.0,    /* 4 */
-                                  1.0, 0.0, -1.0,   /* 5 */
-                                  1.0, -1.0, 1.0,   /* 6 */
-                                  1.0, -1.0, 0.0,   /* 7 */
-                                  1.0, -1.0, -1.0,  /* 8 */
-                                  0.0, 1.0, 1.0,    /* 9 */
-                                  0.0, 1.0, 0.0,    /* 10 */
-                                  0.0, 1.0, -1.0,   /* 11 */
-                                  0.0, 0.0, 1.0,    /* 12 */
-                                  -1.0, -1.0, -1.0, /* 13 */
-                                  -1.0, -1.0, 0.0,  /* 14 */
-                                  -1.0, -1.0, 1.0,  /* 15 */
-                                  -1.0, 0.0, -1.0,  /* 16 */
-                                  -1.0, 0.0, 0.0,   /* 17 */
-                                  -1.0, 0.0, 1.0,   /* 18 */
-                                  -1.0, 1.0, -1.0,  /* 19 */
-                                  -1.0, 1.0, 0.0,   /* 20 */
-                                  -1.0, 1.0, 1.0,   /* 21 */
-                                  0.0, -1.0, -1.0,  /* 22 */
-                                  0.0, -1.0, 0.0,   /* 23 */
-                                  0.0, -1.0, 1.0,   /* 24 */
-                                  0.0, 0.0, -1.0,   /* 25 */
-                                  0.0, 0.0,
-                                  0.0 /* 26 */ /* <-- The cell itself */
-};
-
-/**
- * @brief Creates two neighbouring cells wiht N_parts per celland makes them
- * interact using both the
- * sorted and unsortde interactions. Outputs then the tow sets of accelerations
- * for accuracy tests.
- * @param N_parts Number of particles in each cell
- * @param orientation Orientation of the cells ( 0 <= orientation < 26 )
- */
-void test_direct_neighbour(int N_parts, int orientation) {
-
-  int k;
-  struct part *parts;
-  struct cell left, right;
-
-  if (orientation >= 26) error("Wrong orientation !");
-
-  /* Select configuration */
-  float shift[3];
-  for (k = 0; k < 3; ++k) shift[k] = cell_shift[3 * orientation + k];
-
-  /* Init and fill the particle array. */
-  if ((parts = (struct part *)malloc(sizeof(struct part) * N_parts * 2)) ==
-      NULL)
-    error("Failed to allocate particle buffer.");
-
-  /* Create random set of particles in both cells */
-  for (k = 0; k < 2 * N_parts; k++) {
-    parts[k].id = k;
-
-    parts[k].x[0] = ((double)rand()) / RAND_MAX;
-    parts[k].x[1] = ((double)rand()) / RAND_MAX;
-    parts[k].x[2] = ((double)rand()) / RAND_MAX;
-
-    /* Shift the second cell */
-    if (k >= N_parts) {
-      parts[k].x[0] += shift[0];
-      parts[k].x[1] += shift[1];
-      parts[k].x[2] += shift[2];
-    }
-
-    parts[k].mass = ((double)rand()) / RAND_MAX;
-    parts[k].a[0] = 0.0;
-    parts[k].a[1] = 0.0;
-    parts[k].a[2] = 0.0;
-  }
-
-  /* Get the cell geometry right */
-  left.loc[0] = 0.;
-  left.loc[1] = 0.;
-  left.loc[2] = 0.;
-  left.h = 1.;
-
-  right.loc[0] = shift[0];
-  right.loc[1] = shift[1];
-  right.loc[2] = shift[2];
-  right.h = 1.;
-
-  /* Put the particles in the cell */
-  left.parts = parts;
-  left.count = N_parts;
-  right.parts = parts + N_parts;
-  right.count = N_parts;
-
-  /* Get the linked list right (functions should not recurse but hey...) */
-  left.firstchild = NULL;
-  left.sibling = NULL;
-  left.split = 0;
-  right.firstchild = NULL;
-  right.sibling = NULL;
-  right.split = 0;
-
-  /* Get the multipoles right (should also be useless) */
-  comp_com(&left);
-  comp_com(&right);
-
-  /* Nothing has been sorted yet */
-  left.indices = NULL;
-  left.sorted = 0;
-  right.indices = NULL;
-  right.sorted = 0;
-
-#ifdef COUNTERS
-  count_direct_unsorted = 0;
-  count_direct_sorted_pp = 0;
-  count_direct_sorted_pm_i = 0;
-  count_direct_sorted_pm_j = 0;
-#endif
-
-  /* Do the interactions without sorting */
-  iact_pair_direct_unsorted(&left, &right);
-
-  message("Unsorted interactions done ");
-
-  /* Store accelerations */
-  for (k = 0; k < 2 * N_parts; k++) {
-    parts[k].a_exact[0] = parts[k].a[0];
-    parts[k].a_exact[1] = parts[k].a[1];
-    parts[k].a_exact[2] = parts[k].a[2];
-    parts[k].a[0] = 0.0;
-    parts[k].a[1] = 0.0;
-    parts[k].a[2] = 0.0;
-  }
-
-  /* Do the interactions with sorting */
-  iact_pair_direct(&left, &right);
-
-  message("Sorted interactions done ");
-
-  for (k = 0; k < 2 * N_parts; ++k) {
-    if (parts[k].id == ICHECK) {
-
-      message("Accel exact : [ %e %e %e ]", parts[k].a_exact[0],
-              parts[k].a_exact[1], parts[k].a_exact[2]);
-      message("Accel sorted: [ %e %e %e ]", parts[k].a[0], parts[k].a[1],
-              parts[k].a[2]);
-    }
-  }
-
-  /* Position along the axis */
-  double *position, *ortho_dist;
-  if ((position = (double *)malloc(sizeof(double) * N_parts * 2)) == NULL)
-    error("Failed to allocate position buffer.");
-  if ((ortho_dist = (double *)malloc(sizeof(double) * N_parts * 2)) == NULL)
-    error("Failed to allocate orthogonal distance buffer.");
-
-  float w = 1.0f / sqrtf(shift[0]*shift[0] + shift[1]*shift[1] + shift[2]*shift[2]);
-  shift[0] *= w;
-  shift[1] *= w;
-  shift[2] *= w;
-
-  for (k = 0; k < 2 * N_parts; ++k) {
-    float dx = parts[k].x[0] * shift[0];
-    float dy = parts[k].x[1] * shift[1];
-    float dz = parts[k].x[2] * shift[2];
-
-    position[k] =  dx + dy + dz;
-
-    dx = (parts[k].x[1] - 0.5f)*shift[2] - (parts[k].x[2] - 0.5f)*shift[1];
-    dy = (parts[k].x[2] - 0.5f)*shift[0] - (parts[k].x[0] - 0.5f)*shift[2];
-    dz = (parts[k].x[0] - 0.5f)*shift[1] - (parts[k].x[1] - 0.5f)*shift[0];
-
-    ortho_dist[k] = sqrtf(dx*dx + dy*dy + dz*dz);
-  }
-
-  /* Now, output everything */
-  char fileName[100];
-  sprintf(fileName, "interaction_dump_%d.dat", orientation);
-  message("Writing file '%s'", fileName);
-  FILE *file = fopen(fileName, "w");
-  fprintf(file,
-          "# ID m r x y z a_u.x   a_u.y    a_u.z    a_s.x    a_s.y    a_s.z    ortho\n");
-  for (k = 0; k < 2 * N_parts; k++) {
-    fprintf(file, "%d %e %e %e %e %e %e %e %e %e %e %e %e\n", parts[k].id,
-            parts[k].mass, position[k], parts[k].x[0], parts[k].x[1],
-            parts[k].x[2], parts[k].a_exact[0], parts[k].a_exact[1],
-            parts[k].a_exact[2], parts[k].a[0], parts[k].a[1], parts[k].a[2],
-	    ortho_dist[k]);
-  }
-  fclose(file);
-
-#ifdef COUNTERS
-  message("Unsorted intereactions:           %d", count_direct_unsorted);
-  message("Sorted intereactions PP:          %d", count_direct_sorted_pp);
-  message("Sorted intereactions PM (part i): %d", count_direct_sorted_pm_i);
-  message("Sorted intereactions PM (part j): %d", count_direct_sorted_pm_j);
-  message("Sorted intereactions total:       %d",
-          count_direct_sorted_pm_j + count_direct_sorted_pm_i +
-              count_direct_sorted_pp);
-// message( "%f %d %d %d %d\n", dist_cutoff_ratio, count_direct_unsorted,
-// count_direct_sorted_pp, count_direct_sorted_pm_i, count_direct_sorted_pm_j );
-#endif
-
-  /* Clean up */
-  free(parts);
-}
-
-/**
- * @brief Creates 27 cells and makes the particles in the central one
- * interact with the other cells
- * using both the sorted and unsorted interactions
- * Outputs then the two sets of accelerations for accuracy tests.
- * @param N_parts Number of particles in each cell
- * @param N_test Number of times the test has to be run
- */
-void test_all_direct_neighbours(int N_parts, int N_test) {
-
-  int i, k, j;
-  struct part *parts;
-  struct cell cells[27];
-
-  /* Init and fill the particle array. */
-  if ((parts = (struct part *)malloc(sizeof(struct part) * N_parts * 27)) ==
-      NULL)
-    error("Failed to allocate particle buffer.");
-
-#ifdef COUNTERS
-  count_direct_unsorted = 0;
-  count_direct_sorted_pp = 0;
-  count_direct_sorted_pm_i = 0;
-  count_direct_sorted_pm_j = 0;
-#endif
-  int countMultipoles = 0;
-  int countPairs = 0;
-  int countCoMs = 0;
-
-  /* Loop over the number of tests */
-  for (i = 0; i < N_test; ++i) {
-
-    /* Loop over the 27 cells */
-    for (j = 0; j < 27; ++j) {
-
-      float shift[3];
-      for (k = 0; k < 3; ++k) shift[k] = cell_shift[3 * j + k];
-
-      /* Create random set of particles in all cells */
-      for (k = 0; k < N_parts; k++) {
-        parts[j * N_parts + k].id = j * N_parts + k;
-
-        parts[j * N_parts + k].x[0] = ((double)rand()) / RAND_MAX + shift[0];
-        parts[j * N_parts + k].x[1] = ((double)rand()) / RAND_MAX + shift[1];
-        parts[j * N_parts + k].x[2] = ((double)rand()) / RAND_MAX + shift[2];
-
-        parts[j * N_parts + k].mass = ((double)rand()) / RAND_MAX;
-        parts[j * N_parts + k].a[0] = 0.0;
-        parts[j * N_parts + k].a[1] = 0.0;
-        parts[j * N_parts + k].a[2] = 0.0;
-        parts[j * N_parts + k].a_exact[0] = 0.0;
-        parts[j * N_parts + k].a_exact[1] = 0.0;
-        parts[j * N_parts + k].a_exact[2] = 0.0;
-        parts[j * N_parts + k].a_legacy[0] = 0.0;
-        parts[j * N_parts + k].a_legacy[1] = 0.0;
-        parts[j * N_parts + k].a_legacy[2] = 0.0;
-      }
-
-      /* Get the cell geometry right */
-      cells[j].loc[0] = shift[0];
-      cells[j].loc[1] = shift[1];
-      cells[j].loc[2] = shift[2];
-      cells[j].h = 1.;
-
-      /* Put the particles in the cell */
-      cells[j].parts = parts + N_parts * j;
-      cells[j].count = N_parts;
-
-      /* Get the linked list right (functions should not recurse but hey...) */
-      cells[j].firstchild = NULL;
-      cells[j].sibling = NULL;
-      cells[j].split = 0;
-
-      /* Get the multipoles right (should also be useless) */
-      comp_com(&cells[j]);
-
-      /* Nothing has been sorted yet */
-      cells[j].indices = NULL;
-      cells[j].sorted = 0;
-    }
-
-    /* Do the interactions without sorting */
-    for (j = 0; j < 26; ++j) iact_pair_direct_unsorted(&cells[26], &cells[j]);
-    iact_self_direct(&cells[26]);
-
-    // message("Unsorted interactions done ");
-
-    /* Store accelerations */
-    for (k = 0; k < 27 * N_parts; k++) {
-      parts[k].a_exact[0] = parts[k].a[0];
-      parts[k].a_exact[1] = parts[k].a[1];
-      parts[k].a_exact[2] = parts[k].a[2];
-      parts[k].a[0] = 0.0;
-      parts[k].a[1] = 0.0;
-      parts[k].a[2] = 0.0;
-    }
-
-    /* Do the interactions with sorting */
-    for (j = 0; j < 26; ++j) iact_pair_direct(&cells[26], &cells[j]);
-    iact_self_direct(&cells[26]);
-
-    // message("Sorted interactions done ");
-
-    /* Now, do a B-H calculation on the same set of particles */
-    struct cell *root = cell_get();
-    root->loc[0] = -1.0;
-    root->loc[1] = -1.0;
-    root->loc[2] = -1.0;
-    root->h = 3.0;
-    root->count = 27 * N_parts;
-    root->parts = parts;
-    struct qsched s;
-    qsched_init(&s, 1, qsched_flag_noreown);
-    cell_split(root, &s);
-    legacy_tree_walk(27 * N_parts, parts, root, ICHECK, &countMultipoles,
-                     &countPairs, &countCoMs);
-
-    // free(root);
-    //++countCoMs;
-
-    /* Now, output everything */
-    char fileName[100];
-    sprintf(fileName, "interaction_dump_all.dat");
-    // message("Writing file '%s'", fileName);
-    FILE *file = fopen(fileName, (i == 0 ? "w" : "a"));
-    if (i == 0)
-      fprintf(file,
-              "# ID m x y z a_u.x   a_u.y    a_u.z    a_s.x    a_s.y    a_s.z"
-              "   a_bh.x   a_bh.y   a_bh.z\n");
-    for (k = 0; k < 27 * N_parts; k++) {
-      if (parts[k].id >= 26 * N_parts)
-        fprintf(file, "%d %e %e %e %e %e %e %e %e %e %e %e %e %e\n",
-                parts[k].id, parts[k].mass, parts[k].x[0], parts[k].x[1],
-                parts[k].x[2], parts[k].a_exact[0], parts[k].a_exact[1],
-                parts[k].a_exact[2], parts[k].a[0], parts[k].a[1],
-                parts[k].a[2], parts[k].a_legacy[0], parts[k].a_legacy[1],
-                parts[k].a_legacy[2]);
-    }
-    fclose(file);
-  }
-
-#ifdef COUNTERS
-  message("Unsorted intereactions:           %d", count_direct_unsorted);
-  message("B-H intereactions:   PP           %d", countPairs);
-  message("B-H intereactions:   PM           %d", countMultipoles);
-  message("B-H intereactions:   total        %d", countPairs + countMultipoles);
-  message("Sorted intereactions PP:          %d", count_direct_sorted_pp);
-  message("Sorted intereactions PM (part i): %d", count_direct_sorted_pm_i);
-  message("Sorted intereactions PM (part j): %d", count_direct_sorted_pm_j);
-  message("Sorted intereactions total:       %d",
-          count_direct_sorted_pm_j + count_direct_sorted_pm_i +
-              count_direct_sorted_pp);
-// message( "%f %d %d %d %d\n", dist_cutoff_ratio, count_direct_unsorted,
-// count_direct_sorted_pp, count_direct_sorted_pm_i, count_direct_sorted_pm_j );
-#endif
-
-  /* Clean up */
-  free(parts);
-}
-
 /**
  * @brief Main function.
  */
 
 int main(int argc, char *argv[]) {
 
-  int c, k, nr_threads;
+  int c, nr_threads;
   int N = 1000, runs = 1;
   char fileName[100] = {0};
-  int N_parts = 0;
-  int N_iterations = 0;
 
   /* Die on FP-exceptions. */
   feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
 
-  srand(0);
-
 /* Get the number of threads. */
 #pragma omp parallel shared(nr_threads)
   {
@@ -2461,16 +1645,6 @@ int main(int argc, char *argv[]) {
         if (sscanf(optarg, "%s", &fileName[0]) != 1)
           error("Error parsing file name.");
         break;
-      case 'c':
-        if (sscanf(optarg, "%d", &N_parts) != 1)
-          error(
-              "Error parsing number of particles in neighbouring cells test.");
-        break;
-      case 'i':
-        if (sscanf(optarg, "%d", &N_iterations) != 1)
-          error(
-              "Error parsing number of particles in neighbouring cells test.");
-        break;
       case '?':
         fprintf(stderr, "Usage: %s [-t nr_threads] [-n N] [-r runs] [-f file] "
                         "[-c Nparts] [-i Niterations] \n",
@@ -2490,40 +1664,19 @@ int main(int argc, char *argv[]) {
   /* Part information */
   printf("Size of part: %zu bytes.\n", sizeof(struct part));
 
-  /* Run the neighbour direct integration test */
-  if (N_parts > 0) {
-
-    /* Dump arguments */
-    message("Interacting 2 neighbouring cells with %d particles per cell",
-            N_parts);
-
-    /* Run the test */
-    for (k = 0; k < 26; ++k) test_direct_neighbour(N_parts, k);
-
-    /* Dump arguments */
-    message("Interacting one cell with %d particles with its 27 neighbours"
-            " %d times to get the statistics.",
-            N_parts, N_iterations);
-
-    test_all_direct_neighbours(N_parts, N_iterations);
-
+  /* Dump arguments. */
+  if (fileName[0] == 0) {
+    message("Computing the N-body problem over %i random particles using %i "
+            "threads (%i runs).",
+            N, nr_threads, runs);
   } else {
-
-    /* Dump arguments. */
-    if (fileName[0] == 0) {
-      message("Computing the N-body problem over %i random particles using %i "
-              "threads (%i runs).",
-              N, nr_threads, runs);
-    } else {
-      message("Computing the N-body problem over %i particles read from '%s' "
-              "using %i threads (%i runs).",
-              N, fileName, nr_threads, runs);
-    }
-
-    /* Run the BH test. */
-    test_bh(N, nr_threads, runs, fileName);
+    message("Computing the N-body problem over %i particles read from '%s' "
+            "using %i threads (%i runs).",
+            N, fileName, nr_threads, runs);
   }
 
+  /* Run the BH test. */
+  test_bh(N, nr_threads, runs, fileName);
+
   return 0;
 }
-