diff --git a/examples/test_bh.c b/examples/test_bh.c
index 70897a1689c621663985bc3efe8207511b4596a8..46d50d86dc0c5cc17a9fda5bc15a4e81d4a155d2 100644
--- a/examples/test_bh.c
+++ b/examples/test_bh.c
@@ -51,14 +51,20 @@
 /** Data structure for the particles. */
 struct part {
   double x[3];
-  // union {
-  float a[3];
-  float a_legacy[3];
-  float a_exact[3];
-  // };
+  union {
+    float a[3];
+    float a_legacy[3];
+    float a_exact[3];
+  };
   float mass;
   int id;
-};  // __attribute__((aligned(64)));
+};  // __attribute__((aligned(32)));
+
+struct part_local {
+  float x[3];
+  float a[3];
+  float mass;
+} __attribute__((aligned(32)));
 
 struct multipole {
   double com[3];
@@ -384,12 +390,12 @@ void cell_split(struct cell *c, struct qsched *s) {
 /**
  * @brief Interacts all particles in ci with the monopole in cj
  */
-static inline void make_interact_pc(struct cell *leaf, struct cell *cj) {
+static inline void make_interact_pc(struct part_local *parts, int count,
+                                    double *loc, struct cell *cj) {
 
-  int j, k, count = leaf->count;
-  double com[3] = {0.0, 0.0, 0.0};
+  int j, k;
+  float com[3] = {0.0, 0.0, 0.0};
   float mcom, dx[3] = {0.0, 0.0, 0.0}, r2, ir, w;
-  struct part *parts = leaf->parts;
 
 #ifdef SANITY_CHECKS
 
@@ -410,7 +416,7 @@ static inline void make_interact_pc(struct cell *leaf, struct cell *cj) {
 #endif
 
   /* Init the com's data. */
-  for (k = 0; k < 3; k++) com[k] = cj->new.com[k];
+  for (k = 0; k < 3; k++) com[k] = cj->new.com[k] - loc[k];
   mcom = cj->new.mass;
 
   /* Loop over every particle in leaf. */
@@ -456,21 +462,19 @@ static inline int is_inside(struct cell *leaf, struct cell *c) {
 static inline int are_neighbours_different_size(struct cell *ci,
                                                 struct cell *cj) {
 
-  int k;
-  float dx[3];
   double cih = ci->h, cjh = cj->h;
 
   /* Maximum allowed distance */
   float min_dist = 0.5 * (cih + cjh);
 
   /* (Manhattan) Distance between the cells */
-  for (k = 0; k < 3; k++) {
+  for (int k = 0; k < 3; k++) {
     float center_i = ci->loc[k] + 0.5 * cih;
     float center_j = cj->loc[k] + 0.5 * cjh;
-    dx[k] = fabs(center_i - center_j);
+    if (fabsf(center_i - center_j) > min_dist) return 0;
   }
 
-  return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist);
+  return 1;
 }
 
 /**
@@ -479,9 +483,6 @@ static inline int are_neighbours_different_size(struct cell *ci,
  */
 static inline int are_neighbours(struct cell *ci, struct cell *cj) {
 
-  int k;
-  float dx[3];
-
 #ifdef SANITY_CHECKS
   if (ci->h != cj->h)
     error(" Cells of different size in distance calculation.");
@@ -491,13 +492,13 @@ static inline int are_neighbours(struct cell *ci, struct cell *cj) {
   float min_dist = ci->h;
 
   /* (Manhattan) Distance between the cells */
-  for (k = 0; k < 3; k++) {
+  for (int k = 0; k < 3; k++) {
     float center_i = ci->loc[k];
     float center_j = cj->loc[k];
-    dx[k] = fabs(center_i - center_j);
+    if (fabsf(center_i - center_j) > min_dist) return 0;
   }
 
-  return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist);
+  return 1;
 }
 
 /**
@@ -509,7 +510,8 @@ static inline int are_neighbours(struct cell *ci, struct cell *cj) {
  * @param cj The #cell containing the center of mass.
  */
 static inline void iact_pair_pc(struct cell *ci, struct cell *cj,
-                                struct cell *leaf) {
+                                struct cell *leaf, struct part_local *parts,
+                                int count, double *loc) {
 
   struct cell *cp, *cps;
 
@@ -551,11 +553,11 @@ static inline void iact_pair_pc(struct cell *ci, struct cell *cj,
 
         /* We only recurse if the children are split */
         if (cp->split && cps->split) {
-          iact_pair_pc(cp, cps, leaf);
+          iact_pair_pc(cp, cps, leaf, parts, count, loc);
         }
 
       } else {
-        make_interact_pc(leaf, cps);
+        make_interact_pc(parts, count, loc, cps);
       }
     }
   } else {
@@ -564,7 +566,7 @@ static inline void iact_pair_pc(struct cell *ci, struct cell *cj,
      * multipoles */
     for (cps = cj->firstchild; cps != cj->sibling; cps = cps->sibling) {
 
-      make_interact_pc(leaf, cps);
+      make_interact_pc(parts, count, loc, cps);
     }
   }
 }
@@ -576,9 +578,11 @@ static inline void iact_pair_pc(struct cell *ci, struct cell *cj,
  * @param c The #cell containing the monopoles
  * @param leaf The #cell containing the particles
  */
-static inline void iact_self_pc(struct cell *c, struct cell *leaf) {
+static inline void iact_self_pc(struct cell *c, struct cell *leaf,
+                                struct part_local *parts) {
 
   struct cell *cp, *cps;
+  int collect_part_data = 0;
 
 #ifdef SANITY_CHECKS
 
@@ -589,6 +593,23 @@ static inline void iact_self_pc(struct cell *c, struct cell *leaf) {
 
 #endif
 
+  /* Get local copies of the particle data. */
+  if (parts == NULL) {
+    int count = leaf->count;
+    if ((parts =
+             (struct part_local *)malloc(sizeof(struct part_local) * count)) ==
+        NULL)
+      error("Failed to allocate local parts.");
+    for (int k = 0; k < count; k++) {
+      for (int j = 0; j < 3; j++) {
+        parts[k].x[j] = leaf->parts[k].x[j] - leaf->loc[j];
+        parts[k].a[j] = 0.0f;
+      }
+      parts[k].mass = leaf->parts[k].mass;
+    }
+    collect_part_data = 1;
+  }
+
   /* Find in which subcell of c the leaf is */
   for (cp = c->firstchild; cp != c->sibling; cp = cp->sibling) {
 
@@ -599,7 +620,7 @@ static inline void iact_self_pc(struct cell *c, struct cell *leaf) {
   if (cp->split) {
 
     /* Recurse if the cell can be split */
-    iact_self_pc(cp, leaf);
+    iact_self_pc(cp, leaf, parts);
 
     /* Now, interact with every other subcell */
     for (cps = c->firstchild; cps != c->sibling; cps = cps->sibling) {
@@ -607,8 +628,17 @@ static inline void iact_self_pc(struct cell *c, struct cell *leaf) {
       /* Since cp and cps will be direct neighbours it is only worth recursing
        */
       /* if the cells can both be split */
-      if (cp != cps && cps->split) iact_pair_pc(cp, cps, leaf);
+      if (cp != cps && cps->split)
+        iact_pair_pc(cp, cps, leaf, parts, leaf->count, leaf->loc);
+    }
+  }
+
+  /* Clean up local parts? */
+  if (collect_part_data) {
+    for (int k = 0; k < leaf->count; k++) {
+      for (int j = 0; j < 3; j++) leaf->parts[k].a[j] += parts[k].a[j];
     }
+    free(parts);
   }
 }
 
@@ -695,12 +725,8 @@ static inline void iact_pair_direct(struct cell *ci, struct cell *cj) {
   float xi[3];
   float dx[3], ai[3], mi, mj, r2, w, ir;
   double com[3];
-  struct part_local {
-    float x[3];
-    float a[3];
-    float mass;
-  } parts_i[count_i], parts_j[count_j] __attribute__((aligned(64)));;
-
+  struct part_local parts_j[count_j];
+  struct part *parts_i = ci->parts;
 
 #ifdef SANITY_CHECKS
 
@@ -716,13 +742,6 @@ static inline void iact_pair_direct(struct cell *ci, struct cell *cj) {
   }
 
   /* Init the local copies of the particles. */
-  for (k = 0; k < count_i; k++) {
-    for (j = 0; j < 3; j++) {
-      parts_i[k].x[j] = ci->parts[k].x[j] - com[j];
-      parts_i[k].a[j] = 0.0f;
-    }
-    parts_i[k].mass = ci->parts[k].mass;
-  }
   for (k = 0; k < count_j; k++) {
     for (j = 0; j < 3; j++) {
       parts_j[k].x[j] = cj->parts[k].x[j] - com[j];
@@ -730,13 +749,13 @@ static inline void iact_pair_direct(struct cell *ci, struct cell *cj) {
     }
     parts_j[k].mass = ci->parts[k].mass;
   }
-  
+
   /* Loop over all particles in ci... */
   for (i = 0; i < count_i; i++) {
 
     /* Init the ith particle's data. */
     for (k = 0; k < 3; k++) {
-      xi[k] = parts_i[i].x[k];
+      xi[k] = parts_i[i].x[k] - com[k];
       ai[k] = 0.0f;
     }
     mi = parts_i[i].mass;
@@ -782,15 +801,10 @@ static inline void iact_pair_direct(struct cell *ci, struct cell *cj) {
     for (k = 0; k < 3; k++) parts_i[i].a[k] += ai[k];
 
   } /* loop over all particles. */
-  
+
   /* Copy the local particle data back. */
-  for (k = 0; k < count_i; k++) {
-    for (j = 0; j < 3; j++)
-      ci->parts[k].a[j] = parts_i[k].a[j];
-  }
   for (k = 0; k < count_j; k++) {
-    for (j = 0; j < 3; j++)
-      cj->parts[k].a[j] = parts_j[k].a[j];
+    for (j = 0; j < 3; j++) cj->parts[k].a[j] = parts_j[k].a[j];
   }
 }
 
@@ -928,12 +942,6 @@ void iact_self_direct(struct cell *c) {
   float xi[3] = {0.0, 0.0, 0.0};
   float ai[3] = {0.0, 0.0, 0.0}, mi, mj, dx[3] = {0.0, 0.0, 0.0}, r2, ir, w;
   struct cell *cp, *cps;
-  struct part_local {
-    float x[3];
-    float a[3];
-    float mass;
-  } parts[count] __attribute__((aligned(64)));;
-  double loc[3];
 
 #ifdef SANITY_CHECKS
 
@@ -955,7 +963,11 @@ void iact_self_direct(struct cell *c) {
   } else {
 
     /* Init the local copies of the particles. */
-    loc[0] = c->loc[0]; loc[1] = c->loc[1]; loc[2] = c->loc[2];
+    double loc[3];
+    struct part_local parts[count];
+    loc[0] = c->loc[0];
+    loc[1] = c->loc[1];
+    loc[2] = c->loc[2];
     for (k = 0; k < count; k++) {
       for (j = 0; j < 3; j++) {
         parts[k].x[j] = c->parts[k].x[j] - loc[j];
@@ -1012,8 +1024,7 @@ void iact_self_direct(struct cell *c) {
 
     /* Copy the local particle data back. */
     for (k = 0; k < count; k++) {
-      for (j = 0; j < 3; j++)
-        c->parts[k].a[j] = parts[k].a[j];
+      for (j = 0; j < 3; j++) c->parts[k].a[j] = parts[k].a[j];
     }
   } /* otherwise, compute interactions directly. */
 }
@@ -1397,7 +1408,7 @@ void test_bh(int N, int nr_threads, int runs, char *fileName) {
         iact_pair(d[0], d[1]);
         break;
       case task_type_self_pc:
-        iact_self_pc(d[0], d[1]);
+        iact_self_pc(d[0], d[1], NULL);
         break;
       case task_type_com:
         comp_com(d[0]);
@@ -1413,7 +1424,7 @@ void test_bh(int N, int nr_threads, int runs, char *fileName) {
   for (k = 0; k < task_type_count; k++) task_timers[k] = 0;
 
   /* Initialize the scheduler. */
-  qsched_init(&s, nr_threads, qsched_flag_noreown);
+  qsched_init(&s, nr_threads, qsched_flag_noreown | qsched_flag_pthread);
 
   /* Init and fill the particle array. */
   if ((parts = (struct part *)malloc(sizeof(struct part) * N)) == NULL)
@@ -1551,7 +1562,7 @@ void test_bh(int N, int nr_threads, int runs, char *fileName) {
       parts[i].a[1] = 0.0;
       parts[i].a[2] = 0.0;
     }
-    
+
     /* Execute the tasks. */
     tic = getticks();
     qsched_run(&s, nr_threads, runner);