diff --git a/examples/test_fmm.c b/examples/test_fmm.c
index 03ea2c5f93bb24a23915cac365bfa642cb870154..a897a8cd8c11e82579e4b73254702fff34c15cb6 100644
--- a/examples/test_fmm.c
+++ b/examples/test_fmm.c
@@ -58,7 +58,8 @@ struct part {
   //  };
   float mass;
   int id;
-};  // __attribute__((aligned(32)));
+};//  __attribute__((aligned(32)));
+
 
 struct part_local {
   float x[3];
@@ -94,10 +95,12 @@ struct cell {
     struct multipole new;
   };
 
-  int res, com_tid;
+  double a[3];
+
+  int res, com_tid, down_tid;
   struct index *indices;
 
-} __attribute__((aligned(128)));
+} __attribute__((aligned(32)));
 
 /** Task types. */
 enum task_type {
@@ -105,6 +108,7 @@ enum task_type {
   task_type_pair,
   task_type_pair_pc,
   task_type_com,
+  task_type_down,
   task_type_count
 };
 
@@ -203,8 +207,55 @@ void comp_com(struct cell *c) {
     c->new.com[2] = 0.0;
     c->new.mass = 0.0;
   }
+
+  /* Reset COM acceleration */
+  c->a[0] = 0.;
+  c->a[1] = 0.;
+  c->a[2] = 0.;
 }
 
+
+
+/**
+ * @brief Add a cell's acceleration to all its siblings
+ *
+ * @param c The #cell.
+ */
+void comp_down(struct cell *c) {
+
+  int k, count = c->count;
+  struct cell *cp;
+  struct part *parts = c->parts;
+
+  //  message("h=%f a= %f %f %f", c->h, c->a[0], c->a[1], c->a[2]);
+
+
+  if (c->split) {
+
+    //message("first= %p, sibling= %p", c->firstchild, c->sibling);
+    /* Loop over the siblings and propagate accelerations. */
+    for (cp = c->firstchild; cp != c->sibling; cp = cp->sibling) {
+      cp->a[0] += c->a[0];
+      cp->a[1] += c->a[1];
+      cp->a[2] += c->a[2];
+
+      /* Recurse */
+      //comp_down(cp);
+    }
+
+  } else {
+
+    /* Otherwise, propagate the accelerations to the siblings. */
+    for (k = 0; k < count; k++) {
+      parts[k].a[0] += c->a[0];
+      parts[k].a[1] += c->a[1];
+      parts[k].a[2] += c->a[2];
+    }
+  }
+
+}
+
+
 /**
  * @brief Sort the parts into eight bins along the given pivots and
  *        fill the multipoles. Also adds the hierarchical resources
@@ -234,13 +285,19 @@ void cell_split(struct cell *c, struct qsched *s) {
   if (c->res == qsched_res_none)
     c->res = qsched_addres(s, qsched_owner_none, qsched_res_none);
 
-/* Attach a center-of-mass task to the cell. */
+/* Attach a center-of-mass and downward pass task to the cell. */
+  if (count > 0) {
+
 #ifdef COM_AS_TASK
-  if (count > 0)
     c->com_tid = qsched_addtask(s, task_type_com, task_flag_none, &c,
                                 sizeof(struct cell *), 1);
 #endif
 
+    c->down_tid = qsched_addtask(s, task_type_down, task_flag_none, &c,
+                                sizeof(struct cell *), 1);
+    qsched_addlock(s, c->down_tid, c->res);
+  }
+
   /* Does this cell need to be split? */
   if (count > cell_maxparts) {
 
@@ -355,13 +412,21 @@ void cell_split(struct cell *c, struct qsched *s) {
     for (k = 0; k < 8; k++)
       if (progenitors[k]->count > 0) cell_split(progenitors[k], s);
 
-/* Link the COM tasks. */
+    /* Link the COM tasks. */
 #ifdef COM_AS_TASK
     for (k = 0; k < 8; k++)
       if (progenitors[k]->count > 0)
         qsched_addunlock(s, progenitors[k]->com_tid, c->com_tid);
 #endif
 
+    /* Link the downward tasks. */
+#ifdef COM_AS_TASK
+    for (k = 0; k < 8; k++)
+      if (progenitors[k]->count > 0)
+        qsched_addunlock(s, c->down_tid, progenitors[k]->down_tid);
+#endif
+
+
   } /* does the cell need to be split? */
 
 
@@ -408,58 +473,58 @@ static inline int are_neighbours(struct cell *ci, struct cell *cj) {
 
 
 
-/**
- * @brief Interacts all count particles in parts
- * with the monopole in cj
- *
- * @param parts An array of particles
- * @param count The number of particles in the array
- * @param cj The cell with which the particles will interact
- */
-static inline void make_interact_pc(struct part *parts, int count, struct cell *cj) {
+/* /\** */
+/*  * @brief Interacts all count particles in parts */
+/*  * with the monopole in cj */
+/*  * */
+/*  * @param parts An array of particles */
+/*  * @param count The number of particles in the array */
+/*  * @param cj The cell with which the particles will interact */
+/*  *\/ */
+/* static inline void make_interact_pc(struct part *parts, int count, struct cell *cj) { */
 
-  int j, k;
-  float com[3] = {0.0, 0.0, 0.0};
-  float mcom, dx[3] = {0.0, 0.0, 0.0}, r2, ir, w;
+/*   int j, k; */
+/*   float com[3] = {0.0, 0.0, 0.0}; */
+/*   float mcom, dx[3] = {0.0, 0.0, 0.0}, r2, ir, w; */
 
-#ifdef SANITY_CHECKS
+/* #ifdef SANITY_CHECKS */
 
-  /* Sanity checks */
-  if (count == 0) error("Empty cell!");
+/*   /\* Sanity checks *\/ */
+/*   if (count == 0) error("Empty cell!"); */
 
-  /* Sanity check. */
-  if (cj->new.mass == 0.0) {
-    message("%e %e %e %d %p %d %p", cj->new.com[0], cj->new.com[1],
-            cj->new.com[2], cj->count, cj, cj->split, cj->sibling);
+/*   /\* Sanity check. *\/ */
+/*   if (cj->new.mass == 0.0) { */
+/*     message("%e %e %e %d %p %d %p", cj->new.com[0], cj->new.com[1], */
+/*             cj->new.com[2], cj->count, cj, cj->split, cj->sibling); */
 
-    for (j = 0; j < cj->count; ++j)
-      message("part %d mass=%e id=%d", j, cj->parts[j].mass, cj->parts[j].id);
+/*     for (j = 0; j < cj->count; ++j) */
+/*       message("part %d mass=%e id=%d", j, cj->parts[j].mass, cj->parts[j].id); */
 
-    error("com does not seem to have been set.");
-  }
+/*     error("com does not seem to have been set."); */
+/*   } */
 
-#endif
+/* #endif */
 
-  /* Init the com's data. */
-  for (k = 0; k < 3; k++) com[k] = cj->new.com[k];
-  mcom = cj->new.mass;
+/*   /\* Init the com's data. *\/ */
+/*   for (k = 0; k < 3; k++) com[k] = cj->new.com[k]; */
+/*   mcom = cj->new.mass; */
 
-  /* Loop over every particle in leaf. */
-  for (j = 0; j < count; j++) {
+/*   /\* Loop over every particle in leaf. *\/ */
+/*   for (j = 0; j < count; j++) { */
 
-    /* Compute the pairwise distance. */
-    for (r2 = 0.0, k = 0; k < 3; k++) {
-      dx[k] = com[k] - parts[j].x[k];
-      r2 += dx[k] * dx[k];
-    }
+/*     /\* Compute the pairwise distance. *\/ */
+/*     for (r2 = 0.0, k = 0; k < 3; k++) { */
+/*       dx[k] = com[k] - parts[j].x[k]; */
+/*       r2 += dx[k] * dx[k]; */
+/*     } */
 
-    /* Apply the gravitational acceleration. */
-    ir = 1.0f / sqrtf(r2);
-    w = mcom * const_G * ir * ir * ir;
-    for (k = 0; k < 3; k++) parts[j].a[k] += w * dx[k];
+/*     /\* Apply the gravitational acceleration. *\/ */
+/*     ir = 1.0f / sqrtf(r2); */
+/*     w = mcom * const_G * ir * ir * ir; */
+/*     for (k = 0; k < 3; k++) parts[j].a[k] += w * dx[k]; */
 
-  } /* loop over every particle. */
-}
+/*   } /\* loop over every particle. *\/ */
+/* } */
 
 
 
@@ -474,6 +539,8 @@ static inline void make_interact_pc(struct part *parts, int count, struct cell *
 static inline void iact_pair_pc(struct cell *ci, struct cell *cj) {
 
   struct cell *cp, *cps;
+  int k;
+  float r2, w, ir, dx[3];
   
 #ifdef SANITY_CHECKS
   if (ci->h != cj->h)
@@ -489,8 +556,21 @@ static inline void iact_pair_pc(struct cell *ci, struct cell *cj) {
 
       if ( ! are_neighbours(cp, cps) ) {
 
-	make_interact_pc(cp->parts,  cp->count, cps);
-	make_interact_pc(cps->parts, cps->count, cp );
+
+
+	/* Compute the pairwise distance. */
+	for (r2 = 0.0, k = 0; k < 3; k++) {
+	  dx[k] = cps->new.com[k] - cp->new.com[k];
+	  r2 += dx[k] * dx[k];
+	}
+	ir = 1.0f / sqrtf(r2);
+
+	/* Apply the gravitational acceleration. */
+	w = cps->new.mass * const_G * ir * ir * ir;
+	for (k = 0; k < 3; k++) cp->a[k] += w * dx[k];
+
+	w = cp->new.mass * const_G * ir * ir * ir;
+	for (k = 0; k < 3; k++) cps->a[k] -= w * dx[k];
 
       }
     }
@@ -1155,6 +1235,9 @@ void test_bh(int N, int nr_threads, int runs, char *fileName) {
       case task_type_com:
         comp_com(d[0]);
         break;
+      case task_type_down:
+        comp_down(d[0]);
+        break;
       default:
         error("Unknown task type.");
     }
@@ -1227,7 +1310,7 @@ void test_bh(int N, int nr_threads, int runs, char *fileName) {
   }
   message("Average number of parts per leaf is %f.", ((double)N) / nr_leaves);
 
-#if 1//ICHECK > 0
+#if 1 //ICHECK > 0
   printf("----------------------------------------------------------\n");
 
   /* Do a N^2 interactions calculation */
@@ -1283,12 +1366,12 @@ void test_bh(int N, int nr_threads, int runs, char *fileName) {
 
   // fclose(fileTime);
 
-  printf("task counts: [ %10s %10s %10s %10s %10s ]\n", "self", "pair", "m-poles",
-         "direct", "CoMs");
-  printf("task counts: [ %10i %10i %10i %10i %10i ] (legacy).\n", 0, 0,
-         countMultipoles, countPairs, countCoMs);
-  printf("task counts: [ %10i %10i %10i %10i %10i ] (legacy).\n", counts[0],
-         counts[1], counts[2], 0,  counts[3]);
+  printf("task counts: [ %10s %10s %10s %10s %10s %10s ]\n", "self", "pair", "m-poles",
+         "direct", "CoMs", "Downpass");
+  printf("task counts: [ %10i %10i %10i %10i %10i %10i ] (legacy).\n", 0, 0,
+         countMultipoles, countPairs, countCoMs, 0);
+  printf("task counts: [ %10i %10i %10i %10i %10i %10i ] (new).\n", counts[0],
+         counts[1], counts[2], 0,  counts[3], counts[4]);
 
   /* Loop over the number of runs. */
   for (k = 0; k < runs; k++) {
@@ -1302,6 +1385,7 @@ void test_bh(int N, int nr_threads, int runs, char *fileName) {
     /* Execute the tasks. */
     tic = getticks();
     qsched_run(&s, nr_threads, runner);
+    comp_down(root);
     toc_run = getticks();
     message("%ith run took %lli (= %e) ticks...", k, toc_run - tic,
             (float)(toc_run - tic));