diff --git a/.gitignore b/.gitignore
index 4e2f82309eb0dbf56f8eab50d2df14b65865d858..7d6d9021f12ebfcb837d19c443362f1ecbc4077f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -91,6 +91,7 @@ theory/SPH/*.pdf
 theory/paper_pasc/pasc_paper.pdf
 theory/Multipoles/fmm.pdf
 theory/Multipoles/fmm_standalone.pdf
+theory/Multipoles/potential.pdf
 
 m4/libtool.m4
 m4/ltoptions.m4
diff --git a/README b/README
index 6e2b2cb2304aaa1790646de5316b7c08362779bf..2dedb32a04a7cf143c3e65560c45a68c0e5d1c2a 100644
--- a/README
+++ b/README
@@ -27,6 +27,7 @@ Valid options are:
   -f    {int} Overwrite the CPU frequency (Hz) to be used for time measurements.
   -g          Run with an external gravitational potential.
   -G          Run with self-gravity.
+  -M          Reconstruct the multipoles every time-step.
   -n    {int} Execute a fixed number of time steps. When unset use the time_end parameter to stop. 
   -s          Run with hydrodynamics.
   -S          Run with stars.
diff --git a/configure.ac b/configure.ac
index 6c0a06005778c499c53ae8e596a0685a78886542..8a2d0f30ae297993b34153bc9a4c04085f4748f5 100644
--- a/configure.ac
+++ b/configure.ac
@@ -189,6 +189,18 @@ if test "$enable_task_debugging" = "yes"; then
    AC_DEFINE([SWIFT_DEBUG_TASKS],1,[Enable task debugging])
 fi
 
+# Check if the general timers are switched on.
+AC_ARG_ENABLE([timers],
+   [AS_HELP_STRING([--enable-timers],
+     [Activate the basic timers @<:@yes/no@:>@]
+   )],
+   [enable_timers="$enableval"],
+   [enable_timers="no"]
+)
+if test "$enable_timers" = "yes"; then
+   AC_DEFINE([SWIFT_USE_TIMERS],1,[Enable individual timers])
+fi
+
 # Check if expensive debugging is on.
 AC_ARG_ENABLE([debugging-checks],
    [AS_HELP_STRING([--enable-debugging-checks],
@@ -215,6 +227,21 @@ elif test "$gravity_force_checks" != "no"; then
    AC_DEFINE_UNQUOTED([SWIFT_GRAVITY_FORCE_CHECKS], [$enableval] ,[Enable gravity brute-force checks])
 fi
 
+# Check if we want to zero the gravity forces for all particles below some ID.
+AC_ARG_ENABLE([no-gravity-below-id],
+   [AS_HELP_STRING([--enable-no-gravity-below-id],
+     [Zeros the gravitational acceleration of all particles with an ID smaller than @<:@N@:>@]
+   )],
+   [no_gravity_below_id="$enableval"],
+   [no_gravity_below_id="no"]
+)
+if test "$no_gravity_below_id" == "yes"; then
+   AC_MSG_ERROR(Need to specify the ID below which particles get zero forces when using --enable-no-gravity-below-id!)
+elif test "$no_gravity_below_id" != "no"; then
+   AC_DEFINE_UNQUOTED([SWIFT_NO_GRAVITY_BELOW_ID], [$enableval] ,[Particles with smaller ID than this will have zero gravity forces])
+fi
+
+
 # Define HAVE_POSIX_MEMALIGN if it works.
 AX_FUNC_POSIX_MEMALIGN
 
@@ -854,12 +881,15 @@ AC_MSG_RESULT([
    Adiabatic index    : $with_gamma
    Riemann solver     : $with_riemann
    Cooling function   : $with_cooling
-   External potential : $with_potential
-   Multipole order    : $with_multipole_order
-
-   Task debugging     : $enable_task_debugging
-   Debugging checks   : $enable_debugging_checks
-   Gravity checks     : $gravity_force_checks
+   
+   External potential  : $with_potential
+   Multipole order     : $with_multipole_order
+   No gravity below ID : $no_gravity_below_id
+
+   Individual timers : $enable_timers
+   Task debugging    : $enable_task_debugging
+   Debugging checks  : $enable_debugging_checks
+   Gravity checks    : $gravity_force_checks
 ])
 
 # Make sure the latest git revision string gets included
diff --git a/examples/Makefile.am b/examples/Makefile.am
index dd13fb7eb4b82fbbfbb1ae450e20d01b13f2a455..1dd240fb6015fe5fdd2465cccb1bb221706efeed 100644
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -16,7 +16,7 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 # Common flags
-MYFLAGS = -DTIMER
+MYFLAGS = 
 
 # Add the source directory and debug to CFLAGS
 AM_CFLAGS = -I$(top_srcdir)/src $(HDF5_CPPFLAGS)
diff --git a/examples/main.c b/examples/main.c
index 5f42f3e06ea46d0c3b73363956f470016e5d1498..631117148addd3ab7ad49ed2760855b793757870 100644
--- a/examples/main.c
+++ b/examples/main.c
@@ -82,6 +82,8 @@ void print_help_message() {
          "Run with an external gravitational potential.");
   printf("  %2s %8s %s\n", "-F", "", "Run with feedback.");
   printf("  %2s %8s %s\n", "-G", "", "Run with self-gravity.");
+  printf("  %2s %8s %s\n", "-M", "",
+         "Reconstruct the multipoles every time-step.");
   printf("  %2s %8s %s\n", "-n", "{int}",
          "Execute a fixed number of time steps. When unset use the time_end "
          "parameter to stop.");
@@ -164,6 +166,7 @@ int main(int argc, char *argv[]) {
   int with_stars = 0;
   int with_fp_exceptions = 0;
   int with_drift_all = 0;
+  int with_mpole_reconstruction = 0;
   int verbose = 0;
   int nr_threads = 1;
   int with_verbose_timers = 0;
@@ -172,7 +175,8 @@ int main(int argc, char *argv[]) {
 
   /* Parse the parameters */
   int c;
-  while ((c = getopt(argc, argv, "acCdDef:FgGhn:sSt:Tv:y:")) != -1) switch (c) {
+  while ((c = getopt(argc, argv, "acCdDef:FgGhMn:sSt:Tv:y:")) != -1)
+    switch (c) {
       case 'a':
         with_aff = 1;
         break;
@@ -210,6 +214,9 @@ int main(int argc, char *argv[]) {
       case 'h':
         if (myrank == 0) print_help_message();
         return 0;
+      case 'M':
+        with_mpole_reconstruction = 1;
+        break;
       case 'n':
         if (sscanf(optarg, "%d", &nsteps) != 1) {
           if (myrank == 0) printf("Error parsing fixed number of steps.\n");
@@ -521,6 +528,8 @@ int main(int argc, char *argv[]) {
   /* Construct the engine policy */
   int engine_policies = ENGINE_POLICY | engine_policy_steal;
   if (with_drift_all) engine_policies |= engine_policy_drift_all;
+  if (with_mpole_reconstruction)
+    engine_policies |= engine_policy_reconstruct_mpoles;
   if (with_hydro) engine_policies |= engine_policy_hydro;
   if (with_self_gravity) engine_policies |= engine_policy_self_gravity;
   if (with_external_gravity) engine_policies |= engine_policy_external_gravity;
@@ -601,35 +610,25 @@ int main(int argc, char *argv[]) {
   engine_dump_snapshot(&e);
 
   /* Legend */
-  if (myrank == 0) {
+  if (myrank == 0)
     printf("# %6s %14s %14s %10s %10s %10s %16s [%s]\n", "Step", "Time",
            "Time-step", "Updates", "g-Updates", "s-Updates", "Wall-clock time",
            clocks_getunit());
 
-    if (with_verbose_timers) {
-      printf("timers: ");
-      for (int k = 0; k < timer_count; k++) printf("%s\t", timers_names[k]);
-      printf("\n");
-    }
-  }
+  /* File for the timers */
+  if (with_verbose_timers) timers_open_file(myrank);
 
   /* Main simulation loop */
   for (int j = 0; !engine_is_done(&e) && e.step - 1 != nsteps; j++) {
 
     /* Reset timers */
-    timers_reset(timers_mask_all);
+    timers_reset_all();
 
     /* Take a step. */
     engine_step(&e);
 
     /* Print the timers. */
-    if (with_verbose_timers) {
-      printf("timers: ");
-      for (int k = 0; k < timer_count; k++)
-        printf("%.3f\t", clocks_from_ticks(timers[k]));
-      printf("\n");
-      timers_reset(0xFFFFFFFFllu);
-    }
+    if (with_verbose_timers) timers_print(e.step);
 
 #ifdef SWIFT_DEBUG_TASKS
     /* Dump the task data using the given frequency. */
@@ -735,6 +734,7 @@ int main(int argc, char *argv[]) {
 #endif
 
   /* Clean everything */
+  if (with_verbose_timers) timers_close_file();
   engine_clean(&e);
   free(params);
 
diff --git a/src/Makefile.am b/src/Makefile.am
index 1da7a0d955e488c0b96ee209080b5438356a36bc..7bec5327f4759fcf7d3e1af9d041677ffbc7ab55 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -16,7 +16,7 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 # Add the debug flag to the whole thing
-AM_CFLAGS = -DTIMER $(HDF5_CPPFLAGS)
+AM_CFLAGS = $(HDF5_CPPFLAGS)
 
 # Assign a "safe" version number
 AM_LDFLAGS = $(HDF5_LDFLAGS) $(FFTW_LIBS) -version-info 0:0:0
@@ -46,7 +46,7 @@ include_HEADERS = space.h runner.h queue.h task.h lock.h cell.h part.h const.h \
     hydro_properties.h riemann.h threadpool.h cooling.h cooling_struct.h sourceterms.h \
     sourceterms_struct.h statistics.h memswap.h cache.h runner_doiact_vec.h profiler.h \
     dump.h logger.h active.h timeline.h xmf.h gravity_properties.h gravity_derivatives.h \
-    vector_power.h collectgroup.h hydro_space.h sort_part.h
+    gravity_softened_derivatives.h vector_power.h collectgroup.h hydro_space.h sort_part.h
 
 # Common source files
 AM_SOURCES = space.c runner.c queue.c task.c cell.c engine.c \
diff --git a/src/atomic.h b/src/atomic.h
index be24f96e5a9d2e955132f0d6d34bdfa58bc1649c..4f180c643a304a801a9d70940fd12f9f193941e1 100644
--- a/src/atomic.h
+++ b/src/atomic.h
@@ -19,6 +19,9 @@
 #ifndef SWIFT_ATOMIC_H
 #define SWIFT_ATOMIC_H
 
+/* Config parameters. */
+#include "../config.h"
+
 /* Includes. */
 #include "inline.h"
 
diff --git a/src/cache.h b/src/cache.h
index 3ae092f269a8584ea34f15edbfb76bd92e698e30..638127b19a6a7063a474510e8b1471a45e8997d3 100644
--- a/src/cache.h
+++ b/src/cache.h
@@ -34,6 +34,7 @@
 #define C2_CACHE_SIZE (NUM_VEC_PROC * VEC_SIZE * 6) + (NUM_VEC_PROC * VEC_SIZE)
 #define C2_CACHE_ALIGN sizeof(float) * VEC_SIZE
 
+#ifdef WITH_VECTORIZATION
 /* Cache struct to hold a local copy of a cells' particle
  * properties required for density/force calculations.*/
 struct cache {
@@ -178,10 +179,10 @@ __attribute__((always_inline)) INLINE void cache_read_particles(
 
 #if defined(GADGET2_SPH)
 
-  /* Shift the particles positions to a local frame so single precision can be
-   * used instead of double precision. */
+/* Shift the particles positions to a local frame so single precision can be
+ * used instead of double precision. */
 #if defined(WITH_VECTORIZATION) && defined(__ICC)
-#pragma simd
+#pragma vector aligned
 #endif
   for (int i = 0; i < ci->count; i++) {
     ci_cache->x[i] = ci->parts[i].x[0] - ci->loc[0];
@@ -380,7 +381,7 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
  * used instead of double precision. Also shift the cell ci, particles positions
  * due to BCs but leave cell cj. */
 #if defined(WITH_VECTORIZATION) && defined(__ICC)
-#pragma simd
+#pragma vector aligned
 #endif
   for (int i = first_pi_align; i < ci->count; i++) {
     /* Make sure ci_cache is filled from the first element. */
@@ -397,14 +398,24 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
     ci_cache->vz[ci_cache_idx] = ci->parts[idx].v[2];
   }
 
-  /* Pad cache with fake particles that exist outside the cell so will not interact.*/
-  float fake_pix = 2.0f * ci_cache->x[ci->count - 1];
+  /* Pad cache with fake particles that exist outside the cell so will not
+   * interact.*/
+  float fake_pix = 2.0f * ci->parts[sort_i[ci->count - 1].i].x[0];
   for (int i = ci->count - first_pi_align;
-       i < ci->count - first_pi_align + VEC_SIZE; i++)
+       i < ci->count - first_pi_align + VEC_SIZE; i++) {
     ci_cache->x[i] = fake_pix;
+    ci_cache->y[i] = 1.f;
+    ci_cache->z[i] = 1.f;
+    ci_cache->h[i] = 1.f;
+
+    ci_cache->m[i] = 1.f;
+    ci_cache->vx[i] = 1.f;
+    ci_cache->vy[i] = 1.f;
+    ci_cache->vz[i] = 1.f;
+  }
 
 #if defined(WITH_VECTORIZATION) && defined(__ICC)
-#pragma simd
+#pragma vector aligned
 #endif
   for (int i = 0; i <= last_pj_align; i++) {
     idx = sort_j[i].i;
@@ -419,10 +430,20 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
     cj_cache->vz[i] = cj->parts[idx].v[2];
   }
 
-  /* Pad cache with fake particles that exist outside the cell so will not interact.*/
-  float fake_pjx = 2.0f * cj_cache->x[last_pj_align];
-  for (int i = last_pj_align + 1; i < last_pj_align + 1 + VEC_SIZE; i++)
+  /* Pad cache with fake particles that exist outside the cell so will not
+   * interact.*/
+  float fake_pjx = 2.0f * cj->parts[sort_j[cj->count - 1].i].x[0];
+  for (int i = last_pj_align + 1; i < last_pj_align + 1 + VEC_SIZE; i++) {
     cj_cache->x[i] = fake_pjx;
+    cj_cache->y[i] = 1.f;
+    cj_cache->z[i] = 1.f;
+    cj_cache->h[i] = 1.f;
+
+    cj_cache->m[i] = 1.f;
+    cj_cache->vx[i] = 1.f;
+    cj_cache->vy[i] = 1.f;
+    cj_cache->vz[i] = 1.f;
+  }
 }
 
 /* @brief Clean the memory allocated by a #cache object.
@@ -443,4 +464,6 @@ static INLINE void cache_clean(struct cache *c) {
   }
 }
 
+#endif /* WITH_VECTORIZATION */
+
 #endif /* SWIFT_CACHE_H */
diff --git a/src/cell.c b/src/cell.c
index 97ad8e051f0d484c5fc9d12de02898ac0c0cfe2f..aa5f3687ae02f1b88ffc7d351e42d4dfc1ff1468 100644
--- a/src/cell.c
+++ b/src/cell.c
@@ -129,6 +129,7 @@ int cell_unpack(struct pcell *pc, struct cell *c, struct space *s) {
       temp->depth = c->depth + 1;
       temp->split = 0;
       temp->dx_max = 0.f;
+      temp->dx_max_sort = 0.f;
       temp->nodeID = c->nodeID;
       temp->parent = c;
       c->progeny[k] = temp;
@@ -1103,33 +1104,93 @@ void cell_reset_task_counters(struct cell *c) {
 }
 
 /**
- * @brief Checks whether the cells are direct neighbours ot not. Both cells have
- * to be of the same size
+ * @brief Recursively construct all the multipoles in a cell hierarchy.
  *
- * @param ci First #cell.
- * @param cj Second #cell.
- *
- * @todo Deal with periodicity.
+ * @param c The #cell.
+ * @param ti_current The current integer time.
  */
-int cell_are_neighbours(const struct cell *restrict ci,
-                        const struct cell *restrict cj) {
+void cell_make_multipoles(struct cell *c, integertime_t ti_current) {
 
-#ifdef SWIFT_DEBUG_CHECKS
-  if (ci->width[0] != cj->width[0]) error("Cells of different size !");
-#endif
+  /* Reset everything */
+  gravity_reset(c->multipole);
 
-  /* Maximum allowed distance */
-  const double min_dist =
-      1.2 * ci->width[0]; /* 1.2 accounts for rounding errors */
+  if (c->split) {
 
-  /* (Manhattan) Distance between the cells */
-  for (int k = 0; k < 3; k++) {
-    const double center_i = ci->loc[k];
-    const double center_j = cj->loc[k];
-    if (fabs(center_i - center_j) > min_dist) return 0;
+    /* Compute CoM of all progenies */
+    double CoM[3] = {0., 0., 0.};
+    double mass = 0.;
+
+    for (int k = 0; k < 8; ++k) {
+      if (c->progeny[k] != NULL) {
+        const struct gravity_tensors *m = c->progeny[k]->multipole;
+        CoM[0] += m->CoM[0] * m->m_pole.M_000;
+        CoM[1] += m->CoM[1] * m->m_pole.M_000;
+        CoM[2] += m->CoM[2] * m->m_pole.M_000;
+        mass += m->m_pole.M_000;
+      }
+    }
+    c->multipole->CoM[0] = CoM[0] / mass;
+    c->multipole->CoM[1] = CoM[1] / mass;
+    c->multipole->CoM[2] = CoM[2] / mass;
+
+    /* Now shift progeny multipoles and add them up */
+    struct multipole temp;
+    double r_max = 0.;
+    for (int k = 0; k < 8; ++k) {
+      if (c->progeny[k] != NULL) {
+        const struct cell *cp = c->progeny[k];
+        const struct multipole *m = &cp->multipole->m_pole;
+
+        /* Contribution to multipole */
+        gravity_M2M(&temp, m, c->multipole->CoM, cp->multipole->CoM);
+        gravity_multipole_add(&c->multipole->m_pole, &temp);
+
+        /* Upper limit of max CoM<->gpart distance */
+        const double dx = c->multipole->CoM[0] - cp->multipole->CoM[0];
+        const double dy = c->multipole->CoM[1] - cp->multipole->CoM[1];
+        const double dz = c->multipole->CoM[2] - cp->multipole->CoM[2];
+        const double r2 = dx * dx + dy * dy + dz * dz;
+        r_max = max(r_max, cp->multipole->r_max + sqrt(r2));
+      }
+    }
+    /* Alternative upper limit of max CoM<->gpart distance */
+    const double dx = c->multipole->CoM[0] > c->loc[0] + c->width[0] / 2.
+                          ? c->multipole->CoM[0] - c->loc[0]
+                          : c->loc[0] + c->width[0] - c->multipole->CoM[0];
+    const double dy = c->multipole->CoM[1] > c->loc[1] + c->width[1] / 2.
+                          ? c->multipole->CoM[1] - c->loc[1]
+                          : c->loc[1] + c->width[1] - c->multipole->CoM[1];
+    const double dz = c->multipole->CoM[2] > c->loc[2] + c->width[2] / 2.
+                          ? c->multipole->CoM[2] - c->loc[2]
+                          : c->loc[2] + c->width[2] - c->multipole->CoM[2];
+
+    /* Take minimum of both limits */
+    c->multipole->r_max = min(r_max, sqrt(dx * dx + dy * dy + dz * dz));
+
+  } else {
+
+    if (c->gcount > 0) {
+      gravity_P2M(c->multipole, c->gparts, c->gcount);
+      const double dx = c->multipole->CoM[0] > c->loc[0] + c->width[0] / 2.
+                            ? c->multipole->CoM[0] - c->loc[0]
+                            : c->loc[0] + c->width[0] - c->multipole->CoM[0];
+      const double dy = c->multipole->CoM[1] > c->loc[1] + c->width[1] / 2.
+                            ? c->multipole->CoM[1] - c->loc[1]
+                            : c->loc[1] + c->width[1] - c->multipole->CoM[1];
+      const double dz = c->multipole->CoM[2] > c->loc[2] + c->width[2] / 2.
+                            ? c->multipole->CoM[2] - c->loc[2]
+                            : c->loc[2] + c->width[2] - c->multipole->CoM[2];
+      c->multipole->r_max = sqrt(dx * dx + dy * dy + dz * dz);
+    } else {
+      gravity_multipole_init(&c->multipole->m_pole);
+      c->multipole->CoM[0] = c->loc[0] + c->width[0] / 2.;
+      c->multipole->CoM[1] = c->loc[1] + c->width[1] / 2.;
+      c->multipole->CoM[2] = c->loc[2] + c->width[2] / 2.;
+      c->multipole->r_max = 0.;
+    }
   }
 
-  return 1;
+  c->ti_old_multipole = ti_current;
 }
 
 /**
@@ -1145,6 +1206,8 @@ void cell_check_multipole(struct cell *c, void *data) {
   struct gravity_tensors ma;
   const double tolerance = 1e-3; /* Relative */
 
+  return;
+
   /* First recurse */
   if (c->split)
     for (int k = 0; k < 8; k++)
@@ -1244,28 +1307,44 @@ int cell_unskip_tasks(struct cell *c, struct scheduler *s) {
   /* Un-skip the density tasks involved with this cell. */
   for (struct link *l = c->density; l != NULL; l = l->next) {
     struct task *t = l->t;
-    const struct cell *ci = t->ci;
-    const struct cell *cj = t->cj;
+    struct cell *ci = t->ci;
+    struct cell *cj = t->cj;
     scheduler_activate(s, t);
 
     /* Set the correct sorting flags */
     if (t->type == task_type_pair) {
+      if (ci->dx_max_sort > space_maxreldx * ci->dmin) {
+        for (struct cell *finger = ci; finger != NULL; finger = finger->parent)
+          finger->sorted = 0;
+      }
+      if (cj->dx_max_sort > space_maxreldx * cj->dmin) {
+        for (struct cell *finger = cj; finger != NULL; finger = finger->parent)
+          finger->sorted = 0;
+      }
       if (!(ci->sorted & (1 << t->flags))) {
-        atomic_or(&ci->sorts->flags, (1 << t->flags));
+#ifdef SWIFT_DEBUG_CHECKS
+        if (!(ci->sorts->flags & (1 << t->flags)))
+          error("bad flags in sort task.");
+#endif
         scheduler_activate(s, ci->sorts);
+        if (ci->nodeID == engine_rank) scheduler_activate(s, ci->drift);
       }
       if (!(cj->sorted & (1 << t->flags))) {
-        atomic_or(&cj->sorts->flags, (1 << t->flags));
+#ifdef SWIFT_DEBUG_CHECKS
+        if (!(cj->sorts->flags & (1 << t->flags)))
+          error("bad flags in sort task.");
+#endif
         scheduler_activate(s, cj->sorts);
+        if (cj->nodeID == engine_rank) scheduler_activate(s, cj->drift);
       }
     }
 
-    /* Check whether there was too much particle motion */
+    /* Only interested in pair interactions as of here. */
     if (t->type == task_type_pair || t->type == task_type_sub_pair) {
-      if (t->tight &&
-          (max(ci->h_max, cj->h_max) + ci->dx_max + cj->dx_max > cj->dmin ||
-           ci->dx_max > space_maxreldx * ci->h_max ||
-           cj->dx_max > space_maxreldx * cj->h_max))
+
+      /* Check whether there was too much particle motion, i.e. the
+         cell neighbour conditions were violated. */
+      if (max(ci->h_max, cj->h_max) + ci->dx_max + cj->dx_max > cj->dmin)
         rebuild = 1;
 
 #ifdef WITH_MPI
@@ -1287,10 +1366,12 @@ int cell_unskip_tasks(struct cell *c, struct scheduler *s) {
         if (l == NULL) error("Missing link to send_xv task.");
         scheduler_activate(s, l->t);
 
-        if (cj->super->drift)
-          scheduler_activate(s, cj->super->drift);
+        /* Drift both cells, the foreign one at the level which it is sent. */
+        if (l->t->ci->drift)
+          scheduler_activate(s, l->t->ci->drift);
         else
           error("Drift task missing !");
+        if (t->type == task_type_pair) scheduler_activate(s, cj->drift);
 
         if (cell_is_active(cj, e)) {
           for (l = cj->send_rho; l != NULL && l->t->cj->nodeID != ci->nodeID;
@@ -1323,10 +1404,12 @@ int cell_unskip_tasks(struct cell *c, struct scheduler *s) {
         if (l == NULL) error("Missing link to send_xv task.");
         scheduler_activate(s, l->t);
 
-        if (ci->super->drift)
-          scheduler_activate(s, ci->super->drift);
+        /* Drift both cells, the foreign one at the level which it is sent. */
+        if (l->t->ci->drift)
+          scheduler_activate(s, l->t->ci->drift);
         else
           error("Drift task missing !");
+        if (t->type == task_type_pair) scheduler_activate(s, ci->drift);
 
         if (cell_is_active(ci, e)) {
           for (l = ci->send_rho; l != NULL && l->t->cj->nodeID != cj->nodeID;
@@ -1341,6 +1424,14 @@ int cell_unskip_tasks(struct cell *c, struct scheduler *s) {
           if (l == NULL) error("Missing link to send_ti task.");
           scheduler_activate(s, l->t);
         }
+      } else if (t->type == task_type_pair) {
+        scheduler_activate(s, ci->drift);
+        scheduler_activate(s, cj->drift);
+      }
+#else
+      if (t->type == task_type_pair) {
+        scheduler_activate(s, ci->drift);
+        scheduler_activate(s, cj->drift);
       }
 #endif
     }
@@ -1355,7 +1446,6 @@ int cell_unskip_tasks(struct cell *c, struct scheduler *s) {
     scheduler_activate(s, l->t);
   if (c->extra_ghost != NULL) scheduler_activate(s, c->extra_ghost);
   if (c->ghost != NULL) scheduler_activate(s, c->ghost);
-  if (c->init != NULL) scheduler_activate(s, c->init);
   if (c->init_grav != NULL) scheduler_activate(s, c->init_grav);
   if (c->drift != NULL) scheduler_activate(s, c->drift);
   if (c->kick1 != NULL) scheduler_activate(s, c->kick1);
@@ -1409,7 +1499,9 @@ void cell_drift_particles(struct cell *c, const struct engine *e) {
 
   /* Drift from the last time the cell was drifted to the current time */
   const double dt = (ti_current - ti_old) * timeBase;
-  float dx_max = 0.f, dx2_max = 0.f, cell_h_max = 0.f;
+  float dx_max = 0.f, dx2_max = 0.f;
+  float dx_max_sort = 0.0f, dx2_max_sort = 0.f;
+  float cell_h_max = 0.f;
 
   /* Check that we are actually going to move forward. */
   if (ti_current < ti_old) error("Attempt to drift to the past");
@@ -1421,8 +1513,13 @@ void cell_drift_particles(struct cell *c, const struct engine *e) {
     for (int k = 0; k < 8; k++)
       if (c->progeny[k] != NULL) {
         struct cell *cp = c->progeny[k];
+
+        /* Collect */
         cell_drift_particles(cp, e);
+
+        /* Update */
         dx_max = max(dx_max, cp->dx_max);
+        dx_max_sort = max(dx_max_sort, cp->dx_max_sort);
         cell_h_max = max(cell_h_max, cp->h_max);
       }
 
@@ -1443,6 +1540,11 @@ void cell_drift_particles(struct cell *c, const struct engine *e) {
                         gp->x_diff[1] * gp->x_diff[1] +
                         gp->x_diff[2] * gp->x_diff[2];
       dx2_max = max(dx2_max, dx2);
+
+      /* Init gravity force fields. */
+      if (gpart_is_active(gp, e)) {
+        gravity_init_gpart(gp);
+      }
     }
 
     /* Loop over all the gas particles in the cell */
@@ -1464,9 +1566,18 @@ void cell_drift_particles(struct cell *c, const struct engine *e) {
                         xp->x_diff[1] * xp->x_diff[1] +
                         xp->x_diff[2] * xp->x_diff[2];
       dx2_max = max(dx2_max, dx2);
+      const float dx2_sort = xp->x_diff_sort[0] * xp->x_diff_sort[0] +
+                             xp->x_diff_sort[1] * xp->x_diff_sort[1] +
+                             xp->x_diff_sort[2] * xp->x_diff_sort[2];
+      dx2_max_sort = max(dx2_max_sort, dx2_sort);
 
       /* Maximal smoothing length */
       cell_h_max = max(cell_h_max, p->h);
+
+      /* Get ready for a density calculation */
+      if (part_is_active(p, e)) {
+        hydro_init_part(p, &e->s->hs);
+      }
     }
 
     /* Loop over all the star particles in the cell */
@@ -1484,16 +1595,19 @@ void cell_drift_particles(struct cell *c, const struct engine *e) {
 
     /* Now, get the maximal particle motion from its square */
     dx_max = sqrtf(dx2_max);
+    dx_max_sort = sqrtf(dx2_max_sort);
 
   } else {
 
     cell_h_max = c->h_max;
     dx_max = c->dx_max;
+    dx_max_sort = c->dx_max_sort;
   }
 
   /* Store the values */
   c->h_max = cell_h_max;
   c->dx_max = dx_max;
+  c->dx_max_sort = dx_max_sort;
 
   /* Update the time of the last drift */
   c->ti_old = ti_current;
diff --git a/src/cell.h b/src/cell.h
index b3bc7363f59ba6f9b117bfa690088ab9de40a28e..c89e70960e84bb027f5e99a3cb362f2e0722b9bd 100644
--- a/src/cell.h
+++ b/src/cell.h
@@ -148,9 +148,6 @@ struct cell {
   /*! Linked list of the tasks computing this cell's gravity forces. */
   struct link *grav;
 
-  /*! The particle initialistation task */
-  struct task *init;
-
   /*! The multipole initialistation task */
   struct task *init_grav;
 
@@ -239,6 +236,9 @@ struct cell {
   /*! Last (integer) time the cell's particle was drifted forward in time. */
   integertime_t ti_old;
 
+  /*! Last (integer) time the cell's sort arrays were updated. */
+  integertime_t ti_sort;
+
   /*! Last (integer) time the cell's multipole was drifted forward in time. */
   integertime_t ti_old_multipole;
 
@@ -248,6 +248,9 @@ struct cell {
   /*! Maximum particle movement in this cell since last construction. */
   float dx_max;
 
+  /*! Maximum particle movement in this cell since the last sort. */
+  float dx_max_sort;
+
   /*! Nr of #part in this cell. */
   int count;
 
@@ -351,8 +354,7 @@ int cell_link_gparts(struct cell *c, struct gpart *gparts);
 int cell_link_sparts(struct cell *c, struct spart *sparts);
 void cell_convert_hydro(struct cell *c, void *data);
 void cell_clean_links(struct cell *c, void *data);
-int cell_are_neighbours(const struct cell *restrict ci,
-                        const struct cell *restrict cj);
+void cell_make_multipoles(struct cell *c, integertime_t ti_current);
 void cell_check_multipole(struct cell *c, void *data);
 void cell_clean(struct cell *c);
 void cell_check_particle_drift_point(struct cell *c, void *data);
diff --git a/src/drift.h b/src/drift.h
index 9957d829fd28e033df9db325186195a3b396738c..d9b79f7f0549d85b6f05e8ce4a394aaa5b2a4d8d 100644
--- a/src/drift.h
+++ b/src/drift.h
@@ -101,10 +101,12 @@ __attribute__((always_inline)) INLINE static void drift_part(
   /* Predict the values of the extra fields */
   hydro_predict_extra(p, xp, dt);
 
-  /* Compute offset since last cell construction */
-  xp->x_diff[0] -= xp->v_full[0] * dt;
-  xp->x_diff[1] -= xp->v_full[1] * dt;
-  xp->x_diff[2] -= xp->v_full[2] * dt;
+  /* Compute offsets since last cell construction */
+  for (int k = 0; k < 3; k++) {
+    const float dx = xp->v_full[k] * dt;
+    xp->x_diff[k] -= dx;
+    xp->x_diff_sort[k] -= dx;
+  }
 }
 
 /**
diff --git a/src/engine.c b/src/engine.c
index 0fa2844e26e1e574fdda2d8d7292c685db4b4cf6..ee4838399924573a765466290b2fb7b1d37bcf87 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -144,47 +144,37 @@ void engine_make_hierarchical_tasks(struct engine *e, struct cell *c) {
     /* Local tasks only... */
     if (c->nodeID == e->nodeID) {
 
-      /* Add the init task. */
-      c->init = scheduler_addtask(s, task_type_init, task_subtype_none, 0, 0, c,
-                                  NULL, 0);
-
       /* Add the two half kicks */
       c->kick1 = scheduler_addtask(s, task_type_kick1, task_subtype_none, 0, 0,
-                                   c, NULL, 0);
+                                   c, NULL);
 
       c->kick2 = scheduler_addtask(s, task_type_kick2, task_subtype_none, 0, 0,
-                                   c, NULL, 0);
+                                   c, NULL);
 
       /* Add the time-step calculation task and its dependency */
       c->timestep = scheduler_addtask(s, task_type_timestep, task_subtype_none,
-                                      0, 0, c, NULL, 0);
+                                      0, 0, c, NULL);
 
       scheduler_addunlock(s, c->kick2, c->timestep);
       scheduler_addunlock(s, c->timestep, c->kick1);
 
-      /* Add the drift task and its dependencies. */
-      c->drift = scheduler_addtask(s, task_type_drift, task_subtype_none, 0, 0,
-                                   c, NULL, 0);
-
-      scheduler_addunlock(s, c->drift, c->init);
-
       if (is_self_gravity) {
 
         /* Initialisation of the multipoles */
         c->init_grav = scheduler_addtask(s, task_type_init_grav,
-                                         task_subtype_none, 0, 0, c, NULL, 0);
+                                         task_subtype_none, 0, 0, c, NULL);
 
         /* Gravity non-neighbouring pm calculations */
         c->grav_long_range = scheduler_addtask(
-            s, task_type_grav_long_range, task_subtype_none, 0, 0, c, NULL, 0);
+            s, task_type_grav_long_range, task_subtype_none, 0, 0, c, NULL);
 
         /* Gravity top-level periodic calculation */
-        c->grav_top_level = scheduler_addtask(
-            s, task_type_grav_top_level, task_subtype_none, 0, 0, c, NULL, 0);
+        c->grav_top_level = scheduler_addtask(s, task_type_grav_top_level,
+                                              task_subtype_none, 0, 0, c, NULL);
 
         /* Gravity recursive down-pass */
         c->grav_down = scheduler_addtask(s, task_type_grav_down,
-                                         task_subtype_none, 0, 0, c, NULL, 0);
+                                         task_subtype_none, 0, 0, c, NULL);
 
         scheduler_addunlock(s, c->init_grav, c->grav_long_range);
         scheduler_addunlock(s, c->init_grav, c->grav_top_level);
@@ -196,19 +186,19 @@ void engine_make_hierarchical_tasks(struct engine *e, struct cell *c) {
       /* Generate the ghost task. */
       if (is_hydro)
         c->ghost = scheduler_addtask(s, task_type_ghost, task_subtype_none, 0,
-                                     0, c, NULL, 0);
+                                     0, c, NULL);
 
 #ifdef EXTRA_HYDRO_LOOP
       /* Generate the extra ghost task. */
       if (is_hydro)
         c->extra_ghost = scheduler_addtask(s, task_type_extra_ghost,
-                                           task_subtype_none, 0, 0, c, NULL, 0);
+                                           task_subtype_none, 0, 0, c, NULL);
 #endif
 
       /* Cooling task */
       if (is_with_cooling) {
         c->cooling = scheduler_addtask(s, task_type_cooling, task_subtype_none,
-                                       0, 0, c, NULL, 0);
+                                       0, 0, c, NULL);
 
         scheduler_addunlock(s, c->cooling, c->kick2);
       }
@@ -216,7 +206,7 @@ void engine_make_hierarchical_tasks(struct engine *e, struct cell *c) {
       /* add source terms */
       if (is_with_sourceterms) {
         c->sourceterms = scheduler_addtask(s, task_type_sourceterms,
-                                           task_subtype_none, 0, 0, c, NULL, 0);
+                                           task_subtype_none, 0, 0, c, NULL);
       }
     }
 
@@ -1020,19 +1010,15 @@ void engine_addtasks_send(struct engine *e, struct cell *ci, struct cell *cj,
     /* Create the tasks and their dependencies? */
     if (t_xv == NULL) {
 
-      if (ci->super->drift == NULL)
-        ci->super->drift = scheduler_addtask(
-            s, task_type_drift, task_subtype_none, 0, 0, ci->super, NULL, 0);
-
       t_xv = scheduler_addtask(s, task_type_send, task_subtype_xv, 4 * ci->tag,
-                               0, ci, cj, 0);
+                               0, ci, cj);
       t_rho = scheduler_addtask(s, task_type_send, task_subtype_rho,
-                                4 * ci->tag + 1, 0, ci, cj, 0);
+                                4 * ci->tag + 1, 0, ci, cj);
       t_ti = scheduler_addtask(s, task_type_send, task_subtype_tend,
-                               4 * ci->tag + 2, 0, ci, cj, 0);
+                               4 * ci->tag + 2, 0, ci, cj);
 #ifdef EXTRA_HYDRO_LOOP
       t_gradient = scheduler_addtask(s, task_type_send, task_subtype_gradient,
-                                     4 * ci->tag + 3, 0, ci, cj, 0);
+                                     4 * ci->tag + 3, 0, ci, cj);
 #endif
 
 #ifdef EXTRA_HYDRO_LOOP
@@ -1063,7 +1049,10 @@ void engine_addtasks_send(struct engine *e, struct cell *ci, struct cell *cj,
 #endif
 
       /* Drift before you send */
-      scheduler_addunlock(s, ci->super->drift, t_xv);
+      if (ci->drift == NULL)
+        ci->drift = scheduler_addtask(s, task_type_drift, task_subtype_none, 0,
+                                      0, ci, NULL);
+      scheduler_addunlock(s, ci->drift, t_xv);
 
       /* The super-cell's timestep task should unlock the send_ti task. */
       scheduler_addunlock(s, ci->super->timestep, t_ti);
@@ -1114,14 +1103,14 @@ void engine_addtasks_recv(struct engine *e, struct cell *c, struct task *t_xv,
 
     /* Create the tasks. */
     t_xv = scheduler_addtask(s, task_type_recv, task_subtype_xv, 4 * c->tag, 0,
-                             c, NULL, 0);
+                             c, NULL);
     t_rho = scheduler_addtask(s, task_type_recv, task_subtype_rho,
-                              4 * c->tag + 1, 0, c, NULL, 0);
+                              4 * c->tag + 1, 0, c, NULL);
     t_ti = scheduler_addtask(s, task_type_recv, task_subtype_tend,
-                             4 * c->tag + 2, 0, c, NULL, 0);
+                             4 * c->tag + 2, 0, c, NULL);
 #ifdef EXTRA_HYDRO_LOOP
     t_gradient = scheduler_addtask(s, task_type_recv, task_subtype_gradient,
-                                   4 * c->tag + 3, 0, c, NULL, 0);
+                                   4 * c->tag + 3, 0, c, NULL);
 #endif
   }
   c->recv_xv = t_xv;
@@ -1677,8 +1666,7 @@ void engine_make_self_gravity_tasks(struct engine *e) {
     if (ci->nodeID != nodeID) continue;
 
     /* If the cells is local build a self-interaction */
-    scheduler_addtask(sched, task_type_self, task_subtype_grav, 0, 0, ci, NULL,
-                      0);
+    scheduler_addtask(sched, task_type_self, task_subtype_grav, 0, 0, ci, NULL);
 
     /* Loop over every other cell */
     for (int cjd = cid + 1; cjd < nr_cells; ++cjd) {
@@ -1693,9 +1681,9 @@ void engine_make_self_gravity_tasks(struct engine *e) {
 
       /* Are the cells to close for a MM interaction ? */
       if (!gravity_multipole_accept(ci->multipole, cj->multipole,
-                                    theta_crit_inv))
+                                    theta_crit_inv, 1))
         scheduler_addtask(sched, task_type_pair, task_subtype_grav, 0, 0, ci,
-                          cj, 1);
+                          cj);
     }
   }
 }
@@ -1720,7 +1708,7 @@ void engine_make_external_gravity_tasks(struct engine *e) {
 
     /* If the cell is local, build a self-interaction */
     scheduler_addtask(sched, task_type_self, task_subtype_external_grav, 0, 0,
-                      ci, NULL, 0);
+                      ci, NULL);
   }
 }
 
@@ -1758,7 +1746,7 @@ void engine_make_hydroloop_tasks(struct engine *e) {
         /* If the cells is local build a self-interaction */
         if (ci->nodeID == nodeID)
           scheduler_addtask(sched, task_type_self, task_subtype_density, 0, 0,
-                            ci, NULL, 0);
+                            ci, NULL);
 
         /* Now loop over all the neighbours of this cell */
         for (int ii = -1; ii < 2; ii++) {
@@ -1787,7 +1775,7 @@ void engine_make_hydroloop_tasks(struct engine *e) {
               const int sid =
                   sortlistID[(kk + 1) + 3 * ((jj + 1) + 3 * (ii + 1))];
               scheduler_addtask(sched, task_type_pair, task_subtype_density,
-                                sid, 0, ci, cj, 1);
+                                sid, 0, ci, cj);
             }
           }
         }
@@ -1816,15 +1804,22 @@ void engine_count_and_link_tasks(struct engine *e) {
     struct cell *const ci = t->ci;
     struct cell *const cj = t->cj;
 
-    /* Link sort tasks together. */
-    if (t->type == task_type_sort && ci->split)
-      for (int j = 0; j < 8; j++)
-        if (ci->progeny[j] != NULL && ci->progeny[j]->sorts != NULL) {
-          scheduler_addunlock(sched, ci->progeny[j]->sorts, t);
-        }
+    /* Link sort tasks to the next-higher sort task. */
+    if (t->type == task_type_sort) {
+      struct cell *finger = t->ci->parent;
+      while (finger != NULL && finger->sorts == NULL) finger = finger->parent;
+      if (finger != NULL) scheduler_addunlock(sched, t, finger->sorts);
+    }
+
+    /* Link drift tasks to the next-higher drift task. */
+    else if (t->type == task_type_drift) {
+      struct cell *finger = ci->parent;
+      while (finger != NULL && finger->drift == NULL) finger = finger->parent;
+      if (finger != NULL) scheduler_addunlock(sched, t, finger->drift);
+    }
 
     /* Link self tasks to cells. */
-    if (t->type == task_type_self) {
+    else if (t->type == task_type_self) {
       atomic_inc(&ci->nr_tasks);
       if (t->subtype == task_subtype_density) {
         engine_addlink(e, &ci->density, t);
@@ -1895,7 +1890,6 @@ static inline void engine_make_self_gravity_dependencies(
     struct scheduler *sched, struct task *gravity, struct cell *c) {
 
   /* init --> gravity --> grav_down --> kick */
-  scheduler_addunlock(sched, c->super->init, gravity);
   scheduler_addunlock(sched, c->super->init_grav, gravity);
   scheduler_addunlock(sched, gravity, c->super->grav_down);
 }
@@ -1912,7 +1906,7 @@ static inline void engine_make_external_gravity_dependencies(
     struct scheduler *sched, struct task *gravity, struct cell *c) {
 
   /* init --> external gravity --> kick */
-  scheduler_addunlock(sched, c->super->init, gravity);
+  scheduler_addunlock(sched, c->drift, gravity);
   scheduler_addunlock(sched, gravity, c->super->kick2);
 }
 
@@ -2008,9 +2002,8 @@ static inline void engine_make_hydro_loops_dependencies(
     struct scheduler *sched, struct task *density, struct task *gradient,
     struct task *force, struct cell *c, int with_cooling) {
 
-  /* init --> density loop --> ghost --> gradient loop --> extra_ghost */
+  /* density loop --> ghost --> gradient loop --> extra_ghost */
   /* extra_ghost --> force loop  */
-  scheduler_addunlock(sched, c->super->init, density);
   scheduler_addunlock(sched, density, c->super->ghost);
   scheduler_addunlock(sched, c->super->ghost, gradient);
   scheduler_addunlock(sched, gradient, c->super->extra_ghost);
@@ -2041,8 +2034,7 @@ static inline void engine_make_hydro_loops_dependencies(struct scheduler *sched,
                                                         struct task *force,
                                                         struct cell *c,
                                                         int with_cooling) {
-  /* init --> density loop --> ghost --> force loop */
-  scheduler_addunlock(sched, c->super->init, density);
+  /* density loop --> ghost --> force loop */
   scheduler_addunlock(sched, density, c->super->ghost);
   scheduler_addunlock(sched, c->super->ghost, force);
 
@@ -2078,15 +2070,23 @@ void engine_make_extra_hydroloop_tasks(struct engine *e) {
   for (int ind = 0; ind < nr_tasks; ind++) {
     struct task *t = &sched->tasks[ind];
 
+    /* Sort tasks depend on the drift of the cell. */
+    if (t->type == task_type_sort && t->ci->nodeID == engine_rank) {
+      scheduler_addunlock(sched, t->ci->drift, t);
+    }
+
     /* Self-interaction? */
-    if (t->type == task_type_self && t->subtype == task_subtype_density) {
+    else if (t->type == task_type_self && t->subtype == task_subtype_density) {
+
+      /* Make all density tasks depend on the drift. */
+      scheduler_addunlock(sched, t->ci->drift, t);
 
 #ifdef EXTRA_HYDRO_LOOP
       /* Start by constructing the task for the second  and third hydro loop */
       struct task *t2 = scheduler_addtask(
-          sched, task_type_self, task_subtype_gradient, 0, 0, t->ci, NULL, 0);
+          sched, task_type_self, task_subtype_gradient, 0, 0, t->ci, NULL);
       struct task *t3 = scheduler_addtask(
-          sched, task_type_self, task_subtype_force, 0, 0, t->ci, NULL, 0);
+          sched, task_type_self, task_subtype_force, 0, 0, t->ci, NULL);
 
       /* Add the link between the new loops and the cell */
       engine_addlink(e, &t->ci->gradient, t2);
@@ -2100,7 +2100,7 @@ void engine_make_extra_hydroloop_tasks(struct engine *e) {
 
       /* Start by constructing the task for the second hydro loop */
       struct task *t2 = scheduler_addtask(
-          sched, task_type_self, task_subtype_force, 0, 0, t->ci, NULL, 0);
+          sched, task_type_self, task_subtype_force, 0, 0, t->ci, NULL);
 
       /* Add the link between the new loop and the cell */
       engine_addlink(e, &t->ci->force, t2);
@@ -2113,12 +2113,18 @@ void engine_make_extra_hydroloop_tasks(struct engine *e) {
     /* Otherwise, pair interaction? */
     else if (t->type == task_type_pair && t->subtype == task_subtype_density) {
 
+      /* Make all density tasks depend on the drift. */
+      if (t->ci->nodeID == engine_rank)
+        scheduler_addunlock(sched, t->ci->drift, t);
+      if (t->cj->nodeID == engine_rank)
+        scheduler_addunlock(sched, t->cj->drift, t);
+
 #ifdef EXTRA_HYDRO_LOOP
       /* Start by constructing the task for the second and third hydro loop */
       struct task *t2 = scheduler_addtask(
-          sched, task_type_pair, task_subtype_gradient, 0, 0, t->ci, t->cj, 0);
+          sched, task_type_pair, task_subtype_gradient, 0, 0, t->ci, t->cj);
       struct task *t3 = scheduler_addtask(
-          sched, task_type_pair, task_subtype_force, 0, 0, t->ci, t->cj, 0);
+          sched, task_type_pair, task_subtype_force, 0, 0, t->ci, t->cj);
 
       /* Add the link between the new loop and both cells */
       engine_addlink(e, &t->ci->gradient, t2);
@@ -2141,7 +2147,7 @@ void engine_make_extra_hydroloop_tasks(struct engine *e) {
 
       /* Start by constructing the task for the second hydro loop */
       struct task *t2 = scheduler_addtask(
-          sched, task_type_pair, task_subtype_force, 0, 0, t->ci, t->cj, 0);
+          sched, task_type_pair, task_subtype_force, 0, 0, t->ci, t->cj);
 
       /* Add the link between the new loop and both cells */
       engine_addlink(e, &t->ci->force, t2);
@@ -2169,10 +2175,10 @@ void engine_make_extra_hydroloop_tasks(struct engine *e) {
       /* Start by constructing the task for the second and third hydro loop */
       struct task *t2 =
           scheduler_addtask(sched, task_type_sub_self, task_subtype_gradient,
-                            t->flags, 0, t->ci, t->cj, 0);
+                            t->flags, 0, t->ci, t->cj);
       struct task *t3 =
           scheduler_addtask(sched, task_type_sub_self, task_subtype_force,
-                            t->flags, 0, t->ci, t->cj, 0);
+                            t->flags, 0, t->ci, t->cj);
 
       /* Add the link between the new loop and the cell */
       engine_addlink(e, &t->ci->gradient, t2);
@@ -2189,7 +2195,7 @@ void engine_make_extra_hydroloop_tasks(struct engine *e) {
       /* Start by constructing the task for the second hydro loop */
       struct task *t2 =
           scheduler_addtask(sched, task_type_sub_self, task_subtype_force,
-                            t->flags, 0, t->ci, t->cj, 0);
+                            t->flags, 0, t->ci, t->cj);
 
       /* Add the link between the new loop and the cell */
       engine_addlink(e, &t->ci->force, t2);
@@ -2211,10 +2217,10 @@ void engine_make_extra_hydroloop_tasks(struct engine *e) {
       /* Start by constructing the task for the second and third hydro loop */
       struct task *t2 =
           scheduler_addtask(sched, task_type_sub_pair, task_subtype_gradient,
-                            t->flags, 0, t->ci, t->cj, 0);
+                            t->flags, 0, t->ci, t->cj);
       struct task *t3 =
           scheduler_addtask(sched, task_type_sub_pair, task_subtype_force,
-                            t->flags, 0, t->ci, t->cj, 0);
+                            t->flags, 0, t->ci, t->cj);
 
       /* Add the link between the new loop and both cells */
       engine_addlink(e, &t->ci->gradient, t2);
@@ -2237,7 +2243,7 @@ void engine_make_extra_hydroloop_tasks(struct engine *e) {
       /* Start by constructing the task for the second hydro loop */
       struct task *t2 =
           scheduler_addtask(sched, task_type_sub_pair, task_subtype_force,
-                            t->flags, 0, t->ci, t->cj, 0);
+                            t->flags, 0, t->ci, t->cj);
 
       /* Add the link between the new loop and both cells */
       engine_addlink(e, &t->ci->force, t2);
@@ -2282,7 +2288,7 @@ void engine_make_gravityrecursive_tasks(struct engine *e) {
   /*     struct task *down = NULL; */
   /*         /\* scheduler_addtask(sched, task_type_grav_down,
    * task_subtype_none, 0, 0, *\/ */
-  /*         /\*                   &cells[k], NULL, 0); *\/ */
+  /*         /\*                   &cells[k], NULL); *\/ */
 
   /*     /\* Push tasks down the cell hierarchy. *\/ */
   /*     engine_addtasks_grav(e, &cells[k], up, down); */
@@ -2290,6 +2296,21 @@ void engine_make_gravityrecursive_tasks(struct engine *e) {
   /* } */
 }
 
+void engine_check_sort_tasks(struct engine *e, struct cell *c) {
+
+  /* Find the parent sort task, if any, and copy its flags. */
+  if (c->sorts != NULL) {
+    struct cell *parent = c->parent;
+    while (parent != NULL && parent->sorts == NULL) parent = parent->parent;
+    if (parent != NULL) c->sorts->flags |= parent->sorts->flags;
+  }
+
+  /* Recurse? */
+  if (c->split)
+    for (int k = 0; k < 8; k++)
+      if (c->progeny[k] != NULL) engine_check_sort_tasks(e, c->progeny[k]);
+}
+
 /**
  * @brief Fill the #space's task list.
  *
@@ -2350,10 +2371,13 @@ void engine_maketasks(struct engine *e) {
    * pointers. */
   for (int k = 0; k < nr_cells; k++) cell_set_super(&cells[k], NULL);
 
-  /* Append hierarchical tasks to each cells */
+  /* Append hierarchical tasks to each cell. */
   for (int k = 0; k < nr_cells; k++)
     engine_make_hierarchical_tasks(e, &cells[k]);
 
+  /* Append hierarchical tasks to each cell. */
+  for (int k = 0; k < nr_cells; k++) engine_check_sort_tasks(e, &cells[k]);
+
   /* Run through the tasks and make force tasks for each density task.
      Each force task depends on the cell ghosts and unlocks the kick task
      of its super-cell. */
@@ -2437,15 +2461,8 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
     else if (t->type == task_type_pair || t->type == task_type_sub_pair) {
 
       /* Local pointers. */
-      const struct cell *ci = t->ci;
-      const struct cell *cj = t->cj;
-
-      /* Too much particle movement? */
-      if (t->tight &&
-          (max(ci->h_max, cj->h_max) + ci->dx_max + cj->dx_max > cj->dmin ||
-           ci->dx_max > space_maxreldx * ci->h_max ||
-           cj->dx_max > space_maxreldx * cj->h_max))
-        *rebuild_space = 1;
+      struct cell *ci = t->ci;
+      struct cell *cj = t->cj;
 
       /* Set this task's skip, otherwise nothing to do. */
       if (cell_is_active(t->ci, e) || cell_is_active(t->cj, e))
@@ -2456,20 +2473,41 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
       /* If this is not a density task, we don't have to do any of the below. */
       if (t->subtype != task_subtype_density) continue;
 
-      /* Set the sort flags. */
+      /* Too much particle movement? */
+      if (max(ci->h_max, cj->h_max) + ci->dx_max + cj->dx_max > cj->dmin)
+        *rebuild_space = 1;
+
+      /* Set the correct sorting flags */
       if (t->type == task_type_pair) {
+        if (ci->dx_max_sort > space_maxreldx * ci->dmin) {
+          for (struct cell *finger = ci; finger != NULL;
+               finger = finger->parent)
+            finger->sorted = 0;
+        }
+        if (cj->dx_max_sort > space_maxreldx * cj->dmin) {
+          for (struct cell *finger = cj; finger != NULL;
+               finger = finger->parent)
+            finger->sorted = 0;
+        }
         if (!(ci->sorted & (1 << t->flags))) {
-          atomic_or(&ci->sorts->flags, (1 << t->flags));
+#ifdef SWIFT_DEBUG_CHECKS
+          if (!(ci->sorts->flags & (1 << t->flags)))
+            error("bad flags in sort task.");
+#endif
           scheduler_activate(s, ci->sorts);
+          if (ci->nodeID == engine_rank) scheduler_activate(s, ci->drift);
         }
         if (!(cj->sorted & (1 << t->flags))) {
-          atomic_or(&cj->sorts->flags, (1 << t->flags));
+#ifdef SWIFT_DEBUG_CHECKS
+          if (!(cj->sorts->flags & (1 << t->flags)))
+            error("bad flags in sort task.");
+#endif
           scheduler_activate(s, cj->sorts);
+          if (cj->nodeID == engine_rank) scheduler_activate(s, cj->drift);
         }
       }
 
 #ifdef WITH_MPI
-
       /* Activate the send/recv flags. */
       if (ci->nodeID != engine_rank) {
 
@@ -2488,10 +2526,12 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
         if (l == NULL) error("Missing link to send_xv task.");
         scheduler_activate(s, l->t);
 
-        if (cj->super->drift)
-          scheduler_activate(s, cj->super->drift);
+        /* Drift both cells, the foreign one at the level which it is sent. */
+        if (l->t->ci->drift)
+          scheduler_activate(s, l->t->ci->drift);
         else
           error("Drift task missing !");
+        if (t->type == task_type_pair) scheduler_activate(s, cj->drift);
 
         if (cell_is_active(cj, e)) {
           for (l = cj->send_rho; l != NULL && l->t->cj->nodeID != ci->nodeID;
@@ -2524,10 +2564,12 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
         if (l == NULL) error("Missing link to send_xv task.");
         scheduler_activate(s, l->t);
 
-        if (ci->super->drift)
-          scheduler_activate(s, ci->super->drift);
+        /* Drift both cells, the foreign one at the level which it is sent. */
+        if (l->t->ci->drift)
+          scheduler_activate(s, l->t->ci->drift);
         else
           error("Drift task missing !");
+        if (t->type == task_type_pair) scheduler_activate(s, ci->drift);
 
         if (cell_is_active(ci, e)) {
           for (l = ci->send_rho; l != NULL && l->t->cj->nodeID != cj->nodeID;
@@ -2542,15 +2584,22 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
           if (l == NULL) error("Missing link to send_ti task.");
           scheduler_activate(s, l->t);
         }
-      }
 
+      } else if (t->type == task_type_pair) {
+        scheduler_activate(s, ci->drift);
+        scheduler_activate(s, cj->drift);
+      }
+#else
+      if (t->type == task_type_pair) {
+        scheduler_activate(s, ci->drift);
+        scheduler_activate(s, cj->drift);
+      }
 #endif
     }
 
-    /* Kick/Drift/Init? */
+    /* Kick/Drift? */
     else if (t->type == task_type_kick1 || t->type == task_type_kick2 ||
-             t->type == task_type_drift || t->type == task_type_init ||
-             t->type == task_type_init_grav) {
+             t->type == task_type_drift || t->type == task_type_init_grav) {
       if (cell_is_active(t->ci, e)) scheduler_activate(s, t);
     }
 
@@ -2691,7 +2740,8 @@ void engine_rebuild(struct engine *e) {
  */
 void engine_prepare(struct engine *e) {
 
-  TIMER_TIC;
+  TIMER_TIC2;
+  const ticks tic = getticks();
 
 #ifdef SWIFT_DEBUG_CHECKS
   if (e->forcerepart || e->forcerebuild) {
@@ -2718,7 +2768,7 @@ void engine_prepare(struct engine *e) {
   }
   e->tasks_age += 1;
 
-  TIMER_TOC(timer_prepare);
+  TIMER_TOC2(timer_prepare);
 
   if (e->verbose)
     message("took %.3f %s (including unskip and reweight).",
@@ -3093,6 +3143,11 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs) {
   /* Print the number of active tasks ? */
   if (e->verbose) engine_print_task_counts(e);
 
+  /* Init the particle hydro data (by hand). */
+  for (size_t k = 0; k < s->nr_parts; k++)
+    hydro_init_part(&s->parts[k], &e->s->hs);
+  for (size_t k = 0; k < s->nr_gparts; k++) gravity_init_gpart(&s->gparts[k]);
+
   /* Now, launch the calculation */
   TIMER_TIC;
   engine_launch(e, e->nr_threads);
@@ -3138,6 +3193,11 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs) {
   /* No drift this time */
   engine_skip_drift(e);
 
+  /* Init the particle hydro data (by hand). */
+  for (size_t k = 0; k < s->nr_parts; k++)
+    hydro_init_part(&s->parts[k], &e->s->hs);
+  for (size_t k = 0; k < s->nr_gparts; k++) gravity_init_gpart(&s->gparts[k]);
+
   /* Print the number of active tasks ? */
   if (e->verbose) engine_print_task_counts(e);
 
@@ -3225,6 +3285,15 @@ void engine_step(struct engine *e) {
   /* Are we drifting everything (a la Gadget/GIZMO) ? */
   if (e->policy & engine_policy_drift_all) engine_drift_all(e);
 
+  /* Are we reconstructing the multipoles or drifting them ?*/
+  if (e->policy & engine_policy_self_gravity) {
+
+    if (e->policy & engine_policy_reconstruct_mpoles)
+      engine_reconstruct_multipoles(e);
+    else
+      engine_drift_top_multipoles(e);
+  }
+
   /* Print the number of active tasks ? */
   if (e->verbose) engine_print_task_counts(e);
 
@@ -3248,9 +3317,6 @@ void engine_step(struct engine *e) {
     gravity_exact_force_compute(e->s, e);
 #endif
 
-  /* Do we need to drift the top-level multipoles ? */
-  if (e->policy & engine_policy_self_gravity) engine_drift_top_multipoles(e);
-
   /* Start all the tasks. */
   TIMER_TIC;
   engine_launch(e, e->nr_threads);
@@ -3262,9 +3328,8 @@ void engine_step(struct engine *e) {
     gravity_exact_force_check(e->s, e, 1e-1);
 #endif
 
-  /* Let's trigger a rebuild every-so-often for good measure */  // MATTHIEU
-                                                                 // improve
-  if (!(e->policy & engine_policy_hydro) &&
+  /* Let's trigger a rebuild every-so-often for good measure */
+  if (!(e->policy & engine_policy_hydro) &&  // MATTHIEU improve this
       (e->policy & engine_policy_self_gravity) && e->step % 20 == 0)
     e->forcerebuild = 1;
 
@@ -3447,6 +3512,39 @@ void engine_drift_top_multipoles(struct engine *e) {
             clocks_getunit());
 }
 
+void engine_do_reconstruct_multipoles_mapper(void *map_data, int num_elements,
+                                             void *extra_data) {
+
+  struct engine *e = (struct engine *)extra_data;
+  struct cell *cells = (struct cell *)map_data;
+
+  for (int ind = 0; ind < num_elements; ind++) {
+    struct cell *c = &cells[ind];
+    if (c != NULL && c->nodeID == e->nodeID) {
+
+      /* Construct the multipoles in this cell hierarchy */
+      cell_make_multipoles(c, e->ti_current);
+    }
+  }
+}
+
+/**
+ * @brief Reconstruct all the multipoles at all the levels in the tree.
+ *
+ * @param e The #engine.
+ */
+void engine_reconstruct_multipoles(struct engine *e) {
+
+  const ticks tic = getticks();
+
+  threadpool_map(&e->threadpool, engine_do_reconstruct_multipoles_mapper,
+                 e->s->cells_top, e->s->nr_cells, sizeof(struct cell), 10, e);
+
+  if (e->verbose)
+    message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
+            clocks_getunit());
+}
+
 /**
  * @brief Create and fill the proxies.
  *
@@ -4166,11 +4264,13 @@ void engine_init(struct engine *e, struct space *s,
       e->runners[k].qid = k * nr_queues / e->nr_threads;
     }
 
+#ifdef WITH_VECTORIZATION
     /* Allocate particle caches. */
     e->runners[k].ci_cache.count = 0;
     e->runners[k].cj_cache.count = 0;
     cache_init(&e->runners[k].ci_cache, CACHE_SIZE);
     cache_init(&e->runners[k].cj_cache, CACHE_SIZE);
+#endif
 
     if (verbose) {
       if (with_aff)
@@ -4257,8 +4357,10 @@ void engine_compute_next_snapshot_time(struct engine *e) {
  */
 void engine_clean(struct engine *e) {
 
+#ifdef WITH_VECTORIZATION
   for (int i = 0; i < e->nr_threads; ++i) cache_clean(&e->runners[i].ci_cache);
   for (int i = 0; i < e->nr_threads; ++i) cache_clean(&e->runners[i].cj_cache);
+#endif
   free(e->runners);
   free(e->snapshotUnits);
   free(e->links);
diff --git a/src/engine.h b/src/engine.h
index 22ee122bb082895131584942bde50509952b98ff..e62b12332d3ac1b985b8f6d7181ea66824ec4f13 100644
--- a/src/engine.h
+++ b/src/engine.h
@@ -66,9 +66,10 @@ enum engine_policy {
   engine_policy_external_gravity = (1 << 9),
   engine_policy_cosmology = (1 << 10),
   engine_policy_drift_all = (1 << 11),
-  engine_policy_cooling = (1 << 12),
-  engine_policy_sourceterms = (1 << 13),
-  engine_policy_stars = (1 << 14)
+  engine_policy_reconstruct_mpoles = (1 << 12),
+  engine_policy_cooling = (1 << 13),
+  engine_policy_sourceterms = (1 << 14),
+  engine_policy_stars = (1 << 15)
 };
 
 extern const char *engine_policy_names[];
@@ -256,6 +257,7 @@ void engine_compute_next_snapshot_time(struct engine *e);
 void engine_unskip(struct engine *e);
 void engine_drift_all(struct engine *e);
 void engine_drift_top_multipoles(struct engine *e);
+void engine_reconstruct_multipoles(struct engine *e);
 void engine_dump_snapshot(struct engine *e);
 void engine_init(struct engine *e, struct space *s,
                  const struct swift_params *params, int nr_nodes, int nodeID,
diff --git a/src/gravity.c b/src/gravity.c
index 405e52cc116416c27ec236340a0358443e255384..97b2955b32e1513c3d86d1d1f4da2169130feb77 100644
--- a/src/gravity.c
+++ b/src/gravity.c
@@ -32,6 +32,13 @@
 #include "error.h"
 #include "version.h"
 
+struct exact_force_data {
+  const struct engine *e;
+  const struct space *s;
+  int counter_global;
+  double const_G;
+};
+
 /**
  * @brief Checks whether the file containing the exact accelerations for
  * the current choice of parameters already exists.
@@ -83,32 +90,23 @@ int gravity_exact_force_file_exits(const struct engine *e) {
 }
 
 /**
- * @brief Run a brute-force gravity calculation for a subset of particles.
- *
- * All gpart with ID modulo SWIFT_GRAVITY_FORCE_CHECKS will get their forces
- * computed.
- *
- * @param s The #space to use.
- * @param e The #engine (to access the current time).
+ * @brief Mapper function for the exact gravity calculation.
  */
-void gravity_exact_force_compute(struct space *s, const struct engine *e) {
-
+void gravity_exact_force_compute_mapper(void *map_data, int nr_gparts,
+                                        void *extra_data) {
 #ifdef SWIFT_GRAVITY_FORCE_CHECKS
 
-  const ticks tic = getticks();
-  const double const_G = e->physical_constants->const_newton_G;
+  /* Unpack the data */
+  struct gpart *restrict gparts = (struct gpart *)map_data;
+  struct exact_force_data *data = (struct exact_force_data *)extra_data;
+  const struct space *s = data->s;
+  const struct engine *e = data->e;
+  const double const_G = data->const_G;
   int counter = 0;
 
-  /* Let's start by checking whether we already computed these forces */
-  if (gravity_exact_force_file_exits(e)) {
-    message("Exact accelerations already computed. Skipping calculation.");
-    return;
-  }
-
-  /* No matching file present ? Do it then */
-  for (size_t i = 0; i < s->nr_gparts; ++i) {
+  for (int i = 0; i < nr_gparts; ++i) {
 
-    struct gpart *gpi = &s->gparts[i];
+    struct gpart *gpi = &gparts[i];
 
     /* Is the particle active and part of the subset to be tested ? */
     if (gpi->id_or_neg_offset % SWIFT_GRAVITY_FORCE_CHECKS == 0 &&
@@ -118,13 +116,13 @@ void gravity_exact_force_compute(struct space *s, const struct engine *e) {
       double a_grav[3] = {0., 0., 0.};
 
       /* Interact it with all other particles in the space.*/
-      for (size_t j = 0; j < s->nr_gparts; ++j) {
-
-        /* No self interaction */
-        if (i == j) continue;
+      for (int j = 0; j < (int)s->nr_gparts; ++j) {
 
         struct gpart *gpj = &s->gparts[j];
 
+        /* No self interaction */
+        if (gpi == gpj) continue;
+
         /* Compute the pairwise distance. */
         const double dx[3] = {gpi->x[0] - gpj->x[0],   // x
                               gpi->x[1] - gpj->x[1],   // y
@@ -173,9 +171,47 @@ void gravity_exact_force_compute(struct space *s, const struct engine *e) {
       counter++;
     }
   }
+  atomic_add(&data->counter_global, counter);
 
-  message("Computed exact gravity for %d gparts (took %.3f %s). ", counter,
-          clocks_from_ticks(getticks() - tic), clocks_getunit());
+#else
+  error("Gravity checking function called without the corresponding flag.");
+#endif
+}
+
+/**
+ * @brief Run a brute-force gravity calculation for a subset of particles.
+ *
+ * All gpart with ID modulo SWIFT_GRAVITY_FORCE_CHECKS will get their forces
+ * computed.
+ *
+ * @param s The #space to use.
+ * @param e The #engine (to access the current time).
+ */
+void gravity_exact_force_compute(struct space *s, const struct engine *e) {
+
+#ifdef SWIFT_GRAVITY_FORCE_CHECKS
+
+  const ticks tic = getticks();
+
+  /* Let's start by checking whether we already computed these forces */
+  if (gravity_exact_force_file_exits(e)) {
+    message("Exact accelerations already computed. Skipping calculation.");
+    return;
+  }
+
+  /* No matching file present ? Do it then */
+  struct exact_force_data data;
+  data.e = e;
+  data.s = s;
+  data.counter_global = 0;
+  data.const_G = e->physical_constants->const_newton_G;
+
+  threadpool_map(&s->e->threadpool, gravity_exact_force_compute_mapper,
+                 s->gparts, s->nr_gparts, sizeof(struct gpart), 1000, &data);
+
+  message("Computed exact gravity for %d gparts (took %.3f %s). ",
+          data.counter_global, clocks_from_ticks(getticks() - tic),
+          clocks_getunit());
 
 #else
   error("Gravity checking function called without the corresponding flag.");
diff --git a/src/gravity/Default/gravity.h b/src/gravity/Default/gravity.h
index e3487ecd23e01b83e711dc07a6c97fd5ecf6a50d..1eee6e678288a209b669c46f7c87fbb5c399b6c7 100644
--- a/src/gravity/Default/gravity.h
+++ b/src/gravity/Default/gravity.h
@@ -24,6 +24,39 @@
 #include "gravity_properties.h"
 #include "minmax.h"
 
+/**
+ * @brief Returns the mass of a particle
+ *
+ * @param gp The particle of interest
+ */
+__attribute__((always_inline)) INLINE static float gravity_get_mass(
+    const struct gpart* restrict gp) {
+
+  return gp->mass;
+}
+
+/**
+ * @brief Returns the softening of a particle
+ *
+ * @param gp The particle of interest
+ */
+__attribute__((always_inline)) INLINE static float gravity_get_softening(
+    const struct gpart* restrict gp) {
+
+  return gp->epsilon;
+}
+
+/**
+ * @brief Returns the potential of a particle
+ *
+ * @param gp The particle of interest
+ */
+__attribute__((always_inline)) INLINE static float gravity_get_potential(
+    const struct gpart* restrict gp) {
+
+  return gp->potential;
+}
+
 /**
  * @brief Computes the gravity time-step of a given particle due to self-gravity
  *
@@ -42,8 +75,10 @@ gravity_compute_timestep_self(const struct gpart* const gp,
 
   const float ac_inv = (ac2 > 0.f) ? 1.f / sqrtf(ac2) : FLT_MAX;
 
-  /* Note that 0.714285714 = 2. (from Gadget) / 2.8 (Plummer softening) */
-  const float dt = sqrtf(0.714285714f * grav_props->eta * gp->epsilon * ac_inv);
+  const float epsilon = gravity_get_softening(gp);
+
+  /* Note that 0.66666667 = 2. (from Gadget) / 3. (Plummer softening) */
+  const float dt = sqrtf(0.66666667f * grav_props->eta * epsilon * ac_inv);
 
   return dt;
 }
@@ -63,6 +98,7 @@ __attribute__((always_inline)) INLINE static void gravity_init_gpart(
   gp->a_grav[0] = 0.f;
   gp->a_grav[1] = 0.f;
   gp->a_grav[2] = 0.f;
+  gp->potential = 0.f;
 
 #ifdef SWIFT_DEBUG_CHECKS
   gp->num_interacted = 0;
@@ -130,8 +166,8 @@ __attribute__((always_inline)) INLINE static void gravity_first_init_gpart(
 __attribute__((always_inline)) INLINE static void gravity_init_softening(
     struct gpart* gp, const struct gravity_props* grav_props) {
 
-  /* Note 2.8 is the Plummer-equivalent correction */
-  gp->epsilon = 2.8f * grav_props->epsilon;
+  /* Note 3 is the Plummer-equivalent correction */
+  gp->epsilon = grav_props->epsilon;
 }
 
 #endif /* SWIFT_DEFAULT_GRAVITY_H */
diff --git a/src/gravity/Default/gravity_part.h b/src/gravity/Default/gravity_part.h
index bb1307a1a2fc1dcd71202e3426c99f3e30c0de9a..33c444e24e4bc9681dc3138298b5fd9301d084d3 100644
--- a/src/gravity/Default/gravity_part.h
+++ b/src/gravity/Default/gravity_part.h
@@ -41,6 +41,9 @@ struct gpart {
   /* Particle mass. */
   float mass;
 
+  /* Gravitational potential */
+  float potential;
+
   /* Softening length */
   float epsilon;
 
diff --git a/src/gravity_derivatives.h b/src/gravity_derivatives.h
index 337ae67f4620677e7a3dfdd1b0c527eb016504e2..8c8379f74f5fc67d3671f0154b2aeacbc35ea9f1 100644
--- a/src/gravity_derivatives.h
+++ b/src/gravity_derivatives.h
@@ -21,14 +21,14 @@
 
 /**
  * @file gravity_derivatives.h
- * @brief Derivatives (up to 3rd order) of the gravitational potential.
+ * @brief Derivatives (up to 5th order) of the gravitational potential.
  *
  * We use the notation of Dehnen, Computational Astrophysics and Cosmology,
  * 1, 1, pp. 24 (2014), arXiv:1405.2255
  */
 
-/* Some standard headers. */
-#include <math.h>
+/* Config parameters. */
+#include "../config.h"
 
 /* Local headers. */
 #include "inline.h"
diff --git a/src/gravity_properties.c b/src/gravity_properties.c
index f52029fa1543ad1f8d0121c8c4e6d362227f4c53..7b9b8cd7c35f8fa9b21ff34ce2589b5d45ce8393 100644
--- a/src/gravity_properties.c
+++ b/src/gravity_properties.c
@@ -52,23 +52,25 @@ void gravity_props_init(struct gravity_props *p,
   p->theta_crit_inv = 1. / p->theta_crit;
 
   /* Softening lengths */
-  p->epsilon = parser_get_param_double(params, "Gravity:epsilon");
+  p->epsilon = 3. * parser_get_param_double(params, "Gravity:epsilon");
   p->epsilon2 = p->epsilon * p->epsilon;
   p->epsilon_inv = 1. / p->epsilon;
 }
 
 void gravity_props_print(const struct gravity_props *p) {
 
-  message("Self-gravity scheme: FMM-MM");
+  message("Self-gravity scheme: FMM-MM with m-poles of order %d",
+          SELF_GRAVITY_MULTIPOLE_ORDER);
 
   message("Self-gravity time integration: eta=%.4f", p->eta);
 
   message("Self-gravity opening angle:  theta=%.4f", p->theta_crit);
 
-  message("Self-gravity softening:    epsilon=%.4f", p->epsilon);
+  message("Self-gravity softening:    epsilon=%.4f (Plummer equivalent: %.4f)",
+          p->epsilon, p->epsilon / 3.);
 
   if (p->a_smooth != gravity_props_default_a_smooth)
-    message("Self-gravity smoothing-scale: a_smooth=%f", p->a_smooth);
+    message("Self-gravity MM smoothing-scale: a_smooth=%f", p->a_smooth);
 
   if (p->r_cut != gravity_props_default_r_cut)
     message("Self-gravity MM cut-off: r_cut=%f", p->r_cut);
@@ -80,7 +82,10 @@ void gravity_props_print_snapshot(hid_t h_grpgrav,
 
   io_write_attribute_f(h_grpgrav, "Time integration eta", p->eta);
   io_write_attribute_f(h_grpgrav, "Softening length", p->epsilon);
+  io_write_attribute_f(h_grpgrav, "Softening length (Plummer equivalent)",
+                       p->epsilon / 3.);
   io_write_attribute_f(h_grpgrav, "Opening angle", p->theta_crit);
+  io_write_attribute_d(h_grpgrav, "MM order", SELF_GRAVITY_MULTIPOLE_ORDER);
   io_write_attribute_f(h_grpgrav, "MM a_smooth", p->a_smooth);
   io_write_attribute_f(h_grpgrav, "MM r_cut", p->r_cut);
 }
diff --git a/src/gravity_softened_derivatives.h b/src/gravity_softened_derivatives.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f92476dab5940765b112708a867d940d4d5e6e9
--- /dev/null
+++ b/src/gravity_softened_derivatives.h
@@ -0,0 +1,443 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (c) 2016 Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+#ifndef SWIFT_GRAVITY_SOFTENED_DERIVATIVE_H
+#define SWIFT_GRAVITY_SOFTENED_DERIVATIVE_H
+
+/**
+ * @file gravity_softened_derivatives.h
+ * @brief Derivatives of the softened gravitational potential.
+ *
+ * We use the notation of Dehnen, Computational Astrophysics and Cosmology,
+ * 1, 1, pp. 24 (2014), arXiv:1405.2255
+ */
+
+/* Config parameters. */
+#include "../config.h"
+
+/* Local headers. */
+#include "inline.h"
+#include "kernel_gravity.h"
+
+/*************************/
+/* 0th order derivatives */
+/*************************/
+
+/**
+ * @brief \f$ \phi(r_x, r_y, r_z, h) \f$.
+ *
+ * @param r_x x-coordinate of the distance vector (\f$ r_x \f$).
+ * @param r_y y-coordinate of the distance vector (\f$ r_y \f$).
+ * @param r_z z-coordinate of the distance vector (\f$ r_z \f$).
+ * @param r Norm of the distance vector (\f$ |r| \f$).
+ * @param eps_inv Inverse of the softening length (\f$ 1/h \f$).
+ */
+__attribute__((always_inline)) INLINE static double D_soft_000(
+    double r_x, double r_y, double r_z, double r, double eps_inv) {
+
+  const double u = r * eps_inv;
+  return eps_inv * D_soft_0(u);
+}
+
+/*************************/
+/* 1st order derivatives */
+/*************************/
+
+/**
+ * @brief \f$ \frac{\partial\phi(r_x, r_y, r_z, h)}{\partial r_x} \f$.
+ *
+ * @param r_x x-coordinate of the distance vector (\f$ r_x \f$).
+ * @param r_y y-coordinate of the distance vector (\f$ r_y \f$).
+ * @param r_z z-coordinate of the distance vector (\f$ r_z \f$).
+ * @param r Norm of the distance vector (\f$ |r| \f$).
+ * @param eps_inv Inverse of the softening length (\f$ 1/h \f$).
+ */
+__attribute__((always_inline)) INLINE static double D_soft_100(
+    double r_x, double r_y, double r_z, double r, double eps_inv) {
+
+  const double u = r * eps_inv;
+  return -r_x * eps_inv * eps_inv * eps_inv * D_soft_1(u);
+}
+
+/**
+ * @brief \f$ \frac{\partial\phi(r_x, r_y, r_z, h)}{\partial r_x} \f$.
+ *
+ * @param r_x x-coordinate of the distance vector (\f$ r_x \f$).
+ * @param r_y y-coordinate of the distance vector (\f$ r_y \f$).
+ * @param r_z z-coordinate of the distance vector (\f$ r_z \f$).
+ * @param r Norm of the distance vector (\f$ |r| \f$).
+ * @param eps_inv Inverse of the softening length (\f$ 1/h \f$).
+ */
+__attribute__((always_inline)) INLINE static double D_soft_010(
+    double r_x, double r_y, double r_z, double r, double eps_inv) {
+
+  const double u = r * eps_inv;
+  return -r_y * eps_inv * eps_inv * eps_inv * D_soft_1(u);
+}
+
+/**
+ * @brief \f$ \frac{\partial\phi(r_x, r_y, r_z, h)}{\partial r_x} \f$.
+ *
+ * @param r_x x-coordinate of the distance vector (\f$ r_x \f$).
+ * @param r_y y-coordinate of the distance vector (\f$ r_y \f$).
+ * @param r_z z-coordinate of the distance vector (\f$ r_z \f$).
+ * @param r Norm of the distance vector (\f$ |r| \f$).
+ * @param eps_inv Inverse of the softening length (\f$ 1/h \f$).
+ */
+__attribute__((always_inline)) INLINE static double D_soft_001(
+    double r_x, double r_y, double r_z, double r, double eps_inv) {
+
+  const double u = r * eps_inv;
+  return -r_z * eps_inv * eps_inv * eps_inv * D_soft_1(u);
+}
+
+/*************************/
+/* 2nd order derivatives */
+/*************************/
+
+/**
+ * @brief \f$ \frac{\partial^2\phi(r_x, r_y, r_z, h)}{\partial r_x^2} \f$.
+ *
+ * @param r_x x-coordinate of the distance vector (\f$ r_x \f$).
+ * @param r_y y-coordinate of the distance vector (\f$ r_y \f$).
+ * @param r_z z-coordinate of the distance vector (\f$ r_z \f$).
+ * @param r Norm of the distance vector (\f$ |r| \f$).
+ * @param eps_inv Inverse of the softening length (\f$ 1/h \f$).
+ */
+__attribute__((always_inline)) INLINE static double D_soft_200(
+    double r_x, double r_y, double r_z, double r, double eps_inv) {
+
+  const double u = r * eps_inv;
+  const double eps_inv2 = eps_inv * eps_inv;
+  const double eps_inv3 = eps_inv * eps_inv2;
+  const double eps_inv5 = eps_inv3 * eps_inv2;
+  return r_x * r_x * eps_inv5 * D_soft_2(u) - eps_inv3 * D_soft_1(u);
+}
+
+/**
+ * @brief \f$ \frac{\partial^2\phi(r_x, r_y, r_z, h)}{\partial r_y^2} \f$.
+ *
+ * @param r_x x-coordinate of the distance vector (\f$ r_x \f$).
+ * @param r_y y-coordinate of the distance vector (\f$ r_y \f$).
+ * @param r_z z-coordinate of the distance vector (\f$ r_z \f$).
+ * @param r Norm of the distance vector (\f$ |r| \f$).
+ * @param eps_inv Inverse of the softening length (\f$ 1/h \f$).
+ */
+__attribute__((always_inline)) INLINE static double D_soft_020(
+    double r_x, double r_y, double r_z, double r, double eps_inv) {
+
+  const double u = r * eps_inv;
+  const double eps_inv2 = eps_inv * eps_inv;
+  const double eps_inv3 = eps_inv * eps_inv2;
+  const double eps_inv5 = eps_inv3 * eps_inv2;
+  return r_y * r_y * eps_inv5 * D_soft_2(u) - eps_inv3 * D_soft_1(u);
+}
+
+/**
+ * @brief \f$ \frac{\partial^2\phi(r_x, r_y, r_z, h)}{\partial r_z^2} \f$.
+ *
+ * @param r_x x-coordinate of the distance vector (\f$ r_x \f$).
+ * @param r_y y-coordinate of the distance vector (\f$ r_y \f$).
+ * @param r_z z-coordinate of the distance vector (\f$ r_z \f$).
+ * @param r Norm of the distance vector (\f$ |r| \f$).
+ * @param eps_inv Inverse of the softening length (\f$ 1/h \f$).
+ */
+__attribute__((always_inline)) INLINE static double D_soft_002(
+    double r_x, double r_y, double r_z, double r, double eps_inv) {
+
+  const double u = r * eps_inv;
+  const double eps_inv2 = eps_inv * eps_inv;
+  const double eps_inv3 = eps_inv * eps_inv2;
+  const double eps_inv5 = eps_inv3 * eps_inv2;
+  return r_z * r_z * eps_inv5 * D_soft_2(u) - eps_inv3 * D_soft_1(u);
+}
+
+/**
+ * @brief \f$ \frac{\partial^2\phi(r_x, r_y, r_z, h)}{\partial r_x\partial r_y}
+ * \f$.
+ *
+ * @param r_x x-coordinate of the distance vector (\f$ r_x \f$).
+ * @param r_y y-coordinate of the distance vector (\f$ r_y \f$).
+ * @param r_z z-coordinate of the distance vector (\f$ r_z \f$).
+ * @param r Norm of the distance vector (\f$ |r| \f$).
+ * @param eps_inv Inverse of the softening length (\f$ 1/h \f$).
+ */
+__attribute__((always_inline)) INLINE static double D_soft_110(
+    double r_x, double r_y, double r_z, double r, double eps_inv) {
+
+  const double u = r * eps_inv;
+  const double eps_inv2 = eps_inv * eps_inv;
+  const double eps_inv5 = eps_inv2 * eps_inv2 * eps_inv;
+  return r_x * r_y * eps_inv5 * D_soft_2(u);
+}
+
+/**
+ * @brief \f$ \frac{\partial^2\phi(r_x, r_y, r_z, h)}{\partial r_x\partial r_z}
+ * \f$.
+ *
+ * @param r_x x-coordinate of the distance vector (\f$ r_x \f$).
+ * @param r_y y-coordinate of the distance vector (\f$ r_y \f$).
+ * @param r_z z-coordinate of the distance vector (\f$ r_z \f$).
+ * @param r Norm of the distance vector (\f$ |r| \f$).
+ * @param eps_inv Inverse of the softening length (\f$ 1/h \f$).
+ */
+__attribute__((always_inline)) INLINE static double D_soft_101(
+    double r_x, double r_y, double r_z, double r, double eps_inv) {
+
+  const double u = r * eps_inv;
+  const double eps_inv2 = eps_inv * eps_inv;
+  const double eps_inv5 = eps_inv2 * eps_inv2 * eps_inv;
+  return r_x * r_z * eps_inv5 * D_soft_2(u);
+}
+
+/**
+ * @brief \f$ \frac{\partial^2\phi(r_x, r_y, r_z, h)}{\partial r_y\partial r_z}
+ * \f$.
+ *
+ * @param r_x x-coordinate of the distance vector (\f$ r_x \f$).
+ * @param r_y y-coordinate of the distance vector (\f$ r_y \f$).
+ * @param r_z z-coordinate of the distance vector (\f$ r_z \f$).
+ * @param r Norm of the distance vector (\f$ |r| \f$).
+ * @param eps_inv Inverse of the softening length (\f$ 1/h \f$).
+ */
+__attribute__((always_inline)) INLINE static double D_soft_011(
+    double r_x, double r_y, double r_z, double r, double eps_inv) {
+
+  const double u = r * eps_inv;
+  const double eps_inv2 = eps_inv * eps_inv;
+  const double eps_inv5 = eps_inv2 * eps_inv2 * eps_inv;
+  return r_y * r_z * eps_inv5 * D_soft_2(u);
+}
+
+/*************************/
+/* 3rd order derivatives */
+/*************************/
+
+/**
+ * @brief \f$ \frac{\partial^3\phi(r_x, r_y, r_z, h)}{\partial r_x^3} \f$.
+ *
+ * @param r_x x-coordinate of the distance vector (\f$ r_x \f$).
+ * @param r_y y-coordinate of the distance vector (\f$ r_y \f$).
+ * @param r_z z-coordinate of the distance vector (\f$ r_z \f$).
+ * @param r Norm of the distance vector (\f$ |r| \f$).
+ * @param eps_inv Inverse of the softening length (\f$ 1/h \f$).
+ */
+__attribute__((always_inline)) INLINE static double D_soft_300(
+    double r_x, double r_y, double r_z, double r, double eps_inv) {
+
+  const double u = r * eps_inv;
+  const double eps_inv2 = eps_inv * eps_inv;
+  const double eps_inv5 = eps_inv2 * eps_inv2 * eps_inv;
+  const double eps_inv7 = eps_inv5 * eps_inv2;
+  return -r_x * r_x * r_x * eps_inv7 * D_soft_3(u) +
+         3. * r_x * eps_inv5 * D_soft_2(u);
+}
+
+/**
+ * @brief \f$ \frac{\partial^3\phi(r_x, r_y, r_z, h)}{\partial r_y^3} \f$.
+ *
+ * @param r_x x-coordinate of the distance vector (\f$ r_x \f$).
+ * @param r_y y-coordinate of the distance vector (\f$ r_y \f$).
+ * @param r_z z-coordinate of the distance vector (\f$ r_z \f$).
+ * @param r Norm of the distance vector (\f$ |r| \f$).
+ * @param eps_inv Inverse of the softening length (\f$ 1/h \f$).
+ */
+__attribute__((always_inline)) INLINE static double D_soft_030(
+    double r_x, double r_y, double r_z, double r, double eps_inv) {
+
+  const double u = r * eps_inv;
+  const double eps_inv2 = eps_inv * eps_inv;
+  const double eps_inv5 = eps_inv2 * eps_inv2 * eps_inv;
+  const double eps_inv7 = eps_inv5 * eps_inv2;
+  return -r_y * r_y * r_y * eps_inv7 * D_soft_3(u) +
+         3. * r_y * eps_inv5 * D_soft_2(u);
+}
+
+/**
+ * @brief \f$ \frac{\partial^3\phi(r_x, r_y, r_z, h)}{\partial r_z^3} \f$.
+ *
+ * @param r_x x-coordinate of the distance vector (\f$ r_x \f$).
+ * @param r_y y-coordinate of the distance vector (\f$ r_y \f$).
+ * @param r_z z-coordinate of the distance vector (\f$ r_z \f$).
+ * @param r Norm of the distance vector (\f$ |r| \f$).
+ * @param eps_inv Inverse of the softening length (\f$ 1/h \f$).
+ */
+__attribute__((always_inline)) INLINE static double D_soft_003(
+    double r_x, double r_y, double r_z, double r, double eps_inv) {
+
+  const double u = r * eps_inv;
+  const double eps_inv2 = eps_inv * eps_inv;
+  const double eps_inv5 = eps_inv2 * eps_inv2 * eps_inv;
+  const double eps_inv7 = eps_inv5 * eps_inv2;
+  return -r_z * r_z * r_z * eps_inv7 * D_soft_3(u) +
+         3. * r_z * eps_inv5 * D_soft_2(u);
+}
+
+/**
+ * @brief \f$ \frac{\partial^3\phi(r_x, r_y, r_z, h)}{\partial r_x^2\partial
+ * r_y}
+ * \f$.
+ *
+ * @param r_x x-coordinate of the distance vector (\f$ r_x \f$).
+ * @param r_y y-coordinate of the distance vector (\f$ r_y \f$).
+ * @param r_z z-coordinate of the distance vector (\f$ r_z \f$).
+ * @param r Norm of the distance vector (\f$ |r| \f$).
+ * @param eps_inv Inverse of the softening length (\f$ 1/h \f$).
+ */
+__attribute__((always_inline)) INLINE static double D_soft_210(
+    double r_x, double r_y, double r_z, double r, double eps_inv) {
+
+  const double u = r * eps_inv;
+  const double eps_inv2 = eps_inv * eps_inv;
+  const double eps_inv5 = eps_inv2 * eps_inv2 * eps_inv;
+  const double eps_inv7 = eps_inv5 * eps_inv2;
+  return -r_x * r_x * r_y * eps_inv7 * D_soft_3(u) +
+         r_y * eps_inv5 * D_soft_2(u);
+}
+
+/**
+ * @brief \f$ \frac{\partial^3\phi(r_x, r_y, r_z, h)}{\partial r_x^2\partial
+ * r_z}
+ * \f$.
+ *
+ * @param r_x x-coordinate of the distance vector (\f$ r_x \f$).
+ * @param r_y y-coordinate of the distance vector (\f$ r_y \f$).
+ * @param r_z z-coordinate of the distance vector (\f$ r_z \f$).
+ * @param r Norm of the distance vector (\f$ |r| \f$).
+ * @param eps_inv Inverse of the softening length (\f$ 1/h \f$).
+ */
+__attribute__((always_inline)) INLINE static double D_soft_201(
+    double r_x, double r_y, double r_z, double r, double eps_inv) {
+
+  const double u = r * eps_inv;
+  const double eps_inv2 = eps_inv * eps_inv;
+  const double eps_inv5 = eps_inv2 * eps_inv2 * eps_inv;
+  const double eps_inv7 = eps_inv5 * eps_inv2;
+  return -r_x * r_x * r_z * eps_inv7 * D_soft_3(u) +
+         r_z * eps_inv5 * D_soft_2(u);
+}
+
+/**
+ * @brief \f$ \frac{\partial^3\phi(r_x, r_y, r_z, h)}{\partial r_x\partial
+ * r_y^2}
+ * \f$.
+ *
+ * @param r_x x-coordinate of the distance vector (\f$ r_x \f$).
+ * @param r_y y-coordinate of the distance vector (\f$ r_y \f$).
+ * @param r_z z-coordinate of the distance vector (\f$ r_z \f$).
+ * @param r Norm of the distance vector (\f$ |r| \f$).
+ * @param eps_inv Inverse of the softening length (\f$ 1/h \f$).
+ */
+__attribute__((always_inline)) INLINE static double D_soft_120(
+    double r_x, double r_y, double r_z, double r, double eps_inv) {
+
+  const double u = r * eps_inv;
+  const double eps_inv2 = eps_inv * eps_inv;
+  const double eps_inv5 = eps_inv2 * eps_inv2 * eps_inv;
+  const double eps_inv7 = eps_inv5 * eps_inv2;
+  return -r_x * r_y * r_y * eps_inv7 * D_soft_3(u) +
+         r_x * eps_inv5 * D_soft_2(u);
+}
+
+/**
+ * @brief \f$ \frac{\partial^3\phi(r_x, r_y, r_z, h)}{\partial r_y^2\partial
+ * r_z}
+ * \f$.
+ *
+ * @param r_x x-coordinate of the distance vector (\f$ r_x \f$).
+ * @param r_y y-coordinate of the distance vector (\f$ r_y \f$).
+ * @param r_z z-coordinate of the distance vector (\f$ r_z \f$).
+ * @param r Norm of the distance vector (\f$ |r| \f$).
+ * @param eps_inv Inverse of the softening length (\f$ 1/h \f$).
+ */
+__attribute__((always_inline)) INLINE static double D_soft_021(
+    double r_x, double r_y, double r_z, double r, double eps_inv) {
+
+  const double u = r * eps_inv;
+  const double eps_inv2 = eps_inv * eps_inv;
+  const double eps_inv5 = eps_inv2 * eps_inv2 * eps_inv;
+  const double eps_inv7 = eps_inv5 * eps_inv2;
+  return -r_y * r_y * r_z * eps_inv7 * D_soft_3(u) +
+         r_z * eps_inv5 * D_soft_2(u);
+}
+
+/**
+ * @brief \f$ \frac{\partial^3\phi(r_x, r_y, r_z, h)}{\partial r_x\partial
+ * r_z^2}
+ * \f$.
+ *
+ * @param r_x x-coordinate of the distance vector (\f$ r_x \f$).
+ * @param r_y y-coordinate of the distance vector (\f$ r_y \f$).
+ * @param r_z z-coordinate of the distance vector (\f$ r_z \f$).
+ * @param r Norm of the distance vector (\f$ |r| \f$).
+ * @param eps_inv Inverse of the softening length (\f$ 1/h \f$).
+ */
+__attribute__((always_inline)) INLINE static double D_soft_102(
+    double r_x, double r_y, double r_z, double r, double eps_inv) {
+
+  const double u = r * eps_inv;
+  const double eps_inv2 = eps_inv * eps_inv;
+  const double eps_inv5 = eps_inv2 * eps_inv2 * eps_inv;
+  const double eps_inv7 = eps_inv5 * eps_inv2;
+  return -r_x * r_z * r_z * eps_inv7 * D_soft_3(u) +
+         r_x * eps_inv5 * D_soft_2(u);
+}
+
+/**
+ * @brief \f$ \frac{\partial^3\phi(r_x, r_y, r_z, h)}{\partial r_y\partial
+ * r_z^2}
+ * \f$.
+ *
+ * @param r_x x-coordinate of the distance vector (\f$ r_x \f$).
+ * @param r_y y-coordinate of the distance vector (\f$ r_y \f$).
+ * @param r_z z-coordinate of the distance vector (\f$ r_z \f$).
+ * @param r Norm of the distance vector (\f$ |r| \f$).
+ * @param eps_inv Inverse of the softening length (\f$ 1/h \f$).
+ */
+__attribute__((always_inline)) INLINE static double D_soft_012(
+    double r_x, double r_y, double r_z, double r, double eps_inv) {
+
+  const double u = r * eps_inv;
+  const double eps_inv2 = eps_inv * eps_inv;
+  const double eps_inv5 = eps_inv2 * eps_inv2 * eps_inv;
+  const double eps_inv7 = eps_inv5 * eps_inv2;
+  return -r_y * r_z * r_z * eps_inv7 * D_soft_3(u) +
+         r_y * eps_inv5 * D_soft_2(u);
+}
+
+/**
+ * @brief \f$ \frac{\partial^3\phi(r_x, r_y, r_z, h)}{\partial r_z\partial
+ * r_y\partial r_z} \f$.
+ *
+ * @param r_x x-coordinate of the distance vector (\f$ r_x \f$).
+ * @param r_y y-coordinate of the distance vector (\f$ r_y \f$).
+ * @param r_z z-coordinate of the distance vector (\f$ r_z \f$).
+ * @param r Norm of the distance vector (\f$ |r| \f$).
+ * @param eps_inv Inverse of the softening length (\f$ 1/h \f$).
+ */
+__attribute__((always_inline)) INLINE static double D_soft_111(
+    double r_x, double r_y, double r_z, double r, double eps_inv) {
+
+  const double u = r * eps_inv;
+  const double eps_inv2 = eps_inv * eps_inv;
+  const double eps_inv4 = eps_inv2 * eps_inv2;
+  const double eps_inv7 = eps_inv4 * eps_inv2 * eps_inv;
+  return -r_x * r_y * r_z * eps_inv7 * D_soft_3(u);
+}
+
+#endif /* SWIFT_GRAVITY_SOFTENED_DERIVATIVE_H */
diff --git a/src/hydro/Default/hydro_part.h b/src/hydro/Default/hydro_part.h
index 332eecb27fb65a6b4da48cbb595450a432c44615..8b14c45614e4c09c48056c9398ed4bfe3ed90e38 100644
--- a/src/hydro/Default/hydro_part.h
+++ b/src/hydro/Default/hydro_part.h
@@ -27,6 +27,9 @@ struct xpart {
   /* Offset between current position and position at last tree rebuild. */
   float x_diff[3];
 
+  /*! Offset between the current position and position at the last sort. */
+  float x_diff_sort[3];
+
   /* Velocity at the last full step. */
   float v_full[3];
 
diff --git a/src/hydro/Gadget2/hydro_iact.h b/src/hydro/Gadget2/hydro_iact.h
index 8a2571e6535ff988577d7d2c6e6c7baa98194aca..585b9394dd3ad70dfd9a518bdee0b134d46e5584 100644
--- a/src/hydro/Gadget2/hydro_iact.h
+++ b/src/hydro/Gadget2/hydro_iact.h
@@ -151,7 +151,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_density(
   for (k = 0; k < 3; k++)
     dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]);
 #else
-  error("Unknown vector size.")
+  error("Unknown vector size.");
 #endif
 
   /* Get the radius and inverse radius. */
@@ -318,7 +318,7 @@ runner_iact_nonsym_vec_density(float *R2, float *Dx, float *Hi, float *Hj,
   for (k = 0; k < 3; k++)
     dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]);
 #else
-  error("Unknown vector size.")
+  error("Unknown vector size.");
 #endif
 
   /* Get the radius and inverse radius. */
@@ -381,12 +381,14 @@ runner_iact_nonsym_vec_density(float *R2, float *Dx, float *Hi, float *Hj,
  * (non-symmetric vectorized version).
  */
 __attribute__((always_inline)) INLINE static void
-runner_iact_nonsym_1_vec_density(
-    vector *r2, vector *dx, vector *dy, vector *dz, vector hi_inv, vector vix,
-    vector viy, vector viz, float *Vjx, float *Vjy, float *Vjz, float *Mj,
-    vector *rhoSum, vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum,
-    vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum,
-    vector mask, int knlMask) {
+runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz,
+                                 vector hi_inv, vector vix, vector viy,
+                                 vector viz, float *Vjx, float *Vjy, float *Vjz,
+                                 float *Mj, vector *rhoSum, vector *rho_dhSum,
+                                 vector *wcountSum, vector *wcount_dhSum,
+                                 vector *div_vSum, vector *curlvxSum,
+                                 vector *curlvySum, vector *curlvzSum,
+                                 vector mask, int knlMask) {
 
   vector r, ri, xi, wi, wi_dx;
   vector mj;
@@ -843,7 +845,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_force(
               vec_set(pj[0]->force.balsara, pj[1]->force.balsara,
                       pj[2]->force.balsara, pj[3]->force.balsara);
 #else
-  error("Unknown vector size.")
+  error("Unknown vector size.");
 #endif
 
   /* Get the radius and inverse radius. */
@@ -1122,7 +1124,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force(
               vec_set(pj[0]->force.balsara, pj[1]->force.balsara,
                       pj[2]->force.balsara, pj[3]->force.balsara);
 #else
-  error("Unknown vector size.")
+  error("Unknown vector size.");
 #endif
 
   /* Get the radius and inverse radius. */
diff --git a/src/hydro/Gadget2/hydro_part.h b/src/hydro/Gadget2/hydro_part.h
index 69ae79666e1db4e4f405c653cfc533606989a73a..571aaf39ed2509d95e9f68c34ab8e6c09ded3fbb 100644
--- a/src/hydro/Gadget2/hydro_part.h
+++ b/src/hydro/Gadget2/hydro_part.h
@@ -39,6 +39,9 @@ struct xpart {
   /* Offset between current position and position at last tree rebuild. */
   float x_diff[3];
 
+  /* Offset between the current position and position at the last sort. */
+  float x_diff_sort[3];
+
   /* Velocity at the last full step. */
   float v_full[3];
 
diff --git a/src/hydro/Gizmo/hydro_part.h b/src/hydro/Gizmo/hydro_part.h
index 29cb6b00190f69e03c3d038ee485519969a8c6e9..d552a3f7e86031311098293845f1aa11270c417f 100644
--- a/src/hydro/Gizmo/hydro_part.h
+++ b/src/hydro/Gizmo/hydro_part.h
@@ -27,6 +27,9 @@ struct xpart {
   /* Offset between current position and position at last tree rebuild. */
   float x_diff[3];
 
+  /* Offset between the current position and position at the last sort. */
+  float x_diff_sort[3];
+
   /* Velocity at the last full step. */
   float v_full[3];
 
diff --git a/src/hydro/Minimal/hydro_part.h b/src/hydro/Minimal/hydro_part.h
index dabae1a546d66f61db4f9796c21b71817ca20aac..e9289c099a8a4ee698b016036e4e3d4dad481768 100644
--- a/src/hydro/Minimal/hydro_part.h
+++ b/src/hydro/Minimal/hydro_part.h
@@ -46,6 +46,9 @@ struct xpart {
   /*! Offset between current position and position at last tree rebuild. */
   float x_diff[3];
 
+  /*! Offset between the current position and position at the last sort. */
+  float x_diff_sort[3];
+
   /*! Velocity at the last full step. */
   float v_full[3];
 
diff --git a/src/hydro/PressureEntropy/hydro_part.h b/src/hydro/PressureEntropy/hydro_part.h
index b6e496918fa0e7989a8bddcfc5e8ea6b332c338e..6cb6fe87393e376dc189150286079faa9f41cb68 100644
--- a/src/hydro/PressureEntropy/hydro_part.h
+++ b/src/hydro/PressureEntropy/hydro_part.h
@@ -38,6 +38,9 @@ struct xpart {
   /*! Offset between current position and position at last tree rebuild. */
   float x_diff[3];
 
+  /*! Offset between the current position and position at the last sort. */
+  float x_diff_sort[3];
+
   /*! Velocity at the last full step. */
   float v_full[3];
 
diff --git a/src/hydro/Shadowswift/hydro_part.h b/src/hydro/Shadowswift/hydro_part.h
index 43237d9c80a5dfa5bed2fe409281b4f89b6aa172..e25400e905b893d13ffb552da42d3fbf96d71fde 100644
--- a/src/hydro/Shadowswift/hydro_part.h
+++ b/src/hydro/Shadowswift/hydro_part.h
@@ -28,6 +28,9 @@ struct xpart {
   /* Offset between current position and position at last tree rebuild. */
   float x_diff[3];
 
+  /*! Offset between the current position and position at the last sort. */
+  float x_diff_sort[3];
+
   /* Velocity at the last full step. */
   float v_full[3];
 
diff --git a/src/inline.h b/src/inline.h
index c4dd9d59becabf54895b60cf3ac7ba809ac150c5..538f6f520c9d6187dc02f1cbd5d59480aac7bdb2 100644
--- a/src/inline.h
+++ b/src/inline.h
@@ -20,6 +20,9 @@
 #ifndef SWIFT_INLINE_H
 #define SWIFT_INLINE_H
 
+/* Config parameters. */
+#include "../config.h"
+
 /**
  * @brief Defines inline
  */
diff --git a/src/kernel_gravity.h b/src/kernel_gravity.h
index 4a7588d312c381ef60fb97c43f8fadb1e03dfead..5a9e839b63422a3f18c80caf9d891dd6f8be5da6 100644
--- a/src/kernel_gravity.h
+++ b/src/kernel_gravity.h
@@ -23,113 +23,94 @@
 #include <math.h>
 
 /* Includes. */
-#include "const.h"
 #include "inline.h"
 #include "minmax.h"
-#include "vector.h"
-
-/* The gravity kernel is defined as a degree 6 polynomial in the distance
-   r. The resulting value should be post-multiplied with r^-3, resulting
-   in a polynomial with terms ranging from r^-3 to r^3, which are
-   sufficient to model both the direct potential as well as the splines
-   near the origin.
-   As in the hydro case, the 1/h^3 needs to be multiplied in afterwards */
-
-/* Coefficients for the kernel. */
-#define kernel_grav_name "Gadget-2 softening kernel"
-#define kernel_grav_degree 6 /* Degree of the polynomial */
-#define kernel_grav_ivals 2  /* Number of branches */
-static const float
-    kernel_grav_coeffs[(kernel_grav_degree + 1) * (kernel_grav_ivals + 1)]
-    __attribute__((aligned(16))) = {32.f,
-                                    -38.4f,
-                                    0.f,
-                                    10.66666667f,
-                                    0.f,
-                                    0.f,
-                                    0.f, /* 0 < u < 0.5 */
-                                    -10.66666667f,
-                                    38.4f,
-                                    -48.f,
-                                    21.3333333f,
-                                    0.f,
-                                    0.f,
-                                    -0.066666667f, /* 0.5 < u < 1 */
-                                    0.f,
-                                    0.f,
-                                    0.f,
-                                    0.f,
-                                    0.f,
-                                    0.f,
-                                    0.f}; /* 1 < u */
 
 /**
  * @brief Computes the gravity softening function.
  *
+ * This functions assumes 0 < u < 1.
+ *
  * @param u The ratio of the distance to the softening length $u = x/h$.
  * @param W (return) The value of the kernel function $W(x,h)$.
  */
 __attribute__((always_inline)) INLINE static void kernel_grav_eval(
     float u, float *const W) {
 
-  /* Pick the correct branch of the kernel */
-  const int ind = (int)min(u * (float)kernel_grav_ivals, kernel_grav_ivals);
-  const float *const coeffs =
-      &kernel_grav_coeffs[ind * (kernel_grav_degree + 1)];
-
-  /* First two terms of the polynomial ... */
-  float w = coeffs[0] * u + coeffs[1];
-
-  /* ... and the rest of them */
-  for (int k = 2; k <= kernel_grav_degree; k++) w = u * w + coeffs[k];
-
-  /* Return everything */
-  *W = w / (u * u * u);
+  /* W(u) = 21u^5 - 90u^4 + 140u^3 - 84u^2 + 14 */
+  *W = 21.f * u - 90.f;
+  *W = *W * u + 140.f;
+  *W = *W * u - 84.f;
+  *W = *W * u;
+  *W = *W * u + 14.f;
 }
 
 #ifdef SWIFT_GRAVITY_FORCE_CHECKS
 
+/**
+ * @brief Computes the gravity softening function in double precision.
+ *
+ * This functions assumes 0 < u < 1.
+ *
+ * @param u The ratio of the distance to the softening length $u = x/h$.
+ * @param W (return) The value of the kernel function $W(x,h)$.
+ */
 __attribute__((always_inline)) INLINE static void kernel_grav_eval_double(
     double u, double *const W) {
 
-  static const double kernel_grav_coeffs_double[(kernel_grav_degree + 1) *
-                                                (kernel_grav_ivals + 1)]
-      __attribute__((aligned(16))) = {32.,
-                                      -38.4,
-                                      0.,
-                                      10.66666667,
-                                      0.,
-                                      0.,
-                                      0., /* 0 < u < 0.5 */
-                                      -10.66666667,
-                                      38.4,
-                                      -48.,
-                                      21.3333333,
-                                      0.,
-                                      0.,
-                                      -0.066666667, /* 0.5 < u < 1 */
-                                      0.,
-                                      0.,
-                                      0.,
-                                      0.,
-                                      0.,
-                                      0.,
-                                      0.}; /* 1 < u */
-
-  /* Pick the correct branch of the kernel */
-  const int ind = (int)min(u * (double)kernel_grav_ivals, kernel_grav_ivals);
-  const double *const coeffs =
-      &kernel_grav_coeffs_double[ind * (kernel_grav_degree + 1)];
-
-  /* First two terms of the polynomial ... */
-  double w = coeffs[0] * u + coeffs[1];
-
-  /* ... and the rest of them */
-  for (int k = 2; k <= kernel_grav_degree; k++) w = u * w + coeffs[k];
-
-  /* Return everything */
-  *W = w / (u * u * u);
+  /* W(u) = 21u^5 - 90u^4 + 140u^3 - 84u^2 + 14 */
+  *W = 21. * u - 90.;
+  *W = *W * u + 140.;
+  *W = *W * u - 84.;
+  *W = *W * u;
+  *W = *W * u + 14.;
 }
 #endif /* SWIFT_GRAVITY_FORCE_CHECKS */
 
+/************************************************/
+/* Derivatives of softening kernel used for FMM */
+/************************************************/
+
+__attribute__((always_inline)) INLINE static double D_soft_0(double u) {
+
+  /* phi(u) = -3u^7 + 15u^6 - 28u^5 + 21u^4 - 7u^2 + 3 */
+  double phi = -3. * u + 15.;
+  phi = phi * u - 28.;
+  phi = phi * u + 21.;
+  phi = phi * u;
+  phi = phi * u - 7.;
+  phi = phi * u;
+  phi = phi * u + 3.;
+
+  return phi;
+}
+
+__attribute__((always_inline)) INLINE static double D_soft_1(double u) {
+
+  /* phi'(u)/u = 21u^5 - 90u^4 + 140u^3 - 84u^2 + 14 */
+  double phi = 21. * u - 90.;
+  phi = phi * u + 140.;
+  phi = phi * u - 84.;
+  phi = phi * u;
+  phi = phi * u + 14.;
+
+  return phi;
+}
+
+__attribute__((always_inline)) INLINE static double D_soft_2(double u) {
+
+  /* (phi'(u)/u)'/u = -105u^3 + 360u^2 - 420u + 168 */
+  double phi = -105. * u + 360.;
+  phi = phi * u - 420.;
+  phi = phi * u + 168.;
+
+  return phi;
+}
+
+__attribute__((always_inline)) INLINE static double D_soft_3(double u) {
+
+  /* ((phi'(u)/u)'/u)'/u = 315u - 720 + 420/u */
+  return 315. * u - 720. + 420. / u;
+}
+
 #endif /* SWIFT_KERNEL_GRAVITY_H */
diff --git a/src/kernel_hydro.h b/src/kernel_hydro.h
index 585655940cf6f0587fd7dd1efae1bf546476b1da..f634a59d7ee769951e6560d46a92053c144cc766 100644
--- a/src/kernel_hydro.h
+++ b/src/kernel_hydro.h
@@ -327,7 +327,8 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx(
   const float *const coeffs = &kernel_coeffs[ind * (kernel_degree + 1)];
 
   /* First two terms of the polynomial ... */
-  float dw_dx = ((float)kernel_degree * coeffs[0] * x) + (float)(kernel_degree - 1) * coeffs[1];
+  float dw_dx = ((float)kernel_degree * coeffs[0] * x) +
+                (float)(kernel_degree - 1) * coeffs[1];
 
   /* ... and the rest of them */
   for (int k = 2; k < kernel_degree; k++) {
@@ -396,7 +397,8 @@ __attribute__((always_inline)) INLINE static void kernel_deval_vec(
       dw_dx->v * kernel_constant_vec.v * kernel_gamma_inv_dim_plus_one_vec.v;
 }
 
-/* Define constant vectors for the Wendland C2 and Cubic Spline kernel coefficients. */
+/* Define constant vectors for the Wendland C2 and Cubic Spline kernel
+ * coefficients. */
 #ifdef WENDLAND_C2_KERNEL
 static const vector wendland_const_c0 = FILL_VEC(4.f);
 static const vector wendland_const_c1 = FILL_VEC(-15.f);
@@ -469,37 +471,39 @@ __attribute__((always_inline)) INLINE static void kernel_deval_1_vec(
   vector mask_reg1, mask_reg2;
 
   /* Form a mask for each part of the kernel. */
-  mask_reg1.v = vec_cmp_lt(x.v,cond.v); /* 0 < x < 0.5 */
-  mask_reg2.v = vec_cmp_gte(x.v,cond.v); /* 0.5 < x < 1 */;
+  mask_reg1.v = vec_cmp_lt(x.v, cond.v);  /* 0 < x < 0.5 */
+  mask_reg2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */
+  ;
 
-  /* Work out w for both regions of the kernel and combine the results together using masks. */
+  /* Work out w for both regions of the kernel and combine the results together
+   * using masks. */
 
   /* Init the iteration for Horner's scheme. */
   w->v = vec_fma(cubic_1_const_c0.v, x.v, cubic_1_const_c1.v);
   w2.v = vec_fma(cubic_2_const_c0.v, x.v, cubic_2_const_c1.v);
   dw_dx->v = cubic_1_const_c0.v;
   dw_dx2.v = cubic_2_const_c0.v;
-  
+
   /* Calculate the polynomial interleaving vector operations. */
   dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
   dw_dx2.v = vec_fma(dw_dx2.v, x.v, w2.v);
   w->v = vec_mul(x.v, w->v); /* cubic_1_const_c2 is zero. */
   w2.v = vec_fma(x.v, w2.v, cubic_2_const_c2.v);
-  
+
   dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
   dw_dx2.v = vec_fma(dw_dx2.v, x.v, w2.v);
   w->v = vec_fma(x.v, w->v, cubic_1_const_c3.v);
   w2.v = vec_fma(x.v, w2.v, cubic_2_const_c3.v);
-  
+
   /* Mask out unneeded values. */
-  w->v = vec_and(w->v,mask_reg1.v);
-  w2.v = vec_and(w2.v,mask_reg2.v);
-  dw_dx->v = vec_and(dw_dx->v,mask_reg1.v);
-  dw_dx2.v = vec_and(dw_dx2.v,mask_reg2.v);
+  w->v = vec_and(w->v, mask_reg1.v);
+  w2.v = vec_and(w2.v, mask_reg2.v);
+  dw_dx->v = vec_and(dw_dx->v, mask_reg1.v);
+  dw_dx2.v = vec_and(dw_dx2.v, mask_reg2.v);
 
   /* Added both w and w2 together to form complete result. */
-  w->v = vec_add(w->v,w2.v);
-  dw_dx->v = vec_add(dw_dx->v,dw_dx2.v);
+  w->v = vec_add(w->v, w2.v);
+  dw_dx->v = vec_add(dw_dx->v, dw_dx2.v);
 #else
 #error
 #endif
@@ -556,7 +560,7 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
 
   dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
   dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v);
-  w->v = vec_mul(x.v, w->v); /* wendland_const_c4 is zero. */
+  w->v = vec_mul(x.v, w->v);    /* wendland_const_c4 is zero. */
   w2->v = vec_mul(x2.v, w2->v); /* wendland_const_c4 is zero. */
 
   dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
@@ -579,12 +583,15 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
   vector mask_reg1, mask_reg2, mask_reg1_v2, mask_reg2_v2;
 
   /* Form a mask for each part of the kernel. */
-  mask_reg1.v = vec_cmp_lt(x.v,cond.v); /* 0 < x < 0.5 */
-  mask_reg1_v2.v = vec_cmp_lt(x2.v,cond.v); /* 0 < x < 0.5 */
-  mask_reg2.v = vec_cmp_gte(x.v,cond.v); /* 0.5 < x < 1 */;
-  mask_reg2_v2.v = vec_cmp_gte(x2.v,cond.v); /* 0.5 < x < 1 */;
+  mask_reg1.v = vec_cmp_lt(x.v, cond.v);     /* 0 < x < 0.5 */
+  mask_reg1_v2.v = vec_cmp_lt(x2.v, cond.v); /* 0 < x < 0.5 */
+  mask_reg2.v = vec_cmp_gte(x.v, cond.v);    /* 0.5 < x < 1 */
+  ;
+  mask_reg2_v2.v = vec_cmp_gte(x2.v, cond.v); /* 0.5 < x < 1 */
+  ;
 
-  /* Work out w for both regions of the kernel and combine the results together using masks. */
+  /* Work out w for both regions of the kernel and combine the results together
+   * using masks. */
 
   /* Init the iteration for Horner's scheme. */
   w->v = vec_fma(cubic_1_const_c0.v, x.v, cubic_1_const_c1.v);
@@ -595,13 +602,13 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
   dw_dx2->v = cubic_1_const_c0.v;
   dw_dx_2.v = cubic_2_const_c0.v;
   dw_dx2_2.v = cubic_2_const_c0.v;
-  
+
   /* Calculate the polynomial interleaving vector operations. */
   dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
   dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v);
   dw_dx_2.v = vec_fma(dw_dx_2.v, x.v, w_2.v);
   dw_dx2_2.v = vec_fma(dw_dx2_2.v, x2.v, w2_2.v);
-  w->v = vec_mul(x.v, w->v); /* cubic_1_const_c2 is zero. */
+  w->v = vec_mul(x.v, w->v);    /* cubic_1_const_c2 is zero. */
   w2->v = vec_mul(x2.v, w2->v); /* cubic_1_const_c2 is zero. */
   w_2.v = vec_fma(x.v, w_2.v, cubic_2_const_c2.v);
   w2_2.v = vec_fma(x2.v, w2_2.v, cubic_2_const_c2.v);
@@ -616,20 +623,20 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
   w2_2.v = vec_fma(x2.v, w2_2.v, cubic_2_const_c3.v);
 
   /* Mask out unneeded values. */
-  w->v = vec_and(w->v,mask_reg1.v);
-  w2->v = vec_and(w2->v,mask_reg1_v2.v);
-  w_2.v = vec_and(w_2.v,mask_reg2.v);
-  w2_2.v = vec_and(w2_2.v,mask_reg2_v2.v);
-  dw_dx->v = vec_and(dw_dx->v,mask_reg1.v);
-  dw_dx2->v = vec_and(dw_dx2->v,mask_reg1_v2.v);
-  dw_dx_2.v = vec_and(dw_dx_2.v,mask_reg2.v);
-  dw_dx2_2.v = vec_and(dw_dx2_2.v,mask_reg2_v2.v);
+  w->v = vec_and(w->v, mask_reg1.v);
+  w2->v = vec_and(w2->v, mask_reg1_v2.v);
+  w_2.v = vec_and(w_2.v, mask_reg2.v);
+  w2_2.v = vec_and(w2_2.v, mask_reg2_v2.v);
+  dw_dx->v = vec_and(dw_dx->v, mask_reg1.v);
+  dw_dx2->v = vec_and(dw_dx2->v, mask_reg1_v2.v);
+  dw_dx_2.v = vec_and(dw_dx_2.v, mask_reg2.v);
+  dw_dx2_2.v = vec_and(dw_dx2_2.v, mask_reg2_v2.v);
 
   /* Added both w and w2 together to form complete result. */
-  w->v = vec_add(w->v,w_2.v);
-  w2->v = vec_add(w2->v,w2_2.v);
-  dw_dx->v = vec_add(dw_dx->v,dw_dx_2.v);
-  dw_dx2->v = vec_add(dw_dx2->v,dw_dx2_2.v);
+  w->v = vec_add(w->v, w_2.v);
+  w2->v = vec_add(w2->v, w2_2.v);
+  dw_dx->v = vec_add(dw_dx->v, dw_dx_2.v);
+  dw_dx2->v = vec_add(dw_dx2->v, dw_dx2_2.v);
 
   /* Return everything */
   w->v = w->v * kernel_constant_vec.v * kernel_gamma_inv_dim_vec.v;
@@ -651,8 +658,8 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
  * @param u The ratio of the distance to the smoothing length $u = x/h$.
  * @param w (return) The value of the kernel function $W(x,h)$.
  */
-__attribute__((always_inline)) INLINE static void kernel_eval_W_vec(
-    vector *u, vector *w) {
+__attribute__((always_inline)) INLINE static void kernel_eval_W_vec(vector *u,
+                                                                    vector *w) {
 
   /* Go to the range [0,1[ from [0,H[ */
   vector x;
@@ -672,28 +679,30 @@ __attribute__((always_inline)) INLINE static void kernel_eval_W_vec(
   vector mask1, mask2;
 
   /* Form a mask for each part of the kernel. */
-  mask1.v = vec_cmp_lt(x.v,cond.v); /* 0 < x < 0.5 */
-  mask2.v = vec_cmp_gte(x.v,cond.v); /* 0.5 < x < 1 */;
+  mask1.v = vec_cmp_lt(x.v, cond.v);  /* 0 < x < 0.5 */
+  mask2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */
+  ;
 
-  /* Work out w for both regions of the kernel and combine the results together using masks. */
+  /* Work out w for both regions of the kernel and combine the results together
+   * using masks. */
 
   /* Init the iteration for Horner's scheme. */
   w->v = vec_fma(cubic_1_const_c0.v, x.v, cubic_1_const_c1.v);
   w2.v = vec_fma(cubic_2_const_c0.v, x.v, cubic_2_const_c1.v);
-  
+
   /* Calculate the polynomial interleaving vector operations. */
   w->v = vec_mul(x.v, w->v); /* cubic_1_const_c2 is zero */
   w2.v = vec_fma(x.v, w2.v, cubic_2_const_c2.v);
-  
+
   w->v = vec_fma(x.v, w->v, cubic_1_const_c3.v);
   w2.v = vec_fma(x.v, w2.v, cubic_2_const_c3.v);
-  
+
   /* Mask out unneeded values. */
-  w->v = vec_and(w->v,mask1.v);
-  w2.v = vec_and(w2.v,mask2.v);
+  w->v = vec_and(w->v, mask1.v);
+  w2.v = vec_and(w2.v, mask2.v);
 
   /* Added both w and w2 together to form complete result. */
-  w->v = vec_add(w->v,w2.v);
+  w->v = vec_add(w->v, w2.v);
 #else
 #error
 #endif
@@ -721,39 +730,41 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_vec(
 
 #ifdef WENDLAND_C2_KERNEL
   /* Init the iteration for Horner's scheme. */
-  dw_dx->v = vec_fma(wendland_dwdx_const_c0.v,x.v,wendland_dwdx_const_c1.v);
+  dw_dx->v = vec_fma(wendland_dwdx_const_c0.v, x.v, wendland_dwdx_const_c1.v);
 
   /* Calculate the polynomial interleaving vector operations */
-  dw_dx->v = vec_fma(dw_dx->v,x.v,wendland_dwdx_const_c2.v);
+  dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c2.v);
 
-  dw_dx->v = vec_fma(dw_dx->v,x.v,wendland_dwdx_const_c3.v);
+  dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c3.v);
 
-  dw_dx->v = vec_mul(dw_dx->v,x.v);
+  dw_dx->v = vec_mul(dw_dx->v, x.v);
 
 #elif defined(CUBIC_SPLINE_KERNEL)
   vector dw_dx2;
   vector mask1, mask2;
 
   /* Form a mask for each part of the kernel. */
-  mask1.v = vec_cmp_lt(x.v,cond.v); /* 0 < x < 0.5 */
-  mask2.v = vec_cmp_gte(x.v,cond.v); /* 0.5 < x < 1 */;
+  mask1.v = vec_cmp_lt(x.v, cond.v);  /* 0 < x < 0.5 */
+  mask2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */
+  ;
 
-  /* Work out w for both regions of the kernel and combine the results together using masks. */
+  /* Work out w for both regions of the kernel and combine the results together
+   * using masks. */
 
   /* Init the iteration for Horner's scheme. */
-  dw_dx->v = vec_fma(cubic_1_dwdx_const_c0.v,x.v,cubic_1_dwdx_const_c1.v);
-  dw_dx2.v = vec_fma(cubic_2_dwdx_const_c0.v,x.v,cubic_2_dwdx_const_c1.v);
-  
+  dw_dx->v = vec_fma(cubic_1_dwdx_const_c0.v, x.v, cubic_1_dwdx_const_c1.v);
+  dw_dx2.v = vec_fma(cubic_2_dwdx_const_c0.v, x.v, cubic_2_dwdx_const_c1.v);
+
   /* Calculate the polynomial interleaving vector operations. */
-  dw_dx->v = vec_mul(dw_dx->v,x.v); /* cubic_1_dwdx_const_c2 is zero. */
-  dw_dx2.v = vec_fma(dw_dx2.v,x.v,cubic_2_dwdx_const_c2.v);
-  
+  dw_dx->v = vec_mul(dw_dx->v, x.v); /* cubic_1_dwdx_const_c2 is zero. */
+  dw_dx2.v = vec_fma(dw_dx2.v, x.v, cubic_2_dwdx_const_c2.v);
+
   /* Mask out unneeded values. */
-  dw_dx->v = vec_and(dw_dx->v,mask1.v);
-  dw_dx2.v = vec_and(dw_dx2.v,mask2.v);
+  dw_dx->v = vec_and(dw_dx->v, mask1.v);
+  dw_dx2.v = vec_and(dw_dx2.v, mask2.v);
 
   /* Added both dwdx and dwdx2 together to form complete result. */
-  dw_dx->v = vec_add(dw_dx->v,dw_dx2.v);
+  dw_dx->v = vec_add(dw_dx->v, dw_dx2.v);
 #else
 #error
 #endif
diff --git a/src/multipole.h b/src/multipole.h
index a633554490fe6226f8fe42befb7cb4eaf1e5135f..b9d49dcf0fc3b605849f7b058aef14843b73517d 100644
--- a/src/multipole.h
+++ b/src/multipole.h
@@ -34,6 +34,7 @@
 #include "error.h"
 #include "gravity_derivatives.h"
 #include "gravity_properties.h"
+#include "gravity_softened_derivatives.h"
 #include "inline.h"
 #include "kernel_gravity.h"
 #include "minmax.h"
@@ -176,9 +177,15 @@ struct gravity_tensors {
       /*! Centre of mass of the matter dsitribution */
       double CoM[3];
 
+      /*! Centre of mass of the matter dsitribution at the last rebuild */
+      double CoM_rebuild[3];
+
       /*! Upper limit of the CoM<->gpart distance */
       double r_max;
 
+      /*! Upper limit of the CoM<->gpart distance at the last rebuild */
+      double r_max_rebuild;
+
       /*! Multipole mass */
       struct multipole m_pole;
 
@@ -1232,12 +1239,10 @@ INLINE static void gravity_P2M(struct gravity_tensors *m,
  * @param m_b The #multipole to shift.
  * @param pos_a The position to which m_b will be shifted.
  * @param pos_b The current postion of the multipole to shift.
- * @param periodic Is the calculation periodic ?
  */
 INLINE static void gravity_M2M(struct multipole *m_a,
                                const struct multipole *m_b,
-                               const double pos_a[3], const double pos_b[3],
-                               int periodic) {
+                               const double pos_a[3], const double pos_b[3]) {
   /* Shift bulk velocity */
   m_a->vel[0] = m_b->vel[0];
   m_a->vel[1] = m_b->vel[1];
@@ -1501,6 +1506,9 @@ INLINE static void gravity_M2L(struct grav_tensor *l_b,
                                const struct gravity_props *props,
                                int periodic) {
 
+  /* Recover some constants */
+  const double eps2 = props->epsilon2;
+
   /* Compute distance vector */
   const double dx =
       periodic ? box_wrap(pos_b[0] - pos_a[0], 0., 1.) : pos_b[0] - pos_a[0];
@@ -1519,7 +1527,7 @@ INLINE static void gravity_M2L(struct grav_tensor *l_b,
 #endif
 
   /* Un-softened case */
-  if (r2 > props->epsilon2) {
+  if (r2 > eps2) {
 
     /*  0th order term */
     l_b->F_000 += m_a->M_000 * D_000(dx, dy, dz, r_inv);
@@ -1618,7 +1626,7 @@ INLINE static void gravity_M2L(struct grav_tensor *l_b,
                   m_a->M_010 * D_021(dx, dy, dz, r_inv) +
                   m_a->M_001 * D_012(dx, dy, dz, r_inv);
 
-    /*  3rd order multipole term (addition to rank 2)*/
+    /*  3rd order multipole term (addition to rank 3)*/
     l_b->F_300 += m_a->M_000 * D_300(dx, dy, dz, r_inv);
     l_b->F_030 += m_a->M_000 * D_030(dx, dy, dz, r_inv);
     l_b->F_003 += m_a->M_000 * D_003(dx, dy, dz, r_inv);
@@ -2041,7 +2049,120 @@ INLINE static void gravity_M2L(struct grav_tensor *l_b,
 
     /* Softened case */
   } else {
-    message("Un-supported softened M2L interaction.");
+
+    const double eps_inv = props->epsilon_inv;
+    const double r = r2 * r_inv;
+
+    /*  0th order term */
+    l_b->F_000 += m_a->M_000 * D_soft_000(dx, dy, dz, r, eps_inv);
+
+#if SELF_GRAVITY_MULTIPOLE_ORDER > 0
+
+    /*  1st order multipole term (addition to rank 0)*/
+    l_b->F_000 += m_a->M_100 * D_soft_100(dx, dy, dz, r, eps_inv) +
+                  m_a->M_010 * D_soft_010(dx, dy, dz, r, eps_inv) +
+                  m_a->M_001 * D_soft_001(dx, dy, dz, r, eps_inv);
+
+    /*  1st order multipole term (addition to rank 1)*/
+    l_b->F_100 += m_a->M_000 * D_soft_100(dx, dy, dz, r, eps_inv);
+    l_b->F_010 += m_a->M_000 * D_soft_010(dx, dy, dz, r, eps_inv);
+    l_b->F_001 += m_a->M_000 * D_soft_001(dx, dy, dz, r, eps_inv);
+#endif
+#if SELF_GRAVITY_MULTIPOLE_ORDER > 1
+
+    /*  2nd order multipole term (addition to rank 0)*/
+    l_b->F_000 += m_a->M_200 * D_soft_200(dx, dy, dz, r, eps_inv) +
+                  m_a->M_020 * D_soft_020(dx, dy, dz, r, eps_inv) +
+                  m_a->M_002 * D_soft_002(dx, dy, dz, r, eps_inv);
+    l_b->F_000 += m_a->M_110 * D_soft_110(dx, dy, dz, r, eps_inv) +
+                  m_a->M_101 * D_soft_101(dx, dy, dz, r, eps_inv) +
+                  m_a->M_011 * D_soft_011(dx, dy, dz, r, eps_inv);
+
+    /*  2nd order multipole term (addition to rank 1)*/
+    l_b->F_100 += m_a->M_100 * D_soft_200(dx, dy, dz, r, eps_inv) +
+                  m_a->M_010 * D_soft_110(dx, dy, dz, r, eps_inv) +
+                  m_a->M_001 * D_soft_101(dx, dy, dz, r, eps_inv);
+    l_b->F_010 += m_a->M_100 * D_soft_110(dx, dy, dz, r, eps_inv) +
+                  m_a->M_010 * D_soft_020(dx, dy, dz, r, eps_inv) +
+                  m_a->M_001 * D_soft_011(dx, dy, dz, r, eps_inv);
+    l_b->F_001 += m_a->M_100 * D_soft_101(dx, dy, dz, r, eps_inv) +
+                  m_a->M_010 * D_soft_011(dx, dy, dz, r, eps_inv) +
+                  m_a->M_001 * D_soft_002(dx, dy, dz, r, eps_inv);
+
+    /*  2nd order multipole term (addition to rank 2)*/
+    l_b->F_200 += m_a->M_000 * D_soft_200(dx, dy, dz, r, eps_inv);
+    l_b->F_020 += m_a->M_000 * D_soft_020(dx, dy, dz, r, eps_inv);
+    l_b->F_002 += m_a->M_000 * D_soft_002(dx, dy, dz, r, eps_inv);
+    l_b->F_110 += m_a->M_000 * D_soft_110(dx, dy, dz, r, eps_inv);
+    l_b->F_101 += m_a->M_000 * D_soft_101(dx, dy, dz, r, eps_inv);
+    l_b->F_011 += m_a->M_000 * D_soft_011(dx, dy, dz, r, eps_inv);
+#endif
+#if SELF_GRAVITY_MULTIPOLE_ORDER > 2
+
+    /*  3rd order multipole term (addition to rank 0)*/
+    l_b->F_000 += m_a->M_300 * D_soft_300(dx, dy, dz, r, eps_inv) +
+                  m_a->M_030 * D_soft_030(dx, dy, dz, r, eps_inv) +
+                  m_a->M_003 * D_soft_003(dx, dy, dz, r, eps_inv);
+    l_b->F_000 += m_a->M_210 * D_soft_210(dx, dy, dz, r, eps_inv) +
+                  m_a->M_201 * D_soft_201(dx, dy, dz, r, eps_inv) +
+                  m_a->M_120 * D_soft_120(dx, dy, dz, r, eps_inv);
+    l_b->F_000 += m_a->M_021 * D_soft_021(dx, dy, dz, r, eps_inv) +
+                  m_a->M_102 * D_soft_102(dx, dy, dz, r, eps_inv) +
+                  m_a->M_012 * D_soft_012(dx, dy, dz, r, eps_inv);
+    l_b->F_000 += m_a->M_111 * D_soft_111(dx, dy, dz, r, eps_inv);
+
+    /*  3rd order multipole term (addition to rank 1)*/
+    l_b->F_100 += m_a->M_200 * D_soft_300(dx, dy, dz, r, eps_inv) +
+                  m_a->M_020 * D_soft_120(dx, dy, dz, r, eps_inv) +
+                  m_a->M_002 * D_soft_102(dx, dy, dz, r, eps_inv);
+    l_b->F_100 += m_a->M_110 * D_soft_210(dx, dy, dz, r, eps_inv) +
+                  m_a->M_101 * D_soft_201(dx, dy, dz, r, eps_inv) +
+                  m_a->M_011 * D_soft_111(dx, dy, dz, r, eps_inv);
+    l_b->F_010 += m_a->M_200 * D_soft_210(dx, dy, dz, r, eps_inv) +
+                  m_a->M_020 * D_soft_030(dx, dy, dz, r, eps_inv) +
+                  m_a->M_002 * D_soft_012(dx, dy, dz, r, eps_inv);
+    l_b->F_010 += m_a->M_110 * D_soft_120(dx, dy, dz, r, eps_inv) +
+                  m_a->M_101 * D_soft_111(dx, dy, dz, r, eps_inv) +
+                  m_a->M_011 * D_soft_021(dx, dy, dz, r, eps_inv);
+    l_b->F_001 += m_a->M_200 * D_soft_201(dx, dy, dz, r, eps_inv) +
+                  m_a->M_020 * D_soft_021(dx, dy, dz, r, eps_inv) +
+                  m_a->M_002 * D_soft_003(dx, dy, dz, r, eps_inv);
+    l_b->F_001 += m_a->M_110 * D_soft_111(dx, dy, dz, r, eps_inv) +
+                  m_a->M_101 * D_soft_102(dx, dy, dz, r, eps_inv) +
+                  m_a->M_011 * D_soft_012(dx, dy, dz, r, eps_inv);
+
+    /*  3rd order multipole term (addition to rank 2)*/
+    l_b->F_200 += m_a->M_100 * D_soft_300(dx, dy, dz, r, eps_inv) +
+                  m_a->M_010 * D_soft_210(dx, dy, dz, r, eps_inv) +
+                  m_a->M_001 * D_soft_201(dx, dy, dz, r, eps_inv);
+    l_b->F_020 += m_a->M_100 * D_soft_120(dx, dy, dz, r, eps_inv) +
+                  m_a->M_010 * D_soft_030(dx, dy, dz, r, eps_inv) +
+                  m_a->M_001 * D_soft_021(dx, dy, dz, r, eps_inv);
+    l_b->F_002 += m_a->M_100 * D_soft_102(dx, dy, dz, r, eps_inv) +
+                  m_a->M_010 * D_soft_012(dx, dy, dz, r, eps_inv) +
+                  m_a->M_001 * D_soft_003(dx, dy, dz, r, eps_inv);
+    l_b->F_110 += m_a->M_100 * D_soft_210(dx, dy, dz, r, eps_inv) +
+                  m_a->M_010 * D_soft_120(dx, dy, dz, r, eps_inv) +
+                  m_a->M_001 * D_soft_111(dx, dy, dz, r, eps_inv);
+    l_b->F_101 += m_a->M_100 * D_soft_201(dx, dy, dz, r, eps_inv) +
+                  m_a->M_010 * D_soft_111(dx, dy, dz, r, eps_inv) +
+                  m_a->M_001 * D_soft_102(dx, dy, dz, r, eps_inv);
+    l_b->F_011 += m_a->M_100 * D_soft_111(dx, dy, dz, r, eps_inv) +
+                  m_a->M_010 * D_soft_021(dx, dy, dz, r, eps_inv) +
+                  m_a->M_001 * D_soft_012(dx, dy, dz, r, eps_inv);
+
+    /*  3rd order multipole term (addition to rank 3)*/
+    l_b->F_300 += m_a->M_000 * D_soft_300(dx, dy, dz, r, eps_inv);
+    l_b->F_030 += m_a->M_000 * D_soft_030(dx, dy, dz, r, eps_inv);
+    l_b->F_003 += m_a->M_000 * D_soft_003(dx, dy, dz, r, eps_inv);
+    l_b->F_210 += m_a->M_000 * D_soft_210(dx, dy, dz, r, eps_inv);
+    l_b->F_201 += m_a->M_000 * D_soft_201(dx, dy, dz, r, eps_inv);
+    l_b->F_120 += m_a->M_000 * D_soft_120(dx, dy, dz, r, eps_inv);
+    l_b->F_021 += m_a->M_000 * D_soft_021(dx, dy, dz, r, eps_inv);
+    l_b->F_102 += m_a->M_000 * D_soft_102(dx, dy, dz, r, eps_inv);
+    l_b->F_012 += m_a->M_000 * D_soft_012(dx, dy, dz, r, eps_inv);
+    l_b->F_111 += m_a->M_000 * D_soft_111(dx, dy, dz, r, eps_inv);
+#endif
   }
 }
 
@@ -2521,20 +2642,29 @@ INLINE static void gravity_L2P(const struct grav_tensor *lb,
  * @param ma The #multipole of the first #cell.
  * @param mb The #multipole of the second #cell.
  * @param theta_crit_inv The inverse of the critical opening angle.
+ * @param rebuild Are we using the current value of CoM or the ones from
+ * the last rebuild ?
  */
 __attribute__((always_inline)) INLINE static int gravity_multipole_accept(
     const struct gravity_tensors *ma, const struct gravity_tensors *mb,
-    double theta_crit_inv) {
+    double theta_crit_inv, int rebuild) {
 
-  const double r_crit_a = ma->r_max * theta_crit_inv;
-  const double r_crit_b = mb->r_max * theta_crit_inv;
+  const double r_crit_a =
+      (rebuild ? ma->r_max_rebuild : ma->r_max) * theta_crit_inv;
+  const double r_crit_b =
+      (rebuild ? mb->r_max_rebuild : mb->r_max) * theta_crit_inv;
 
-  const double dx = ma->CoM[0] - mb->CoM[0];
-  const double dy = ma->CoM[1] - mb->CoM[1];
-  const double dz = ma->CoM[2] - mb->CoM[2];
+  const double dx = rebuild ? ma->CoM_rebuild[0] - mb->CoM_rebuild[0]
+                            : ma->CoM[0] - mb->CoM[0];
+  const double dy = rebuild ? ma->CoM_rebuild[1] - mb->CoM_rebuild[1]
+                            : ma->CoM[1] - mb->CoM[1];
+  const double dz = rebuild ? ma->CoM_rebuild[2] - mb->CoM_rebuild[2]
+                            : ma->CoM[2] - mb->CoM[2];
 
   const double r2 = dx * dx + dy * dy + dz * dz;
 
+  // MATTHIEU: Make this mass-dependent ?
+
   /* Multipole acceptance criterion (Dehnen 2002, eq.10) */
   return (r2 > (r_crit_a + r_crit_b) * (r_crit_a + r_crit_b));
 }
diff --git a/src/partition.c b/src/partition.c
index 49dbf883e0dea3f00c93ca33ec8cb0248bbfbfaa..499efab263a9031b0116f073af8cebd5fef0c2eb 100644
--- a/src/partition.c
+++ b/src/partition.c
@@ -524,7 +524,7 @@ static void repart_edge_metis(int partweights, int bothweights, int nodeID,
         t->type != task_type_sub_self && t->type != task_type_sub_self &&
         t->type != task_type_ghost && t->type != task_type_kick1 &&
         t->type != task_type_kick2 && t->type != task_type_timestep &&
-        t->type != task_type_drift && t->type != task_type_init)
+        t->type != task_type_drift)
       continue;
 
     /* Get the task weight. */
diff --git a/src/profiler.c b/src/profiler.c
index ad8338eebfd130d4088f9fd9d4fcc9856c8cc731..62ba881a15a32de63dba3cce95feab79d595aa84 100644
--- a/src/profiler.c
+++ b/src/profiler.c
@@ -31,6 +31,46 @@
 #include "hydro.h"
 #include "version.h"
 
+/* Array to store the list of file names. Order must match profiler_types
+ * enumerator and profiler_func_names. */
+const char *profiler_file_names[profiler_length] = {"enginecollecttimesteps",
+                                                    "enginedrift",
+                                                    "enginerebuild",
+                                                    "schedulerreweight",
+                                                    "schedulerclearwaits",
+                                                    "schedulerrewait",
+                                                    "schedulerenqueue",
+                                                    "engineprintstats",
+                                                    "enginelaunch",
+                                                    "spacerebuild",
+                                                    "enginemaketasks",
+                                                    "enginemarktasks",
+                                                    "spaceregrid",
+                                                    "spacepartssort",
+                                                    "spacesplit",
+                                                    "spacegetcellid",
+                                                    "spacecountparts"};
+
+/* Array to store the list of function names. Order must match profiler_types
+ * enumerator and profiler_file_names. */
+const char *profiler_func_names[profiler_length] = {"engine_collect_timesteps",
+                                                    "engine_drift",
+                                                    "engine_rebuild",
+                                                    "scheduler_reweight",
+                                                    "scheduler_clear_waits",
+                                                    "scheduler_rewait",
+                                                    "scheduler_enqueue",
+                                                    "engine_print_stats",
+                                                    "engine_launch",
+                                                    "space_rebuild",
+                                                    "engine_maketasks",
+                                                    "engine_marktasks",
+                                                    "space_regrid",
+                                                    "space_parts_sort",
+                                                    "space_split",
+                                                    "space_get_cell_id",
+                                                    "space_count_parts"};
+
 /**
  * @brief Resets all timers.
  *
@@ -38,24 +78,8 @@
  * function timers.
  */
 void profiler_reset_timers(struct profiler *profiler) {
-
-  profiler->collect_timesteps_time = 0;
-  profiler->drift_time = 0;
-  profiler->rebuild_time = 0;
-  profiler->reweight_time = 0;
-  profiler->clear_waits_time = 0;
-  profiler->re_wait_time = 0;
-  profiler->enqueue_time = 0;
-  profiler->stats_time = 0;
-  profiler->launch_time = 0;
-  profiler->space_rebuild_time = 0;
-  profiler->engine_maketasks_time = 0;
-  profiler->engine_marktasks_time = 0;
-  profiler->space_regrid_time = 0;
-  profiler->space_parts_sort_time = 0;
-  profiler->space_split_time = 0;
-  profiler->space_parts_get_cell_id_time = 0;
-  profiler->space_count_parts_time = 0;
+  /* Iterate over times array and reset values. */
+  for (int i = 0; i < profiler_length; i++) profiler->times[i] = 0;
 }
 
 /**
@@ -66,8 +90,9 @@ void profiler_reset_timers(struct profiler *profiler) {
  * @param functionName name of function that is being timed.
  * @param file (return) pointer used to open output file.
  */
-void profiler_write_timing_info_header(const struct engine *e, char *fileName,
-                                       char *functionName, FILE **file) {
+void profiler_write_timing_info_header(const struct engine *e,
+                                       const char *fileName,
+                                       const char *functionName, FILE **file) {
 
   /* Create the file name in the format: "fileName_(no. of threads)" */
   char fullFileName[200] = "";
@@ -82,13 +107,14 @@ void profiler_write_timing_info_header(const struct engine *e, char *fileName,
           "Number of threads: %d\n# Number of MPI ranks: %d\n# Hydrodynamic "
           "scheme: %s\n# Hydrodynamic kernel: %s\n# No. of neighbours: %.2f "
           "+/- %.2f\n# Eta: %f\n"
-          "# %6s %14s %14s %10s %10s %16s [%s]\n",
+          "# %6s %14s %14s %10s %10s %10s %16s [%s]\n",
           hostname(), functionName, git_revision(), compiler_name(),
           compiler_version(), e->nr_threads, e->nr_nodes, SPH_IMPLEMENTATION,
           kernel_name, e->hydro_properties->target_neighbours,
           e->hydro_properties->delta_neighbours,
           e->hydro_properties->eta_neighbours, "Step", "Time", "Time-step",
-          "Updates", "g-Updates", "Wall-clock time", clocks_getunit());
+          "Updates", "g-Updates", "s-Updates", "Wall-clock time",
+          clocks_getunit());
 
   fflush(*file);
 }
@@ -103,44 +129,11 @@ void profiler_write_timing_info_header(const struct engine *e, char *fileName,
  */
 void profiler_write_all_timing_info_headers(const struct engine *e,
                                             struct profiler *profiler) {
-
-  profiler_write_timing_info_header(e, "enginecollecttimesteps",
-                                    "engine_collect_timesteps",
-                                    &profiler->file_engine_collect_timesteps);
-  profiler_write_timing_info_header(e, "enginedrift", "engine_drift",
-                                    &profiler->file_engine_drift);
-  profiler_write_timing_info_header(e, "enginerebuild", "engine_rebuild",
-                                    &profiler->file_engine_rebuild);
-  profiler_write_timing_info_header(e, "schedulerreweight",
-                                    "scheduler_reweight",
-                                    &profiler->file_scheduler_reweight);
-  profiler_write_timing_info_header(e, "schedulerclearwaits",
-                                    "scheduler_clear_waits",
-                                    &profiler->file_scheduler_clear_waits);
-  profiler_write_timing_info_header(e, "schedulerrewait", "scheduler_rewait",
-                                    &profiler->file_scheduler_re_wait);
-  profiler_write_timing_info_header(e, "schedulerenqueue", "scheduler_enqueue",
-                                    &profiler->file_scheduler_enqueue);
-  profiler_write_timing_info_header(e, "engineprintstats", "engine_print_stats",
-                                    &profiler->file_engine_stats);
-  profiler_write_timing_info_header(e, "enginelaunch", "engine_launch",
-                                    &profiler->file_engine_launch);
-  profiler_write_timing_info_header(e, "spacerebuild", "space_rebuild",
-                                    &profiler->file_space_rebuild);
-  profiler_write_timing_info_header(e, "enginemaketasks", "engine_maketasks",
-                                    &profiler->file_engine_maketasks);
-  profiler_write_timing_info_header(e, "enginemarktasks", "engine_marktasks",
-                                    &profiler->file_engine_marktasks);
-  profiler_write_timing_info_header(e, "spaceregrid", "space_regrid",
-                                    &profiler->file_space_regrid);
-  profiler_write_timing_info_header(e, "spacepartssort", "space_parts_sort",
-                                    &profiler->file_space_parts_sort);
-  profiler_write_timing_info_header(e, "spacesplit", "space_split",
-                                    &profiler->file_space_split);
-  profiler_write_timing_info_header(e, "spacegetcellid", "space_get_cell_id",
-                                    &profiler->file_space_parts_get_cell_id);
-  profiler_write_timing_info_header(e, "spacecountparts", "space_count_parts",
-                                    &profiler->file_space_count_parts);
+  /* Iterate over files array and write file headers. */
+  for (int i = 0; i < profiler_length; i++) {
+    profiler_write_timing_info_header(
+        e, profiler_file_names[i], profiler_func_names[i], &profiler->files[i]);
+  }
 }
 
 /**
@@ -153,8 +146,9 @@ void profiler_write_all_timing_info_headers(const struct engine *e,
 void profiler_write_timing_info(const struct engine *e, ticks time,
                                 FILE *file) {
 
-  fprintf(file, "  %6d %14e %14e %10zu %10zu %21.3f\n", e->step, e->time,
-          e->timeStep, e->updates, e->g_updates, clocks_from_ticks(time));
+  fprintf(file, "  %6d %14e %14e %10zu %10zu %10zu %21.3f\n", e->step, e->time,
+          e->timeStep, e->updates, e->g_updates, e->s_updates,
+          clocks_from_ticks(time));
   fflush(file);
 }
 
@@ -169,39 +163,10 @@ void profiler_write_timing_info(const struct engine *e, ticks time,
 void profiler_write_all_timing_info(const struct engine *e,
                                     struct profiler *profiler) {
 
-  profiler_write_timing_info(e, profiler->drift_time,
-                             profiler->file_engine_drift);
-  profiler_write_timing_info(e, profiler->rebuild_time,
-                             profiler->file_engine_rebuild);
-  profiler_write_timing_info(e, profiler->reweight_time,
-                             profiler->file_scheduler_reweight);
-  profiler_write_timing_info(e, profiler->clear_waits_time,
-                             profiler->file_scheduler_clear_waits);
-  profiler_write_timing_info(e, profiler->re_wait_time,
-                             profiler->file_scheduler_re_wait);
-  profiler_write_timing_info(e, profiler->enqueue_time,
-                             profiler->file_scheduler_enqueue);
-  profiler_write_timing_info(e, profiler->stats_time,
-                             profiler->file_engine_stats);
-  profiler_write_timing_info(e, profiler->launch_time,
-                             profiler->file_engine_launch);
-  profiler_write_timing_info(e, profiler->space_rebuild_time,
-                             profiler->file_space_rebuild);
-  profiler_write_timing_info(e, profiler->engine_maketasks_time,
-                             profiler->file_engine_maketasks);
-  profiler_write_timing_info(e, profiler->engine_marktasks_time,
-                             profiler->file_engine_marktasks);
-  profiler_write_timing_info(e, profiler->space_regrid_time,
-                             profiler->file_space_regrid);
-  profiler_write_timing_info(e, profiler->space_parts_sort_time,
-                             profiler->file_space_parts_sort);
-  profiler_write_timing_info(e, profiler->space_split_time,
-                             profiler->file_space_split);
-  profiler_write_timing_info(e, profiler->space_parts_get_cell_id_time,
-                             profiler->file_space_parts_get_cell_id);
-  profiler_write_timing_info(e, profiler->space_count_parts_time,
-                             profiler->file_space_count_parts);
-
+  /* Iterate over times array and print timing info to files. */
+  for (int i = 0; i < profiler_length; i++) {
+    profiler_write_timing_info(e, profiler->times[i], profiler->files[i]);
+  }
   /* Reset timers. */
   profiler_reset_timers(profiler);
 }
@@ -215,20 +180,6 @@ void profiler_write_all_timing_info(const struct engine *e,
  */
 void profiler_close_files(struct profiler *profiler) {
 
-  fclose(profiler->file_engine_drift);
-  fclose(profiler->file_engine_rebuild);
-  fclose(profiler->file_scheduler_reweight);
-  fclose(profiler->file_scheduler_clear_waits);
-  fclose(profiler->file_scheduler_re_wait);
-  fclose(profiler->file_scheduler_enqueue);
-  fclose(profiler->file_engine_stats);
-  fclose(profiler->file_engine_launch);
-  fclose(profiler->file_space_rebuild);
-  fclose(profiler->file_engine_maketasks);
-  fclose(profiler->file_engine_marktasks);
-  fclose(profiler->file_space_regrid);
-  fclose(profiler->file_space_parts_sort);
-  fclose(profiler->file_space_split);
-  fclose(profiler->file_space_parts_get_cell_id);
-  fclose(profiler->file_space_count_parts);
+  /* Iterate over files array and close files. */
+  for (int i = 0; i < profiler_length; i++) fclose(profiler->files[i]);
 }
diff --git a/src/profiler.h b/src/profiler.h
index b00bc986ece8b78282b11ce317a6746ecba5a50f..911d4747912d64dd46d2cf59369670ff26a92012 100644
--- a/src/profiler.h
+++ b/src/profiler.h
@@ -25,46 +25,33 @@
 /* Local includes */
 #include "engine.h"
 
+/* Enumerator to be used as an index into the timers and files array. To add an
+ * extra timer extend this list, before the profiler_length value.*/
+enum profiler_types {
+  profiler_engine_collect_timesteps = 0,
+  profiler_engine_drift,
+  profiler_engine_rebuild,
+  profiler_scheduler_reweight,
+  profiler_scheduler_clear_waits,
+  profiler_scheduler_re_wait,
+  profiler_scheduler_enqueue,
+  profiler_engine_stats,
+  profiler_engine_launch,
+  profiler_space_rebuild,
+  profiler_engine_maketasks,
+  profiler_engine_marktasks,
+  profiler_space_regrid,
+  profiler_space_parts_sort,
+  profiler_space_split,
+  profiler_space_parts_get_cell_id,
+  profiler_space_count_parts,
+  profiler_length
+};
+
 /* Profiler that holds file pointers and time taken in functions. */
 struct profiler {
-
-  /* File pointers for timing info. */
-  FILE *file_engine_collect_timesteps;
-  FILE *file_engine_drift;
-  FILE *file_engine_rebuild;
-  FILE *file_scheduler_reweight;
-  FILE *file_scheduler_clear_waits;
-  FILE *file_scheduler_re_wait;
-  FILE *file_scheduler_enqueue;
-  FILE *file_engine_stats;
-  FILE *file_engine_launch;
-  FILE *file_space_rebuild;
-  FILE *file_engine_maketasks;
-  FILE *file_engine_marktasks;
-  FILE *file_space_regrid;
-  FILE *file_space_parts_sort;
-  FILE *file_space_split;
-  FILE *file_space_parts_get_cell_id;
-  FILE *file_space_count_parts;
-
-  /* Time taken in functions. */
-  ticks collect_timesteps_time;
-  ticks drift_time;
-  ticks rebuild_time;
-  ticks reweight_time;
-  ticks clear_waits_time;
-  ticks re_wait_time;
-  ticks enqueue_time;
-  ticks stats_time;
-  ticks launch_time;
-  ticks space_rebuild_time;
-  ticks engine_maketasks_time;
-  ticks engine_marktasks_time;
-  ticks space_regrid_time;
-  ticks space_parts_sort_time;
-  ticks space_split_time;
-  ticks space_parts_get_cell_id_time;
-  ticks space_count_parts_time;
+  FILE *files[profiler_length];
+  ticks times[profiler_length];
 };
 
 /* Function prototypes. */
diff --git a/src/runner.c b/src/runner.c
index 99b399b044f5afdc1729e58ca9d36c7d016ccd70..b4dbe7b377255b27520b611e0ca9e24289b38ba7 100644
--- a/src/runner.c
+++ b/src/runner.c
@@ -289,6 +289,26 @@ void runner_do_sort_ascending(struct entry *sort, int N) {
   }
 }
 
+/**
+ * @brief Recursively checks that the flags are consistent in a cell hierarchy.
+ *
+ * Debugging function.
+ *
+ * @param c The #cell to check.
+ * @param flags The sorting flags to check.
+ */
+void runner_check_sorts(struct cell *c, int flags) {
+
+#ifdef SWIFT_DEBUG_CHECKS
+  if (flags & ~c->sorted) error("Inconsistent sort flags (downward)!");
+  if (c->split)
+    for (int k = 0; k < 8; k++)
+      if (c->progeny[k] != NULL) runner_check_sorts(c->progeny[k], c->sorted);
+#else
+  error("Calling debugging code without debugging flag activated.");
+#endif
+}
+
 /**
  * @brief Sort the particles in the given cell along all cardinal directions.
  *
@@ -303,16 +323,37 @@ void runner_do_sort(struct runner *r, struct cell *c, int flags, int clock) {
   struct entry *finger;
   struct entry *fingers[8];
   struct part *parts = c->parts;
+  struct xpart *xparts = c->xparts;
   struct entry *sort;
-  int j, k, count = c->count;
-  int i, ind, off[8], inds[8], temp_i, missing;
+  const int count = c->count;
   float buff[8];
-  double px[3];
 
-  TIMER_TIC
+  TIMER_TIC;
+
+  /* Check that the particles have been moved to the current time */
+  if (!cell_is_drifted(c, r->e)) error("Sorting un-drifted cell");
 
-  /* Clean-up the flags, i.e. filter out what's already been sorted. */
-  flags &= ~c->sorted;
+#ifdef SWIFT_DEBUG_CHECKS
+  /* Make sure the sort flags are consistent (downward). */
+  runner_check_sorts(c, c->sorted);
+
+  /* Make sure the sort flags are consistent (upard). */
+  for (struct cell *finger = c->parent; finger != NULL;
+       finger = finger->parent) {
+    if (finger->sorted & ~c->sorted) error("Inconsistent sort flags (upward).");
+  }
+#endif
+
+  /* Clean-up the flags, i.e. filter out what's already been sorted, but
+     only if the sorts are recent. */
+  if (c->ti_sort == r->e->ti_current) {
+    /* Ignore dimensions that have been sorted in this timestep. */
+    // flags &= ~c->sorted;
+  } else {
+    /* Clean old (stale) sorts. */
+    flags |= c->sorted;
+    c->sorted = 0;
+  }
   if (flags == 0) return;
 
   /* start by allocating the entry arrays. */
@@ -329,27 +370,35 @@ void runner_do_sort(struct runner *r, struct cell *c, int flags, int clock) {
   if (c->split) {
 
     /* Fill in the gaps within the progeny. */
-    for (k = 0; k < 8; k++) {
-      if (c->progeny[k] == NULL) continue;
-      missing = flags & ~c->progeny[k]->sorted;
-      if (missing) runner_do_sort(r, c->progeny[k], missing, 0);
+    float dx_max_sort = 0.0f;
+    for (int k = 0; k < 8; k++) {
+      if (c->progeny[k] != NULL) {
+        if (flags & ~c->progeny[k]->sorted ||
+            c->progeny[k]->dx_max_sort > c->dmin * space_maxreldx)
+          runner_do_sort(r, c->progeny[k], flags, 0);
+        dx_max_sort = max(dx_max_sort, c->progeny[k]->dx_max_sort);
+      }
     }
+    c->dx_max_sort = dx_max_sort;
 
     /* Loop over the 13 different sort arrays. */
-    for (j = 0; j < 13; j++) {
+    for (int j = 0; j < 13; j++) {
 
       /* Has this sort array been flagged? */
       if (!(flags & (1 << j))) continue;
 
       /* Init the particle index offsets. */
-      for (off[0] = 0, k = 1; k < 8; k++)
+      int off[8];
+      off[0] = 0;
+      for (int k = 1; k < 8; k++)
         if (c->progeny[k - 1] != NULL)
           off[k] = off[k - 1] + c->progeny[k - 1]->count;
         else
           off[k] = off[k - 1];
 
       /* Init the entries and indices. */
-      for (k = 0; k < 8; k++) {
+      int inds[8];
+      for (int k = 0; k < 8; k++) {
         inds[k] = k;
         if (c->progeny[k] != NULL && c->progeny[k]->count > 0) {
           fingers[k] = &c->progeny[k]->sort[j * (c->progeny[k]->count + 1)];
@@ -360,17 +409,17 @@ void runner_do_sort(struct runner *r, struct cell *c, int flags, int clock) {
       }
 
       /* Sort the buffer. */
-      for (i = 0; i < 7; i++)
-        for (k = i + 1; k < 8; k++)
+      for (int i = 0; i < 7; i++)
+        for (int k = i + 1; k < 8; k++)
           if (buff[inds[k]] < buff[inds[i]]) {
-            temp_i = inds[i];
+            int temp_i = inds[i];
             inds[i] = inds[k];
             inds[k] = temp_i;
           }
 
       /* For each entry in the new sort list. */
       finger = &sort[j * (count + 1)];
-      for (ind = 0; ind < count; ind++) {
+      for (int ind = 0; ind < count; ind++) {
 
         /* Copy the minimum into the new sort array. */
         finger[ind].d = buff[inds[0]];
@@ -381,8 +430,8 @@ void runner_do_sort(struct runner *r, struct cell *c, int flags, int clock) {
         buff[inds[0]] = fingers[inds[0]]->d;
 
         /* Find the smallest entry. */
-        for (k = 1; k < 8 && buff[inds[k]] < buff[inds[k - 1]]; k++) {
-          temp_i = inds[k - 1];
+        for (int k = 1; k < 8 && buff[inds[k]] < buff[inds[k - 1]]; k++) {
+          int temp_i = inds[k - 1];
           inds[k - 1] = inds[k];
           inds[k] = temp_i;
         }
@@ -403,12 +452,19 @@ void runner_do_sort(struct runner *r, struct cell *c, int flags, int clock) {
   /* Otherwise, just sort. */
   else {
 
+    /* Reset the sort distance if we are in a local cell */
+    if (xparts != NULL) {
+      for (int k = 0; k < count; k++) {
+        xparts[k].x_diff_sort[0] = 0.0f;
+        xparts[k].x_diff_sort[1] = 0.0f;
+        xparts[k].x_diff_sort[2] = 0.0f;
+      }
+    }
+
     /* Fill the sort array. */
-    for (k = 0; k < count; k++) {
-      px[0] = parts[k].x[0];
-      px[1] = parts[k].x[1];
-      px[2] = parts[k].x[2];
-      for (j = 0; j < 13; j++)
+    for (int k = 0; k < count; k++) {
+      const double px[3] = {parts[k].x[0], parts[k].x[1], parts[k].x[2]};
+      for (int j = 0; j < 13; j++)
         if (flags & (1 << j)) {
           sort[j * (count + 1) + k].i = k;
           sort[j * (count + 1) + k].d = px[0] * runner_shift[j][0] +
@@ -418,26 +474,47 @@ void runner_do_sort(struct runner *r, struct cell *c, int flags, int clock) {
     }
 
     /* Add the sentinel and sort. */
-    for (j = 0; j < 13; j++)
+    for (int j = 0; j < 13; j++)
       if (flags & (1 << j)) {
         sort[j * (count + 1) + count].d = FLT_MAX;
         sort[j * (count + 1) + count].i = 0;
         runner_do_sort_ascending(&sort[j * (count + 1)], count);
         c->sorted |= (1 << j);
       }
+
+    /* Finally, clear the dx_max_sort field of this cell. */
+    c->dx_max_sort = 0.f;
+
+    /* If this was not just an update, invalidate the sorts above this one. */
+    if (c->ti_sort < r->e->ti_current)
+      for (struct cell *finger = c->parent; finger != NULL;
+           finger = finger->parent)
+        finger->sorted = 0;
   }
 
+  /* Update the sort timer. */
+  c->ti_sort = r->e->ti_current;
+
 #ifdef SWIFT_DEBUG_CHECKS
   /* Verify the sorting. */
-  for (j = 0; j < 13; j++) {
+  for (int j = 0; j < 13; j++) {
     if (!(flags & (1 << j))) continue;
     finger = &sort[j * (count + 1)];
-    for (k = 1; k < count; k++) {
+    for (int k = 1; k < count; k++) {
       if (finger[k].d < finger[k - 1].d)
         error("Sorting failed, ascending array.");
       if (finger[k].i >= count) error("Sorting failed, indices borked.");
     }
   }
+
+  /* Make sure the sort flags are consistent (downward). */
+  runner_check_sorts(c, flags);
+
+  /* Make sure the sort flags are consistent (upward). */
+  for (struct cell *finger = c->parent; finger != NULL;
+       finger = finger->parent) {
+    if (finger->sorted & ~c->sorted) error("Inconsistent sort flags.");
+  }
 #endif
 
   if (clock) TIMER_TOC(timer_dosort);
@@ -480,63 +557,6 @@ void runner_do_init_grav(struct runner *r, struct cell *c, int timer) {
   if (timer) TIMER_TOC(timer_init_grav);
 }
 
-/**
- * @brief Initialize the particles before the density calculation.
- *
- * @param r The runner thread.
- * @param c The cell.
- * @param timer 1 if the time is to be recorded.
- */
-void runner_do_init(struct runner *r, struct cell *c, int timer) {
-
-  struct part *restrict parts = c->parts;
-  struct gpart *restrict gparts = c->gparts;
-  const int count = c->count;
-  const int gcount = c->gcount;
-  const struct engine *e = r->e;
-  const struct space *s = e->s;
-
-  TIMER_TIC;
-
-  /* Anything to do here? */
-  if (!cell_is_active(c, e)) return;
-
-  /* Recurse? */
-  if (c->split) {
-    for (int k = 0; k < 8; k++)
-      if (c->progeny[k] != NULL) runner_do_init(r, c->progeny[k], 0);
-  } else {
-
-    /* Loop over the parts in this cell. */
-    for (int i = 0; i < count; i++) {
-
-      /* Get a direct pointer on the part. */
-      struct part *restrict p = &parts[i];
-
-      if (part_is_active(p, e)) {
-
-        /* Get ready for a density calculation */
-        hydro_init_part(p, &s->hs);
-      }
-    }
-
-    /* Loop over the gparts in this cell. */
-    for (int i = 0; i < gcount; i++) {
-
-      /* Get a direct pointer on the part. */
-      struct gpart *restrict gp = &gparts[i];
-
-      if (gpart_is_active(gp, e)) {
-
-        /* Get ready for a density calculation */
-        gravity_init_gpart(gp);
-      }
-    }
-  }
-
-  if (timer) TIMER_TOC(timer_init);
-}
-
 /**
  * @brief Intermediate task after the gradient loop that does final operations
  * on the gradient quantities and optionally slope limits the gradients
@@ -757,9 +777,7 @@ void runner_do_ghost(struct runner *r, struct cell *c, int timer) {
 
 #ifdef SWIFT_DEBUG_CHECKS
     if (count) {
-      message("Smoothing length failed to converge on %i particles.", count);
-
-      error("Aborting....");
+      error("Smoothing length failed to converge on %i particles.", count);
     }
 #else
     if (count)
@@ -784,11 +802,8 @@ static void runner_do_unskip(struct cell *c, struct engine *e) {
   /* Ignore empty cells. */
   if (c->count == 0 && c->gcount == 0) return;
 
-  /* Unskip any active tasks. */
-  if (cell_is_active(c, e)) {
-    const int forcerebuild = cell_unskip_tasks(c, &e->sched);
-    if (forcerebuild) atomic_inc(&e->forcerebuild);
-  }
+  /* Skip inactive cells. */
+  if (!cell_is_active(c, e)) return;
 
   /* Recurse */
   if (c->split) {
@@ -799,6 +814,10 @@ static void runner_do_unskip(struct cell *c, struct engine *e) {
       }
     }
   }
+
+  /* Unskip any active tasks. */
+  const int forcerebuild = cell_unskip_tasks(c, &e->sched);
+  if (forcerebuild) atomic_inc(&e->forcerebuild);
 }
 
 /**
@@ -1116,6 +1135,14 @@ void runner_do_timestep(struct runner *r, struct cell *c, int timer) {
 
   TIMER_TIC;
 
+  /* Anything to do here? */
+  if (!cell_is_active(c, e)) {
+    c->updated = 0;
+    c->g_updated = 0;
+    c->s_updated = 0;
+    return;
+  }
+
   int updated = 0, g_updated = 0, s_updated = 0;
   integertime_t ti_end_min = max_nr_timesteps, ti_end_max = 0, ti_beg_max = 0;
 
@@ -1356,9 +1383,8 @@ void runner_do_end_force(struct runner *r, struct cell *c, int timer) {
 
       if (part_is_active(p, e)) {
 
-        /* First, finish the force loop */
+        /* Finish the force loop */
         hydro_end_force(p);
-        if (p->gpart != NULL) gravity_end_force(p->gpart, const_G);
       }
     }
 
@@ -1368,25 +1394,44 @@ void runner_do_end_force(struct runner *r, struct cell *c, int timer) {
       /* Get a handle on the gpart. */
       struct gpart *restrict gp = &gparts[k];
 
-      if (gp->type == swift_type_dark_matter) {
+      if (gpart_is_active(gp, e)) {
 
-        if (gpart_is_active(gp, e)) gravity_end_force(gp, const_G);
-      }
+        /* Finish the force calculation */
+        gravity_end_force(gp, const_G);
+
+#ifdef SWIFT_NO_GRAVITY_BELOW_ID
+        /* Cancel gravity forces of these particles */
+        if ((gp->type == swift_type_dark_matter &&
+             gp->id_or_neg_offset < SWIFT_NO_GRAVITY_BELOW_ID) ||
+            (gp->type == swift_type_gas &&
+             parts[-gp->id_or_neg_offset].id < SWIFT_NO_GRAVITY_BELOW_ID) ||
+            (gp->type == swift_type_star &&
+             sparts[-gp->id_or_neg_offset].id < SWIFT_NO_GRAVITY_BELOW_ID)) {
+
+          /* Don't move ! */
+          gp->a_grav[0] = 0.f;
+          gp->a_grav[1] = 0.f;
+          gp->a_grav[2] = 0.f;
+        }
+#endif
 
 #ifdef SWIFT_DEBUG_CHECKS
-      if (e->policy & engine_policy_self_gravity && gpart_is_active(gp, e)) {
+        if (e->policy & engine_policy_self_gravity) {
 
-        /* Check that this gpart has interacted with all the other particles
-         * (via direct or multipoles) in the box */
-        gp->num_interacted++;
-        if (gp->num_interacted != (long long)e->s->nr_gparts)
-          error(
-              "g-particle (id=%lld, type=%d) did not interact gravitationally "
-              "with all other gparts gp->num_interacted=%lld, total_gparts=%zd",
-              gp->id_or_neg_offset, gp->type, gp->num_interacted,
-              e->s->nr_gparts);
-      }
+          /* Check that this gpart has interacted with all the other
+           * particles (via direct or multipoles) in the box */
+          gp->num_interacted++;
+          if (gp->num_interacted != (long long)e->s->nr_gparts)
+            error(
+                "g-particle (id=%lld, type=%d) did not interact "
+                "gravitationally "
+                "with all other gparts gp->num_interacted=%lld, "
+                "total_gparts=%zd",
+                gp->id_or_neg_offset, gp->type, gp->num_interacted,
+                e->s->nr_gparts);
+        }
 #endif
+      }
     }
 
     /* Loop over the star particles in this cell. */
@@ -1396,9 +1441,8 @@ void runner_do_end_force(struct runner *r, struct cell *c, int timer) {
       struct spart *restrict sp = &sparts[k];
       if (spart_is_active(sp, e)) {
 
-        /* First, finish the force loop */
+        /* Finish the force loop */
         star_end_force(sp);
-        gravity_end_force(sp->gpart, const_G);
       }
     }
   }
@@ -1411,9 +1455,11 @@ void runner_do_end_force(struct runner *r, struct cell *c, int timer) {
  *
  * @param r The runner thread.
  * @param c The cell.
+ * @param clear_sorts Should we clear the sort flag and hence trigger a sort ?
  * @param timer Are we timing this ?
  */
-void runner_do_recv_part(struct runner *r, struct cell *c, int timer) {
+void runner_do_recv_part(struct runner *r, struct cell *c, int clear_sorts,
+                         int timer) {
 
 #ifdef WITH_MPI
 
@@ -1429,6 +1475,9 @@ void runner_do_recv_part(struct runner *r, struct cell *c, int timer) {
   timebin_t time_bin_max = 0;
   float h_max = 0.f;
 
+  /* Clear this cell's sorted mask. */
+  if (clear_sorts) c->sorted = 0;
+
   /* If this cell is a leaf, collect the particle data. */
   if (!c->split) {
 
@@ -1454,7 +1503,7 @@ void runner_do_recv_part(struct runner *r, struct cell *c, int timer) {
   else {
     for (int k = 0; k < 8; k++) {
       if (c->progeny[k] != NULL) {
-        runner_do_recv_part(r, c->progeny[k], 0);
+        runner_do_recv_part(r, c->progeny[k], clear_sorts, 0);
         ti_end_min = min(ti_end_min, c->progeny[k]->ti_end_min);
         ti_end_max = max(ti_end_max, c->progeny[k]->ti_end_max);
         h_max = max(h_max, c->progeny[k]->h_max);
@@ -1686,7 +1735,7 @@ void *runner_main(void *data) {
 
         if (!cell_is_active(ci, e) && t->type != task_type_sort &&
             t->type != task_type_send && t->type != task_type_recv &&
-            t->type != task_type_kick1)
+            t->type != task_type_kick1 && t->type != task_type_drift)
           error(
               "Task (type='%s/%s') should have been skipped ti_current=%lld "
               "c->ti_end_min=%lld",
@@ -1806,9 +1855,6 @@ void *runner_main(void *data) {
         case task_type_init_grav:
           runner_do_init_grav(r, ci, 1);
           break;
-        case task_type_init:
-          runner_do_init(r, ci, 1);
-          break;
         case task_type_ghost:
           runner_do_ghost(r, ci, 1);
           break;
@@ -1841,9 +1887,10 @@ void *runner_main(void *data) {
           if (t->subtype == task_subtype_tend) {
             cell_unpack_ti_ends(ci, t->buff);
             free(t->buff);
-          } else if (t->subtype == task_subtype_xv ||
-                     t->subtype == task_subtype_rho) {
-            runner_do_recv_part(r, ci, 1);
+          } else if (t->subtype == task_subtype_xv) {
+            runner_do_recv_part(r, ci, 1, 1);
+          } else if (t->subtype == task_subtype_rho) {
+            runner_do_recv_part(r, ci, 1, 1);
           } else if (t->subtype == task_subtype_gpart) {
             runner_do_recv_gpart(r, ci, 1);
           } else if (t->subtype == task_subtype_spart) {
diff --git a/src/runner.h b/src/runner.h
index 49f7cd88a0b345cac1b29b7fd38cf59b21268012..5cb08cc14222c074b229c9bdf96ac47a072cd34e 100644
--- a/src/runner.h
+++ b/src/runner.h
@@ -49,11 +49,13 @@ struct runner {
   /*! The engine owing this runner. */
   struct engine *e;
 
+#ifdef WITH_VECTORIZATION
   /*! The particle cache of cell ci. */
   struct cache ci_cache;
 
   /*! The particle cache of cell cj. */
   struct cache cj_cache;
+#endif
 };
 
 /* Function prototypes. */
diff --git a/src/runner_doiact.h b/src/runner_doiact.h
index 37147d32a901ed97ede1825ee232c41aa4f41fc6..839e492956140bd2b633d1e7c99c2b2a43f89629 100644
--- a/src/runner_doiact.h
+++ b/src/runner_doiact.h
@@ -35,12 +35,6 @@
 #define _DOPAIR2(f) PASTE(runner_dopair2, f)
 #define DOPAIR2 _DOPAIR2(FUNCTION)
 
-#define _DOPAIR1_NOSORT(f) PASTE(runner_dopair1_nosort, f)
-#define DOPAIR1_NOSORT _DOPAIR1_NOSORT(FUNCTION)
-
-#define _DOPAIR2_NOSORT(f) PASTE(runner_dopair2_nosort, f)
-#define DOPAIR2_NOSORT _DOPAIR2_NOSORT(FUNCTION)
-
 #define _DOPAIR_SUBSET(f) PASTE(runner_dopair_subset, f)
 #define DOPAIR_SUBSET _DOPAIR_SUBSET(FUNCTION)
 
@@ -50,11 +44,14 @@
 #define _DOPAIR_SUBSET_NAIVE(f) PASTE(runner_dopair_subset_naive, f)
 #define DOPAIR_SUBSET_NAIVE _DOPAIR_SUBSET_NAIVE(FUNCTION)
 
-#define _DOPAIR_NAIVE(f) PASTE(runner_dopair_naive, f)
-#define DOPAIR_NAIVE _DOPAIR_NAIVE(FUNCTION)
+#define _DOPAIR1_NAIVE(f) PASTE(runner_dopair1_naive, f)
+#define DOPAIR1_NAIVE _DOPAIR1_NAIVE(FUNCTION)
 
-#define _DOSELF_NAIVE(f) PASTE(runner_doself_naive, f)
-#define DOSELF_NAIVE _DOSELF_NAIVE(FUNCTION)
+#define _DOPAIR2_NAIVE(f) PASTE(runner_dopair2_naive, f)
+#define DOPAIR2_NAIVE _DOPAIR2_NAIVE(FUNCTION)
+
+#define _DOSELF2_NAIVE(f) PASTE(runner_doself2_naive, f)
+#define DOSELF2_NAIVE _DOSELF2_NAIVE(FUNCTION)
 
 #define _DOSELF1(f) PASTE(runner_doself1, f)
 #define DOSELF1 _DOSELF1(FUNCTION)
@@ -110,21 +107,164 @@
 #define _TIMER_DOPAIR_SUBSET(f) PASTE(timer_dopair_subset, f)
 #define TIMER_DOPAIR_SUBSET _TIMER_DOPAIR_SUBSET(FUNCTION)
 
-#include "runner_doiact_nosort.h"
+/**
+ * @brief Compute the interactions between a cell pair (non-symmetric case).
+ *
+ * Inefficient version using a brute-force algorithm.
+ *
+ * @param r The #runner.
+ * @param ci The first #cell.
+ * @param cj The second #cell.
+ */
+void DOPAIR1_NAIVE(struct runner *r, struct cell *restrict ci,
+                   struct cell *restrict cj) {
+
+  const struct engine *e = r->e;
+
+#ifndef SWIFT_DEBUG_CHECKS
+  error("Don't use in actual runs ! Slow code !");
+#endif
+
+#ifdef WITH_VECTORIZATION
+  int icount = 0;
+  float r2q[VEC_SIZE] __attribute__((aligned(16)));
+  float hiq[VEC_SIZE] __attribute__((aligned(16)));
+  float hjq[VEC_SIZE] __attribute__((aligned(16)));
+  float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
+  struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
+#endif
+  TIMER_TIC;
+
+  /* Anything to do here? */
+  if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return;
+
+  const int count_i = ci->count;
+  const int count_j = cj->count;
+  struct part *restrict parts_i = ci->parts;
+  struct part *restrict parts_j = cj->parts;
+
+  /* Get the relative distance between the pairs, wrapping. */
+  double shift[3] = {0.0, 0.0, 0.0};
+  for (int k = 0; k < 3; k++) {
+    if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2)
+      shift[k] = e->s->dim[k];
+    else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2)
+      shift[k] = -e->s->dim[k];
+  }
+
+  /* Loop over the parts in ci. */
+  for (int pid = 0; pid < count_i; pid++) {
+
+    /* Get a hold of the ith part in ci. */
+    struct part *restrict pi = &parts_i[pid];
+    const float hi = pi->h;
+
+    double pix[3];
+    for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
+    const float hig2 = hi * hi * kernel_gamma2;
+
+    /* Loop over the parts in cj. */
+    for (int pjd = 0; pjd < count_j; pjd++) {
+
+      /* Get a pointer to the jth particle. */
+      struct part *restrict pj = &parts_j[pjd];
+
+      /* Compute the pairwise distance. */
+      float r2 = 0.0f;
+      float dx[3];
+      for (int k = 0; k < 3; k++) {
+        dx[k] = pix[k] - pj->x[k];
+        r2 += dx[k] * dx[k];
+      }
+
+      /* Hit or miss? */
+      if (r2 < hig2) {
+
+#ifndef WITH_VECTORIZATION
+
+        IACT_NONSYM(r2, dx, hi, pj->h, pi, pj);
+
+#else
+
+        /* Add this interaction to the queue. */
+        r2q[icount] = r2;
+        dxq[3 * icount + 0] = dx[0];
+        dxq[3 * icount + 1] = dx[1];
+        dxq[3 * icount + 2] = dx[2];
+        hiq[icount] = hi;
+        hjq[icount] = pj->h;
+        piq[icount] = pi;
+        pjq[icount] = pj;
+        icount += 1;
+
+        /* Flush? */
+        if (icount == VEC_SIZE) {
+          IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq);
+          icount = 0;
+        }
+
+#endif
+      }
+      if (r2 < pj->h * pj->h * kernel_gamma2) {
+
+#ifndef WITH_VECTORIZATION
+
+        for (int k = 0; k < 3; k++) dx[k] = -dx[k];
+        IACT_NONSYM(r2, dx, pj->h, hi, pj, pi);
+
+#else
+
+        /* Add this interaction to the queue. */
+        r2q[icount] = r2;
+        dxq[3 * icount + 0] = -dx[0];
+        dxq[3 * icount + 1] = -dx[1];
+        dxq[3 * icount + 2] = -dx[2];
+        hiq[icount] = pj->h;
+        hjq[icount] = hi;
+        piq[icount] = pj;
+        pjq[icount] = pi;
+        icount += 1;
+
+        /* Flush? */
+        if (icount == VEC_SIZE) {
+          IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq);
+          icount = 0;
+        }
+
+#endif
+      }
+
+    } /* loop over the parts in cj. */
+
+  } /* loop over the parts in ci. */
+
+#ifdef WITH_VECTORIZATION
+  /* Pick up any leftovers. */
+  if (icount > 0)
+    for (int k = 0; k < icount; k++)
+      IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
+#endif
+
+  TIMER_TOC(TIMER_DOPAIR);
+}
 
 /**
- * @brief Compute the interactions between a cell pair.
+ * @brief Compute the interactions between a cell pair (symmetric case).
+ *
+ * Inefficient version using a brute-force algorithm.
  *
  * @param r The #runner.
  * @param ci The first #cell.
  * @param cj The second #cell.
  */
-void DOPAIR_NAIVE(struct runner *r, struct cell *restrict ci,
-                  struct cell *restrict cj) {
+void DOPAIR2_NAIVE(struct runner *r, struct cell *restrict ci,
+                   struct cell *restrict cj) {
 
   const struct engine *e = r->e;
 
+#ifndef SWIFT_DEBUG_CHECKS
   error("Don't use in actual runs ! Slow code !");
+#endif
 
 #ifdef WITH_OLD_VECTORIZATION
   int icount = 0;
@@ -221,11 +361,21 @@ void DOPAIR_NAIVE(struct runner *r, struct cell *restrict ci,
   TIMER_TOC(TIMER_DOPAIR);
 }
 
-void DOSELF_NAIVE(struct runner *r, struct cell *restrict c) {
+/**
+ * @brief Compute the interactions within a cell (symmetric case).
+ *
+ * Inefficient version using a brute-force algorithm.
+ *
+ * @param r The #runner.
+ * @param c The #cell.
+ */
+void DOSELF2_NAIVE(struct runner *r, struct cell *restrict c) {
 
   const struct engine *e = r->e;
 
+#ifndef SWIFT_DEBUG_CHECKS
   error("Don't use in actual runs ! Slow code !");
+#endif
 
 #ifdef WITH_OLD_VECTORIZATION
   int icount = 0;
@@ -314,6 +464,8 @@ void DOSELF_NAIVE(struct runner *r, struct cell *restrict c) {
  * @brief Compute the interactions between a cell pair, but only for the
  *      given indices in ci.
  *
+ * Version using a brute-force algorithm.
+ *
  * @param r The #runner.
  * @param ci The first #cell.
  * @param parts_i The #part to interact with @c cj.
@@ -325,9 +477,7 @@ void DOPAIR_SUBSET_NAIVE(struct runner *r, struct cell *restrict ci,
                          struct part *restrict parts_i, int *restrict ind,
                          int count, struct cell *restrict cj) {
 
-  struct engine *e = r->e;
-
-  error("Don't use in actual runs ! Slow code !");
+  const struct engine *e = r->e;
 
 #ifdef WITH_OLD_VECTORIZATION
   int icount = 0;
@@ -362,6 +512,12 @@ void DOPAIR_SUBSET_NAIVE(struct runner *r, struct cell *restrict ci,
     const float hi = pi->h;
     const float hig2 = hi * hi * kernel_gamma2;
 
+#ifdef SWIFT_DEBUG_CHECKS
+    if (!part_is_active(pi, e))
+      error("Trying to correct smoothing length of inactive particle !");
+
+#endif
+
     /* Loop over the parts in cj. */
     for (int pjd = 0; pjd < count_j; pjd++) {
 
@@ -376,6 +532,14 @@ void DOPAIR_SUBSET_NAIVE(struct runner *r, struct cell *restrict ci,
         r2 += dx[k] * dx[k];
       }
 
+#ifdef SWIFT_DEBUG_CHECKS
+      /* Check that particles have been drifted to the current time */
+      if (pi->ti_drift != e->ti_current)
+        error("Particle pi not drifted to current time");
+      if (pj->ti_drift != e->ti_current)
+        error("Particle pj not drifted to current time");
+#endif
+
       /* Hit or miss? */
       if (r2 < hig2) {
 
@@ -416,7 +580,7 @@ void DOPAIR_SUBSET_NAIVE(struct runner *r, struct cell *restrict ci,
       IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
 #endif
 
-  TIMER_TOC(timer_dopair_subset);
+  TIMER_TOC(timer_dopair_subset_naive);
 }
 
 /**
@@ -436,13 +600,6 @@ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci,
 
   struct engine *e = r->e;
 
-#ifdef WITH_MPI
-  if (ci->nodeID != cj->nodeID) {
-    DOPAIR_SUBSET_NOSORT(r, ci, parts_i, ind, count, cj);
-    return;
-  }
-#endif
-
 #ifdef WITH_OLD_VECTORIZATION
   int icount = 0;
   float r2q[VEC_SIZE] __attribute__((aligned(16)));
@@ -478,11 +635,15 @@ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci,
   sid = sortlistID[sid];
 
   /* Have the cells been sorted? */
-  if (!(cj->sorted & (1 << sid))) error("Trying to interact unsorted cells.");
+  if (!(cj->sorted & (1 << sid)) ||
+      cj->dx_max_sort > space_maxreldx * cj->dmin) {
+    DOPAIR_SUBSET_NAIVE(r, ci, parts_i, ind, count, cj);
+    return;
+  }
 
   /* Pick-out the sorted lists. */
   const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)];
-  const float dxj = cj->dx_max;
+  const float dxj = cj->dx_max_sort;
 
   /* Parts are on the left? */
   if (!flipped) {
@@ -714,7 +875,7 @@ void DOSELF_SUBSET(struct runner *r, struct cell *restrict ci,
       IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
 #endif
 
-  TIMER_TOC(timer_dopair_subset);
+  TIMER_TOC(timer_doself_subset);
 }
 
 /**
@@ -728,13 +889,6 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) {
 
   const struct engine *restrict e = r->e;
 
-#ifdef WITH_MPI
-  if (ci->nodeID != cj->nodeID) {
-    DOPAIR1_NOSORT(r, ci, cj);
-    return;
-  }
-#endif
-
 #ifdef WITH_OLD_VECTORIZATION
   int icount = 0;
   float r2q[VEC_SIZE] __attribute__((aligned(16)));
@@ -749,16 +903,18 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) {
   /* Anything to do here? */
   if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return;
 
-  if (!cell_is_drifted(ci, e)) cell_drift_particles(ci, e);
-  if (!cell_is_drifted(cj, e)) cell_drift_particles(cj, e);
+  if (!cell_is_drifted(ci, e) || !cell_is_drifted(cj, e))
+    error("Interacting undrifted cells.");
 
   /* Get the sort ID. */
   double shift[3] = {0.0, 0.0, 0.0};
   const int sid = space_getsid(e->s, &ci, &cj, shift);
 
   /* Have the cells been sorted? */
-  if (!(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid)))
-    error("Trying to interact unsorted cells.");
+  if (!(ci->sorted & (1 << sid)) || ci->dx_max_sort > space_maxreldx * ci->dmin)
+    runner_do_sort(r, ci, (1 << sid), 1);
+  if (!(cj->sorted & (1 << sid)) || cj->dx_max_sort > space_maxreldx * cj->dmin)
+    runner_do_sort(r, cj, (1 << sid), 1);
 
   /* Get the cutoff shift. */
   double rshift = 0.0;
@@ -768,6 +924,29 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) {
   const struct entry *restrict sort_i = &ci->sort[sid * (ci->count + 1)];
   const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)];
 
+#ifdef SWIFT_DEBUG_CHECKS
+  /* Check that the dx_max_sort values in the cell are indeed an upper
+     bound on particle movement. */
+  for (int pid = 0; pid < ci->count; pid++) {
+    const struct part *p = &ci->parts[sort_i[pid].i];
+    const float d = p->x[0] * runner_shift[sid][0] +
+                    p->x[1] * runner_shift[sid][1] +
+                    p->x[2] * runner_shift[sid][2];
+    if (fabsf(d - sort_i[pid].d) - ci->dx_max_sort >
+        1.0e-6 * max(fabsf(d), ci->dx_max_sort))
+      error("particle shift diff exceeds dx_max_sort.");
+  }
+  for (int pjd = 0; pjd < cj->count; pjd++) {
+    const struct part *p = &cj->parts[sort_j[pjd].i];
+    const float d = p->x[0] * runner_shift[sid][0] +
+                    p->x[1] * runner_shift[sid][1] +
+                    p->x[2] * runner_shift[sid][2];
+    if (fabsf(d - sort_j[pjd].d) - cj->dx_max_sort >
+        1.0e-6 * max(fabsf(d), cj->dx_max_sort))
+      error("particle shift diff exceeds dx_max_sort.");
+  }
+#endif /* SWIFT_DEBUG_CHECKS */
+
   /* Get some other useful values. */
   const double hi_max = ci->h_max * kernel_gamma - rshift;
   const double hj_max = cj->h_max * kernel_gamma;
@@ -777,7 +956,7 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) {
   struct part *restrict parts_j = cj->parts;
   const double di_max = sort_i[count_i - 1].d - rshift;
   const double dj_min = sort_j[0].d;
-  const float dx_max = (ci->dx_max + cj->dx_max);
+  const float dx_max = (ci->dx_max_sort + cj->dx_max_sort);
 
   if (cell_is_active(ci, e)) {
 
@@ -948,13 +1127,6 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
 
   struct engine *restrict e = r->e;
 
-#ifdef WITH_MPI
-  if (ci->nodeID != cj->nodeID) {
-    DOPAIR2_NOSORT(r, ci, cj);
-    return;
-  }
-#endif
-
 #ifdef WITH_OLD_VECTORIZATION
   int icount1 = 0;
   float r2q1[VEC_SIZE] __attribute__((aligned(16)));
@@ -975,16 +1147,18 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
   /* Anything to do here? */
   if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return;
 
-  if (!cell_is_drifted(ci, e)) error("Cell ci not drifted");
-  if (!cell_is_drifted(cj, e)) error("Cell cj not drifted");
+  if (!cell_is_drifted(ci, e) || !cell_is_drifted(cj, e))
+    error("Interacting undrifted cells.");
 
   /* Get the shift ID. */
   double shift[3] = {0.0, 0.0, 0.0};
   const int sid = space_getsid(e->s, &ci, &cj, shift);
 
   /* Have the cells been sorted? */
-  if (!(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid)))
-    error("Trying to interact unsorted cells.");
+  if (!(ci->sorted & (1 << sid)) || ci->dx_max_sort > space_maxreldx * ci->dmin)
+    runner_do_sort(r, ci, (1 << sid), 1);
+  if (!(cj->sorted & (1 << sid)) || cj->dx_max_sort > space_maxreldx * cj->dmin)
+    runner_do_sort(r, cj, (1 << sid), 1);
 
   /* Get the cutoff shift. */
   double rshift = 0.0;
@@ -994,6 +1168,29 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
   struct entry *restrict sort_i = &ci->sort[sid * (ci->count + 1)];
   struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)];
 
+#ifdef SWIFT_DEBUG_CHECKS
+  /* Check that the dx_max_sort values in the cell are indeed an upper
+     bound on particle movement. */
+  for (int pid = 0; pid < ci->count; pid++) {
+    const struct part *p = &ci->parts[sort_i[pid].i];
+    const float d = p->x[0] * runner_shift[sid][0] +
+                    p->x[1] * runner_shift[sid][1] +
+                    p->x[2] * runner_shift[sid][2];
+    if (fabsf(d - sort_i[pid].d) - ci->dx_max_sort >
+        1.0e-6 * max(fabsf(d), ci->dx_max_sort))
+      error("particle shift diff exceeds dx_max_sort.");
+  }
+  for (int pjd = 0; pjd < cj->count; pjd++) {
+    const struct part *p = &cj->parts[sort_j[pjd].i];
+    const float d = p->x[0] * runner_shift[sid][0] +
+                    p->x[1] * runner_shift[sid][1] +
+                    p->x[2] * runner_shift[sid][2];
+    if (fabsf(d - sort_j[pjd].d) - cj->dx_max_sort >
+        1.0e-6 * max(fabsf(d), cj->dx_max_sort))
+      error("particle shift diff exceeds dx_max_sort.");
+  }
+#endif /* SWIFT_DEBUG_CHECKS */
+
   /* Get some other useful values. */
   const double hi_max = ci->h_max * kernel_gamma - rshift;
   const double hj_max = cj->h_max * kernel_gamma;
@@ -1003,7 +1200,7 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
   struct part *restrict parts_j = cj->parts;
   const double di_max = sort_i[count_i - 1].d - rshift;
   const double dj_min = sort_j[0].d;
-  const double dx_max = (ci->dx_max + cj->dx_max);
+  const double dx_max = (ci->dx_max_sort + cj->dx_max_sort);
 
   /* Collect the number of parts left and right below dt. */
   int countdt_i = 0, countdt_j = 0;
@@ -1400,7 +1597,7 @@ void DOSELF1(struct runner *r, struct cell *restrict c) {
 
   if (!cell_is_active(c, e)) return;
 
-  if (!cell_is_drifted(c, e)) cell_drift_particles(c, e);
+  if (!cell_is_drifted(c, e)) error("Interacting undrifted cell.");
 
   struct part *restrict parts = c->parts;
   const int count = c->count;
@@ -1876,7 +2073,8 @@ void DOSUB_PAIR1(struct runner *r, struct cell *ci, struct cell *cj, int sid,
 
   /* Recurse? */
   if (ci->split && cj->split &&
-      max(ci->h_max, cj->h_max) * kernel_gamma + ci->dx_max + cj->dx_max <
+      max(ci->h_max, cj->h_max) * kernel_gamma + ci->dx_max_sort +
+              cj->dx_max_sort <
           h / 2) {
 
     /* Different types of flags. */
@@ -2080,9 +2278,17 @@ void DOSUB_PAIR1(struct runner *r, struct cell *ci, struct cell *cj, int sid,
   /* Otherwise, compute the pair directly. */
   else if (cell_is_active(ci, e) || cell_is_active(cj, e)) {
 
+    /* Make sure both cells are drifted to the current timestep. */
+    if (!cell_is_drifted(ci, e)) cell_drift_particles(ci, e);
+    if (!cell_is_drifted(cj, e)) cell_drift_particles(cj, e);
+
     /* Do any of the cells need to be sorted first? */
-    if (!(ci->sorted & (1 << sid))) runner_do_sort(r, ci, (1 << sid), 1);
-    if (!(cj->sorted & (1 << sid))) runner_do_sort(r, cj, (1 << sid), 1);
+    if (!(ci->sorted & (1 << sid)) ||
+        ci->dx_max_sort > ci->dmin * space_maxreldx)
+      runner_do_sort(r, ci, (1 << sid), 1);
+    if (!(cj->sorted & (1 << sid)) ||
+        cj->dx_max_sort > cj->dmin * space_maxreldx)
+      runner_do_sort(r, cj, (1 << sid), 1);
 
 /* Compute the interactions. */
 #if (DOPAIR1 == runner_dopair1_density) && defined(WITH_VECTORIZATION) && \
@@ -2110,10 +2316,6 @@ void DOSUB_SELF1(struct runner *r, struct cell *ci, int gettimer) {
   /* Should we even bother? */
   if (ci->count == 0 || !cell_is_active(ci, r->e)) return;
 
-#ifdef SWIFT_DEBUG_CHECKS
-  cell_is_drifted(ci, r->e);
-#endif
-
   /* Recurse? */
   if (ci->split) {
 
@@ -2129,6 +2331,10 @@ void DOSUB_SELF1(struct runner *r, struct cell *ci, int gettimer) {
 
   /* Otherwise, compute self-interaction. */
   else {
+
+    /* Drift the cell to the current timestep if needed. */
+    if (!cell_is_drifted(ci, r->e)) cell_drift_particles(ci, r->e);
+
 #if (DOSELF1 == runner_doself1_density) && defined(WITH_VECTORIZATION) && \
     defined(GADGET2_SPH)
     runner_doself1_density_vec(r, ci);
@@ -2174,7 +2380,8 @@ void DOSUB_PAIR2(struct runner *r, struct cell *ci, struct cell *cj, int sid,
 
   /* Recurse? */
   if (ci->split && cj->split &&
-      max(ci->h_max, cj->h_max) * kernel_gamma + ci->dx_max + cj->dx_max <
+      max(ci->h_max, cj->h_max) * kernel_gamma + ci->dx_max_sort +
+              cj->dx_max_sort <
           h / 2) {
 
     /* Different types of flags. */
@@ -2378,9 +2585,17 @@ void DOSUB_PAIR2(struct runner *r, struct cell *ci, struct cell *cj, int sid,
   /* Otherwise, compute the pair directly. */
   else if (cell_is_active(ci, e) || cell_is_active(cj, e)) {
 
+    /* Make sure both cells are drifted to the current timestep. */
+    if (!cell_is_drifted(ci, e)) cell_drift_particles(ci, e);
+    if (!cell_is_drifted(cj, e)) cell_drift_particles(cj, e);
+
     /* Do any of the cells need to be sorted first? */
-    if (!(ci->sorted & (1 << sid))) runner_do_sort(r, ci, (1 << sid), 1);
-    if (!(cj->sorted & (1 << sid))) runner_do_sort(r, cj, (1 << sid), 1);
+    if (!(ci->sorted & (1 << sid)) ||
+        ci->dx_max_sort > ci->dmin * space_maxreldx)
+      runner_do_sort(r, ci, (1 << sid), 1);
+    if (!(cj->sorted & (1 << sid)) ||
+        cj->dx_max_sort > cj->dmin * space_maxreldx)
+      runner_do_sort(r, cj, (1 << sid), 1);
 
     /* Compute the interactions. */
     DOPAIR2(r, ci, cj);
@@ -2436,15 +2651,6 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts,
   struct cell *sub = NULL;
   for (int k = 0; k < 8; k++)
     if (ci->progeny[k] != NULL) {
-      // if ( parts[ ind[ 0 ] ].x[0] >= ci->progeny[k]->loc[0] &&
-      //      parts[ ind[ 0 ] ].x[0] <= ci->progeny[k]->loc[0] +
-      // ci->progeny[k]->width[0] &&
-      //      parts[ ind[ 0 ] ].x[1] >= ci->progeny[k]->loc[1] &&
-      //      parts[ ind[ 0 ] ].x[1] <= ci->progeny[k]->loc[1] +
-      // ci->progeny[k]->width[1] &&
-      //      parts[ ind[ 0 ] ].x[2] >= ci->progeny[k]->loc[2] &&
-      //      parts[ ind[ 0 ] ].x[2] <= ci->progeny[k]->loc[2] +
-      // ci->progeny[k]->width[2] ) {
       if (&parts[ind[0]] >= &ci->progeny[k]->parts[0] &&
           &parts[ind[0]] < &ci->progeny[k]->parts[ci->progeny[k]->count]) {
         sub = ci->progeny[k];
@@ -2480,7 +2686,8 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts,
 
     /* Recurse? */
     if (ci->split && cj->split &&
-        max(ci->h_max, cj->h_max) * kernel_gamma + ci->dx_max + cj->dx_max <
+        max(ci->h_max, cj->h_max) * kernel_gamma + ci->dx_max_sort +
+                cj->dx_max_sort <
             h / 2) {
 
       /* Get the type of pair if not specified explicitly. */
@@ -3010,59 +3217,13 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts,
                        : (cj->loc[k] - ci->loc[k] + shift[k] > 0) ? 2 : 1);
       new_sid = sortlistID[new_sid];
 
-      /* Do any of the cells need to be sorted first? */
-      if (!(cj->sorted & (1 << new_sid)))
-        runner_do_sort(r, cj, (1 << new_sid), 1);
+      /* Do any of the cells need to be drifted first? */
+      if (!cell_is_drifted(cj, e)) cell_drift_particles(cj, e);
 
-      /* Compute the interactions. */
       DOPAIR_SUBSET(r, ci, parts, ind, count, cj);
     }
 
   } /* otherwise, pair interaction. */
 
-  if (gettimer) TIMER_TOC(TIMER_DOSUB_PAIR);
-}
-
-/**
- * @brief Determine which version of DOPAIR1 needs to be called depending on MPI, vectorisation and orientation of the cells or whether DOPAIR1 needs to be called at all.
- *
- * @param r #runner
- * @param ci #cell ci
- * @param cj #cell cj
- *
- */
-void DOPAIR1_BRANCH(struct runner *r, struct cell *ci, struct cell *cj) {
-
-  const struct engine *restrict e = r->e;
-
-#ifdef WITH_MPI
-  if (ci->nodeID != cj->nodeID) {
-    DOPAIR1_NOSORT(r, ci, cj);
-    return;
-  }
-#endif
-
-  /* Anything to do here? */
-  if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return;
-
-  /* Drift cells that are not drifted. */
-  if (!cell_is_drifted(ci, e)) cell_drift_particles(ci, e);
-  if (!cell_is_drifted(cj, e)) cell_drift_particles(cj, e);
-
-  /* Get the sort ID. */
-  double shift[3] = {0.0, 0.0, 0.0};
-  const int sid = space_getsid(e->s, &ci, &cj, shift);
-
-  /* Have the cells been sorted? */
-  if (!(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid)))
-    error("Trying to interact unsorted cells.");
-
-#if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH) && (DOPAIR1_BRANCH == runner_dopair1_density_branch)
- if(!sort_is_corner(sid))
-   runner_dopair1_density_vec(r, ci, cj);
- else
-  DOPAIR1(r, ci, cj);
-#else 
- DOPAIR1(r, ci, cj);
-#endif
+  if (gettimer) TIMER_TOC(timer_dosub_subset);
 }
diff --git a/src/runner_doiact_grav.h b/src/runner_doiact_grav.h
index 065e99dba8ee08a6a4fb91245a9d53ffdc7caccc..13a55344d773e7fba000d680eae9866dffdd88e1 100644
--- a/src/runner_doiact_grav.h
+++ b/src/runner_doiact_grav.h
@@ -43,6 +43,10 @@ void runner_do_grav_down(struct runner *r, struct cell *c, int timer) {
 
   TIMER_TIC;
 
+#ifdef SWIFT_DEBUG_CHECKS
+  if (c->ti_old_multipole != e->ti_current) error("c->multipole not drifted.");
+#endif
+
   if (c->split) { /* Node case */
 
     /* Add the field-tensor to all the 8 progenitors */
@@ -53,6 +57,11 @@ void runner_do_grav_down(struct runner *r, struct cell *c, int timer) {
       /* Do we have a progenitor with any active g-particles ? */
       if (cp != NULL && cell_is_active(cp, e)) {
 
+#ifdef SWIFT_DEBUG_CHECKS
+        if (cp->ti_old_multipole != e->ti_current)
+          error("cp->multipole not drifted.");
+#endif
+
         /* Shift the field tensor */
         gravity_L2L(&temp, &c->multipole->pot, cp->multipole->CoM,
                     c->multipole->CoM, 0 * periodic);
@@ -74,8 +83,16 @@ void runner_do_grav_down(struct runner *r, struct cell *c, int timer) {
       struct gpart *gp = &gparts[i];
 
       /* Update if active */
-      if (gpart_is_active(gp, e))
+      if (gpart_is_active(gp, e)) {
+
+#ifdef SWIFT_DEBUG_CHECKS
+        /* Check that particles have been drifted to the current time */
+        if (gp->ti_drift != e->ti_current)
+          error("gpart not drifted to current time");
+#endif
+
         gravity_L2P(&c->multipole->pot, c->multipole->CoM, gp);
+      }
     }
   }
 
@@ -102,14 +119,17 @@ void runner_dopair_grav_mm(const struct runner *r, struct cell *restrict ci,
 
   TIMER_TIC;
 
+  /* Anything to do here? */
+  if (!cell_is_active(ci, e)) return;
+
 #ifdef SWIFT_DEBUG_CHECKS
   if (ci == cj) error("Interacting a cell with itself using M2L");
 
   if (multi_j->M_000 == 0.f) error("Multipole does not seem to have been set.");
-#endif
 
-  /* Anything to do here? */
-  if (!cell_is_active(ci, e)) return;
+  if (ci->ti_old_multipole != e->ti_current)
+    error("ci->multipole not drifted.");
+#endif
 
   /* Do we need to drift the multipole ? */
   if (cj->ti_old_multipole != e->ti_current) cell_drift_multipole(cj, e);
@@ -161,6 +181,10 @@ void runner_dopair_grav_pp(struct runner *r, struct cell *ci, struct cell *cj) {
   /* Anything to do here? */
   if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return;
 
+  /* Let's start by drifting things */
+  if (!cell_is_drifted(ci, e)) cell_drift_particles(ci, e);
+  if (!cell_is_drifted(cj, e)) cell_drift_particles(cj, e);
+
 #if ICHECK > 0
   for (int pid = 0; pid < gcount_i; pid++) {
 
@@ -188,60 +212,80 @@ void runner_dopair_grav_pp(struct runner *r, struct cell *ci, struct cell *cj) {
   /* MATTHIEU: Should we use local DP accumulators ? */
 
   /* Loop over all particles in ci... */
-  for (int pid = 0; pid < gcount_i; pid++) {
+  if (cell_is_active(ci, e)) {
+    for (int pid = 0; pid < gcount_i; pid++) {
 
-    /* Get a hold of the ith part in ci. */
-    struct gpart *restrict gpi = &gparts_i[pid];
+      /* Get a hold of the ith part in ci. */
+      struct gpart *restrict gpi = &gparts_i[pid];
 
-    if (!gpart_is_active(gpi, e)) continue;
+      if (!gpart_is_active(gpi, e)) continue;
 
-    /* Loop over every particle in the other cell. */
-    for (int pjd = 0; pjd < gcount_j; pjd++) {
+      /* Loop over every particle in the other cell. */
+      for (int pjd = 0; pjd < gcount_j; pjd++) {
 
-      /* Get a hold of the jth part in cj. */
-      const struct gpart *restrict gpj = &gparts_j[pjd];
+        /* Get a hold of the jth part in cj. */
+        const struct gpart *restrict gpj = &gparts_j[pjd];
 
-      /* Compute the pairwise distance. */
-      const float dx[3] = {gpi->x[0] - gpj->x[0],   // x
-                           gpi->x[1] - gpj->x[1],   // y
-                           gpi->x[2] - gpj->x[2]};  // z
-      const float r2 = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2];
+        /* Compute the pairwise distance. */
+        const float dx[3] = {gpi->x[0] - gpj->x[0],   // x
+                             gpi->x[1] - gpj->x[1],   // y
+                             gpi->x[2] - gpj->x[2]};  // z
+        const float r2 = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2];
 
-      /* Interact ! */
-      runner_iact_grav_pp_nonsym(rlr_inv, r2, dx, gpi, gpj);
+#ifdef SWIFT_DEBUG_CHECKS
+        /* Check that particles have been drifted to the current time */
+        if (gpi->ti_drift != e->ti_current)
+          error("gpi not drifted to current time");
+        if (gpj->ti_drift != e->ti_current)
+          error("gpj not drifted to current time");
+#endif
+
+        /* Interact ! */
+        runner_iact_grav_pp_nonsym(rlr_inv, r2, dx, gpi, gpj);
 
 #ifdef SWIFT_DEBUG_CHECKS
-      gpi->num_interacted++;
+        gpi->num_interacted++;
 #endif
+      }
     }
   }
 
   /* Loop over all particles in cj... */
-  for (int pjd = 0; pjd < gcount_j; pjd++) {
+  if (cell_is_active(cj, e)) {
+    for (int pjd = 0; pjd < gcount_j; pjd++) {
 
-    /* Get a hold of the ith part in ci. */
-    struct gpart *restrict gpj = &gparts_j[pjd];
+      /* Get a hold of the ith part in ci. */
+      struct gpart *restrict gpj = &gparts_j[pjd];
 
-    if (!gpart_is_active(gpj, e)) continue;
+      if (!gpart_is_active(gpj, e)) continue;
 
-    /* Loop over every particle in the other cell. */
-    for (int pid = 0; pid < gcount_i; pid++) {
+      /* Loop over every particle in the other cell. */
+      for (int pid = 0; pid < gcount_i; pid++) {
 
-      /* Get a hold of the ith part in ci. */
-      const struct gpart *restrict gpi = &gparts_i[pid];
+        /* Get a hold of the ith part in ci. */
+        const struct gpart *restrict gpi = &gparts_i[pid];
 
-      /* Compute the pairwise distance. */
-      const float dx[3] = {gpj->x[0] - gpi->x[0],   // x
-                           gpj->x[1] - gpi->x[1],   // y
-                           gpj->x[2] - gpi->x[2]};  // z
-      const float r2 = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2];
+        /* Compute the pairwise distance. */
+        const float dx[3] = {gpj->x[0] - gpi->x[0],   // x
+                             gpj->x[1] - gpi->x[1],   // y
+                             gpj->x[2] - gpi->x[2]};  // z
+        const float r2 = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2];
 
-      /* Interact ! */
-      runner_iact_grav_pp_nonsym(rlr_inv, r2, dx, gpj, gpi);
+#ifdef SWIFT_DEBUG_CHECKS
+        /* Check that particles have been drifted to the current time */
+        if (gpi->ti_drift != e->ti_current)
+          error("gpi not drifted to current time");
+        if (gpj->ti_drift != e->ti_current)
+          error("gpj not drifted to current time");
+#endif
+
+        /* Interact ! */
+        runner_iact_grav_pp_nonsym(rlr_inv, r2, dx, gpj, gpi);
 
 #ifdef SWIFT_DEBUG_CHECKS
-      gpj->num_interacted++;
+        gpj->num_interacted++;
 #endif
+      }
     }
   }
 
@@ -273,6 +317,9 @@ void runner_doself_grav_pp(struct runner *r, struct cell *c) {
   /* Anything to do here? */
   if (!cell_is_active(c, e)) return;
 
+  /* Do we need to start by drifting things ? */
+  if (!cell_is_drifted(c, e)) cell_drift_particles(c, e);
+
 #if ICHECK > 0
   for (int pid = 0; pid < gcount; pid++) {
 
@@ -306,6 +353,14 @@ void runner_doself_grav_pp(struct runner *r, struct cell *c) {
                      gpi->x[2] - gpj->x[2]};  // z
       const float r2 = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2];
 
+#ifdef SWIFT_DEBUG_CHECKS
+      /* Check that particles have been drifted to the current time */
+      if (gpi->ti_drift != e->ti_current)
+        error("gpi not drifted to current time");
+      if (gpj->ti_drift != e->ti_current)
+        error("gpj not drifted to current time");
+#endif
+
       /* Interact ! */
       if (gpart_is_active(gpi, e) && gpart_is_active(gpj, e)) {
 
@@ -403,7 +458,8 @@ void runner_dopair_grav(struct runner *r, struct cell *ci, struct cell *cj,
   TIMER_TIC;
 
   /* Can we use M-M interactions ? */
-  if (gravity_multipole_accept(ci->multipole, cj->multipole, theta_crit_inv)) {
+  if (gravity_multipole_accept(ci->multipole, cj->multipole, theta_crit_inv,
+                               0)) {
     /* MATTHIEU: make a symmetric M-M interaction function ! */
     runner_dopair_grav_mm(r, ci, cj);
     runner_dopair_grav_mm(r, cj, ci);
@@ -527,11 +583,6 @@ void runner_dosub_grav(struct runner *r, struct cell *ci, struct cell *cj,
 
   } else {
 
-#ifdef SWIFT_DEBUG_CHECKS
-    if (!cell_are_neighbours(ci, cj))
-      error("Non-neighbouring cells in pair task !");
-#endif
-
     runner_dopair_grav(r, ci, cj, 1);
   }
 }
@@ -588,11 +639,20 @@ void runner_do_grav_long_range(struct runner *r, struct cell *ci, int timer) {
     if (ci == cj || cj->gcount == 0) continue;
 
     /* Check the multipole acceptance criterion */
-    if (gravity_multipole_accept(ci->multipole, cj->multipole,
-                                 theta_crit_inv)) {
+    if (gravity_multipole_accept(ci->multipole, cj->multipole, theta_crit_inv,
+                                 0)) {
+
       /* Go for a (non-symmetric) M-M calculation */
       runner_dopair_grav_mm(r, ci, cj);
     }
+    /* Is the criterion violated now but was OK at the last rebuild ? */
+    else if (gravity_multipole_accept(ci->multipole, cj->multipole,
+                                      theta_crit_inv, 1)) {
+
+      /* Alright, we have to take charge of that pair in a different way. */
+      // MATTHIEU: We should actually open the tree-node here and recurse.
+      runner_dopair_grav_mm(r, ci, cj);
+    }
   }
 
   if (timer) TIMER_TOC(timer_dograv_long_range);
diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c
index fd15fc9e775f7a05d45056c23c6567aa3ac9d0c4..c55a482cfb0244ccf78dadc4f98031b9f36f64e0 100644
--- a/src/runner_doiact_vec.c
+++ b/src/runner_doiact_vec.c
@@ -50,9 +50,6 @@ static const vector kernel_gamma2_vec = FILL_VEC(kernel_gamma2);
 //  printf("]\n");
 //}
 
-#endif
-
-#ifdef WITH_VECTORIZATION
 /**
  * @brief Compute the vector remainder interactions from the secondary cache.
  *
@@ -589,7 +586,8 @@ __attribute__((always_inline)) INLINE static void populate_max_d_no_cache(
 
   int first_pi = 0, last_pj = cj->count - 1;
 
-  /* Find the first active particle in ci to interact with any particle in cj. */
+  /* Find the first active particle in ci to interact with any particle in cj.
+   */
   /* Populate max_di with distances. */
   int active_id = ci->count - 1;
   for (int k = ci->count - 1; k >= 0; k--) {
@@ -599,20 +597,19 @@ __attribute__((always_inline)) INLINE static void populate_max_d_no_cache(
 
     max_di[k] = d;
 
-    /* If the particle is out of range set the index to 
+    /* If the particle is out of range set the index to
      * the last active particle within range. */
     if (d < dj_min) {
       first_pi = active_id;
       break;
-    }
-    else {
-      if(part_is_active(p,e)) active_id = k;
+    } else {
+      if (part_is_active(p, e)) active_id = k;
     }
   }
 
   /* Find the maximum distance of pi particles into cj.*/
-  for (int k = first_pi; k < ci->count; k++) {
-    max_di[k] = fmaxf(max_di[k - 1],max_di[k]);
+  for (int k = first_pi + 1; k < ci->count; k++) {
+    max_di[k] = fmaxf(max_di[k - 1], max_di[k]);
   }
 
   /* Find the last particle in cj to interact with any particle in ci. */
@@ -622,17 +619,16 @@ __attribute__((always_inline)) INLINE static void populate_max_d_no_cache(
     p = &parts_j[sort_j[k].i];
     h = p->h;
     d = sort_j[k].d - h * kernel_gamma - dx_max - rshift;
-    
+
     max_dj[k] = d;
-    
-    /* If the particle is out of range set the index to 
+
+    /* If the particle is out of range set the index to
      * the last active particle within range. */
     if (d > di_max) {
       last_pj = active_id;
       break;
-    }
-    else {
-      if(part_is_active(p,e)) active_id = k;
+    } else {
+      if (part_is_active(p, e)) active_id = k;
     }
   }
 
@@ -672,7 +668,7 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
 
   if (!cell_is_active(c, e)) return;
 
-  if (!cell_is_drifted(c, e)) cell_drift_particles(c, e);
+  if (!cell_is_drifted(c, e)) error("Interacting undrifted cell.");
 
   /* Get the particle cache from the runner and re-allocate
    * the cache if it is not big enough for the cell. */
@@ -1615,16 +1611,18 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
   /* Anything to do here? */
   if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return;
 
-  if (!cell_is_drifted(ci, e)) cell_drift_particles(ci, e);
-  if (!cell_is_drifted(cj, e)) cell_drift_particles(cj, e);
+  if (!cell_is_drifted(ci, e) || !cell_is_drifted(cj, e))
+    error("Interacting undrifted cells.");
 
   /* Get the sort ID. */
   double shift[3] = {0.0, 0.0, 0.0};
   const int sid = space_getsid(e->s, &ci, &cj, shift);
 
   /* Have the cells been sorted? */
-  if (!(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid)))
-    error("Trying to interact unsorted cells.");
+  if (!(ci->sorted & (1 << sid)) || ci->dx_max_sort > space_maxreldx * ci->dmin)
+    runner_do_sort(r, ci, (1 << sid), 1);
+  if (!(cj->sorted & (1 << sid)) || cj->dx_max_sort > space_maxreldx * cj->dmin)
+    runner_do_sort(r, cj, (1 << sid), 1);
 
   /* Get the cutoff shift. */
   double rshift = 0.0;
@@ -1634,6 +1632,29 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
   const struct entry *restrict sort_i = &ci->sort[sid * (ci->count + 1)];
   const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)];
 
+#ifdef SWIFT_DEBUG_CHECKS
+  /* Check that the dx_max_sort values in the cell are indeed an upper
+     bound on particle movement. */
+  for (int pid = 0; pid < ci->count; pid++) {
+    const struct part *p = &ci->parts[sort_i[pid].i];
+    const float d = p->x[0] * runner_shift[sid][0] +
+                    p->x[1] * runner_shift[sid][1] +
+                    p->x[2] * runner_shift[sid][2];
+    if (fabsf(d - sort_i[pid].d) - ci->dx_max_sort >
+        1.0e-6 * max(fabsf(d), ci->dx_max_sort))
+      error("particle shift diff exceeds dx_max_sort.");
+  }
+  for (int pjd = 0; pjd < cj->count; pjd++) {
+    const struct part *p = &cj->parts[sort_j[pjd].i];
+    const float d = p->x[0] * runner_shift[sid][0] +
+                    p->x[1] * runner_shift[sid][1] +
+                    p->x[2] * runner_shift[sid][2];
+    if (fabsf(d - sort_j[pjd].d) - cj->dx_max_sort >
+        1.0e-6 * max(fabsf(d), cj->dx_max_sort))
+      error("particle shift diff exceeds dx_max_sort.");
+  }
+#endif /* SWIFT_DEBUG_CHECKS */
+
   /* Get some other useful values. */
   const int count_i = ci->count;
   const int count_j = cj->count;
@@ -1643,31 +1664,31 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
   struct part *restrict parts_j = cj->parts;
   const double di_max = sort_i[count_i - 1].d - rshift;
   const double dj_min = sort_j[0].d;
-  const float dx_max = (ci->dx_max + cj->dx_max);
+  const float dx_max = (ci->dx_max_sort + cj->dx_max_sort);
 
   /* Check if any particles are active and return if there are not. */
-  int numActive = 0; 
-  for (int pid = count_i - 1; 
-      pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min; pid--) {
+  int numActive = 0;
+  for (int pid = count_i - 1;
+       pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min; pid--) {
     struct part *restrict pi = &parts_i[sort_i[pid].i];
     if (part_is_active(pi, e)) {
       numActive++;
       break;
-    }    
+    }
   }
 
-  if(!numActive) {
+  if (!numActive) {
     for (int pjd = 0; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max;
-        pjd++) {
+         pjd++) {
       struct part *restrict pj = &parts_j[sort_j[pjd].i];
       if (part_is_active(pj, e)) {
         numActive++;
         break;
-      }    
-    }    
+      }
+    }
   }
 
-  if(numActive == 0) return;
+  if (numActive == 0) return;
 
   /* Get both particle caches from the runner and re-allocate
    * them if they are not big enough for the cells. */
@@ -1692,7 +1713,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
   /* Also find the first pi that interacts with any particle in cj and the last
    * pj that interacts with any particle in ci. */
   populate_max_d_no_cache(ci, cj, sort_i, sort_j, dx_max, rshift, max_di,
-      max_dj, &first_pi, &last_pj, e);
+                          max_dj, &first_pi, &last_pj, e);
 
   /* Find the maximum index into cj that is required by a particle in ci. */
   /* Find the maximum index into ci that is required by a particle in cj. */
@@ -1727,8 +1748,8 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
   int first_pi_align = first_pi;
   int last_pj_align = last_pj;
   cache_read_two_partial_cells_sorted(ci, cj, ci_cache, cj_cache, sort_i,
-      sort_j, shift, &first_pi_align,
-      &last_pj_align, 1);
+                                      sort_j, shift, &first_pi_align,
+                                      &last_pj_align, 1);
 
   /* Get the number of particles read into the ci cache. */
   int ci_cache_count = count_i - first_pi_align;
@@ -1772,7 +1793,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
 
       /* Reset cumulative sums of update vectors. */
       vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum,
-             curlvySum, curlvzSum;
+          curlvySum, curlvzSum;
 
       /* Get the inverse of hi. */
       vector v_hi_inv;
@@ -1809,8 +1830,8 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
         vector v_dx, v_dy, v_dz, v_r2;
 
 #ifdef SWIFT_DEBUG_CHECKS
-        if (cj_cache_idx%VEC_SIZE != 0 || cj_cache_idx < 0) {
-          error("Unaligned read!!! cj_cache_idx=%d",cj_cache_idx);
+        if (cj_cache_idx % VEC_SIZE != 0 || cj_cache_idx < 0) {
+          error("Unaligned read!!! cj_cache_idx=%d", cj_cache_idx);
         }
 #endif
 
@@ -1848,7 +1869,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
 #ifdef HAVE_AVX512_F
               knl_mask);
 #else
-        0);
+              0);
 #endif
 
       } /* loop over the parts in cj. */
@@ -1906,7 +1927,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
 
       /* Reset cumulative sums of update vectors. */
       vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum,
-             curlvySum, curlvzSum;
+          curlvySum, curlvzSum;
 
       /* Get the inverse of hj. */
       vector v_hj_inv;
@@ -1929,17 +1950,18 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
 
       /* Pad the exit iteration align so cache reads are aligned. */
       int rem = exit_iteration_align % VEC_SIZE;
-      if ( exit_iteration_align < VEC_SIZE) {
+      if (exit_iteration_align < VEC_SIZE) {
         exit_iteration_align = 0;
-      }
-      else exit_iteration_align -= rem;
+      } else
+        exit_iteration_align -= rem;
 
       /* Loop over the parts in ci. */
-      for (int ci_cache_idx = exit_iteration_align; ci_cache_idx < ci_cache_count; ci_cache_idx += VEC_SIZE) {
+      for (int ci_cache_idx = exit_iteration_align;
+           ci_cache_idx < ci_cache_count; ci_cache_idx += VEC_SIZE) {
 
 #ifdef SWIFT_DEBUG_CHECKS
-        if (ci_cache_idx%VEC_SIZE != 0 || ci_cache_idx < 0) {
-          error("Unaligned read!!! ci_cache_idx=%d",ci_cache_idx);
+        if (ci_cache_idx % VEC_SIZE != 0 || ci_cache_idx < 0) {
+          error("Unaligned read!!! ci_cache_idx=%d", ci_cache_idx);
         }
 #endif
 
@@ -1979,12 +2001,13 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
 #ifdef HAVE_AVX512_F
               knl_mask);
 #else
-        0);
+              0);
 #endif
 
       } /* loop over the parts in ci. */
 
-      /* Perform horizontal adds on vector sums and store result in particle pj. */
+      /* Perform horizontal adds on vector sums and store result in particle pj.
+       */
       VEC_HADD(rhoSum, pj->rho);
       VEC_HADD(rho_dhSum, pj->density.rho_dh);
       VEC_HADD(wcountSum, pj->density.wcount);
@@ -1995,9 +2018,8 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
       VEC_HADD(curlvzSum, pj->density.rot_v[2]);
 
     } /* loop over the parts in cj. */
-    
-    TIMER_TOC(timer_dopair_density);
 
+    TIMER_TOC(timer_dopair_density);
   }
 
 #endif /* WITH_VECTORIZATION */
diff --git a/src/scheduler.c b/src/scheduler.c
index 62fce53762e0b3c84396dfb9c54190199b41753b..9cf1598f6d3bfd31b1120058d85b18edf44c427f 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -65,6 +65,11 @@ void scheduler_clear_active(struct scheduler *s) { s->active_count = 0; }
  */
 void scheduler_addunlock(struct scheduler *s, struct task *ta,
                          struct task *tb) {
+#ifdef SWIFT_DEBUG_CHECKS
+  if (ta == NULL) error("Unlocking task is NULL.");
+  if (tb == NULL) error("Unlocked task is NULL.");
+#endif
+
   /* Get an index at which to store this unlock. */
   const int ind = atomic_inc(&s->nr_unlocks);
 
@@ -136,8 +141,7 @@ static void scheduler_splittask(struct task *t, struct scheduler *s) {
         ((t->type == task_type_kick1) && t->ci->nodeID != s->nodeID) ||
         ((t->type == task_type_kick2) && t->ci->nodeID != s->nodeID) ||
         ((t->type == task_type_drift) && t->ci->nodeID != s->nodeID) ||
-        ((t->type == task_type_timestep) && t->ci->nodeID != s->nodeID) ||
-        ((t->type == task_type_init) && t->ci->nodeID != s->nodeID)) {
+        ((t->type == task_type_timestep) && t->ci->nodeID != s->nodeID)) {
       t->type = task_type_none;
       t->subtype = task_subtype_none;
       t->cj = NULL;
@@ -169,6 +173,19 @@ static void scheduler_splittask(struct task *t, struct scheduler *s) {
           /* convert to a self-subtask. */
           t->type = task_type_sub_self;
 
+          /* Make sure we have a drift task (MATTHIEU temp. fix for gravity) */
+          if (t->subtype == task_subtype_grav ||
+              t->subtype == task_subtype_external_grav) {
+            lock_lock(&ci->lock);
+            if (ci->drift == NULL)
+              ci->drift = scheduler_addtask(s, task_type_drift,
+                                            task_subtype_none, 0, 0, ci, NULL);
+            lock_unlock_blind(&ci->lock);
+          }
+
+          /* Depend on local sorts on this cell. */
+          if (ci->sorts != NULL) scheduler_addunlock(s, ci->sorts, t);
+
           /* Otherwise, make tasks explicitly. */
         } else {
 
@@ -183,7 +200,7 @@ static void scheduler_splittask(struct task *t, struct scheduler *s) {
             if (ci->progeny[k] != NULL)
               scheduler_splittask(
                   scheduler_addtask(s, task_type_self, t->subtype, 0, 0,
-                                    ci->progeny[k], NULL, 0),
+                                    ci->progeny[k], NULL),
                   s);
 
           /* Make a task for each pair of progeny unless it's ext. gravity. */
@@ -196,12 +213,21 @@ static void scheduler_splittask(struct task *t, struct scheduler *s) {
                     scheduler_splittask(
                         scheduler_addtask(s, task_type_pair, t->subtype,
                                           pts[j][k], 0, ci->progeny[j],
-                                          ci->progeny[k], 0),
+                                          ci->progeny[k]),
                         s);
           }
         }
       }
 
+      /* Otherwise, make sure the self task has a drift task. */
+      else {
+        lock_lock(&ci->lock);
+        if (ci->drift == NULL)
+          ci->drift = scheduler_addtask(s, task_type_drift, task_subtype_none,
+                                        0, 0, ci, NULL);
+        lock_unlock_blind(&ci->lock);
+      }
+
       /* Pair interaction? */
     } else if (t->type == task_type_pair && t->subtype != task_subtype_grav) {
 
@@ -235,6 +261,10 @@ static void scheduler_splittask(struct task *t, struct scheduler *s) {
           /* Make this task a sub task. */
           t->type = task_type_sub_pair;
 
+          /* Depend on the sort tasks of both cells. */
+          if (ci->sorts != NULL) scheduler_addunlock(s, ci->sorts, t);
+          if (cj->sorts != NULL) scheduler_addunlock(s, cj->sorts, t);
+
           /* Otherwise, split it. */
         } else {
 
@@ -254,18 +284,17 @@ static void scheduler_splittask(struct task *t, struct scheduler *s) {
               t->ci = ci->progeny[6];
               t->cj = cj->progeny[0];
               t->flags = 1;
-              t->tight = 1;
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 1, 0,
-                                    ci->progeny[7], cj->progeny[1], 1),
+                                    ci->progeny[7], cj->progeny[1]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 0, 0,
-                                    ci->progeny[6], cj->progeny[1], 1),
+                                    ci->progeny[6], cj->progeny[1]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 2, 0,
-                                    ci->progeny[7], cj->progeny[0], 1),
+                                    ci->progeny[7], cj->progeny[0]),
                   s);
               break;
 
@@ -273,25 +302,23 @@ static void scheduler_splittask(struct task *t, struct scheduler *s) {
               t->ci = ci->progeny[6];
               t->cj = cj->progeny[1];
               t->flags = 2;
-              t->tight = 1;
               break;
 
             case 3: /* (  1 ,  0 ,  1 ) */
               t->ci = ci->progeny[5];
               t->cj = cj->progeny[0];
               t->flags = 3;
-              t->tight = 1;
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 3, 0,
-                                    ci->progeny[7], cj->progeny[2], 1),
+                                    ci->progeny[7], cj->progeny[2]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 0, 0,
-                                    ci->progeny[5], cj->progeny[2], 1),
+                                    ci->progeny[5], cj->progeny[2]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 6, 0,
-                                    ci->progeny[7], cj->progeny[0], 1),
+                                    ci->progeny[7], cj->progeny[0]),
                   s);
               break;
 
@@ -299,66 +326,65 @@ static void scheduler_splittask(struct task *t, struct scheduler *s) {
               t->ci = ci->progeny[4];
               t->cj = cj->progeny[0];
               t->flags = 4;
-              t->tight = 1;
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 5, 0,
-                                    ci->progeny[5], cj->progeny[0], 1),
+                                    ci->progeny[5], cj->progeny[0]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 7, 0,
-                                    ci->progeny[6], cj->progeny[0], 1),
+                                    ci->progeny[6], cj->progeny[0]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 8, 0,
-                                    ci->progeny[7], cj->progeny[0], 1),
+                                    ci->progeny[7], cj->progeny[0]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 3, 0,
-                                    ci->progeny[4], cj->progeny[1], 1),
+                                    ci->progeny[4], cj->progeny[1]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 4, 0,
-                                    ci->progeny[5], cj->progeny[1], 1),
+                                    ci->progeny[5], cj->progeny[1]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 6, 0,
-                                    ci->progeny[6], cj->progeny[1], 1),
+                                    ci->progeny[6], cj->progeny[1]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 7, 0,
-                                    ci->progeny[7], cj->progeny[1], 1),
+                                    ci->progeny[7], cj->progeny[1]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 1, 0,
-                                    ci->progeny[4], cj->progeny[2], 1),
+                                    ci->progeny[4], cj->progeny[2]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 2, 0,
-                                    ci->progeny[5], cj->progeny[2], 1),
+                                    ci->progeny[5], cj->progeny[2]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 4, 0,
-                                    ci->progeny[6], cj->progeny[2], 1),
+                                    ci->progeny[6], cj->progeny[2]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 5, 0,
-                                    ci->progeny[7], cj->progeny[2], 1),
+                                    ci->progeny[7], cj->progeny[2]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 0, 0,
-                                    ci->progeny[4], cj->progeny[3], 1),
+                                    ci->progeny[4], cj->progeny[3]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 1, 0,
-                                    ci->progeny[5], cj->progeny[3], 1),
+                                    ci->progeny[5], cj->progeny[3]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 3, 0,
-                                    ci->progeny[6], cj->progeny[3], 1),
+                                    ci->progeny[6], cj->progeny[3]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 4, 0,
-                                    ci->progeny[7], cj->progeny[3], 1),
+                                    ci->progeny[7], cj->progeny[3]),
                   s);
               break;
 
@@ -366,18 +392,17 @@ static void scheduler_splittask(struct task *t, struct scheduler *s) {
               t->ci = ci->progeny[4];
               t->cj = cj->progeny[1];
               t->flags = 5;
-              t->tight = 1;
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 5, 0,
-                                    ci->progeny[6], cj->progeny[3], 1),
+                                    ci->progeny[6], cj->progeny[3]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 2, 0,
-                                    ci->progeny[4], cj->progeny[3], 1),
+                                    ci->progeny[4], cj->progeny[3]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 8, 0,
-                                    ci->progeny[6], cj->progeny[1], 1),
+                                    ci->progeny[6], cj->progeny[1]),
                   s);
               break;
 
@@ -385,25 +410,23 @@ static void scheduler_splittask(struct task *t, struct scheduler *s) {
               t->ci = ci->progeny[5];
               t->cj = cj->progeny[2];
               t->flags = 6;
-              t->tight = 1;
               break;
 
             case 7: /* (  1 , -1 ,  0 ) */
               t->ci = ci->progeny[4];
               t->cj = cj->progeny[3];
               t->flags = 6;
-              t->tight = 1;
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 8, 0,
-                                    ci->progeny[5], cj->progeny[2], 1),
+                                    ci->progeny[5], cj->progeny[2]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 7, 0,
-                                    ci->progeny[4], cj->progeny[2], 1),
+                                    ci->progeny[4], cj->progeny[2]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 7, 0,
-                                    ci->progeny[5], cj->progeny[3], 1),
+                                    ci->progeny[5], cj->progeny[3]),
                   s);
               break;
 
@@ -411,25 +434,23 @@ static void scheduler_splittask(struct task *t, struct scheduler *s) {
               t->ci = ci->progeny[4];
               t->cj = cj->progeny[3];
               t->flags = 8;
-              t->tight = 1;
               break;
 
             case 9: /* (  0 ,  1 ,  1 ) */
               t->ci = ci->progeny[3];
               t->cj = cj->progeny[0];
               t->flags = 9;
-              t->tight = 1;
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 9, 0,
-                                    ci->progeny[7], cj->progeny[4], 1),
+                                    ci->progeny[7], cj->progeny[4]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 0, 0,
-                                    ci->progeny[3], cj->progeny[4], 1),
+                                    ci->progeny[3], cj->progeny[4]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 8, 0,
-                                    ci->progeny[7], cj->progeny[0], 1),
+                                    ci->progeny[7], cj->progeny[0]),
                   s);
               break;
 
@@ -437,66 +458,65 @@ static void scheduler_splittask(struct task *t, struct scheduler *s) {
               t->ci = ci->progeny[2];
               t->cj = cj->progeny[0];
               t->flags = 10;
-              t->tight = 1;
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 11, 0,
-                                    ci->progeny[3], cj->progeny[0], 1),
+                                    ci->progeny[3], cj->progeny[0]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 7, 0,
-                                    ci->progeny[6], cj->progeny[0], 1),
+                                    ci->progeny[6], cj->progeny[0]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 6, 0,
-                                    ci->progeny[7], cj->progeny[0], 1),
+                                    ci->progeny[7], cj->progeny[0]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 9, 0,
-                                    ci->progeny[2], cj->progeny[1], 1),
+                                    ci->progeny[2], cj->progeny[1]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 10, 0,
-                                    ci->progeny[3], cj->progeny[1], 1),
+                                    ci->progeny[3], cj->progeny[1]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 8, 0,
-                                    ci->progeny[6], cj->progeny[1], 1),
+                                    ci->progeny[6], cj->progeny[1]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 7, 0,
-                                    ci->progeny[7], cj->progeny[1], 1),
+                                    ci->progeny[7], cj->progeny[1]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 1, 0,
-                                    ci->progeny[2], cj->progeny[4], 1),
+                                    ci->progeny[2], cj->progeny[4]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 2, 0,
-                                    ci->progeny[3], cj->progeny[4], 1),
+                                    ci->progeny[3], cj->progeny[4]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 10, 0,
-                                    ci->progeny[6], cj->progeny[4], 1),
+                                    ci->progeny[6], cj->progeny[4]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 11, 0,
-                                    ci->progeny[7], cj->progeny[4], 1),
+                                    ci->progeny[7], cj->progeny[4]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 0, 0,
-                                    ci->progeny[2], cj->progeny[5], 1),
+                                    ci->progeny[2], cj->progeny[5]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 1, 0,
-                                    ci->progeny[3], cj->progeny[5], 1),
+                                    ci->progeny[3], cj->progeny[5]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 9, 0,
-                                    ci->progeny[6], cj->progeny[5], 1),
+                                    ci->progeny[6], cj->progeny[5]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 10, 0,
-                                    ci->progeny[7], cj->progeny[5], 1),
+                                    ci->progeny[7], cj->progeny[5]),
                   s);
               break;
 
@@ -504,18 +524,17 @@ static void scheduler_splittask(struct task *t, struct scheduler *s) {
               t->ci = ci->progeny[2];
               t->cj = cj->progeny[1];
               t->flags = 11;
-              t->tight = 1;
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 11, 0,
-                                    ci->progeny[6], cj->progeny[5], 1),
+                                    ci->progeny[6], cj->progeny[5]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 2, 0,
-                                    ci->progeny[2], cj->progeny[5], 1),
+                                    ci->progeny[2], cj->progeny[5]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 6, 0,
-                                    ci->progeny[6], cj->progeny[1], 1),
+                                    ci->progeny[6], cj->progeny[1]),
                   s);
               break;
 
@@ -523,66 +542,65 @@ static void scheduler_splittask(struct task *t, struct scheduler *s) {
               t->ci = ci->progeny[1];
               t->cj = cj->progeny[0];
               t->flags = 12;
-              t->tight = 1;
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 11, 0,
-                                    ci->progeny[3], cj->progeny[0], 1),
+                                    ci->progeny[3], cj->progeny[0]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 5, 0,
-                                    ci->progeny[5], cj->progeny[0], 1),
+                                    ci->progeny[5], cj->progeny[0]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 2, 0,
-                                    ci->progeny[7], cj->progeny[0], 1),
+                                    ci->progeny[7], cj->progeny[0]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 9, 0,
-                                    ci->progeny[1], cj->progeny[2], 1),
+                                    ci->progeny[1], cj->progeny[2]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 12, 0,
-                                    ci->progeny[3], cj->progeny[2], 1),
+                                    ci->progeny[3], cj->progeny[2]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 8, 0,
-                                    ci->progeny[5], cj->progeny[2], 1),
+                                    ci->progeny[5], cj->progeny[2]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 5, 0,
-                                    ci->progeny[7], cj->progeny[2], 1),
+                                    ci->progeny[7], cj->progeny[2]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 3, 0,
-                                    ci->progeny[1], cj->progeny[4], 1),
+                                    ci->progeny[1], cj->progeny[4]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 6, 0,
-                                    ci->progeny[3], cj->progeny[4], 1),
+                                    ci->progeny[3], cj->progeny[4]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 12, 0,
-                                    ci->progeny[5], cj->progeny[4], 1),
+                                    ci->progeny[5], cj->progeny[4]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 11, 0,
-                                    ci->progeny[7], cj->progeny[4], 1),
+                                    ci->progeny[7], cj->progeny[4]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 0, 0,
-                                    ci->progeny[1], cj->progeny[6], 1),
+                                    ci->progeny[1], cj->progeny[6]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 3, 0,
-                                    ci->progeny[3], cj->progeny[6], 1),
+                                    ci->progeny[3], cj->progeny[6]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 9, 0,
-                                    ci->progeny[5], cj->progeny[6], 1),
+                                    ci->progeny[5], cj->progeny[6]),
                   s);
               scheduler_splittask(
                   scheduler_addtask(s, task_type_pair, t->subtype, 12, 0,
-                                    ci->progeny[7], cj->progeny[6], 1),
+                                    ci->progeny[7], cj->progeny[6]),
                   s);
               break;
           } /* switch(sid) */
@@ -604,7 +622,7 @@ static void scheduler_splittask(struct task *t, struct scheduler *s) {
               if (cj->progeny[k] != NULL) {
                 struct task *tl =
                     scheduler_addtask(s, task_type_pair, t->subtype, 0, 0,
-                                      ci->progeny[j], cj->progeny[k], 0);
+                                      ci->progeny[j], cj->progeny[k]);
                 scheduler_splittask(tl, s);
                 tl->flags = space_getsid(s->space, &t->ci, &t->cj, shift);
               }
@@ -612,11 +630,14 @@ static void scheduler_splittask(struct task *t, struct scheduler *s) {
         /* Otherwise, if not spilt, stitch-up the sorting. */
       } else {
 
-        /* Create the sort for ci. */
+        /* Create the drift and sort for ci. */
         lock_lock(&ci->lock);
+        if (ci->drift == NULL && ci->nodeID == engine_rank)
+          ci->drift = scheduler_addtask(s, task_type_drift, task_subtype_none,
+                                        0, 0, ci, NULL);
         if (ci->sorts == NULL)
           ci->sorts = scheduler_addtask(s, task_type_sort, task_subtype_none,
-                                        1 << sid, 0, ci, NULL, 0);
+                                        1 << sid, 0, ci, NULL);
         else
           ci->sorts->flags |= (1 << sid);
         lock_unlock_blind(&ci->lock);
@@ -624,9 +645,12 @@ static void scheduler_splittask(struct task *t, struct scheduler *s) {
 
         /* Create the sort for cj. */
         lock_lock(&cj->lock);
+        if (cj->drift == NULL && cj->nodeID == engine_rank)
+          cj->drift = scheduler_addtask(s, task_type_drift, task_subtype_none,
+                                        0, 0, cj, NULL);
         if (cj->sorts == NULL)
           cj->sorts = scheduler_addtask(s, task_type_sort, task_subtype_none,
-                                        1 << sid, 0, cj, NULL, 0);
+                                        1 << sid, 0, cj, NULL);
         else
           cj->sorts->flags |= (1 << sid);
         lock_unlock_blind(&cj->lock);
@@ -690,11 +714,10 @@ void scheduler_splittasks(struct scheduler *s) {
  * @param wait The number of unsatisfied dependencies of this task.
  * @param ci The first cell to interact.
  * @param cj The second cell to interact.
- * @param tight
  */
 struct task *scheduler_addtask(struct scheduler *s, enum task_types type,
                                enum task_subtypes subtype, int flags, int wait,
-                               struct cell *ci, struct cell *cj, int tight) {
+                               struct cell *ci, struct cell *cj) {
 
 #ifdef SWIFT_DEBUG_CHECKS
   if (ci == NULL && cj != NULL)
@@ -719,7 +742,6 @@ struct task *scheduler_addtask(struct scheduler *s, enum task_types type,
   t->ci = ci;
   t->cj = cj;
   t->skip = 1; /* Mark tasks as skip by default. */
-  t->tight = tight;
   t->implicit = 0;
   t->weight = 0;
   t->rank = 0;
@@ -854,13 +876,13 @@ void scheduler_ranktasks(struct scheduler *s) {
     }
 
   /* Main loop. */
-  for (int j = 0, rank = 0; left < nr_tasks; rank++) {
+  for (int j = 0, rank = 0; j < nr_tasks; rank++) {
 
     /* Did we get anything? */
     if (j == left) error("Unsatisfiable task dependencies detected.");
-    const int left_old = left;
 
     /* Unlock the next layer of tasks. */
+    const int left_old = left;
     for (; j < left_old; j++) {
       struct task *t = &tasks[tid[j]];
       t->rank = rank;
@@ -876,14 +898,14 @@ void scheduler_ranktasks(struct scheduler *s) {
       }
     }
 
-    /* Move back to the old left (like Sanders). */
+    /* Move back to the old left (like Sanders!). */
     j = left_old;
   }
 
 #ifdef SWIFT_DEBUG_CHECKS
   /* Verify that the tasks were ranked correctly. */
   for (int k = 1; k < s->nr_tasks; k++)
-    if (tasks[tid[k - 1]].rank > tasks[tid[k - 1]].rank)
+    if (tasks[tid[k - 1]].rank > tasks[tid[k]].rank)
       error("Task ranking failed.");
 #endif
 }
@@ -1002,9 +1024,6 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
       case task_type_timestep:
         cost = wscale * t->ci->count;
         break;
-      case task_type_init:
-        cost = wscale * t->ci->count;
-        break;
       default:
         cost = 0;
         break;
@@ -1088,6 +1107,15 @@ void scheduler_enqueue_mapper(void *map_data, int num_elements,
  */
 void scheduler_start(struct scheduler *s) {
 
+/* Reset all task debugging timers */
+#ifdef SWIFT_DEBUG_TASKS
+  for (int i = 0; i < s->nr_tasks; ++i) {
+    s->tasks[i].tic = 0;
+    s->tasks[i].toc = 0;
+    s->tasks[i].rid = -1;
+  }
+#endif
+
   /* Re-wait the tasks. */
   if (s->active_count > 1000) {
     threadpool_map(s->threadpool, scheduler_rewait_mapper, s->tid_active,
@@ -1121,7 +1149,7 @@ void scheduler_start(struct scheduler *s) {
       } else if (cj == NULL) { /* self */
 
         if (ci->ti_end_min == ti_current && t->skip &&
-            t->type != task_type_sort && t->type)
+            t->type != task_type_sort && t->type != task_type_drift && t->type)
           error(
               "Task (type='%s/%s') should not have been skipped "
               "ti_current=%lld "
@@ -1214,7 +1242,6 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
       case task_type_kick2:
       case task_type_drift:
       case task_type_timestep:
-      case task_type_init:
         qid = t->ci->super->owner;
         break;
       case task_type_pair:
diff --git a/src/scheduler.h b/src/scheduler.h
index f2225f5f5b8d0a5db54eb8506e02d78b14f4bb88..7bf9a40e7cec89eb25dfa6ce7a56912bf3a9e639 100644
--- a/src/scheduler.h
+++ b/src/scheduler.h
@@ -134,7 +134,7 @@ void scheduler_ranktasks(struct scheduler *s);
 void scheduler_reweight(struct scheduler *s, int verbose);
 struct task *scheduler_addtask(struct scheduler *s, enum task_types type,
                                enum task_subtypes subtype, int flags, int wait,
-                               struct cell *ci, struct cell *cj, int tight);
+                               struct cell *ci, struct cell *cj);
 void scheduler_splittasks(struct scheduler *s);
 struct task *scheduler_done(struct scheduler *s, struct task *t);
 struct task *scheduler_unlock(struct scheduler *s, struct task *t);
diff --git a/src/space.c b/src/space.c
index 0ea1e4ab3299c29d3b47a1af718dc9cdcb47b285..0c67fe27ac1c7b2cae3672e7f0b1ce82be5fb804 100644
--- a/src/space.c
+++ b/src/space.c
@@ -206,11 +206,11 @@ void space_rebuild_recycle_mapper(void *map_data, int num_elements,
     c->force = NULL;
     c->grav = NULL;
     c->dx_max = 0.0f;
+    c->dx_max_sort = 0.0f;
     c->sorted = 0;
     c->count = 0;
     c->gcount = 0;
     c->scount = 0;
-    c->init = NULL;
     c->init_grav = NULL;
     c->extra_ghost = NULL;
     c->ghost = NULL;
@@ -2027,6 +2027,7 @@ void space_split_recursive(struct space *s, struct cell *c,
       cp->split = 0;
       cp->h_max = 0.0;
       cp->dx_max = 0.f;
+      cp->dx_max_sort = 0.f;
       cp->nodeID = c->nodeID;
       cp->parent = c;
       cp->super = NULL;
@@ -2091,8 +2092,7 @@ void space_split_recursive(struct space *s, struct cell *c,
           const struct multipole *m = &cp->multipole->m_pole;
 
           /* Contribution to multipole */
-          gravity_M2M(&temp, m, c->multipole->CoM, cp->multipole->CoM,
-                      s->periodic);
+          gravity_M2M(&temp, m, c->multipole->CoM, cp->multipole->CoM);
           gravity_multipole_add(&c->multipole->m_pole, &temp);
 
           /* Upper limit of max CoM<->gpart distance */
@@ -2116,6 +2116,10 @@ void space_split_recursive(struct space *s, struct cell *c,
 
       /* Take minimum of both limits */
       c->multipole->r_max = min(r_max, sqrt(dx * dx + dy * dy + dz * dz));
+      c->multipole->r_max_rebuild = c->multipole->r_max;
+      c->multipole->CoM_rebuild[0] = c->multipole->CoM[0];
+      c->multipole->CoM_rebuild[1] = c->multipole->CoM[1];
+      c->multipole->CoM_rebuild[2] = c->multipole->CoM[2];
     } /* Deal with gravity */
   }
 
@@ -2194,6 +2198,10 @@ void space_split_recursive(struct space *s, struct cell *c,
         c->multipole->CoM[2] = c->loc[2] + c->width[2] / 2.;
         c->multipole->r_max = 0.;
       }
+      c->multipole->r_max_rebuild = c->multipole->r_max;
+      c->multipole->CoM_rebuild[0] = c->multipole->CoM[0];
+      c->multipole->CoM_rebuild[1] = c->multipole->CoM[1];
+      c->multipole->CoM_rebuild[2] = c->multipole->CoM[2];
     }
   }
 
diff --git a/src/space.h b/src/space.h
index 8674c39e110694ec303584627840a7ee47644552..c5f588563e5a9fb4b6cb73ac1446514f8149794f 100644
--- a/src/space.h
+++ b/src/space.h
@@ -45,7 +45,7 @@
 #define space_maxcount_default 10000
 #define space_max_top_level_cells_default 12
 #define space_stretch 1.10f
-#define space_maxreldx 0.25f
+#define space_maxreldx 0.1f
 
 /* Maximum allowed depth of cell splits. */
 #define space_cell_maxdepth 52
diff --git a/src/statistics.c b/src/statistics.c
index 297d88c1f25c1b5b42be8edcd1282fd437964894..57d60bcb1b247c9616c859b7ac8a475acdcd878f 100644
--- a/src/statistics.c
+++ b/src/statistics.c
@@ -35,6 +35,7 @@
 #include "cooling.h"
 #include "engine.h"
 #include "error.h"
+#include "gravity.h"
 #include "hydro.h"
 #include "threadpool.h"
 
@@ -164,10 +165,12 @@ void stats_collect_part_mapper(void *map_data, int nr_parts, void *extra_data) {
     stats.E_kin += 0.5f * m * (v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
     stats.E_int += m * hydro_get_internal_energy(p);
     stats.E_rad += cooling_get_radiated_energy(xp);
-    stats.E_pot_self += 0.f;
-    if (gp != NULL)
+    if (gp != NULL) {
+      stats.E_pot_self += m * gravity_get_potential(gp);
       stats.E_pot_ext += m * external_gravity_get_potential_energy(
                                  time, potential, phys_const, gp);
+    }
+
     /* Collect entropy */
     stats.entropy += m * hydro_get_entropy(p);
   }
@@ -224,7 +227,7 @@ void stats_collect_gpart_mapper(void *map_data, int nr_gparts,
                         gp->v_full[1] + gp->a_grav[1] * dt,
                         gp->v_full[2] + gp->a_grav[2] * dt};
 
-    const float m = gp->mass;
+    const float m = gravity_get_mass(gp);
     const double x[3] = {gp->x[0], gp->x[1], gp->x[2]};
 
     /* Collect mass */
@@ -242,7 +245,7 @@ void stats_collect_gpart_mapper(void *map_data, int nr_gparts,
 
     /* Collect energies. */
     stats.E_kin += 0.5f * m * (v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
-    stats.E_pot_self += 0.f;
+    stats.E_pot_self += m * gravity_get_potential(gp);
     stats.E_pot_ext += m * external_gravity_get_potential_energy(
                                time, potential, phys_const, gp);
   }
diff --git a/src/task.c b/src/task.c
index 40cb41ffe057ac78dc3b8b073e29eedb052ec2de..e8c35e49a57595a6415c60ce7071ae1c0e3f09b7 100644
--- a/src/task.c
+++ b/src/task.c
@@ -47,15 +47,27 @@
 #include "lock.h"
 
 /* Task type names. */
-const char *taskID_names[task_type_count] = {
-    "none",        "sort",           "self",
-    "pair",        "sub_self",       "sub_pair",
-    "init",        "init_grav",      "ghost",
-    "extra_ghost", "drift",          "kick1",
-    "kick2",       "timestep",       "send",
-    "recv",        "grav_top_level", "grav_long_range",
-    "grav_mm",     "grav_down",      "cooling",
-    "sourceterms"};
+const char *taskID_names[task_type_count] = {"none",
+                                             "sort",
+                                             "self",
+                                             "pair",
+                                             "sub_self",
+                                             "sub_pair",
+                                             "init_grav",
+                                             "ghost",
+                                             "extra_ghost",
+                                             "drift",
+                                             "kick1",
+                                             "kick2",
+                                             "timestep",
+                                             "send",
+                                             "recv",
+                                             "grav_top_level",
+                                             "grav_long_range",
+                                             "grav_mm",
+                                             "grav_down",
+                                             "cooling",
+                                             "sourceterms"};
 
 /* Sub-task type names. */
 const char *subtaskID_names[task_subtype_count] = {
@@ -152,7 +164,6 @@ __attribute__((always_inline)) INLINE static enum task_actions task_acts_on(
       }
       break;
 
-    case task_type_init:
     case task_type_kick1:
     case task_type_kick2:
     case task_type_timestep:
@@ -272,11 +283,6 @@ void task_unlock(struct task *t) {
   /* Act based on task type. */
   switch (type) {
 
-    case task_type_init_grav:
-      cell_munlocktree(ci);
-      break;
-
-    case task_type_init:
     case task_type_kick1:
     case task_type_kick2:
     case task_type_timestep:
@@ -363,12 +369,6 @@ int task_lock(struct task *t) {
 #endif
       break;
 
-    case task_type_init_grav:
-      if (ci->mhold) return 0;
-      if (cell_mlocktree(ci) != 0) return 0;
-      break;
-
-    case task_type_init:
     case task_type_kick1:
     case task_type_kick2:
     case task_type_timestep:
diff --git a/src/task.h b/src/task.h
index 02dbcbfe258b3db718b0e217fd2b65e56cc03f34..049f86bdd6b4baf0856745b2b53acda5cca8c9e1 100644
--- a/src/task.h
+++ b/src/task.h
@@ -34,6 +34,8 @@
 
 /**
  * @brief The different task types.
+ *
+ * Be sure to update the taskID_names array in tasks.c if you modify this list!
  */
 enum task_types {
   task_type_none = 0,
@@ -42,7 +44,6 @@ enum task_types {
   task_type_pair,
   task_type_sub_self,
   task_type_sub_pair,
-  task_type_init,
   task_type_init_grav,
   task_type_ghost,
   task_type_extra_ghost,
@@ -152,9 +153,6 @@ struct task {
   /*! Should the scheduler skip this task ? */
   char skip;
 
-  /*! Does this task require the particles to be tightly in the cell ? */
-  char tight;
-
   /*! Is this task implicit (i.e. does not do anything) ? */
   char implicit;
 
@@ -167,8 +165,9 @@ struct task {
 #endif
 
 #ifdef SWIFT_DEBUG_CHECKS
-  int ti_run;
-#endif
+  /* When was this task last run? */
+  integertime_t ti_run;
+#endif /* SWIFT_DEBUG_CHECKS */
 
 } SWIFT_STRUCT_ALIGN;
 
diff --git a/src/timers.c b/src/timers.c
index 558ac109c224d939272152aa43684ad842c511de..aa42eee14fc0df3edd5a18340c092b8eea2ffac1 100644
--- a/src/timers.c
+++ b/src/timers.c
@@ -21,16 +21,21 @@
  ******************************************************************************/
 
 /* Config parameters. */
-#include "../config.h"
 
 /* This object's header. */
 #include "timers.h"
 
+/* Some standard headers. */
+#include <stdio.h>
+
+/* Local includes. */
+#include "clocks.h"
+
 /* The timers. */
 ticks timers[timer_count];
 
 /* Timer names. */
-char *timers_names[timer_count] = {
+const char* timers_names[timer_count] = {
     "none",
     "prepare",
     "init",
@@ -63,20 +68,27 @@ char *timers_names[timer_count] = {
     "dosub_pair_gradient",
     "dosub_pair_force",
     "dosub_pair_grav",
+    "doself_subset",
     "dopair_subset",
+    "dopair_subset_naive",
+    "dosub_subset",
     "do_ghost",
     "do_extra_ghost",
     "dorecv_part",
     "dorecv_gpart",
     "dorecv_spart",
+    "do_cooling",
     "gettask",
     "qget",
     "qsteal",
+    "locktree",
     "runners",
     "step",
-    "do_cooling",
 };
 
+/* File to store the timers */
+static FILE* timers_file;
+
 /**
  * @brief Re-set the timers.
  *
@@ -90,3 +102,43 @@ void timers_reset(unsigned long long mask) {
   for (int k = 0; k < timer_count; k++)
     if (mask & (1ull << k)) timers[k] = 0;
 }
+
+/**
+ * @brief Re-set all the timers.
+ *
+ */
+void timers_reset_all() { timers_reset(timers_mask_all); }
+
+/**
+ * @brief Outputs all the timers to the timers dump file.
+ *
+ * @param step The current step.
+ */
+void timers_print(int step) {
+  fprintf(timers_file, "%d\t", step);
+  for (int k = 0; k < timer_count; k++)
+    fprintf(timers_file, "%.3f\t", clocks_from_ticks(timers[k]));
+  fprintf(timers_file, "\n");
+}
+
+/**
+ * @brief Opens the file to contain the timers info and print a header
+ *
+ * @param rank The MPI rank of the file.
+ */
+void timers_open_file(int rank) {
+
+  char buff[100];
+  sprintf(buff, "timers_%d.txt", rank);
+  timers_file = fopen(buff, "w");
+
+  fprintf(timers_file, "# timers: \n# step | ");
+  for (int k = 0; k < timer_count; k++)
+    fprintf(timers_file, "%s\t", timers_names[k]);
+  fprintf(timers_file, "\n");
+}
+
+/**
+ * @brief Close the file containing the timer info.
+ */
+void timers_close_file() { fclose(timers_file); }
diff --git a/src/timers.h b/src/timers.h
index ef84a82eab4c0b11f758d75bb8f530d320707814..08e983a947bc57d9dcc7a432df92c2a4b0a1f7d7 100644
--- a/src/timers.h
+++ b/src/timers.h
@@ -22,7 +22,10 @@
 #ifndef SWIFT_TIMERS_H
 #define SWIFT_TIMERS_H
 
-/* Includes. */
+/* Config parameters. */
+#include "../config.h"
+
+/* Local includes. */
 #include "atomic.h"
 #include "cycle.h"
 #include "inline.h"
@@ -66,18 +69,22 @@ enum {
   timer_dosub_pair_gradient,
   timer_dosub_pair_force,
   timer_dosub_pair_grav,
+  timer_doself_subset,
   timer_dopair_subset,
+  timer_dopair_subset_naive,
+  timer_dosub_subset,
   timer_do_ghost,
   timer_do_extra_ghost,
   timer_dorecv_part,
   timer_dorecv_gpart,
   timer_dorecv_spart,
+  timer_do_cooling,
   timer_gettask,
   timer_qget,
   timer_qsteal,
+  timer_locktree,
   timer_runners,
   timer_step,
-  timer_do_cooling,
   timer_count,
 };
 
@@ -85,32 +92,34 @@ enum {
 extern ticks timers[timer_count];
 
 /* The timer names. */
-extern char *timers_names[];
+extern const char *timers_names[];
 
 /* Mask for all timers. */
 #define timers_mask_all ((1ull << timer_count) - 1)
 
 /* Define the timer macros. */
-#ifdef TIMER
-#define TIMER_TIC_ND tic = getticks();
-#define TIMER_TIC2_ND ticks tic2 = getticks();
-#define TIMER_TIC ticks tic = getticks();
+#ifdef SWIFT_USE_TIMERS
+#define TIMER_TIC const ticks tic = getticks();
 #define TIMER_TOC(t) timers_toc(t, tic)
-#define TIMER_TIC2 ticks tic2 = getticks();
+#define TIMER_TIC2 const ticks tic2 = getticks();
 #define TIMER_TOC2(t) timers_toc(t, tic2)
 INLINE static ticks timers_toc(unsigned int t, ticks tic) {
-  ticks d = (getticks() - tic);
+  const ticks d = (getticks() - tic);
   atomic_add(&timers[t], d);
   return d;
 }
 #else
 #define TIMER_TIC
-#define TIMER_TOC(t)
+#define TIMER_TOC(t) (void)0
 #define TIMER_TIC2
-#define TIMER_TOC2(t)
+#define TIMER_TOC2(t) (void)0
 #endif
 
 /* Function prototypes. */
+void timers_reset_all();
 void timers_reset(unsigned long long mask);
+void timers_open_file(int rank);
+void timers_close_file();
+void timers_print(int step);
 
 #endif /* SWIFT_TIMERS_H */
diff --git a/src/vector.h b/src/vector.h
index 4091ecdceeb150118ebab772532c2d08edc26aee..ba803a666954fe8c22e7ba5a01147c22cd7f028a 100644
--- a/src/vector.h
+++ b/src/vector.h
@@ -107,8 +107,15 @@
   }
 
 /* Performs a horizontal add on the vector and adds the result to a float. */
+#ifdef __ICC
 #define VEC_HADD(a, b) b += _mm512_reduce_add_ps(a.v)
-
+#else /* _mm512_reduce_add_ps not present in GCC compiler. \
+       TODO: Implement intrinsic version.*/
+#define VEC_HADD(a, b)                              \
+  {                                                 \
+    for (int i = 0; i < VEC_SIZE; i++) b += a.f[i]; \
+  }
+#endif
 /* Calculates the number of set bits in the mask and adds the result to an int.
  */
 #define VEC_FORM_PACKED_MASK(mask, v_mask, pack) \
diff --git a/src/vector_power.h b/src/vector_power.h
index f2fd4381751706f6fca1e24daa131d8fd8a0eeba..6e8377586fdbc072e8f894c0805f43c122f8c54f 100644
--- a/src/vector_power.h
+++ b/src/vector_power.h
@@ -34,8 +34,8 @@
 /* Config parameters. */
 #include "../config.h"
 
-/* Some standard headers. */
-#include <math.h>
+/* Local headers. */
+#include "inline.h"
 
 /***************************/
 /* 0th order vector powers */
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 8a75182323859cb5662b0f7d0e83bd0f25f78b4e..a51b8eb82a17313818ff956ca3f15a232df8df65 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -15,15 +15,14 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 # Add the source directory and debug to CFLAGS
-AM_CFLAGS = -I$(top_srcdir)/src $(HDF5_CPPFLAGS) -DTIMER
-
+AM_CFLAGS = -I$(top_srcdir)/src $(HDF5_CPPFLAGS)
 
 AM_LDFLAGS = ../src/.libs/libswiftsim.a $(HDF5_LDFLAGS) $(HDF5_LIBS) $(FFTW_LIBS)
 
 # List of programs and scripts to run in the test suite
 TESTS = testGreetings testMaths testReading.sh testSingle testKernel testSymmetry \
         testPair.sh testPairPerturbed.sh test27cells.sh test27cellsPerturbed.sh  \
-        testParser.sh testSPHStep test125cells.sh testKernelGrav testFFT \
+        testParser.sh testSPHStep test125cells.sh testFFT \
         testAdiabaticIndex testRiemannExact testRiemannTRRS testRiemannHLLC \
         testMatrixInversion testThreadpool testDump testLogger \
         testVoronoi1D testVoronoi2D testVoronoi3D
@@ -31,7 +30,7 @@ TESTS = testGreetings testMaths testReading.sh testSingle testKernel testSymmetr
 # List of test programs to compile
 check_PROGRAMS = testGreetings testReading testSingle testTimeIntegration \
 		 testSPHStep testPair test27cells test125cells testParser \
-                 testKernel testKernelGrav testFFT testInteractions testMaths \
+                 testKernel testFFT testInteractions testMaths \
                  testSymmetry testThreadpool benchmarkInteractions \
                  testAdiabaticIndex testRiemannExact testRiemannTRRS \
                  testRiemannHLLC testMatrixInversion testDump testLogger \
@@ -62,8 +61,6 @@ testParser_SOURCES = testParser.c
 
 testKernel_SOURCES = testKernel.c
 
-testKernelGrav_SOURCES = testKernelGrav.c
-
 testFFT_SOURCES = testFFT.c
 
 testInteractions_SOURCES = testInteractions.c
diff --git a/tests/difffloat.py b/tests/difffloat.py
index e0f0864372264899c6de1bf2f83ab678b7dd9ead..f989ccb56fb1f008d4a60931cb7165f4f2156ebd 100644
--- a/tests/difffloat.py
+++ b/tests/difffloat.py
@@ -35,13 +35,18 @@ file1 = sys.argv[1]
 file2 = sys.argv[2]
 number_to_check = -1
 
-if len(sys.argv) == 5:
-    number_to_check = int(sys.argv[4])
-
 fileTol = ""
 if len(sys.argv) >= 4:
     fileTol = sys.argv[3]
 
+if len(sys.argv) >= 5:
+    number_to_check = int(sys.argv[4])
+
+if len(sys.argv) == 6:
+    ignoreSmallRhoDh = int(sys.argv[5])
+else:
+    ignoreSmallRhoDh = 0
+    
 data1 = loadtxt(file1)
 data2 = loadtxt(file2)
 if fileTol != "":
@@ -102,8 +107,11 @@ for i in range(n_lines_to_check):
             print ""
             error = True
 
-        if abs(data1[i,j]) < 1e-6 and + abs(data2[i,j]) < 1e-6 : continue
-            
+        if abs(data1[i,j]) + abs(data2[i,j]) < 1e-6 : continue
+
+        # Ignore pathological cases with rho_dh
+        if ignoreSmallRhoDh and j == 8 and abs(data1[i,j]) < 2e-4: continue
+        
         if( rel_diff > 1.1*relTol[j]):
             print "Relative difference larger than tolerance (%e) for particle %d, column %d:"%(relTol[j], i,j)
             print "%10s:           a = %e"%("File 1", data1[i,j])
diff --git a/tests/test125cells.c b/tests/test125cells.c
index 517833eb2b1fe84c2d9811d090294acab2511b19..76112029016b7e28eeeaf69967a3b1ba3f8f0ef3 100644
--- a/tests/test125cells.c
+++ b/tests/test125cells.c
@@ -201,8 +201,9 @@ void get_solution(const struct cell *main_cell, struct solution_part *solution,
   }
 }
 
-void reset_particles(struct cell *c, enum velocity_field vel,
-                     enum pressure_field press, float size, float density) {
+void reset_particles(struct cell *c, struct hydro_space *hs,
+                     enum velocity_field vel, enum pressure_field press,
+                     float size, float density) {
 
   for (int i = 0; i < c->count; ++i) {
 
@@ -211,6 +212,8 @@ void reset_particles(struct cell *c, enum velocity_field vel,
     set_velocity(p, vel, size);
     set_energy_state(p, press, size, density);
 
+    hydro_init_part(p, hs);
+
 #if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH)
     float volume = p->conserved.mass / density;
 #if defined(GIZMO_SPH)
@@ -326,6 +329,7 @@ struct cell *make_cell(size_t n, const double offset[3], double size, double h,
   cell->count = count;
   cell->gcount = 0;
   cell->dx_max = 0.;
+  cell->dx_max_sort = 0.;
   cell->width[0] = size;
   cell->width[1] = size;
   cell->width[2] = size;
@@ -336,6 +340,7 @@ struct cell *make_cell(size_t n, const double offset[3], double size, double h,
   cell->ti_old = 8;
   cell->ti_end_min = 8;
   cell->ti_end_max = 8;
+  cell->ti_sort = 0;
 
   // shuffle_particles(cell->parts, cell->count);
 
@@ -537,7 +542,11 @@ int main(int argc, char *argv[]) {
 
   /* Build the infrastructure */
   struct space space;
-  space.periodic = 0;
+  space.periodic = 1;
+  space.dim[0] = 3.;
+  space.dim[1] = 3.;
+  space.dim[2] = 3.;
+  hydro_space_init(&space.hs, &space);
 
   struct phys_const prog_const;
   prog_const.const_newton_G = 1.f;
@@ -601,18 +610,13 @@ int main(int argc, char *argv[]) {
 
     const ticks tic = getticks();
 
-    /* Start with a gentle kick */
-    // runner_do_kick1(&runner, main_cell, 0);
-
-    /* And a gentle drift */
-    // runner_do_drift_particles(&runner, main_cell, 0);
+    /* Initialise the particles */
+    for (int j = 0; j < 125; ++j)
+      runner_do_drift_particles(&runner, cells[j], 0);
 
     /* First, sort stuff */
     for (int j = 0; j < 125; ++j) runner_do_sort(&runner, cells[j], 0x1FFF, 0);
 
-    /* Initialise the particles */
-    for (int j = 0; j < 125; ++j) runner_do_init(&runner, cells[j], 0);
-
 /* Do the density calculation */
 #if !(defined(MINIMAL_SPH) && defined(WITH_VECTORIZATION))
 
@@ -686,8 +690,6 @@ int main(int argc, char *argv[]) {
 
     /* Finally, give a gentle kick */
     runner_do_end_force(&runner, main_cell, 0);
-    // runner_do_kick2(&runner, main_cell, 0);
-
     const ticks toc = getticks();
     time += toc - tic;
 
@@ -704,20 +706,21 @@ int main(int argc, char *argv[]) {
   message("SWIFT calculation took       : %15lli ticks.", time / runs);
 
   for (int j = 0; j < 125; ++j)
-    reset_particles(cells[j], vel, press, size, rho);
+    reset_particles(cells[j], &space.hs, vel, press, size, rho);
 
   /* NOW BRUTE-FORCE CALCULATION */
 
   const ticks tic = getticks();
 
-  /* Kick the central cell */
-  // runner_do_kick1(&runner, main_cell, 0);
+/* Kick the central cell */
+// runner_do_kick1(&runner, main_cell, 0);
 
-  /* And drift it */
-  runner_do_drift_particles(&runner, main_cell, 0);
+/* And drift it */
+// runner_do_drift_particles(&runner, main_cell, 0);
 
-  /* Initialise the particles */
-  for (int j = 0; j < 125; ++j) runner_do_init(&runner, cells[j], 0);
+/* Initialise the particles */
+// for (int j = 0; j < 125; ++j) runner_do_drift_particles(&runner, cells[j],
+// 0);
 
 /* Do the density calculation */
 #if !(defined(MINIMAL_SPH) && defined(WITH_VECTORIZATION))
diff --git a/tests/test27cells.c b/tests/test27cells.c
index 9e79c097462203465c03ea056569c7f448125d7e..bd827b68e90ea5f4e9d5577612e6cecda2edf83a 100644
--- a/tests/test27cells.c
+++ b/tests/test27cells.c
@@ -159,6 +159,7 @@ struct cell *make_cell(size_t n, double *offset, double size, double h,
   cell->h_max = h;
   cell->count = count;
   cell->dx_max = 0.;
+  cell->dx_max_sort = 0.;
   cell->width[0] = size;
   cell->width[1] = size;
   cell->width[2] = size;
@@ -169,6 +170,7 @@ struct cell *make_cell(size_t n, double *offset, double size, double h,
   cell->ti_old = 8;
   cell->ti_end_min = 8;
   cell->ti_end_max = 8;
+  cell->ti_sort = 8;
 
   shuffle_particles(cell->parts, cell->count);
 
@@ -407,13 +409,20 @@ int main(int argc, char *argv[]) {
 
   /* Build the infrastructure */
   struct space space;
-  space.periodic = 0;
+  space.periodic = 1;
+  space.dim[0] = 3.;
+  space.dim[1] = 3.;
+  space.dim[2] = 3.;
+
+  struct hydro_props hp;
+  hp.h_max = FLT_MAX;
 
   struct engine engine;
   engine.s = &space;
   engine.time = 0.1f;
   engine.ti_current = 8;
   engine.max_active_bin = num_time_bins;
+  engine.hydro_properties = &hp;
 
   struct runner runner;
   runner.e = &engine;
@@ -429,6 +438,8 @@ int main(int argc, char *argv[]) {
         cells[i * 9 + j * 3 + k] = make_cell(particles, offset, size, h, rho,
                                              &partId, perturbation, vel);
 
+        runner_do_drift_particles(&runner, cells[i * 9 + j * 3 + k], 0);
+
         runner_do_sort(&runner, cells[i * 9 + j * 3 + k], 0x1FFF, 0);
       }
     }
@@ -542,8 +553,7 @@ int main(int argc, char *argv[]) {
   dump_particle_fields(outputFileName, main_cell, cells);
 
   /* Check serial results against the vectorised results. */
-   if (check_results(main_cell->parts, vec_parts, main_cell->count,
-   threshold))
+  if (check_results(main_cell->parts, vec_parts, main_cell->count, threshold))
     message("Differences found...");
 
   /* Output timing */
diff --git a/tests/test27cellsPerturbed.sh.in b/tests/test27cellsPerturbed.sh.in
index 30498594b659101216b51dfea2346fa9230dbc97..1eb7b31bcd668e7362a9eb907be501aa8016abb8 100755
--- a/tests/test27cellsPerturbed.sh.in
+++ b/tests/test27cellsPerturbed.sh.in
@@ -10,7 +10,7 @@ do
 
     if [ -e brute_force_27_perturbed.dat ]
     then
-	python @srcdir@/difffloat.py brute_force_27_perturbed.dat swift_dopair_27_perturbed.dat @srcdir@/tolerance_27_perturbed.dat 6
+	python @srcdir@/difffloat.py brute_force_27_perturbed.dat swift_dopair_27_perturbed.dat @srcdir@/tolerance_27_perturbed.dat 6 1
     else
 	exit 1
     fi
diff --git a/tests/testKernel.c b/tests/testKernel.c
index 13f4e36534eb11a4c8f7ba9c19a48de6599e31f5..0f627675cbfbcbc6fe926ee6617f83d3aeacb5de 100644
--- a/tests/testKernel.c
+++ b/tests/testKernel.c
@@ -39,7 +39,7 @@ int main() {
   const float numPoints_inv = 1. / numPoints;
 
   for (int i = 0; i < numPoints; ++i) {
-    u[i] = i * 2.5f * numPoints_inv / h;
+    u[i] = i * 2.25f * numPoints_inv / h;
   }
 
   for (int i = 0; i < numPoints; ++i) {
@@ -55,18 +55,22 @@ int main() {
 
 #ifdef WITH_VECTORIZATION
 
+  printf("\nVector Output for kernel_deval_1_vec\n");
+  printf("-------------\n");
+
+  /* Test vectorised kernel that uses one vector. */
   for (int i = 0; i < numPoints; i += VEC_SIZE) {
 
     vector vx, vx_h;
     vector W_vec, dW_vec;
 
     for (int j = 0; j < VEC_SIZE; j++) {
-      vx.f[j] = (i + j) * 2.5f / numPoints;
+      vx.f[j] = (i + j) * 2.25f / numPoints;
     }
 
     vx_h.v = vx.v / vec_set1(h);
 
-    kernel_deval_vec(&vx_h, &W_vec, &dW_vec);
+    kernel_deval_1_vec(&vx_h, &W_vec, &dW_vec);
 
     for (int j = 0; j < VEC_SIZE; j++) {
       printf("%2d: h= %f H= %f x=%f W(x,h)=%f dW(x,h)=%f\n", i + j, h,
@@ -85,6 +89,63 @@ int main() {
     }
   }
 
+  printf("\nVector Output for kernel_deval_2_vec\n");
+  printf("-------------\n");
+  
+  /* Test vectorised kernel that uses two vectors. */
+  for (int i = 0; i < numPoints; i += VEC_SIZE) {
+
+    vector vx, vx_h;
+    vector W_vec, dW_vec;
+
+    vector vx_2, vx_h_2;
+    vector W_vec_2, dW_vec_2;
+    
+    for (int j = 0; j < VEC_SIZE; j++) {
+      vx.f[j] = (i + j) * 2.25f / numPoints;
+      vx_2.f[j] = (i + j) * 2.25f / numPoints;
+    }
+
+    vx_h.v = vx.v / vec_set1(h);
+    vx_h_2.v = vx_2.v / vec_set1(h);
+
+    kernel_deval_2_vec(&vx_h, &W_vec, &dW_vec, &vx_h_2, &W_vec_2, &dW_vec_2);
+
+    /* Check first vector results. */
+    for (int j = 0; j < VEC_SIZE; j++) {
+      printf("%2d: h= %f H= %f x=%f W(x,h)=%f dW(x,h)=%f\n", i + j, h,
+             h * kernel_gamma, vx.f[j], W_vec.f[j], dW_vec.f[j]);
+
+      if (fabsf(W_vec.f[j] - W[i + j]) > 2e-7) {
+        printf("Invalid value ! scalar= %e, vector= %e\n", W[i + j],
+               W_vec.f[j]);
+        return 1;
+      }
+      if (fabsf(dW_vec.f[j] - dW[i + j]) > 2e-7) {
+        printf("Invalid value ! scalar= %e, vector= %e\n", dW[i + j],
+               dW_vec.f[j]);
+        return 1;
+      }
+    }
+    
+    /* Check second vector results. */
+    for (int j = 0; j < VEC_SIZE; j++) {
+      printf("%2d: h= %f H= %f x=%f W(x,h)=%f dW(x,h)=%f\n", i + j, h,
+             h * kernel_gamma, vx_2.f[j], W_vec_2.f[j], dW_vec_2.f[j]);
+
+      if (fabsf(W_vec_2.f[j] - W[i + j]) > 2e-7) {
+        printf("Invalid value ! scalar= %e, vector= %e\n", W[i + j],
+               W_vec_2.f[j]);
+        return 1;
+      }
+      if (fabsf(dW_vec_2.f[j] - dW[i + j]) > 2e-7) {
+        printf("Invalid value ! scalar= %e, vector= %e\n", dW[i + j],
+               dW_vec_2.f[j]);
+        return 1;
+      }
+    }
+  }
+
   printf("\nAll values are consistent\n");
 
 #endif
diff --git a/tests/testVoronoi2D.c b/tests/testVoronoi2D.c
index 7a35961952079494b0d1567db57abd29fa1df35b..509d3ab69976fa8618db389ebd87eedb9ea34409 100644
--- a/tests/testVoronoi2D.c
+++ b/tests/testVoronoi2D.c
@@ -107,8 +107,10 @@ int main() {
     Atot = 0.0f;
     /* print the cells to the stdout */
     for (i = 0; i < TESTVORONOI2D_NUMCELL; ++i) {
+#ifdef VORONOI_VERBOSE
       printf("Cell %i:\n", i);
       voronoi_print_cell(&cells[i]);
+#endif
       voronoi_cell_finalize(&cells[i]);
       Atot += cells[i].volume;
     }
@@ -185,8 +187,10 @@ int main() {
     Atot = 0.0f;
     /* print the cells to the stdout */
     for (i = 0; i < 100; ++i) {
+#ifdef VORONOI_VERBOSE
       printf("Cell %i:\n", i);
       voronoi_print_cell(&cells[i]);
+#endif
       voronoi_cell_finalize(&cells[i]);
       Atot += cells[i].volume;
     }
diff --git a/tests/tolerance_27_perturbed.dat b/tests/tolerance_27_perturbed.dat
index 53de4ec7632039a56a3757488881e890296e3ac8..b6ed8c2c18808156056f9c5816212866c2dfa998 100644
--- a/tests/tolerance_27_perturbed.dat
+++ b/tests/tolerance_27_perturbed.dat
@@ -1,3 +1,3 @@
 #   ID      pos_x      pos_y      pos_z        v_x        v_y        v_z           rho        rho_dh        wcount     wcount_dh         div_v       curl_vx       curl_vy       curl_vz
     0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1.2e-6     1e-4	    5e-5       2e-3		 3.1e-6	     3e-6	   3e-6		 3e-6
-    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-6	      1.2e-2	    1e-5       1e-4		 2e-5	     2e-3	   2e-3	 	 2e-3
+    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-6	      2e-3	    1e-5       1e-4		 2e-5	     2e-3	   2e-3	 	 2e-3
diff --git a/theory/Multipoles/bibliography.bib b/theory/Multipoles/bibliography.bib
index abdb5855f52aefd9b4562851946d16bd402db68a..12e274dd63093ba1e14750249f2538c092e5268a 100644
--- a/theory/Multipoles/bibliography.bib
+++ b/theory/Multipoles/bibliography.bib
@@ -24,4 +24,75 @@ archivePrefix = "arXiv",
    pages={13},
    year={2006}
  }
-	      
+
+@ARTICLE{Monaghan1985,
+   author = {{Monaghan}, J.~J. and {Lattanzio}, J.~C.},
+    title = "{A refined particle method for astrophysical problems}",
+  journal = {\aap},
+ keywords = {Computational Astrophysics, Gravitational Collapse, Gravitational Fields, Many Body Problem, Molecular Clouds, Stellar Evolution, Angular Momentum, Binary Stars, Computational Grids, Interpolation, Kernel Functions, Particle Mass, Stellar Orbits},
+     year = 1985,
+    month = aug,
+   volume = 149,
+    pages = {135-143},
+   adsurl = {http://adsabs.harvard.edu/abs/1985A%26A...149..135M},
+  adsnote = {Provided by the SAO/NASA Astrophysics Data System}
+}
+
+@ARTICLE{Price2007,
+   author = {{Price}, D.~J. and {Monaghan}, J.~J.},
+    title = "{An energy-conserving formalism for adaptive gravitational force softening in smoothed particle hydrodynamics and N-body codes}",
+  journal = {\mnras},
+   eprint = {astro-ph/0610872},
+ keywords = {gravitation, hydrodynamics, methods: N-body simulations, methods: numerical},
+     year = 2007,
+    month = feb,
+   volume = 374,
+    pages = {1347-1358},
+      doi = {10.1111/j.1365-2966.2006.11241.x},
+   adsurl = {http://adsabs.harvard.edu/abs/2007MNRAS.374.1347P},
+  adsnote = {Provided by the SAO/NASA Astrophysics Data System}
+}
+
+@ARTICLE{Dehnen2001,
+   author = {{Dehnen}, W.},
+    title = "{Towards optimal softening in three-dimensional N-body codes - I. Minimizing the force error}",
+  journal = {\mnras},
+   eprint = {astro-ph/0011568},
+ keywords = {STELLAR DYNAMICS, METHODS: N-BODY SIMULATIONS, METHODS: NUMERICAL},
+     year = 2001,
+    month = jun,
+   volume = 324,
+    pages = {273-291},
+      doi = {10.1046/j.1365-8711.2001.04237.x},
+   adsurl = {http://adsabs.harvard.edu/abs/2001MNRAS.324..273D},
+  adsnote = {Provided by the SAO/NASA Astrophysics Data System}
+}
+
+@ARTICLE{Hernquist1989,
+   author = {{Hernquist}, L. and {Katz}, N.},
+    title = "{TREESPH - A unification of SPH with the hierarchical tree method}",
+  journal = {\apjs},
+ keywords = {Computational Fluid Dynamics, Computerized Simulation, Data Smoothing, Magnetohydrodynamics, Trees (Mathematics), Dynamical Systems, Many Body Problem, Monte Carlo Method, Spatial Resolution},
+     year = 1989,
+    month = jun,
+   volume = 70,
+    pages = {419-446},
+      doi = {10.1086/191344},
+   adsurl = {http://adsabs.harvard.edu/abs/1989ApJS...70..419H},
+  adsnote = {Provided by the SAO/NASA Astrophysics Data System}
+}
+
+@Article{Wendland1995,
+author="Wendland, Holger",
+title="Piecewise polynomial, positive definite and compactly supported radial functions of minimal degree",
+journal="Advances in Computational Mathematics",
+year="1995",
+volume="4",
+number="1",
+pages="389--396",
+abstract="We construct a new class of positive definite and compactly supported radial functions which consist of a univariate polynomial within their support. For given smoothness and space dimension it is proved that they are of minimal degree and unique up to a constant factor. Finally, we establish connections between already known functions of this kind.",
+issn="1572-9044",
+doi="10.1007/BF02123482",
+url="http://dx.doi.org/10.1007/BF02123482"
+}
+
diff --git a/theory/Multipoles/fmm_standalone.tex b/theory/Multipoles/fmm_standalone.tex
index 6948708622dbf0cfc05daa9d63917f7a3017bfc2..fcd727a89abe95bba69b23c58ce5067c8cc53211 100644
--- a/theory/Multipoles/fmm_standalone.tex
+++ b/theory/Multipoles/fmm_standalone.tex
@@ -1,4 +1,4 @@
-\documentclass[fleqn, usenatbib, useAMS,a4paper]{mnras}
+\documentclass[fleqn, usenatbib, useAMS, a4paper]{mnras}
 \usepackage{graphicx}
 \usepackage{amsmath,paralist,xcolor,xspace,amssymb}
 \usepackage{times}
@@ -13,10 +13,18 @@
 
 \maketitle
 
+We use the multi-index notation of \cite{Dehnen2014} to simplify expressions.
+
+\input{potential_softening}
 \input{gravity_derivatives}
 
 \bibliographystyle{mnras}
 \bibliography{./bibliography.bib}
 
+\appendix
+\onecolumn
+\input{potential_derivatives}
+
+
 
 \end{document}
diff --git a/theory/Multipoles/gravity_derivatives.tex b/theory/Multipoles/gravity_derivatives.tex
index 913bdf752c666aac5913c6e6452bdb06f3fcdc86..e4c7b1565ab6c82de5623d5a643c3a8bd1fa513f 100644
--- a/theory/Multipoles/gravity_derivatives.tex
+++ b/theory/Multipoles/gravity_derivatives.tex
@@ -1,5 +1,3 @@
-We use the multi-index notation of \cite{Dehnen2014} to simplify expressions.
-
 \subsection{Derivatives of the gravitational potential}
 
 The calculation of all the
diff --git a/theory/Multipoles/potential.py b/theory/Multipoles/potential.py
new file mode 100644
index 0000000000000000000000000000000000000000..559f590762a3cbef171c5dd584cbc517879a2cec
--- /dev/null
+++ b/theory/Multipoles/potential.py
@@ -0,0 +1,181 @@
+###############################################################################
+ # This file is part of SWIFT.
+ # Copyright (c) 2016  Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ # 
+ # This program is free software: you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published
+ # by the Free Software Foundation, either version 3 of the License, or
+ # (at your option) any later version.
+ # 
+ # This program is distributed in the hope that it will be useful,
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ # GNU General Public License for more details.
+ # 
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ # 
+ ##############################################################################
+import matplotlib
+matplotlib.use("Agg")
+from pylab import *
+from scipy import integrate
+from scipy.optimize import curve_fit
+from scipy.optimize import fsolve
+from matplotlib.font_manager import FontProperties
+import numpy
+import math
+
+params = {'axes.labelsize': 9,
+'axes.titlesize': 10,
+'font.size': 10,
+'legend.fontsize': 10,
+'xtick.labelsize': 8,
+'ytick.labelsize': 8,
+'text.usetex': True,
+'figure.figsize' : (3.15,3.15),
+'figure.subplot.left'    : 0.115,
+'figure.subplot.right'   : 0.99  ,
+'figure.subplot.bottom'  : 0.065  ,
+'figure.subplot.top'     : 0.99  ,
+'figure.subplot.wspace'  : 0.  ,
+'figure.subplot.hspace'  : 0.  ,
+'lines.markersize' : 6,
+'lines.linewidth' : 3.,
+'text.latex.unicode': True
+}
+rcParams.update(params)
+rc('font',**{'family':'sans-serif','sans-serif':['Times']})
+
+# Parameters
+epsilon = 2.
+r_min = 0.
+r_max = 4
+r_max_plot = 2.6
+
+# Radius
+r = linspace(r_min, r_max, 401)
+r[0] += 1e-9
+u = r / epsilon
+
+# Newtonian solution
+phi_newton = 1. / r
+F_newton = 1. / r**2
+W_newton = 0. * r
+
+# Softened potential
+phi = np.zeros(np.size(r))
+W = np.zeros(np.size(r))
+F = np.zeros(np.size(r))
+for i in range(np.size(r)):
+    if r[i] > epsilon:
+        phi[i] = 1. / r[i]
+        W[i] = 0.
+        F[i] = 1. / r[i]**2
+    else:
+        phi[i] = (-1./epsilon) * (3.*u[i]**7 - 15.*u[i]**6 + 28.*u[i]**5 - 21.*u[i]**4 + 7.*u[i]**2 - 3.)
+        W[i] = (21. / (2.*math.pi)) * (4.*u[i]**5 - 15.*u[i]**4 + 20.*u[i]**3 - 10.*u[i]**2 + 1.) / epsilon**3
+        F[i] = (1./epsilon**2) * (21.*u[i]**6 - 90*u[i]**5 + 140.*u[i]**4 - 84.*u[i]**3 + 14*u[i])
+
+plummer_equivalent_factor = phi[0] * epsilon
+
+print "Plummer-equivalent factor:", plummer_equivalent_factor
+        
+epsilon_plummer = epsilon / plummer_equivalent_factor
+        
+# Plummer potential
+phi_plummer = (1. / epsilon_plummer) * (1 + (r / epsilon_plummer)**2)**(-1./2.)
+F_plummer = (1. / epsilon_plummer**3) * r / (1 + (r / epsilon_plummer )**2)**(3./2.)
+def eta_plummer(r):
+    return (3. / (4.*math.pi)) * 1. / (1 + r**2)**(5./2.)
+W_plummer = (1. / epsilon_plummer**3) * eta_plummer(r / epsilon_plummer)
+
+
+# Gadget-2 potential
+epsilon_gadget = epsilon #/ plummer_equivalent_factor * 2.8
+phi_gadget2 = np.zeros(np.size(r))
+W_gadget2 = np.zeros(np.size(r))
+F_gadget2 = np.zeros(np.size(r))
+for i in range(np.size(r)):
+    if r[i] > epsilon_gadget:
+        phi_gadget2[i] = 1. / r[i]
+        W_gadget2[i] = 0.
+        F_gadget2[i] = 1. / r[i]**2
+    elif r[i] > epsilon_gadget / 2.:
+        phi_gadget2[i] = -((32/3.)*u[i]**2 - 16.*u[i]**3 + (96./10.)*u[i]**4 - (64./30.)*u[i]**5 - (16./5.) + 1./(15.*u[i])  )/ (epsilon_gadget)
+        W_gadget2[i] = (8. / math.pi) * (2. * (1- u[i])**3) / epsilon_gadget**3
+        F_gadget2[i] = u[i] * (21.333333 - 48*u[i] + 38.4*u[i]**2 - 10.6666667*u[i]**3 - 0.06666667*u[i]**-3) / epsilon_gadget**2   
+    else:
+        phi_gadget2[i] = -((16./3.)*u[i]**2 - (96./10.)*u[i]**4 + (64./10.)*u[i]**5 - (14./5.)  ) / (epsilon_gadget)
+        W_gadget2[i] = (8. / math.pi) * (1. - 6.*u[i]**2 + 6.*u[i]**3) / epsilon_gadget**3
+        F_gadget2[i] = u[i] * (10.666667 + u[i]**2 * (32. * u[i] - 38.4)) / epsilon_gadget**2
+
+figure()
+colors=['#4477AA', '#CC6677', '#DDCC77', '#117733']
+
+# Density
+subplot(311)
+plot(r, W_newton, '--', lw=1.4, label="${\\rm Newtonian}$", color=colors[0])
+plot(r, W_plummer, ':', lw=1.4, label="${\\rm Plummer}$", color=colors[1])
+plot(r, W_gadget2, '-', lw=1.4, label="${\\rm Gadget}$", color=colors[2])
+plot(r, W, '-', lw=1.4, label="${\\rm SWIFT}$", color=colors[3])
+plot([epsilon, epsilon], [0, 10], 'k-', alpha=0.5, lw=0.5)
+plot([epsilon/plummer_equivalent_factor, epsilon/plummer_equivalent_factor], [0, 10], 'k-', alpha=0.5, lw=0.5)
+
+legend(loc="upper right", frameon=True, handletextpad=0.1, handlelength=3.2, fontsize=8)
+
+xlim(0,r_max_plot)
+xticks([0., 0.5, 1., 1.5, 2., 2.5], ["", "", "", "", "", ""])
+
+ylim(0., 0.84)
+yticks([0, 0.2, 0.4, 0.6, 0.8], ["$0$", "$0.2$", "$0.4$", "$0.6$", "$0.8$"])
+ylabel("$\\rho(r)$", labelpad=2)
+
+# Potential
+subplot(312)
+plot(r, phi_newton, '--', lw=1.4, label="${\\rm Newtonian}$", color=colors[0])
+plot(r, phi_plummer, ':', lw=1.4, label="${\\rm Plummer}$", color=colors[1])
+plot(r, phi_gadget2, '-', lw=1.4, label="${\\rm Gadget}$", color=colors[2])
+plot(r, phi, '-', lw=1.4, label="${\\rm SWIFT}$", color=colors[3])
+plot([epsilon, epsilon], [-10, 10], 'k-', alpha=0.5, lw=0.5)
+plot([epsilon/plummer_equivalent_factor, epsilon/plummer_equivalent_factor], [0, 10], 'k-', alpha=0.5, lw=0.5)
+
+ylim(0, 2.3)
+ylabel("$|\\phi(r)|$", labelpad=1)
+#yticks([0., 0.5, 1., 1.5, 2., 2.5], ["$%.1f$"%(0.*epsilon), "$%.1f$"%(0.5*epsilon), "$%.1f$"%(1.*epsilon), "$%.1f$"%(1.5*epsilon), "$%.1f$"%(2.*epsilon)])
+
+xlim(0,r_max_plot)
+xticks([0., 0.5, 1., 1.5, 2., 2.5], ["", "", "", "", "", ""])
+
+# Force
+subplot(313)
+plot(r, F_newton, '--', lw=1.4, color=colors[0])
+plot(r, F_plummer, ':', lw=1.4, color=colors[1])
+plot(r, F_gadget2, '-', lw=1.4, color=colors[2])
+plot(r, F, '-', lw=1.4, color=colors[3])
+plot([epsilon, epsilon], [0, 10], 'k-', alpha=0.5, lw=0.5)
+plot([epsilon/plummer_equivalent_factor, epsilon/plummer_equivalent_factor], [0, 10], 'k-', alpha=0.5, lw=0.5)
+text(epsilon+0.03, 0.05, "$\\epsilon$", color='k', alpha=0.5, rotation=90, va="bottom", ha="left", fontsize=8)
+text(epsilon/plummer_equivalent_factor+0.03, 0.05, "$\\epsilon_{\\rm Plummer}$", color='k', alpha=0.5, rotation=90, va="bottom", ha="left", fontsize=8) 
+
+xlim(0,r_max_plot)
+xticks([0., 0.5, 1., 1.5, 2., 2.5], ["$%.1f$"%(0./epsilon), "", "$%.1f$"%(1./epsilon), "", "$%.1f$"%(2./epsilon)])
+xlabel("$r/H$", labelpad=-7)
+
+ylim(0, 0.95)
+ylabel("$|\\overrightarrow{\\nabla}\\phi(r)|$", labelpad=0)
+
+savefig("potential.pdf")
+
+
+
+
+#Construct potential
+# phi = np.zeros(np.size(r))
+# for i in range(np.size(r)):
+#     if r[i] > 2*epsilon:
+#         phi[i] = 1./ r[i]
+#     elif r[i] > epsilon:
+#         phi[i] = -(1./epsilon) * ((32./3.)*u[i]**2 - (48./3.)*u[i]**3 + (38.4/4.)*u[i]**4 - (32./15.)*u[i]**5 + (2./30.)*u[i]**(-1) - (9/5.))
+#     else:
+#         phi[i] = -(1./epsilon) * ((32./6.)*u[i]**2 - (38.4/4.)*u[i]**4 + (32./5.)*u[i]**4 - (7./5.))
diff --git a/theory/Multipoles/potential_derivatives.tex b/theory/Multipoles/potential_derivatives.tex
new file mode 100644
index 0000000000000000000000000000000000000000..56184ce98902d76ad53ce1d49e3d6d67dfc33ac4
--- /dev/null
+++ b/theory/Multipoles/potential_derivatives.tex
@@ -0,0 +1,75 @@
+\subsection{Derivatives of the potential}
+
+For completeness, we give here the full expression for the first few
+derivatives of the potential that are used in our FMM scheme. We use
+the notation $\mathbf{r}=(r_x, r_y, r_z)$, $r = |\mathbf{r}|$ and
+$u=r/H$. Starting from the potential (Eq. \ref{eq:fmm:potential},
+reproduced here for clarity), 
+\begin{align}
+D_{000}(\mathbf{r}) = \phi (\mathbf{r},H) = 
+\left\lbrace\begin{array}{rcl}
+\frac{1}{H} \left(-3u^7 + 15u^6 - 28u^5 + 21u^4 - 7u^2 + 3\right) & \mbox{if} & u < 1,\\
+\frac{1}{r} & \mbox{if} & u \geq 1, 
+\end{array}
+\right.\nonumber
+\end{align}
+we can construct the higher order terms by successively applying the
+"chain rule". We show examples of the first few relevant ones here.
+
+\begin{align}
+D_{100}(\mathbf{r}) = \frac{\partial}{\partial r_x} \phi (\mathbf{r},H) = 
+\left\lbrace\begin{array}{rcl}
+-\frac{r_x}{H^3} \left(21u^5 - 90u^4 + 140u^3 - 84u^2 + 14\right) & \mbox{if} & u < 1,\\
+-\frac{r_x}{r^3} & \mbox{if} & u \geq 1, 
+\end{array}
+\right.\nonumber
+\end{align}
+
+\begin{align}
+D_{200}(\mathbf{r}) = \frac{\partial^2}{\partial r_x^2} \phi (\mathbf{r},H) = 
+\left\lbrace\begin{array}{rcl}
+\frac{r_x^2}{H^5}\left(-105u^3+360u^2-420u+168\right) -
+\frac{1}{H^3} \left(21u^5 - 90u^4 + 140u^3 - 84u^2 + 14\right) & \mbox{if} & u < 1,\\
+3\frac{r_x^2}{r^5} - \frac{1}{r^3} & \mbox{if} & u \geq 1, 
+\end{array}
+\right.\nonumber
+\end{align}
+
+\begin{align}
+D_{110}(\mathbf{r}) = \frac{\partial^2}{\partial r_x\partial r_y} \phi (\mathbf{r},H) = 
+\left\lbrace\begin{array}{rcl}
+\frac{r_xr_y}{H^5}\left(-105u^3+360u^2-420u+168\right) & \mbox{if} & u < 1,\\
+3\frac{r_xr_y}{r^5} & \mbox{if} & u \geq 1, 
+\end{array}
+\right.\nonumber
+\end{align}
+
+\begin{align}
+D_{300}(\mathbf{r}) = \frac{\partial^3}{\partial r_x^3} \phi (\mathbf{r},H) = 
+\left\lbrace\begin{array}{rcl}
+-\frac{r_x^3}{H^7} \left(315u - 720 + 420u^{-1}\right) +
+\frac{3r_x}{H^5}\left(-105u^3+360u^2-420u+168\right) & \mbox{if} & u < 1,\\
+-15\frac{r_x^3}{r^7} + 9 \frac{r_x}{r^5} & \mbox{if} & u \geq 1, 
+\end{array}
+\right.\nonumber
+\end{align}
+
+\begin{align}
+D_{210}(\mathbf{r}) = \frac{\partial^3}{\partial r_x^3} \phi (\mathbf{r},H) = 
+\left\lbrace\begin{array}{rcl}
+-\frac{r_x^2r_y}{H^7} \left(315u - 720 + 420u^{-1}\right) +
+\frac{r_y}{H^5}\left(-105u^3+360u^2-420u+168\right) & \mbox{if} & u < 1,\\
+-15\frac{r_x^2r_y}{r^7} + 3 \frac{r_y}{r^5} & \mbox{if} & u \geq 1, 
+\end{array}
+\right.\nonumber
+\end{align}
+
+
+\begin{align}
+D_{111}(\mathbf{r}) = \frac{\partial^3}{\partial r_x\partial r_y\partial r_z} \phi (\mathbf{r},H) = 
+\left\lbrace\begin{array}{rcl}
+-\frac{r_xr_yr_z}{H^7} \left(315u - 720 + 420u^{-1}\right) & \mbox{if} & u < 1,\\
+-15\frac{r_xr_yr_z}{r^7} & \mbox{if} & u \geq 1, 
+\end{array}
+\right.\nonumber
+\end{align}
diff --git a/theory/Multipoles/potential_softening.tex b/theory/Multipoles/potential_softening.tex
new file mode 100644
index 0000000000000000000000000000000000000000..1186a9cec377fd8daa94e14d024115f95ecfdc99
--- /dev/null
+++ b/theory/Multipoles/potential_softening.tex
@@ -0,0 +1,52 @@
+\subsection{Gravitational softening}
+
+To avoid artificial two-body relaxation, the Dirac
+$\delta$-distribution of particles is convolved with a softening
+kernel of a given fixed, but time-variable, scale-length
+$\epsilon$. Instead of the commonly used spline kernel of
+\cite{Monaghan1985} (e.g. in \textsc{Gadget}), we use a C2 kernel
+\citep{Wendland1995} which leads to an expression for the force that
+is cheaper to compute and has a very similar overall shape. We set
+$\tilde\delta(\mathbf{x}) = \rho(|\mathbf{x}|) = W(|\mathbf{x}|,
+3\epsilon_{\rm Plummer})$, with $W(r, H)$ given by
+
+\begin{align}
+W(r,H) &= \frac{21}{2\pi H^3} \times \nonumber \\
+&\left\lbrace\begin{array}{rcl}
+4u^5 - 15u^4 + 20u^3 - 10u^2 + 1 & \mbox{if} & u < 1,\\
+0 & \mbox{if} & u \geq 1,
+\end{array}
+\right.
+\end{align}
+and $u = r/H$. The potential $\phi(r,H)$ corresponding to this density distribution reads
+\begin{align}
+\phi = 
+\left\lbrace\begin{array}{rcl}
+\frac{1}{H} (-3u^7 + 15u^6 - 28u^5 + 21u^4 - 7u^2 + 3) & \mbox{if} & u < 1,\\
+\frac{1}{r} & \mbox{if} & u \geq 1.
+\end{array}
+\right.
+\label{eq:fmm:potential}
+\end{align}
+
+These choices, lead to a potential at $|\mathbf{x}| = 0$ equal to the
+central potential of a Plummer sphere (i.e. $1/\epsilon_{\rm
+Plummer}$)\footnote{Note the factor $3$ in the definition of
+$\rho(|\mathbf{x}|)$ which differs from the factor $2.8$ used
+in \textsc{Gadget} as a consequence of the change of kernel
+shape.}. The softened density profile, its corresponding potential and
+resulting forces are shown on Fig. \ref{fig:fmm:softening} (for
+details see Sec. 2 of~\cite{Price2007}).
+
+
+\begin{figure}
+\includegraphics[width=\columnwidth]{potential.pdf}
+\caption{The density (top), potential (middle) and forces (bottom) of
+generated py a point mass in our softened gravitational scheme (for
+completeness, we chose $\epsilon=2$). A
+Plummer-equivalent sphere is shown for comparison. The spline kernel
+of \citet{Monaghan1985}, used in \textsc{Gadget}, is shown for
+comparison but note that it has not been re-scaled to match the
+Plummer-sphere potential at $r=0$.}
+\label{fig:fmm:softening}
+\end{figure}
diff --git a/theory/Multipoles/run.sh b/theory/Multipoles/run.sh
index 34304a37dc1cb1fad3a7442544a8df793ea94d35..f25d407cd4ffe679a272f352798817f7c0c4e55a 100755
--- a/theory/Multipoles/run.sh
+++ b/theory/Multipoles/run.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-python kernels.py
+python potential.py
 pdflatex -jobname=fmm fmm_standalone.tex
 bibtex fmm.aux
 pdflatex -jobname=fmm fmm_standalone.tex