diff --git a/.gitignore b/.gitignore
index dba88cee0989e00baef51d9e75a42b1141d15714..3521b353ec4b1c4dea7192047e197596f083fe99 100644
--- a/.gitignore
+++ b/.gitignore
@@ -56,6 +56,8 @@ tests/testSingle
 tests/testTimeIntegration
 tests/testSPHStep
 tests/testKernel
+tests/testKernelGrav
+tests/testFFT
 tests/testInteractions
 tests/testSymmetry
 tests/testMaths
diff --git a/configure.ac b/configure.ac
index bbe2945679cb4f49523cd90d4edfb7c0224cfe5d..483937a9ce4b166410ee42e312b24b13551b5d6a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -417,6 +417,17 @@ if test "$ac_cv_func_pthread_setaffinity_np" = "yes"; then
   fi
 fi
 
+# Check for FFTW
+have_fftw3="no"
+AC_CHECK_HEADER([fftw3.h])
+if test "$ac_cv_header_fftw3_h" = "yes"; then
+   AC_CHECK_LIB([fftw3],[fftw_malloc], [AC_DEFINE([HAVE_FFTW],
+   [1],[Defined if FFTW 3.x exists.])] )
+   FFTW_LIBS="-lfftw3"
+   have_fftw3="yes"
+fi
+AC_SUBST([FFTW_LIBS])
+
 # Check for Intel intrinsics header optionally used by vector.h.
 AC_CHECK_HEADERS([immintrin.h])
 
@@ -503,6 +514,7 @@ AC_MSG_RESULT([
    HDF5 enabled    : $with_hdf5
     - parallel     : $have_parallel_hdf5
    Metis enabled   : $have_metis
+   FFTW3 enabled   : $have_fftw3
    libNUMA enabled : $have_numa
    Using tcmalloc  : $have_tcmalloc
    CPU profiler    : $have_profiler
diff --git a/examples/Makefile.am b/examples/Makefile.am
index 1fedca0c3847c3e4d7942de09cbf409bcd69b8c6..187abcf9898b8024b4e4f5089de7ca51e6dd2e3c 100644
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -24,7 +24,7 @@ AM_CFLAGS = -I$(top_srcdir)/src $(HDF5_CPPFLAGS)
 AM_LDFLAGS = $(HDF5_LDFLAGS)
 
 # Extra libraries.
-EXTRA_LIBS = $(HDF5_LIBS) $(PROFILER_LIBS) $(TCMALLOC_LIBS)
+EXTRA_LIBS = $(HDF5_LIBS) $(FFTW_LIBS) $(PROFILER_LIBS) $(TCMALLOC_LIBS)
 
 # MPI libraries.
 MPI_LIBS = $(METIS_LIBS) $(MPI_THREAD_LIBS)
diff --git a/examples/UniformDMBox/makeIC.py b/examples/UniformDMBox/makeIC.py
index 449d780fb31bc23dd194f772be45d35e6b0bbe3f..2aee89798a5b8bbd425a6b73528779fb1aa7db23 100644
--- a/examples/UniformDMBox/makeIC.py
+++ b/examples/UniformDMBox/makeIC.py
@@ -26,7 +26,7 @@ from numpy import *
 # with a density of 1
 
 # Parameters
-periodic= 1           # 1 For periodic box
+periodic= 0           # 1 For periodic box
 boxSize = 1.
 rho = 1.
 L = int(sys.argv[1])  # Number of particles along one axis
diff --git a/examples/main.c b/examples/main.c
index 9cda03c14c364974a18e36c84e60227a01970dd5..30e27cc55a6ec508dc352f1acc7f2eed71d71bdd 100644
--- a/examples/main.c
+++ b/examples/main.c
@@ -399,8 +399,8 @@ int main(int argc, char *argv[]) {
   /* Initialize the space with these data. */
   if (myrank == 0) clocks_gettime(&tic);
   struct space s;
-  space_init(&s, params, dim, parts, gparts, Ngas, Ngpart, periodic, talking,
-             dry_run);
+  space_init(&s, params, dim, parts, gparts, Ngas, Ngpart, periodic,
+             with_self_gravity, talking, dry_run);
   if (myrank == 0) {
     clocks_gettime(&toc);
     message("space_init took %.3f %s.", clocks_diff(&tic, &toc),
diff --git a/src/Makefile.am b/src/Makefile.am
index c8554ce61940650898cfb21e282e0a6d174c79f1..4aa7278ba5162ec106fe25f10529f766f9a703a8 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -19,7 +19,7 @@
 AM_CFLAGS = -DTIMER $(HDF5_CPPFLAGS)
 
 # Assign a "safe" version number
-AM_LDFLAGS = $(HDF5_LDFLAGS) -version-info 0:0:0
+AM_LDFLAGS = $(HDF5_LDFLAGS) $(FFTW_LIBS) -version-info 0:0:0
 
 # The git command, if available.
 GIT_CMD = @GIT_CMD@
@@ -48,17 +48,18 @@ include_HEADERS = space.h runner.h queue.h task.h lock.h cell.h part.h const.h \
 AM_SOURCES = space.c runner.c queue.c task.c cell.c engine.c \
     serial_io.c timers.c debug.c scheduler.c proxy.c parallel_io.c \
     units.c common_io.c single_io.c multipole.c version.c map.c \
-    kernel_hydro.c kernel_gravity.c tools.c part.c partition.c clocks.c parser.c \
-    physical_constants.c potentials.c hydro_properties.c
+    kernel_hydro.c tools.c part.c partition.c clocks.c parser.c \
+    physical_constants.c potentials.c hydro_properties.c \
+    runner_doiact_fft.c
 
 # Include files for distribution, not installation.
 nobase_noinst_HEADERS = approx_math.h atomic.h cycle.h error.h inline.h kernel_hydro.h kernel_gravity.h \
-		 vector.h runner_doiact.h runner_doiact_grav.h units.h intrinsics.h minmax.h kick.h \
-		 timestep.h drift.h adiabatic_index.h io_properties.h \
+		 kernel_long_gravity.h vector.h runner_doiact.h runner_doiact_grav.h runner_doiact_fft.h \
+                 units.h intrinsics.h minmax.h kick.h timestep.h drift.h adiabatic_index.h io_properties.h \
 		 gravity.h gravity_io.h \
 		 gravity/Default/gravity.h gravity/Default/gravity_iact.h gravity/Default/gravity_io.h \
 		 gravity/Default/gravity_debug.h gravity/Default/gravity_part.h  \
-                 hydro.h hydro_io.h \
+	 	 hydro.h hydro_io.h \
 		 hydro/Minimal/hydro.h hydro/Minimal/hydro_iact.h hydro/Minimal/hydro_io.h \
                  hydro/Minimal/hydro_debug.h hydro/Minimal/hydro_part.h \
 		 hydro/Default/hydro.h hydro/Default/hydro_iact.h hydro/Default/hydro_io.h \
diff --git a/src/cell.c b/src/cell.c
index 2796a8933e1fedc2522c833570558adda369140b..7df55ce04ca739da2de6e6061048ad1fe3998a1e 100644
--- a/src/cell.c
+++ b/src/cell.c
@@ -709,6 +709,92 @@ void cell_clean_links(struct cell *c, void *data) {
 }
 
 /**
+<<<<<<< HEAD
+ * @brief Checks whether the cells are direct neighbours ot not. Both cells have
+ * to be of the same size
+ *
+ * @param ci First #cell.
+ * @param cj Second #cell.
+ *
+ * @todo Deal with periodicity.
+ */
+int cell_are_neighbours(const struct cell *restrict ci,
+                        const struct cell *restrict cj) {
+
+#ifdef SWIFT_DEBUG_CHECKS
+  if (ci->width[0] != cj->width[0]) error("Cells of different size !");
+#endif
+
+  /* Maximum allowed distance */
+  const double min_dist =
+      1.2 * ci->width[0]; /* 1.2 accounts for rounding errors */
+
+  /* (Manhattan) Distance between the cells */
+  for (int k = 0; k < 3; k++) {
+    const double center_i = ci->loc[k];
+    const double center_j = cj->loc[k];
+    if (fabsf(center_i - center_j) > min_dist) return 0;
+  }
+
+  return 1;
+}
+
+/**
+ * @brief Computes the multi-pole brutally and compare to the
+ * recursively computed one.
+ *
+ * @param c Cell to act upon
+ * @param data Unused parameter
+ */
+void cell_check_multipole(struct cell *c, void *data) {
+
+  struct multipole ma;
+
+  if (c->gcount > 0) {
+
+    /* Brute-force calculation */
+    multipole_init(&ma, c->gparts, c->gcount);
+
+    /* Compare with recursive one */
+    struct multipole mb = c->multipole;
+
+    if (fabsf(ma.mass - mb.mass) / fabsf(ma.mass + mb.mass) > 1e-5)
+      error("Multipole masses are different (%12.15e vs. %12.15e)", ma.mass,
+            mb.mass);
+
+    for (int k = 0; k < 3; ++k)
+      if (fabsf(ma.CoM[k] - mb.CoM[k]) / fabsf(ma.CoM[k] + mb.CoM[k]) > 1e-5)
+        error("Multipole CoM are different (%12.15e vs. %12.15e", ma.CoM[k],
+              mb.CoM[k]);
+
+    if (fabsf(ma.I_xx - mb.I_xx) / fabsf(ma.I_xx + mb.I_xx) > 1e-5 &&
+        ma.I_xx > 1e-9)
+      error("Multipole I_xx are different (%12.15e vs. %12.15e)", ma.I_xx,
+            mb.I_xx);
+    if (fabsf(ma.I_yy - mb.I_yy) / fabsf(ma.I_yy + mb.I_yy) > 1e-5 &&
+        ma.I_yy > 1e-9)
+      error("Multipole I_yy are different (%12.15e vs. %12.15e)", ma.I_yy,
+            mb.I_yy);
+    if (fabsf(ma.I_zz - mb.I_zz) / fabsf(ma.I_zz + mb.I_zz) > 1e-5 &&
+        ma.I_zz > 1e-9)
+      error("Multipole I_zz are different (%12.15e vs. %12.15e)", ma.I_zz,
+            mb.I_zz);
+    if (fabsf(ma.I_xy - mb.I_xy) / fabsf(ma.I_xy + mb.I_xy) > 1e-5 &&
+        ma.I_xy > 1e-9)
+      error("Multipole I_xy are different (%12.15e vs. %12.15e)", ma.I_xy,
+            mb.I_xy);
+    if (fabsf(ma.I_xz - mb.I_xz) / fabsf(ma.I_xz + mb.I_xz) > 1e-5 &&
+        ma.I_xz > 1e-9)
+      error("Multipole I_xz are different (%12.15e vs. %12.15e)", ma.I_xz,
+            mb.I_xz);
+    if (fabsf(ma.I_yz - mb.I_yz) / fabsf(ma.I_yz + mb.I_yz) > 1e-5 &&
+        ma.I_yz > 1e-9)
+      error("Multipole I_yz are different (%12.15e vs. %12.15e)", ma.I_yz,
+            mb.I_yz);
+  }
+}
+
+/*
  * @brief Frees up the memory allocated for this #cell
  */
 void cell_clean(struct cell *c) {
diff --git a/src/cell.h b/src/cell.h
index a14d94d50ce6a4d2d25e7e6005c035cb22a3720d..150718eeb4bd3857e37d517718fe53661033a330 100644
--- a/src/cell.h
+++ b/src/cell.h
@@ -197,6 +197,9 @@ void cell_init_parts(struct cell *c, void *data);
 void cell_init_gparts(struct cell *c, void *data);
 void cell_convert_hydro(struct cell *c, void *data);
 void cell_clean_links(struct cell *c, void *data);
+int cell_are_neighbours(const struct cell *restrict ci,
+                        const struct cell *restrict cj);
+void cell_check_multipole(struct cell *c, void *data);
 void cell_clean(struct cell *c);
 
 #endif /* SWIFT_CELL_H */
diff --git a/src/common_io.c b/src/common_io.c
index d3590ae402487eb2f605a1cf3f5722f86704ad88..3c001d9da106a46ef5033c8cdec9346d68c54ecd 100644
--- a/src/common_io.c
+++ b/src/common_io.c
@@ -374,6 +374,9 @@ void writeCodeDescription(hid_t h_file) {
   writeAttribute_s(h_grpcode, "Git Branch", git_branch());
   writeAttribute_s(h_grpcode, "Git Revision", git_revision());
   writeAttribute_s(h_grpcode, "HDF5 library version", hdf5_version());
+#ifdef HAVE_FFTW
+  writeAttribute_s(h_grpcode, "FFTW library version", fftw3_version());
+#endif
 #ifdef WITH_MPI
   writeAttribute_s(h_grpcode, "MPI library", mpi_version());
 #ifdef HAVE_METIS
diff --git a/src/const.h b/src/const.h
index 50d72e3339d7b8f04e3028f09794aff45f03dc65..736ba0155e03dc0f595ac06edfd0763b38302e9c 100644
--- a/src/const.h
+++ b/src/const.h
@@ -36,11 +36,6 @@
 /* Time integration constants. */
 #define const_max_u_change 0.1f
 
-/* Gravity stuff. */
-#define const_theta_max 0.57735f
-#define const_G 6.672e-8f     /* Gravitational constant. */
-#define const_epsilon 0.0014f /* Gravity blending distance. */
-
 /* Hydrodynamical adiabatic index. */
 #define HYDRO_GAMMA_5_3
 //#define HYDRO_GAMMA_4_3
@@ -59,7 +54,13 @@
 #define GADGET2_SPH
 //#define DEFAULT_SPH
 
-/* External potential properties */
+/* Self gravity stuff. */
+#define const_gravity_multipole_order 2
+#define const_gravity_a_smooth 1.25f
+#define const_gravity_r_cut 4.5f
+#define const_gravity_eta 0.025f
+
+/* External gravity properties */
 #define EXTERNAL_POTENTIAL_POINTMASS
 //#define EXTERNAL_POTENTIAL_ISOTHERMALPOTENTIAL
 
diff --git a/src/drift.h b/src/drift.h
index c71cda49b0ad2b18b227fc2d485bbe76f32d5b94..a6e347cc337385f89bece53bc477bff7d163c202 100644
--- a/src/drift.h
+++ b/src/drift.h
@@ -26,6 +26,7 @@
 #include "const.h"
 #include "debug.h"
 #include "hydro.h"
+#include "part.h"
 
 /**
  * @brief Perform the 'drift' operation on a #gpart
diff --git a/src/engine.c b/src/engine.c
index b33850be9415f30f6c0ac38e70a74a7d239b791e..ebd589bbb0af01cec3196c658c4d014daca8df38 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -64,6 +64,7 @@
 #include "serial_io.h"
 #include "single_io.h"
 #include "timers.h"
+#include "tools.h"
 #include "units.h"
 #include "version.h"
 
@@ -129,12 +130,10 @@ void engine_make_gravity_hierarchical_tasks(struct engine *e, struct cell *c,
       engine_policy_external_gravity;
   const int is_fixdt = (e->policy & engine_policy_fixdt) == engine_policy_fixdt;
 
-  /* Am I the super-cell? */
-  /* TODO(pedro): Add a condition for gravity tasks as well. */
-  if (super == NULL &&
-      (c->density != NULL || (!c->split && (c->count > 0 || c->gcount > 0)))) {
+  /* Is this the super-cell? */
+  if (super == NULL && (c->grav != NULL || (!c->split && c->gcount > 0))) {
 
-    /* This is the super cell, i.e. the first with density tasks attached. */
+    /* This is the super cell, i.e. the first with gravity tasks attached. */
     super = c;
 
     /* Local tasks only... */
@@ -1182,9 +1181,62 @@ void engine_exchange_strays(struct engine *e, size_t offset_parts,
 #endif
 }
 
+/**
+ * @brief Constructs the top-level tasks for the short-range gravity
+ * interactions.
+ *
+ * All top-cells get a self task.
+ * All neighbouring pairs get a pair task.
+ * All non-neighbouring pairs within a range of 6 cells get a M-M task.
+ *
+ * @param e The #engine.
+ */
+void engine_make_gravity_tasks(struct engine *e) {
+
+  struct space *s = e->s;
+  struct scheduler *sched = &e->sched;
+  const int nodeID = e->nodeID;
+  struct cell *cells = s->cells;
+  const int nr_cells = s->nr_cells;
+
+  for (int cid = 0; cid < nr_cells; ++cid) {
+
+    struct cell *ci = &cells[cid];
+
+    /* Skip cells without gravity particles */
+    if (ci->gcount == 0) continue;
+
+    /* Is that neighbour local ? */
+    if (ci->nodeID != nodeID) continue;
+
+    /* If the cells is local build a self-interaction */
+    scheduler_addtask(sched, task_type_self, task_subtype_grav, 0, 0, ci, NULL,
+                      0);
+
+    /* Let's also build a task for all the non-neighbouring pm calculations */
+    scheduler_addtask(sched, task_type_grav_mm, task_subtype_none, 0, 0, ci,
+                      NULL, 0);
+
+    for (int cjd = cid + 1; cjd < nr_cells; ++cjd) {
+
+      struct cell *cj = &cells[cjd];
+
+      /* Skip cells without gravity particles */
+      if (cj->gcount == 0) continue;
+
+      /* Is that neighbour local ? */
+      if (cj->nodeID != nodeID) continue;
+
+      if (cell_are_neighbours(ci, cj))
+        scheduler_addtask(sched, task_type_pair, task_subtype_grav, 0, 0, ci,
+                          cj, 1);
+    }
+  }
+}
+
 /**
  * @brief Constructs the top-level pair tasks for the first hydro loop over
- *neighbours
+ * neighbours
  *
  * Here we construct all the tasks for all possible neighbouring non-empty
  * local cells in the hierarchy. No dependencies are being added thus far.
@@ -1316,6 +1368,115 @@ void engine_count_and_link_tasks(struct engine *e) {
   }
 }
 
+/**
+ * @brief Creates the dependency network for the gravity tasks of a given cell.
+ *
+ * @param sched The #scheduler.
+ * @param gravity The gravity task to link.
+ * @param c The cell.
+ */
+static inline void engine_make_gravity_dependencies(struct scheduler *sched,
+                                                    struct task *gravity,
+                                                    struct cell *c) {
+
+  /* init --> gravity --> kick */
+  scheduler_addunlock(sched, c->super->init, gravity);
+  scheduler_addunlock(sched, gravity, c->super->kick);
+
+  /* grav_up --> gravity ( --> kick) */
+  scheduler_addunlock(sched, c->super->grav_up, gravity);
+}
+
+/**
+ * @brief Creates all the task dependencies for the gravity
+ *
+ * @param e The #engine
+ */
+void engine_link_gravity_tasks(struct engine *e) {
+
+  struct scheduler *sched = &e->sched;
+  const int nodeID = e->nodeID;
+  const int nr_tasks = sched->nr_tasks;
+
+  /* Add one task gathering all the multipoles */
+  struct task *gather = scheduler_addtask(
+      sched, task_type_grav_gather_m, task_subtype_none, 0, 0, NULL, NULL, 0);
+
+  /* And one task performing the FFT */
+  struct task *fft = scheduler_addtask(sched, task_type_grav_fft,
+                                       task_subtype_none, 0, 0, NULL, NULL, 0);
+
+  scheduler_addunlock(sched, gather, fft);
+
+  for (int k = 0; k < nr_tasks; k++) {
+
+    /* Get a pointer to the task. */
+    struct task *t = &sched->tasks[k];
+
+    /* Skip? */
+    if (t->skip) continue;
+
+    /* Multipole construction */
+    if (t->type == task_type_grav_up) {
+      scheduler_addunlock(sched, t, gather);
+    }
+
+    /* Long-range interaction */
+    if (t->type == task_type_grav_mm) {
+
+      /* Gather the multipoles --> mm interaction --> kick */
+      scheduler_addunlock(sched, gather, t);
+      scheduler_addunlock(sched, t, t->ci->super->kick);
+
+      /* init --> mm interaction */
+      scheduler_addunlock(sched, t->ci->super->init, t);
+    }
+
+    /* Self-interaction? */
+    if (t->type == task_type_self && t->subtype == task_subtype_grav) {
+
+      engine_make_gravity_dependencies(sched, t, t->ci);
+
+    }
+
+    /* Otherwise, pair interaction? */
+    else if (t->type == task_type_pair && t->subtype == task_subtype_grav) {
+
+      if (t->ci->nodeID == nodeID) {
+
+        engine_make_gravity_dependencies(sched, t, t->ci);
+      }
+
+      if (t->cj->nodeID == nodeID && t->ci->super != t->cj->super) {
+
+        engine_make_gravity_dependencies(sched, t, t->cj);
+      }
+
+    }
+
+    /* Otherwise, sub-self interaction? */
+    else if (t->type == task_type_sub_self && t->subtype == task_subtype_grav) {
+
+      if (t->ci->nodeID == nodeID) {
+        engine_make_gravity_dependencies(sched, t, t->ci);
+      }
+    }
+
+    /* Otherwise, sub-pair interaction? */
+    else if (t->type == task_type_sub_pair && t->subtype == task_subtype_grav) {
+
+      if (t->ci->nodeID == nodeID) {
+
+        engine_make_gravity_dependencies(sched, t, t->ci);
+      }
+      if (t->cj->nodeID == nodeID && t->ci->super != t->cj->super) {
+
+        engine_make_gravity_dependencies(sched, t, t->cj);
+      }
+    }
+  }
+}
+
 /**
  * @brief Creates the dependency network for the hydro tasks of a given cell.
  *
@@ -1453,41 +1614,6 @@ void engine_make_extra_hydroloop_tasks(struct engine *e) {
   }
 }
 
-/**
- * @brief Constructs the top-level pair tasks for the gravity M-M interactions
- *
- * Correct implementation is still lacking here.
- *
- * @param e The #engine.
- */
-void engine_make_gravityinteraction_tasks(struct engine *e) {
-
-  struct space *s = e->s;
-  struct scheduler *sched = &e->sched;
-  const int nr_cells = s->nr_cells;
-  struct cell *cells = s->cells;
-
-  /* Loop over all cells. */
-  for (int i = 0; i < nr_cells; i++) {
-
-    /* If it has gravity particles, add a self-task */
-    if (cells[i].gcount > 0) {
-      scheduler_addtask(sched, task_type_grav_mm, task_subtype_none, -1, 0,
-                        &cells[i], NULL, 0);
-
-      /* Loop over all remainding cells */
-      for (int j = i + 1; j < nr_cells; j++) {
-
-        /* If that other cell has gravity parts, add a pair interaction */
-        if (cells[j].gcount > 0) {
-          scheduler_addtask(sched, task_type_grav_mm, task_subtype_none, -1, 0,
-                            &cells[i], &cells[j], 0);
-        }
-      }
-    }
-  }
-}
-
 /**
  * @brief Constructs the gravity tasks building the multipoles and propagating
  *them to the children
@@ -1513,9 +1639,12 @@ void engine_make_gravityrecursive_tasks(struct engine *e) {
       struct task *up =
           scheduler_addtask(sched, task_type_grav_up, task_subtype_none, 0, 0,
                             &cells[k], NULL, 0);
-      struct task *down =
-          scheduler_addtask(sched, task_type_grav_down, task_subtype_none, 0, 0,
-                            &cells[k], NULL, 0);
+
+      struct task *down = NULL;
+      /* struct task *down = */
+      /*     scheduler_addtask(sched, task_type_grav_down, task_subtype_none, 0,
+       * 0, */
+      /*                       &cells[k], NULL, 0); */
 
       /* Push tasks down the cell hierarchy. */
       engine_addtasks_grav(e, &cells[k], up, down);
@@ -1548,11 +1677,10 @@ void engine_maketasks(struct engine *e) {
   }
 
   /* Construct the firt hydro loop over neighbours */
-  engine_make_hydroloop_tasks(e);
+  if (e->policy & engine_policy_hydro) engine_make_hydroloop_tasks(e);
 
   /* Add the gravity mm tasks. */
-  if (e->policy & engine_policy_self_gravity)
-    engine_make_gravityinteraction_tasks(e);
+  if (e->policy & engine_policy_self_gravity) engine_make_gravity_tasks(e);
 
   /* Split the tasks. */
   scheduler_splittasks(sched);
@@ -1573,7 +1701,7 @@ void engine_maketasks(struct engine *e) {
   /* Count the number of tasks associated with each cell and
      store the density tasks in each cell, and make each sort
      depend on the sorts of its progeny. */
-  engine_count_and_link_tasks(e);
+  if (e->policy & engine_policy_hydro) engine_count_and_link_tasks(e);
 
   /* Append hierarchical tasks to each cells */
   if (e->policy & engine_policy_hydro)
@@ -1588,7 +1716,12 @@ void engine_maketasks(struct engine *e) {
   /* Run through the tasks and make force tasks for each density task.
      Each force task depends on the cell ghosts and unlocks the kick task
      of its super-cell. */
-  engine_make_extra_hydroloop_tasks(e);
+  if (e->policy & engine_policy_hydro) engine_make_extra_hydroloop_tasks(e);
+
+  /* Add the dependencies for the self-gravity stuff */
+  if (e->policy & engine_policy_self_gravity) engine_link_gravity_tasks(e);
+
+#ifdef WITH_MPI
 
   /* Add the communication tasks if MPI is being used. */
   if (e->policy & engine_policy_mpi) {
@@ -1611,6 +1744,7 @@ void engine_maketasks(struct engine *e) {
                              NULL);
     }
   }
+#endif
 
   /* Set the unlocks per task. */
   scheduler_set_unlocks(sched);
@@ -1740,7 +1874,7 @@ int engine_marktasks(struct engine *e) {
           continue;
 
         /* Set the sort flags. */
-        if (t->type == task_type_pair) {
+        if (t->type == task_type_pair && t->subtype != task_subtype_grav) {
           if (!(ci->sorted & (1 << t->flags))) {
             ci->sorts->flags |= (1 << t->flags);
             ci->sorts->skip = 0;
@@ -2144,6 +2278,7 @@ void engine_collect_drift(struct cell *c) {
   c->ang_mom[1] = ang_mom[1];
   c->ang_mom[2] = ang_mom[2];
 }
+
 /**
  * @brief Print the conserved quantities statistics to a log file
  *
@@ -2315,7 +2450,16 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs) {
   /* Add the tasks corresponding to self-gravity to the masks */
   if (e->policy & engine_policy_self_gravity) {
 
-    /* Nothing here for now */
+    mask |= 1 << task_type_grav_up;
+    mask |= 1 << task_type_grav_mm;
+    mask |= 1 << task_type_grav_gather_m;
+    mask |= 1 << task_type_grav_fft;
+    mask |= 1 << task_type_self;
+    mask |= 1 << task_type_pair;
+    mask |= 1 << task_type_sub_self;
+    mask |= 1 << task_type_sub_pair;
+
+    submask |= 1 << task_subtype_grav;
   }
 
   /* Add the tasks corresponding to external gravity to the masks */
@@ -2326,6 +2470,7 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs) {
 
   /* Add MPI tasks if need be */
   if (e->policy & engine_policy_mpi) {
+
     mask |= 1 << task_type_send;
     mask |= 1 << task_type_recv;
     submask |= 1 << task_subtype_tend;
@@ -2450,7 +2595,16 @@ void engine_step(struct engine *e) {
   /* Add the tasks corresponding to self-gravity to the masks */
   if (e->policy & engine_policy_self_gravity) {
 
-    /* Nothing here for now */
+    mask |= 1 << task_type_grav_up;
+    mask |= 1 << task_type_grav_mm;
+    mask |= 1 << task_type_grav_gather_m;
+    mask |= 1 << task_type_grav_fft;
+    mask |= 1 << task_type_self;
+    mask |= 1 << task_type_pair;
+    mask |= 1 << task_type_sub_self;
+    mask |= 1 << task_type_sub_pair;
+
+    submask |= 1 << task_subtype_grav;
   }
 
   /* Add the tasks corresponding to external gravity to the masks */
@@ -2460,6 +2614,7 @@ void engine_step(struct engine *e) {
 
   /* Add MPI tasks if need be */
   if (e->policy & engine_policy_mpi) {
+
     mask |= 1 << task_type_send;
     mask |= 1 << task_type_recv;
     submask |= 1 << task_subtype_tend;
diff --git a/src/gravity/Default/gravity.h b/src/gravity/Default/gravity.h
index 32af9b6c67fb9d78d70ae6f41e8b4a32bfb1d8c8..5903ce1384f616a123c74b579539fe53747d17e4 100644
--- a/src/gravity/Default/gravity.h
+++ b/src/gravity/Default/gravity.h
@@ -53,8 +53,6 @@ gravity_compute_timestep_external(const struct external_potential* potential,
 /**
  * @brief Computes the gravity time-step of a given particle due to self-gravity
  *
- * This function only branches towards the potential chosen by the user.
- *
  * @param phys_const The physical constants in internal units.
  * @param gp Pointer to the g-particle data.
  */
@@ -62,7 +60,13 @@ __attribute__((always_inline)) INLINE static float
 gravity_compute_timestep_self(const struct phys_const* const phys_const,
                               const struct gpart* const gp) {
 
-  float dt = FLT_MAX;
+  const float ac2 = gp->a_grav[0] * gp->a_grav[0] +
+                    gp->a_grav[1] * gp->a_grav[1] +
+                    gp->a_grav[2] * gp->a_grav[2];
+
+  const float ac = (ac2 > 0.f) ? sqrtf(ac2) : FLT_MIN;
+
+  const float dt = sqrt(2.f * const_gravity_eta * gp->epsilon / ac);
 
   return dt;
 }
@@ -76,7 +80,9 @@ gravity_compute_timestep_self(const struct phys_const* const phys_const,
  * @param gp The particle to act upon
  */
 __attribute__((always_inline)) INLINE static void gravity_first_init_gpart(
-    struct gpart* gp) {}
+    struct gpart* gp) {
+  gp->epsilon = 0.;  // MATTHIEU
+}
 
 /**
  * @brief Prepares a g-particle for the gravity calculation
@@ -101,9 +107,16 @@ __attribute__((always_inline)) INLINE static void gravity_init_gpart(
  * Multiplies the forces and accelerations by the appropiate constants
  *
  * @param gp The particle to act upon
+ * @param const_G Newton's constant
  */
 __attribute__((always_inline)) INLINE static void gravity_end_force(
-    struct gpart* gp) {}
+    struct gpart* gp, double const_G) {
+
+  /* Let's get physical... */
+  gp->a_grav[0] *= const_G;
+  gp->a_grav[1] *= const_G;
+  gp->a_grav[2] *= const_G;
+}
 
 /**
  * @brief Computes the gravitational acceleration induced by external potentials
diff --git a/src/gravity/Default/gravity_iact.h b/src/gravity/Default/gravity_iact.h
index dd6878841d7d0632ab8e4f218044516d9895cab8..aa285ce3245aeac608aa022924a613a7a7b00375 100644
--- a/src/gravity/Default/gravity_iact.h
+++ b/src/gravity/Default/gravity_iact.h
@@ -23,93 +23,168 @@
 /* Includes. */
 #include "const.h"
 #include "kernel_gravity.h"
+#include "kernel_long_gravity.h"
+#include "multipole.h"
 #include "vector.h"
 
 /**
- * @brief Gravity potential
+ * @brief Gravity forces between particles
  */
-__attribute__((always_inline)) INLINE static void runner_iact_grav(
-    float r2, float *dx, struct gpart *pi, struct gpart *pj) {
+__attribute__((always_inline)) INLINE static void runner_iact_grav_pp(
+    float rlr_inv, float r2, const float *dx, struct gpart *gpi,
+    struct gpart *gpj) {
 
-  float ir, r;
-  float w, acc;
-  float mi = pi->mass, mj = pj->mass;
-  int k;
+  /* Apply the gravitational acceleration. */
+  const float r = sqrtf(r2);
+  const float ir = 1.f / r;
+  const float mi = gpi->mass;
+  const float mj = gpj->mass;
+  const float hi = gpi->epsilon;
+  const float hj = gpj->epsilon;
+  const float u = r * rlr_inv;
+  float f_lr, fi, fj, W;
 
-  /* Get the absolute distance. */
-  ir = 1.0f / sqrtf(r2);
-  r = r2 * ir;
+  /* Get long-range correction */
+  kernel_long_grav_eval(u, &f_lr);
 
-  /* Evaluate the gravity kernel. */
-  kernel_grav_eval(r, &acc);
+  if (r >= hi) {
 
-  /* Scale the acceleration. */
-  acc *= const_G * ir * ir * ir;
+    /* Get Newtonian gravity */
+    fi = mj * ir * ir * ir * f_lr;
 
-  /* Aggregate the accelerations. */
-  for (k = 0; k < 3; k++) {
-    w = acc * dx[k];
-    pi->a_grav[k] -= w * mj;
-    pj->a_grav[k] += w * mi;
+  } else {
+
+    const float hi_inv = 1.f / hi;
+    const float hi_inv3 = hi_inv * hi_inv * hi_inv;
+    const float ui = r * hi_inv;
+
+    kernel_grav_eval(ui, &W);
+
+    /* Get softened gravity */
+    fi = mj * hi_inv3 * W * f_lr;
   }
+
+  if (r >= hj) {
+
+    /* Get Newtonian gravity */
+    fj = mi * ir * ir * ir * f_lr;
+
+  } else {
+
+    const float hj_inv = 1.f / hj;
+    const float hj_inv3 = hj_inv * hj_inv * hj_inv;
+    const float uj = r * hj_inv;
+
+    kernel_grav_eval(uj, &W);
+
+    /* Get softened gravity */
+    fj = mi * hj_inv3 * W * f_lr;
+  }
+
+  const float fidx[3] = {fi * dx[0], fi * dx[1], fi * dx[2]};
+  gpi->a_grav[0] -= fidx[0];
+  gpi->a_grav[1] -= fidx[1];
+  gpi->a_grav[2] -= fidx[2];
+
+  const float fjdx[3] = {fj * dx[0], fj * dx[1], fj * dx[2]};
+  gpj->a_grav[0] += fjdx[0];
+  gpj->a_grav[1] += fjdx[1];
+  gpj->a_grav[2] += fjdx[2];
 }
 
 /**
- * @brief Gravity potential (Vectorized version)
+ * @brief Gravity forces between particles (non-symmetric version)
  */
-__attribute__((always_inline)) INLINE static void runner_iact_vec_grav(
-    float *R2, float *Dx, struct gpart **pi, struct gpart **pj) {
-
-#ifdef WITH_VECTORIZATION
-
-  vector ir, r, r2, dx[3];
-  vector w, acc, ai, aj;
-  vector mi, mj;
-  int j, k;
-
-#if VEC_SIZE == 8
-  mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass,
-                 pi[4]->mass, pi[5]->mass, pi[6]->mass, pi[7]->mass);
-  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass,
-                 pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass);
-  for (k = 0; k < 3; k++)
-    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k], Dx[12 + k],
-                      Dx[15 + k], Dx[18 + k], Dx[21 + k]);
-#elif VEC_SIZE == 4
-  mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass);
-  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass);
-  for (k = 0; k < 3; k++)
-    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]);
-#endif
+__attribute__((always_inline)) INLINE static void runner_iact_grav_pp_nonsym(
+    float rlr_inv, float r2, const float *dx, struct gpart *gpi,
+    const struct gpart *gpj) {
 
-  /* Get the radius and inverse radius. */
-  r2.v = vec_load(R2);
-  ir.v = vec_rsqrt(r2.v);
-  ir.v = ir.v - vec_set1(0.5f) * ir.v * (r2.v * ir.v * ir.v - vec_set1(1.0f));
-  r.v = r2.v * ir.v;
-
-  /* Evaluate the gravity kernel. */
-  blender_eval_vec(&r, &acc);
-
-  /* Scale the acceleration. */
-  acc.v *= vec_set1(const_G) * ir.v * ir.v * ir.v;
-
-  /* Aggregate the accelerations. */
-  for (k = 0; k < 3; k++) {
-    w.v = acc.v * dx[k].v;
-    ai.v = w.v * mj.v;
-    aj.v = w.v * mi.v;
-    for (j = 0; j < VEC_SIZE; j++) {
-      pi[j]->a_grav[k] -= ai.f[j];
-      pj[j]->a_grav[k] += aj.f[j];
-    }
+  /* Apply the gravitational acceleration. */
+  const float r = sqrtf(r2);
+  const float ir = 1.f / r;
+  const float mj = gpj->mass;
+  const float hi = gpi->epsilon;
+  const float u = r * rlr_inv;
+  float f_lr, f, W;
+
+  /* Get long-range correction */
+  kernel_long_grav_eval(u, &f_lr);
+
+  if (r >= hi) {
+
+    /* Get Newtonian gravity */
+    f = mj * ir * ir * ir * f_lr;
+
+  } else {
+
+    const float hi_inv = 1.f / hi;
+    const float hi_inv3 = hi_inv * hi_inv * hi_inv;
+    const float ui = r * hi_inv;
+
+    kernel_grav_eval(ui, &W);
+
+    /* Get softened gravity */
+    f = mj * hi_inv3 * W * f_lr;
   }
 
-#else
+  const float fdx[3] = {f * dx[0], f * dx[1], f * dx[2]};
+
+  gpi->a_grav[0] -= fdx[0];
+  gpi->a_grav[1] -= fdx[1];
+  gpi->a_grav[2] -= fdx[2];
+}
 
-  for (int k = 0; k < VEC_SIZE; k++)
-    runner_iact_grav(R2[k], &Dx[3 * k], pi[k], pj[k]);
+/**
+ * @brief Gravity forces between particle and multipole
+ */
+__attribute__((always_inline)) INLINE static void runner_iact_grav_pm(
+    float rlr_inv, float r2, const float *dx, struct gpart *gp,
+    const struct multipole *multi) {
+
+  /* Apply the gravitational acceleration. */
+  const float r = sqrtf(r2);
+  const float ir = 1.f / r;
+  const float u = r * rlr_inv;
+  const float mrinv3 = multi->mass * ir * ir * ir;
+
+  /* Get long-range correction */
+  float f_lr;
+  kernel_long_grav_eval(u, &f_lr);
+
+#if const_gravity_multipole_order < 2
+
+  /* 0th and 1st order terms */
+  gp->a_grav[0] += mrinv3 * f_lr * dx[0];
+  gp->a_grav[1] += mrinv3 * f_lr * dx[1];
+  gp->a_grav[2] += mrinv3 * f_lr * dx[2];
+
+#elif const_gravity_multipole_order == 2
+  /* Terms up to 2nd order (quadrupole) */
+
+  /* Follows the notation in Bonsai */
+  const float mrinv5 = mrinv3 * ir * ir;
+  const float mrinv7 = mrinv5 * ir * ir;
+
+  const float D1 = -mrinv3;
+  const float D2 = 3.f * mrinv5;
+  const float D3 = -15.f * mrinv7;
+
+  const float q = multi->I_xx + multi->I_yy + multi->I_zz;
+  const float qRx =
+      multi->I_xx * dx[0] + multi->I_xy * dx[1] + multi->I_xz * dx[2];
+  const float qRy =
+      multi->I_xy * dx[0] + multi->I_yy * dx[1] + multi->I_yz * dx[2];
+  const float qRz =
+      multi->I_xz * dx[0] + multi->I_yz * dx[1] + multi->I_zz * dx[2];
+  const float qRR = qRx * dx[0] + qRy * dx[1] + qRz * dx[2];
+  const float C = D1 + 0.5f * D2 * q + 0.5f * D3 * qRR;
+
+  gp->a_grav[0] -= f_lr * (C * dx[0] + D2 * qRx);
+  gp->a_grav[1] -= f_lr * (C * dx[1] + D2 * qRy);
+  gp->a_grav[2] -= f_lr * (C * dx[2] + D2 * qRz);
 
+#else
+#error "Multipoles of order >2 not yet implemented."
 #endif
 }
 
diff --git a/src/gravity/Default/gravity_part.h b/src/gravity/Default/gravity_part.h
index b6c9e62559207cb25323c8a108e4bffc87ea0fcf..77e6543429993bea355c04dd28fcb5b04eee5519 100644
--- a/src/gravity/Default/gravity_part.h
+++ b/src/gravity/Default/gravity_part.h
@@ -37,16 +37,15 @@ struct gpart {
   /* Particle mass. */
   float mass;
 
+  /* Softening length */
+  float epsilon;
+
   /* Particle time of beginning of time-step. */
   int ti_begin;
 
   /* Particle time of end of time-step. */
   int ti_end;
 
-  /* /\* current time of x, and of v_full *\/ */
-  /* float tx; */
-  /* float tv; */
-
   /* Particle ID. If negative, it is the negative offset of the #part with
      which this gpart is linked. */
   long long id_or_neg_offset;
diff --git a/src/kernel_gravity.c b/src/kernel_gravity.c
deleted file mode 100644
index 639a964c813ef7fd95008857ee17b7dd5ffafb27..0000000000000000000000000000000000000000
--- a/src/kernel_gravity.c
+++ /dev/null
@@ -1,74 +0,0 @@
-/*******************************************************************************
- * This file is part of SWIFT.
- * Copyright (c) 2015 Pedro Gonnet (pedro.gonnet@durham.ac.uk),
- *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- ******************************************************************************/
-
-#include <math.h>
-#include <stdio.h>
-
-#include "kernel_gravity.h"
-
-/**
- * @brief The Gadget-2 gravity kernel function
- *
- * @param r The distance between particles
- */
-float gadget(float r) {
-  float fac, h_inv, u, r2 = r * r;
-  if (r >= const_epsilon)
-    fac = 1.0f / (r2 * r);
-  else {
-    h_inv = 1. / const_epsilon;
-    u = r * h_inv;
-    if (u < 0.5)
-      fac = const_iepsilon3 * (10.666666666667 + u * u * (32.0 * u - 38.4));
-    else
-      fac = const_iepsilon3 *
-            (21.333333333333 - 48.0 * u + 38.4 * u * u -
-             10.666666666667 * u * u * u - 0.066666666667 / (u * u * u));
-  }
-  return const_G * fac;
-}
-
-/**
- * @brief Test the gravity kernel function by dumping it in the interval [0,1].
- *
- * @param r_max The radius up to which the kernel is dumped.
- * @param N number of intervals in [0,1].
- */
-void gravity_kernel_dump(float r_max, int N) {
-
-  int k;
-  float x, w;
-  float x4[4] = {0.0f, 0.0f, 0.0f, 0.0f};
-  float w4[4] = {0.0f, 0.0f, 0.0f, 0.0f};
-  // float dw_dx4[4] __attribute__ ((aligned (16)));
-
-  for (k = 1; k <= N; k++) {
-    x = (r_max * k) / N;
-    x4[3] = x4[2];
-    x4[2] = x4[1];
-    x4[1] = x4[0];
-    x4[0] = x;
-    kernel_grav_eval(x, &w);
-    w *= const_G / (x * x * x);
-    // blender_deval_vec( (vector *)x4 , (vector *)w4 , (vector *)dw_dx4 );
-    printf(" %.16e %.16e %.16e %.16e %.16e %.16e %.16e\n", x, w * x, w4[0],
-           w4[1], w4[2], w4[3], gadget(x) * x);
-  }
-}
diff --git a/src/kernel_gravity.h b/src/kernel_gravity.h
index 0c786f5189b882ed4593ef4a927edbf2a8cdd6e7..b38feb5758debf87add2007ee3684d869f393f7e 100644
--- a/src/kernel_gravity.h
+++ b/src/kernel_gravity.h
@@ -20,203 +20,70 @@
 #ifndef SWIFT_KERNEL_GRAVITY_H
 #define SWIFT_KERNEL_GRAVITY_H
 
+#include <math.h>
+
 /* Includes. */
 #include "const.h"
 #include "inline.h"
 #include "vector.h"
 
-#define const_iepsilon (1. / const_epsilon)
-#define const_iepsilon2 (const_iepsilon * const_iepsilon)
-#define const_iepsilon3 (const_iepsilon2 * const_iepsilon)
-#define const_iepsilon4 (const_iepsilon2 * const_iepsilon2)
-#define const_iepsilon5 (const_iepsilon3 * const_iepsilon2)
-#define const_iepsilon6 (const_iepsilon3 * const_iepsilon3)
-
 /* The gravity kernel is defined as a degree 6 polynomial in the distance
    r. The resulting value should be post-multiplied with r^-3, resulting
    in a polynomial with terms ranging from r^-3 to r^3, which are
    sufficient to model both the direct potential as well as the splines
-   near the origin. */
-
-/* Coefficients for the gravity kernel. */
-#define kernel_grav_degree 6
-#define kernel_grav_ivals 2
-#define kernel_grav_scale (2 * const_iepsilon)
-static float
-    kernel_grav_coeffs[(kernel_grav_degree + 1) * (kernel_grav_ivals + 1)] = {
-        32.0f * const_iepsilon6,
-        -192.0f / 5.0f * const_iepsilon5,
-        0.0f,
-        32.0f / 3.0f * const_iepsilon3,
-        0.0f,
-        0.0f,
-        0.0f,
-        -32.0f / 3.0f * const_iepsilon6,
-        192.0f / 5.0f * const_iepsilon5,
-        -48.0f * const_iepsilon4,
-        64.0f / 3.0f * const_iepsilon3,
-        0.0f,
-        0.0f,
-        -1.0f / 15.0f,
-        0.0f,
-        0.0f,
-        0.0f,
-        0.0f,
-        0.0f,
-        0.0f,
-        1.0f};
+   near the origin.
+   As in the hydro case, the 1/h^3 needs to be multiplied in afterwards */
+
+/* Coefficients for the kernel. */
+#define kernel_grav_name "Gadget-2 softening kernel"
+#define kernel_grav_degree 6 /* Degree of the polynomial */
+#define kernel_grav_ivals 2  /* Number of branches */
+static const float
+    kernel_grav_coeffs[(kernel_grav_degree + 1) * (kernel_grav_ivals + 1)]
+    __attribute__((aligned(16))) = {32.f,
+                                    -38.4f,
+                                    0.f,
+                                    10.66666667f,
+                                    0.f,
+                                    0.f,
+                                    0.f, /* 0 < u < 0.5 */
+                                    -10.66666667f,
+                                    38.4f,
+                                    -48.f,
+                                    21.3333333,
+                                    0.f,
+                                    0.f,
+                                    -0.066666667f, /* 0.5 < u < 1 */
+                                    0.f,
+                                    0.f,
+                                    0.f,
+                                    0.f,
+                                    0.f,
+                                    0.f,
+                                    0.f}; /* 1 < u */
 
 /**
- * @brief Computes the gravity cubic spline for a given distance x.
- */
-
-__attribute__((always_inline)) INLINE static void kernel_grav_eval(float x,
-                                                                   float *W) {
-  int ind = fmin(x * kernel_grav_scale, kernel_grav_ivals);
-  float *coeffs = &kernel_grav_coeffs[ind * (kernel_grav_degree + 1)];
-  float w = coeffs[0] * x + coeffs[1];
-  for (int k = 2; k <= kernel_grav_degree; k++) w = x * w + coeffs[k];
-  *W = w;
-}
-
-#ifdef WITH_VECTORIZATION
-
-/**
- * @brief Computes the gravity cubic spline for a given distance x (Vectorized
- * version).
- */
-
-__attribute__((always_inline)) INLINE static void kernel_grav_eval_vec(
-    vector *x, vector *w) {
-
-  vector ind, c[kernel_grav_degree + 1];
-  int j, k;
-
-  /* Load x and get the interval id. */
-  ind.m = vec_ftoi(vec_fmin(x->v * vec_set1(kernel_grav_scale),
-                            vec_set1((float)kernel_grav_ivals)));
-
-  /* load the coefficients. */
-  for (k = 0; k < VEC_SIZE; k++)
-    for (j = 0; j < kernel_grav_degree + 1; j++)
-      c[j].f[k] = kernel_grav_coeffs[ind.i[k] * (kernel_grav_degree + 1) + j];
-
-  /* Init the iteration for Horner's scheme. */
-  w->v = (c[0].v * x->v) + c[1].v;
-
-  /* And we're off! */
-  for (int k = 2; k <= kernel_grav_degree; k++) w->v = (x->v * w->v) + c[k].v;
-}
-
-#endif
-
-/* Blending function stuff
- * --------------------------------------------------------------------------------------------
- */
-
-/* Coefficients for the blending function. */
-#define blender_degree 3
-#define blender_ivals 3
-#define blender_scale 4.0f
-static float blender_coeffs[(blender_degree + 1) * (blender_ivals + 1)] = {
-    0.0f,   0.0f,  0.0f,   1.0f,  -32.0f, 24.0f, -6.0f, 1.5f,
-    -32.0f, 72.0f, -54.0f, 13.5f, 0.0f,   0.0f,  0.0f,  0.0f};
-
-/**
- * @brief Computes the cubic spline blender for a given distance x.
- */
-
-__attribute__((always_inline)) INLINE static void blender_eval(float x,
-                                                               float *W) {
-  int ind = fmin(x * blender_scale, blender_ivals);
-  float *coeffs = &blender_coeffs[ind * (blender_degree + 1)];
-  float w = coeffs[0] * x + coeffs[1];
-  for (int k = 2; k <= blender_degree; k++) w = x * w + coeffs[k];
-  *W = w;
-}
-
-/**
- * @brief Computes the cubic spline blender and its derivative for a given
- * distance x.
- */
-
-__attribute__((always_inline)) INLINE static void blender_deval(float x,
-                                                                float *W,
-                                                                float *dW_dx) {
-  int ind = fminf(x * blender_scale, blender_ivals);
-  float *coeffs = &blender_coeffs[ind * (blender_degree + 1)];
-  float w = coeffs[0] * x + coeffs[1];
-  float dw_dx = coeffs[0];
-  for (int k = 2; k <= blender_degree; k++) {
-    dw_dx = dw_dx * x + w;
-    w = x * w + coeffs[k];
-  }
-  *W = w;
-  *dW_dx = dw_dx;
-}
-
-#ifdef WITH_VECTORIZATION
-
-/**
- * @brief Computes the cubic spline blender and its derivative for a given
- * distance x (Vectorized version). Gives a sensible answer only if x<2.
- */
-
-__attribute__((always_inline)) INLINE static void blender_eval_vec(vector *x,
-                                                                   vector *w) {
-
-  vector ind, c[blender_degree + 1];
-  int j, k;
-
-  /* Load x and get the interval id. */
-  ind.m = vec_ftoi(
-      vec_fmin(x->v * vec_set1(blender_scale), vec_set1((float)blender_ivals)));
-
-  /* load the coefficients. */
-  for (k = 0; k < VEC_SIZE; k++)
-    for (j = 0; j < blender_degree + 1; j++)
-      c[j].f[k] = blender_coeffs[ind.i[k] * (blender_degree + 1) + j];
-
-  /* Init the iteration for Horner's scheme. */
-  w->v = (c[0].v * x->v) + c[1].v;
-
-  /* And we're off! */
-  for (int k = 2; k <= blender_degree; k++) w->v = (x->v * w->v) + c[k].v;
-}
-
-/**
- * @brief Computes the cubic spline blender and its derivative for a given
- * distance x (Vectorized version). Gives a sensible answer only if x<2.
+ * @brief Computes the gravity softening function.
+ *
+ * @param u The ratio of the distance to the softening length $u = x/h$.
+ * @param W (return) The value of the kernel function $W(x,h)$.
  */
+__attribute__((always_inline)) INLINE static void kernel_grav_eval(
+    float u, float *const W) {
 
-__attribute__((always_inline)) INLINE static void blender_deval_vec(
-    vector *x, vector *w, vector *dw_dx) {
-
-  vector ind, c[blender_degree + 1];
-  int j, k;
+  /* Pick the correct branch of the kernel */
+  const int ind = (int)fminf(u * (float)kernel_grav_ivals, kernel_grav_ivals);
+  const float *const coeffs =
+      &kernel_grav_coeffs[ind * (kernel_grav_degree + 1)];
 
-  /* Load x and get the interval id. */
-  ind.m = vec_ftoi(
-      vec_fmin(x->v * vec_set1(blender_scale), vec_set1((float)blender_ivals)));
+  /* First two terms of the polynomial ... */
+  float w = coeffs[0] * u + coeffs[1];
 
-  /* load the coefficients. */
-  for (k = 0; k < VEC_SIZE; k++)
-    for (j = 0; j < blender_degree + 1; j++)
-      c[j].f[k] = blender_coeffs[ind.i[k] * (blender_degree + 1) + j];
+  /* ... and the rest of them */
+  for (int k = 2; k <= kernel_grav_degree; k++) w = u * w + coeffs[k];
 
-  /* Init the iteration for Horner's scheme. */
-  w->v = (c[0].v * x->v) + c[1].v;
-  dw_dx->v = c[0].v;
-
-  /* And we're off! */
-  for (int k = 2; k <= blender_degree; k++) {
-    dw_dx->v = (dw_dx->v * x->v) + w->v;
-    w->v = (x->v * w->v) + c[k].v;
-  }
+  /* Return everything */
+  *W = w / (u * u * u);
 }
 
-#endif
-
-void gravity_kernel_dump(float r_max, int N);
-
 #endif  // SWIFT_KERNEL_GRAVITY_H
diff --git a/src/kernel_long_gravity.h b/src/kernel_long_gravity.h
new file mode 100644
index 0000000000000000000000000000000000000000..d247c7a461d4bd116f30ab106143f6c75e1b941e
--- /dev/null
+++ b/src/kernel_long_gravity.h
@@ -0,0 +1,50 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (c) 2016 Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+#ifndef SWIFT_KERNEL_LONG_GRAVITY_H
+#define SWIFT_KERNEL_LONG_GRAVITY_H
+
+#include <math.h>
+
+/* Includes. */
+#include "const.h"
+#include "inline.h"
+#include "vector.h"
+
+#define one_over_sqrt_pi ((float)(M_2_SQRTPI * 0.5))
+
+/**
+ * @brief Computes the long-range correction term for the FFT calculation.
+ *
+ * @param u The ratio of the distance to the FFT cell scale $u = x/A$.
+ * @param W (return) The value of the kernel function.
+ */
+__attribute__((always_inline)) INLINE static void kernel_long_grav_eval(
+    float u, float *const W) {
+
+  const float arg1 = u * 0.5f;
+  const float arg2 = u * one_over_sqrt_pi;
+  const float arg3 = -arg1 * arg1;
+
+  const float term1 = erfc(arg1);
+  const float term2 = arg2 * expf(arg3);
+
+  *W = term1 + term2;
+}
+
+#endif  // SWIFT_KERNEL_LONG_GRAVITY_H
diff --git a/src/multipole.c b/src/multipole.c
index fc4701fead68e64e8742730325931282f80e8934..f0d0c7084fd9bec366f2185cb24887da40591b17 100644
--- a/src/multipole.c
+++ b/src/multipole.c
@@ -1,6 +1,7 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Copyright (c) 2013 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
+ *               2016 Matthieu Schaller (matthieu.schaller@durham.ac.uk)
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
@@ -21,149 +22,203 @@
 #include "../config.h"
 
 /* Some standard headers. */
-#include <float.h>
-#include <limits.h>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-/* MPI headers. */
-#ifdef WITH_MPI
-#include <mpi.h>
-#endif
+#include <strings.h>
 
 /* This object's header. */
 #include "multipole.h"
 
 /**
- * @brief Merge two multipoles.
- *
- * @param ma The #multipole which will contain the merged result.
- * @param mb The other #multipole.
- */
+* @brief Reset the data of a #multipole.
+*
+* @param m The #multipole.
+*/
+void multipole_reset(struct multipole *m) {
 
-void multipole_merge(struct multipole *ma, struct multipole *mb) {
+  /* Just bzero the struct. */
+  bzero(m, sizeof(struct multipole));
+}
 
-#if multipole_order == 1
+/**
+* @brief Init a multipole from a set of particles.
+*
+* @param m The #multipole.
+* @param gparts The #gpart.
+* @param gcount The number of particles.
+*/
+void multipole_init(struct multipole *m, const struct gpart *gparts,
+                    int gcount) {
+
+#if const_gravity_multipole_order > 2
+#error "Multipoles of order >2 not yet implemented."
+#endif
 
-  /* Correct the position. */
-  float mma = ma->coeffs[0], mmb = mb->coeffs[0];
-  float w = 1.0f / (mma + mmb);
-  for (int k = 0; k < 3; k++) ma->x[k] = (ma->x[k] * mma + mb->x[k] * mmb) * w;
+  /* Zero everything */
+  multipole_reset(m);
 
-  /* Add the particle to the moments. */
-  ma->coeffs[0] = mma + mmb;
+  /* Temporary variables */
+  double mass = 0.0;
+  double com[3] = {0.0, 0.0, 0.0};
 
-#else
-#error( "Multipoles of order %i not yet implemented." , multipole_order )
+#if const_gravity_multipole_order >= 2
+  double I_xx = 0.0, I_yy = 0.0, I_zz = 0.0;
+  double I_xy = 0.0, I_xz = 0.0, I_yz = 0.0;
 #endif
-}
-
-/**
- * @brief Add a particle to the given multipole.
- *
- * @param m The #multipole.
- * @param p The #gpart.
- */
-
-void multipole_addpart(struct multipole *m, struct gpart *p) {
 
-#if multipole_order == 1
+  /* Collect the particle data. */
+  for (int k = 0; k < gcount; k++) {
+    const float w = gparts[k].mass;
 
-  /* Correct the position. */
-  float mm = m->coeffs[0], mp = p->mass;
-  float w = 1.0f / (mm + mp);
-  for (int k = 0; k < 3; k++) m->x[k] = (m->x[k] * mm + p->x[k] * mp) * w;
+    mass += w;
+    com[0] += gparts[k].x[0] * w;
+    com[1] += gparts[k].x[1] * w;
+    com[2] += gparts[k].x[2] * w;
+
+#if const_gravity_multipole_order >= 2
+    I_xx += gparts[k].x[0] * gparts[k].x[0] * w;
+    I_yy += gparts[k].x[1] * gparts[k].x[1] * w;
+    I_zz += gparts[k].x[2] * gparts[k].x[2] * w;
+    I_xy += gparts[k].x[0] * gparts[k].x[1] * w;
+    I_xz += gparts[k].x[0] * gparts[k].x[2] * w;
+    I_yz += gparts[k].x[1] * gparts[k].x[2] * w;
+#endif
+  }
 
-  /* Add the particle to the moments. */
-  m->coeffs[0] = mm + mp;
+  const double imass = 1.0 / mass;
 
-#else
-#error( "Multipoles of order %i not yet implemented." , multipole_order )
+  /* Store the data on the multipole. */
+  m->mass = mass;
+  m->CoM[0] = com[0] * imass;
+  m->CoM[1] = com[1] * imass;
+  m->CoM[2] = com[2] * imass;
+
+#if const_gravity_multipole_order >= 2
+  m->I_xx = I_xx - imass * com[0] * com[0];
+  m->I_yy = I_yy - imass * com[1] * com[1];
+  m->I_zz = I_zz - imass * com[2] * com[2];
+  m->I_xy = I_xy - imass * com[0] * com[1];
+  m->I_xz = I_xz - imass * com[0] * com[2];
+  m->I_yz = I_yz - imass * com[1] * com[2];
 #endif
 }
 
 /**
- * @brief Add a group of particles to the given multipole.
+ * @brief Add the second multipole to the first one.
  *
- * @param m The #multipole.
- * @param p The #gpart array.
- * @param N Number of parts to add.
+ * @param ma The #multipole which will contain the sum.
+ * @param mb The #multipole to add.
  */
 
-void multipole_addparts(struct multipole *m, struct gpart *p, int N) {
+void multipole_add(struct multipole *ma, const struct multipole *mb) {
 
-#if multipole_order == 1
-
-  /* Get the combined mass and positions. */
-  double xp[3] = {0.0, 0.0, 0.0};
-  float mp = 0.0f, w;
-  for (int k = 0; k < N; k++) {
-    w = p[k].mass;
-    mp += w;
-    xp[0] += p[k].x[0] * w;
-    xp[1] += p[k].x[1] * w;
-    xp[2] += p[k].x[2] * w;
-  }
+#if const_gravity_multipole_order > 2
+#error "Multipoles of order >2 not yet implemented."
+#endif
 
   /* Correct the position. */
-  float mm = m->coeffs[0];
-  w = 1.0f / (mm + mp);
-  for (int k = 0; k < 3; k++) m->x[k] = (m->x[k] * mm + xp[k]) * w;
-
-  /* Add the particle to the moments. */
-  m->coeffs[0] = mm + mp;
+  const double ma_mass = ma->mass;
+  const double mb_mass = mb->mass;
+  const double M_tot = ma_mass + mb_mass;
+  const double M_tot_inv = 1.0 / M_tot;
+
+  const double ma_CoM[3] = {ma->CoM[0], ma->CoM[1], ma->CoM[2]};
+  const double mb_CoM[3] = {mb->CoM[0], mb->CoM[1], mb->CoM[2]};
+
+#if const_gravity_multipole_order >= 2
+  const double ma_I_xx = (double)ma->I_xx + ma_mass * ma_CoM[0] * ma_CoM[0];
+  const double ma_I_yy = (double)ma->I_yy + ma_mass * ma_CoM[1] * ma_CoM[1];
+  const double ma_I_zz = (double)ma->I_zz + ma_mass * ma_CoM[2] * ma_CoM[2];
+  const double ma_I_xy = (double)ma->I_xy + ma_mass * ma_CoM[0] * ma_CoM[1];
+  const double ma_I_xz = (double)ma->I_xz + ma_mass * ma_CoM[0] * ma_CoM[2];
+  const double ma_I_yz = (double)ma->I_yz + ma_mass * ma_CoM[1] * ma_CoM[2];
+
+  const double mb_I_xx = (double)mb->I_xx + mb_mass * mb_CoM[0] * mb_CoM[0];
+  const double mb_I_yy = (double)mb->I_yy + mb_mass * mb_CoM[1] * mb_CoM[1];
+  const double mb_I_zz = (double)mb->I_zz + mb_mass * mb_CoM[2] * mb_CoM[2];
+  const double mb_I_xy = (double)mb->I_xy + mb_mass * mb_CoM[0] * mb_CoM[1];
+  const double mb_I_xz = (double)mb->I_xz + mb_mass * mb_CoM[0] * mb_CoM[2];
+  const double mb_I_yz = (double)mb->I_yz + mb_mass * mb_CoM[1] * mb_CoM[2];
+#endif
 
-#else
-#error( "Multipoles of order %i not yet implemented." , multipole_order )
+  /* New mass */
+  ma->mass = M_tot;
+
+  /* New CoM */
+  ma->CoM[0] = (ma_CoM[0] * ma_mass + mb_CoM[0] * mb_mass) * M_tot_inv;
+  ma->CoM[1] = (ma_CoM[1] * ma_mass + mb_CoM[1] * mb_mass) * M_tot_inv;
+  ma->CoM[2] = (ma_CoM[2] * ma_mass + mb_CoM[2] * mb_mass) * M_tot_inv;
+
+/* New quadrupole */
+#if const_gravity_multipole_order >= 2
+  ma->I_xx = (ma_I_xx + mb_I_xx) - M_tot * ma->CoM[0] * ma->CoM[0];
+  ma->I_yy = (ma_I_yy + mb_I_yy) - M_tot * ma->CoM[1] * ma->CoM[1];
+  ma->I_zz = (ma_I_zz + mb_I_zz) - M_tot * ma->CoM[2] * ma->CoM[2];
+  ma->I_xy = (ma_I_xy + mb_I_xy) - M_tot * ma->CoM[0] * ma->CoM[1];
+  ma->I_xz = (ma_I_xz + mb_I_xz) - M_tot * ma->CoM[0] * ma->CoM[2];
+  ma->I_yz = (ma_I_yz + mb_I_yz) - M_tot * ma->CoM[1] * ma->CoM[2];
 #endif
 }
 
 /**
- * @brief Init a multipole from a set of particles.
+ * @brief Add a particle to the given multipole.
  *
  * @param m The #multipole.
- * @param parts The #gpart.
- * @param N The number of particles.
+ * @param p The #gpart.
  */
 
-void multipole_init(struct multipole *m, struct gpart *parts, int N) {
+void multipole_addpart(struct multipole *m, struct gpart *p) {
 
-#if multipole_order == 1
+  /* #if const_gravity_multipole_order == 1 */
 
-  float mass = 0.0f, w;
-  double x[3] = {0.0, 0.0, 0.0};
-  int k;
+  /*   /\* Correct the position. *\/ */
+  /*   float mm = m->coeffs[0], mp = p->mass; */
+  /*   float w = 1.0f / (mm + mp); */
+  /*   for (int k = 0; k < 3; k++) m->x[k] = (m->x[k] * mm + p->x[k] * mp) * w;
+   */
 
-  /* Collect the particle data. */
-  for (k = 0; k < N; k++) {
-    w = parts[k].mass;
-    mass += w;
-    x[0] += parts[k].x[0] * w;
-    x[1] += parts[k].x[1] * w;
-    x[2] += parts[k].x[2] * w;
-  }
-
-  /* Store the data on the multipole. */
-  m->coeffs[0] = mass;
-  m->x[0] = x[0] / mass;
-  m->x[1] = x[1] / mass;
-  m->x[2] = x[2] / mass;
+  /*   /\* Add the particle to the moments. *\/ */
+  /*   m->coeffs[0] = mm + mp; */
 
-#else
-#error( "Multipoles of order %i not yet implemented." , multipole_order )
-#endif
+  /* #else */
+  /* #error( "Multipoles of order %i not yet implemented." ,
+   * const_gravity_multipole_order )
+   */
+  /* #endif */
 }
 
 /**
- * @brief Reset the data of a #multipole.
+ * @brief Add a group of particles to the given multipole.
  *
  * @param m The #multipole.
+ * @param p The #gpart array.
+ * @param N Number of parts to add.
  */
 
-void multipole_reset(struct multipole *m) {
+void multipole_addparts(struct multipole *m, struct gpart *p, int N) {
 
-  /* Just bzero the struct. */
-  bzero(m, sizeof(struct multipole));
+  /* #if const_gravity_multipole_order == 1 */
+
+  /*   /\* Get the combined mass and positions. *\/ */
+  /*   double xp[3] = {0.0, 0.0, 0.0}; */
+  /*   float mp = 0.0f, w; */
+  /*   for (int k = 0; k < N; k++) { */
+  /*     w = p[k].mass; */
+  /*     mp += w; */
+  /*     xp[0] += p[k].x[0] * w; */
+  /*     xp[1] += p[k].x[1] * w; */
+  /*     xp[2] += p[k].x[2] * w; */
+  /*   } */
+
+  /*   /\* Correct the position. *\/ */
+  /*   float mm = m->coeffs[0]; */
+  /*   w = 1.0f / (mm + mp); */
+  /*   for (int k = 0; k < 3; k++) m->x[k] = (m->x[k] * mm + xp[k]) * w; */
+
+  /*   /\* Add the particle to the moments. *\/ */
+  /*   m->coeffs[0] = mm + mp; */
+
+  /* #else */
+  /* #error( "Multipoles of order %i not yet implemented." ,
+   * const_gravity_multipole_order )
+   */
+  /* #endif */
 }
diff --git a/src/multipole.h b/src/multipole.h
index 85ba44d3ce95d958b721d435ccd26b72e30a79c1..dc1f914dd73969bc63f3beffca7f34d0f889edb6 100644
--- a/src/multipole.h
+++ b/src/multipole.h
@@ -1,6 +1,7 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Copyright (c) 2013 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
+ *               2016 Matthieu Schaller (matthieu.schaller@durham.ac.uk)
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
@@ -28,31 +29,33 @@
 #include "kernel_gravity.h"
 #include "part.h"
 
-/* Some constants. */
-#define multipole_order 1
-
 /* Multipole struct. */
 struct multipole {
 
   /* Multipole location. */
-  double x[3];
+  double CoM[3];
 
-  /* Acceleration on this multipole. */
-  float a[3];
+  /* Multipole mass */
+  float mass;
 
-  /* Multipole coefficients. */
-  float coeffs[multipole_order * multipole_order];
+#if const_gravity_multipole_order >= 2
+  /* Quadrupole terms */
+  float I_xx, I_yy, I_zz;
+  float I_xy, I_xz, I_yz;
+#endif
 };
 
 /* Multipole function prototypes. */
-static void multipole_iact_mm(struct multipole *ma, struct multipole *mb,
-                              double *shift);
-void multipole_merge(struct multipole *ma, struct multipole *mb);
-void multipole_addpart(struct multipole *m, struct gpart *p);
-void multipole_addparts(struct multipole *m, struct gpart *p, int N);
-void multipole_init(struct multipole *m, struct gpart *parts, int N);
+void multipole_add(struct multipole *m_sum, const struct multipole *m_term);
+void multipole_init(struct multipole *m, const struct gpart *gparts,
+                    int gcount);
 void multipole_reset(struct multipole *m);
 
+/* static void multipole_iact_mm(struct multipole *ma, struct multipole *mb, */
+/*                               double *shift); */
+/* void multipole_addpart(struct multipole *m, struct gpart *p); */
+/* void multipole_addparts(struct multipole *m, struct gpart *p, int N); */
+
 /**
  * @brief Compute the pairwise interaction between two multipoles.
  *
@@ -60,39 +63,39 @@ void multipole_reset(struct multipole *m);
  * @param mb The second #multipole.
  * @param shift The periodicity correction.
  */
-
 __attribute__((always_inline)) INLINE static void multipole_iact_mm(
     struct multipole *ma, struct multipole *mb, double *shift) {
-
-  float dx[3], ir, r, r2 = 0.0f, acc;
-  int k;
-
-  /* Compute the multipole distance. */
-  for (k = 0; k < 3; k++) {
-    dx[k] = ma->x[k] - mb->x[k] - shift[k];
-    r2 += dx[k] * dx[k];
-  }
-
-  /* Compute the normalized distance vector. */
-  ir = 1.0f / sqrtf(r2);
-  r = r2 * ir;
-
-  /* Evaluate the gravity kernel. */
-  kernel_grav_eval(r, &acc);
-
-  /* Scale the acceleration. */
-  acc *= const_G * ir * ir * ir;
-
-/* Compute the forces on both multipoles. */
-#if multipole_order == 1
-  float mma = ma->coeffs[0], mmb = mb->coeffs[0];
-  for (k = 0; k < 3; k++) {
-    ma->a[k] -= dx[k] * acc * mmb;
-    mb->a[k] += dx[k] * acc * mma;
-  }
-#else
-#error( "Multipoles of order %i not yet implemented." , multipole_order )
-#endif
+  /*   float dx[3], ir, r, r2 = 0.0f, acc; */
+  /*   int k; */
+
+  /*   /\* Compute the multipole distance. *\/ */
+  /*   for (k = 0; k < 3; k++) { */
+  /*     dx[k] = ma->x[k] - mb->x[k] - shift[k]; */
+  /*     r2 += dx[k] * dx[k]; */
+  /*   } */
+
+  /*   /\* Compute the normalized distance vector. *\/ */
+  /*   ir = 1.0f / sqrtf(r2); */
+  /*   r = r2 * ir; */
+
+  /*   /\* Evaluate the gravity kernel. *\/ */
+  /*   kernel_grav_eval(r, &acc); */
+
+  /*   /\* Scale the acceleration. *\/ */
+  /*   acc *= const_G * ir * ir * ir; */
+
+  /* /\* Compute the forces on both multipoles. *\/ */
+  /* #if const_gravity_multipole_order == 1 */
+  /*   float mma = ma->coeffs[0], mmb = mb->coeffs[0]; */
+  /*   for (k = 0; k < 3; k++) { */
+  /*     ma->a[k] -= dx[k] * acc * mmb; */
+  /*     mb->a[k] += dx[k] * acc * mma; */
+  /*   } */
+  /* #else */
+  /* #error( "Multipoles of order %i not yet implemented." ,
+   * const_gravity_multipole_order )
+   */
+  /* #endif */
 }
 
 /**
@@ -102,35 +105,36 @@ __attribute__((always_inline)) INLINE static void multipole_iact_mm(
  * @param p The #gpart.
  * @param shift The periodicity correction.
  */
-
 __attribute__((always_inline)) INLINE static void multipole_iact_mp(
     struct multipole *m, struct gpart *p, double *shift) {
 
-  float dx[3], ir, r, r2 = 0.0f, acc;
-  int k;
-
-  /* Compute the multipole distance. */
-  for (k = 0; k < 3; k++) {
-    dx[k] = m->x[k] - p->x[k] - shift[k];
-    r2 += dx[k] * dx[k];
-  }
-
-  /* Compute the normalized distance vector. */
-  ir = 1.0f / sqrtf(r2);
-  r = r2 * ir;
-
-  /* Evaluate the gravity kernel. */
-  kernel_grav_eval(r, &acc);
-
-  /* Scale the acceleration. */
-  acc *= const_G * ir * ir * ir * m->coeffs[0];
-
-/* Compute the forces on both multipoles. */
-#if multipole_order == 1
-  for (k = 0; k < 3; k++) p->a_grav[k] += dx[k] * acc;
-#else
-#error( "Multipoles of order %i not yet implemented." , multipole_order )
-#endif
+  /*   float dx[3], ir, r, r2 = 0.0f, acc; */
+  /*   int k; */
+
+  /*   /\* Compute the multipole distance. *\/ */
+  /*   for (k = 0; k < 3; k++) { */
+  /*     dx[k] = m->x[k] - p->x[k] - shift[k]; */
+  /*     r2 += dx[k] * dx[k]; */
+  /*   } */
+
+  /*   /\* Compute the normalized distance vector. *\/ */
+  /*   ir = 1.0f / sqrtf(r2); */
+  /*   r = r2 * ir; */
+
+  /*   /\* Evaluate the gravity kernel. *\/ */
+  /*   kernel_grav_eval(r, &acc); */
+
+  /*   /\* Scale the acceleration. *\/ */
+  /*   acc *= const_G * ir * ir * ir * m->coeffs[0]; */
+
+  /* /\* Compute the forces on both multipoles. *\/ */
+  /* #if const_gravity_multipole_order == 1 */
+  /*   for (k = 0; k < 3; k++) p->a_grav[k] += dx[k] * acc; */
+  /* #else */
+  /* #error( "Multipoles of order %i not yet implemented." ,
+   * const_gravity_multipole_order )
+   */
+  /* #endif */
 }
 
 #endif /* SWIFT_MULTIPOLE_H */
diff --git a/src/potentials.c b/src/potentials.c
index 6818d8a234a437ca0164c5c991b92d905fd6041e..5cbe05e008b7de17310b1e738e032af90684c25e 100644
--- a/src/potentials.c
+++ b/src/potentials.c
@@ -47,7 +47,7 @@ void potential_init(const struct swift_params* parameter_file,
   potential->point_mass.mass =
       parser_get_param_double(parameter_file, "PointMass:mass");
   potential->point_mass.timestep_mult =
-      parser_get_param_double(parameter_file, "PointMass:timestep_mult");
+      parser_get_param_float(parameter_file, "PointMass:timestep_mult");
 
 #endif /* EXTERNAL_POTENTIAL_POINTMASS */
 
@@ -61,7 +61,7 @@ void potential_init(const struct swift_params* parameter_file,
       parser_get_param_double(parameter_file, "IsothermalPotential:position_z");
   potential->isothermal_potential.vrot =
       parser_get_param_double(parameter_file, "IsothermalPotential:vrot");
-  potential->isothermal_potential.timestep_mult = parser_get_param_double(
+  potential->isothermal_potential.timestep_mult = parser_get_param_float(
       parameter_file, "IsothermalPotential:timestep_mult");
 
 #endif /* EXTERNAL_POTENTIAL_ISOTHERMALPOTENTIAL */
diff --git a/src/potentials.h b/src/potentials.h
index 403acf12ad555740b6da803dbc1f2cde2223e266..5373cc1f3f55b0c6d9876cbe6348fdcefbe242aa 100644
--- a/src/potentials.h
+++ b/src/potentials.h
@@ -42,7 +42,7 @@ struct external_potential {
   struct {
     double x, y, z;
     double mass;
-    double timestep_mult;
+    float timestep_mult;
   } point_mass;
 #endif
 
@@ -50,7 +50,7 @@ struct external_potential {
   struct {
     double x, y, z;
     double vrot;
-    double timestep_mult;
+    float timestep_mult;
   } isothermal_potential;
 #endif
 };
@@ -95,6 +95,8 @@ external_gravity_isothermalpotential_timestep(
  * @brief Computes the gravitational acceleration of a particle due to a point
  * mass
  *
+ * Note that the accelerations are multiplied by Newton's G constant later on.
+ *
  * @param potential The #external_potential used in the run.
  * @param phys_const The physical constants in internal units.
  * @param g Pointer to the g-particle data.
@@ -104,15 +106,19 @@ external_gravity_isothermalpotential(const struct external_potential* potential,
                                      const struct phys_const* const phys_const,
                                      struct gpart* g) {
 
+  const float G_newton = phys_const->const_newton_G;
+
   const float dx = g->x[0] - potential->isothermal_potential.x;
   const float dy = g->x[1] - potential->isothermal_potential.y;
   const float dz = g->x[2] - potential->isothermal_potential.z;
   const float rinv2 = 1.f / (dx * dx + dy * dy + dz * dz);
 
   const double vrot = potential->isothermal_potential.vrot;
-  g->a_grav[0] += -vrot * vrot * rinv2 * dx;
-  g->a_grav[1] += -vrot * vrot * rinv2 * dy;
-  g->a_grav[2] += -vrot * vrot * rinv2 * dz;
+  const double term = -vrot * vrot * rinv2 / G_newton;
+
+  g->a_grav[0] += term * dx;
+  g->a_grav[1] += term * dy;
+  g->a_grav[2] += term * dz;
   // error(" %f %f %f %f", vrot, rinv2, dx, g->a_grav[0]);
 }
 
@@ -158,6 +164,8 @@ external_gravity_pointmass_timestep(const struct external_potential* potential,
  * @brief Computes the gravitational acceleration of a particle due to a point
  * mass
  *
+ * Note that the accelerations are multiplied by Newton's G constant later on.
+ *
  * @param potential The proerties of the external potential.
  * @param phys_const The physical constants in internal units.
  * @param g Pointer to the g-particle data.
@@ -166,18 +174,15 @@ __attribute__((always_inline)) INLINE static void external_gravity_pointmass(
     const struct external_potential* potential,
     const struct phys_const* const phys_const, struct gpart* g) {
 
-  const float G_newton = phys_const->const_newton_G;
   const float dx = g->x[0] - potential->point_mass.x;
   const float dy = g->x[1] - potential->point_mass.y;
   const float dz = g->x[2] - potential->point_mass.z;
   const float rinv = 1.f / sqrtf(dx * dx + dy * dy + dz * dz);
+  const float rinv3 = rinv * rinv * rinv;
 
-  g->a_grav[0] +=
-      -G_newton * potential->point_mass.mass * dx * rinv * rinv * rinv;
-  g->a_grav[1] +=
-      -G_newton * potential->point_mass.mass * dy * rinv * rinv * rinv;
-  g->a_grav[2] +=
-      -G_newton * potential->point_mass.mass * dz * rinv * rinv * rinv;
+  g->a_grav[0] += -potential->point_mass.mass * dx * rinv3;
+  g->a_grav[1] += -potential->point_mass.mass * dy * rinv3;
+  g->a_grav[2] += -potential->point_mass.mass * dz * rinv3;
 }
 
 #endif /* EXTERNAL_POTENTIAL_POINTMASS */
diff --git a/src/runner.c b/src/runner.c
index 01762dda28aaa8f1fa3e240824761d90c9f8f702..6e99d8cece2cf086b1209ac2f8314301b9779e20 100644
--- a/src/runner.c
+++ b/src/runner.c
@@ -87,6 +87,10 @@ const char runner_flip[27] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
 #define FUNCTION force
 #include "runner_doiact.h"
 
+/* Import the gravity loop functions. */
+#include "runner_doiact_fft.h"
+#include "runner_doiact_grav.h"
+
 /**
  * @brief Calculate gravity acceleration from external potential
  *
@@ -115,7 +119,7 @@ void runner_do_grav_external(struct runner *r, struct cell *c, int timer) {
   OUT;
 #endif
 
-  /* Loop over the parts in this cell. */
+  /* Loop over the gparts in this cell. */
   for (int i = 0; i < gcount; i++) {
 
     /* Get a direct pointer on the part. */
@@ -127,6 +131,7 @@ void runner_do_grav_external(struct runner *r, struct cell *c, int timer) {
       external_gravity(potential, constants, g);
     }
   }
+
   if (timer) TIMER_TOC(timer_dograv_external);
 }
 
@@ -424,11 +429,9 @@ void runner_do_init(struct runner *r, struct cell *c, int timer) {
  */
 void runner_do_ghost(struct runner *r, struct cell *c) {
 
-  struct part *p, *parts = c->parts;
-  struct xpart *xp, *xparts = c->xparts;
-  struct cell *finger;
+  struct part *restrict parts = c->parts;
+  struct xpart *restrict xparts = c->xparts;
   int redo, count = c->count;
-  float h_corr;
   const int ti_current = r->e->ti_current;
   const double timeBase = r->e->timeBase;
   const float target_wcount = r->e->hydro_properties->target_neighbours;
@@ -465,8 +468,8 @@ void runner_do_ghost(struct runner *r, struct cell *c) {
     for (int i = 0; i < count; i++) {
 
       /* Get a direct pointer on the part. */
-      p = &parts[pid[i]];
-      xp = &xparts[pid[i]];
+      struct part *restrict p = &parts[pid[i]];
+      struct xpart *restrict xp = &xparts[pid[i]];
 
       /* Is this part within the timestep? */
       if (p->ti_end <= ti_current) {
@@ -474,6 +477,8 @@ void runner_do_ghost(struct runner *r, struct cell *c) {
         /* Finish the density calculation */
         hydro_end_density(p, ti_current);
 
+        float h_corr = 0.f;
+
         /* If no derivative, double the smoothing length. */
         if (p->density.wcount_dh == 0.0f) h_corr = p->h;
 
@@ -526,7 +531,7 @@ void runner_do_ghost(struct runner *r, struct cell *c) {
     if (count > 0) {
 
       /* Climb up the cell hierarchy. */
-      for (finger = c; finger != NULL; finger = finger->parent) {
+      for (struct cell *finger = c; finger != NULL; finger = finger->parent) {
 
         /* Run through this cell's density interactions. */
         for (struct link *l = finger->density; l != NULL; l = l->next) {
@@ -745,6 +750,7 @@ void runner_do_kick_fixdt(struct runner *r, struct cell *c, int timer) {
   struct part *restrict parts = c->parts;
   struct xpart *restrict xparts = c->xparts;
   struct gpart *restrict gparts = c->gparts;
+  const double const_G = r->e->physical_constants->const_newton_G;
 
   int updated = 0, g_updated = 0;
   int ti_end_min = max_nr_timesteps, ti_end_max = 0;
@@ -771,7 +777,7 @@ void runner_do_kick_fixdt(struct runner *r, struct cell *c, int timer) {
       if (gp->id_or_neg_offset > 0) {
 
         /* First, finish the force calculation */
-        gravity_end_force(gp);
+        gravity_end_force(gp, const_G);
 
         /* Kick the g-particle forward */
         kick_gpart(gp, new_dti, timeBase);
@@ -796,7 +802,7 @@ void runner_do_kick_fixdt(struct runner *r, struct cell *c, int timer) {
 
       /* First, finish the force loop */
       hydro_end_force(p);
-      if (p->gpart != NULL) gravity_end_force(p->gpart);
+      if (p->gpart != NULL) gravity_end_force(p->gpart, const_G);
 
       /* Kick the particle forward */
       kick_part(p, xp, new_dti, timeBase);
@@ -857,6 +863,7 @@ void runner_do_kick(struct runner *r, struct cell *c, int timer) {
   struct part *restrict parts = c->parts;
   struct xpart *restrict xparts = c->xparts;
   struct gpart *restrict gparts = c->gparts;
+  const double const_G = r->e->physical_constants->const_newton_G;
 
   int updated = 0, g_updated = 0;
   int ti_end_min = max_nr_timesteps, ti_end_max = 0;
@@ -882,7 +889,7 @@ void runner_do_kick(struct runner *r, struct cell *c, int timer) {
         if (gp->ti_end <= ti_current) {
 
           /* First, finish the force calculation */
-          gravity_end_force(gp);
+          gravity_end_force(gp, const_G);
 
           /* Compute the next timestep */
           const int new_dti = get_gpart_timestep(gp, e);
@@ -914,7 +921,7 @@ void runner_do_kick(struct runner *r, struct cell *c, int timer) {
 
         /* First, finish the force loop */
         hydro_end_force(p);
-        if (p->gpart != NULL) gravity_end_force(p->gpart);
+        if (p->gpart != NULL) gravity_end_force(p->gpart, const_G);
 
         /* Compute the next timestep (hydro condition) */
         const int new_dti = get_part_timestep(p, xp, e);
@@ -1071,6 +1078,8 @@ void *runner_main(void *data) {
             runner_doself1_density(r, ci);
           else if (t->subtype == task_subtype_force)
             runner_doself2_force(r, ci);
+          else if (t->subtype == task_subtype_grav)
+            runner_doself_grav(r, ci, 1);
           else
             error("Unknown task subtype.");
           break;
@@ -1079,6 +1088,8 @@ void *runner_main(void *data) {
             runner_dopair1_density(r, ci, cj);
           else if (t->subtype == task_subtype_force)
             runner_dopair2_force(r, ci, cj);
+          else if (t->subtype == task_subtype_grav)
+            runner_dopair_grav(r, ci, cj, 1);
           else
             error("Unknown task subtype.");
           break;
@@ -1090,6 +1101,8 @@ void *runner_main(void *data) {
             runner_dosub_self1_density(r, ci, 1);
           else if (t->subtype == task_subtype_force)
             runner_dosub_self2_force(r, ci, 1);
+          else if (t->subtype == task_subtype_grav)
+            runner_dosub_grav(r, ci, cj, 1);
           else
             error("Unknown task subtype.");
           break;
@@ -1098,6 +1111,8 @@ void *runner_main(void *data) {
             runner_dosub_pair1_density(r, ci, cj, t->flags, 1);
           else if (t->subtype == task_subtype_force)
             runner_dosub_pair2_force(r, ci, cj, t->flags, 1);
+          else if (t->subtype == task_subtype_grav)
+            runner_dosub_grav(r, ci, cj, 1);
           else
             error("Unknown task subtype.");
           break;
@@ -1129,6 +1144,17 @@ void *runner_main(void *data) {
             runner_do_recv_cell(r, ci, 1);
           }
           break;
+        case task_type_grav_mm:
+          runner_do_grav_mm(r, t->ci, 1);
+          break;
+        case task_type_grav_up:
+          runner_do_grav_up(r, t->ci);
+          break;
+        case task_type_grav_gather_m:
+          break;
+        case task_type_grav_fft:
+          runner_do_grav_fft(r);
+          break;
         case task_type_grav_external:
           runner_do_grav_external(r, t->ci, 1);
           break;
diff --git a/src/runner_doiact_fft.c b/src/runner_doiact_fft.c
new file mode 100644
index 0000000000000000000000000000000000000000..076ec2578361127266c637cc5ac224609b702c66
--- /dev/null
+++ b/src/runner_doiact_fft.c
@@ -0,0 +1,36 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (c) 2016 Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+
+/* Config parameters. */
+#include "../config.h"
+
+/* Some standard headers. */
+#include <pthread.h>
+
+#ifdef HAVE_FFTW
+#include <fftw3.h>
+#endif
+
+/* This object's header. */
+#include "runner_doiact_fft.h"
+
+/* Local includes. */
+#include "runner.h"
+
+void runner_do_grav_fft(struct runner *r) {}
diff --git a/src/runner_doiact_fft.h b/src/runner_doiact_fft.h
new file mode 100644
index 0000000000000000000000000000000000000000..263662383fb465dcf945e55494a569289b009ff9
--- /dev/null
+++ b/src/runner_doiact_fft.h
@@ -0,0 +1,26 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (c) 2016 Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+#ifndef SWIFT_RUNNER_DOIACT_FFT_H
+#define SWIFT_RUNNER_DOIACT_FFT_H
+
+struct runner;
+
+void runner_do_grav_fft(struct runner *r);
+
+#endif /* SWIFT_RUNNER_DOIACT_FFT_H */
diff --git a/src/runner_doiact_grav.h b/src/runner_doiact_grav.h
index 401e48ffbb92218aecafd21a3f70c44c9a7b0a1b..a220ad1794d23999ff16752e797a499071fa2e65 100644
--- a/src/runner_doiact_grav.h
+++ b/src/runner_doiact_grav.h
@@ -1,6 +1,7 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Copyright (c) 2013 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
+ *               2016 Matthieu Schaller (matthieu.schaller@durham.ac.uk)
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
@@ -19,575 +20,502 @@
 #ifndef SWIFT_RUNNER_DOIACT_GRAV_H
 #define SWIFT_RUNNER_DOIACT_GRAV_H
 
+/* Includes. */
+#include "cell.h"
+#include "gravity.h"
+#include "part.h"
+
+#define ICHECK -1000
+
 /**
- * @brief Compute the sorted gravity interactions between a cell pair.
+ * @brief Compute the recursive upward sweep, i.e. construct the
+ *        multipoles in a cell hierarchy.
  *
  * @param r The #runner.
- * @param ci The first #cell.
- * @param cj The second #cell.
+ * @param c The top-level #cell.
  */
+void runner_do_grav_up(struct runner *r, struct cell *c) {
+
+  if (c->split) { /* Regular node */
+
+    /* Recurse. */
+    for (int k = 0; k < 8; k++)
+      if (c->progeny[k] != NULL) runner_do_grav_up(r, c->progeny[k]);
+
+    /* Collect the multipoles from the progeny. */
+    multipole_reset(&c->multipole);
+    for (int k = 0; k < 8; k++) {
+      if (c->progeny[k] != NULL)
+        multipole_add(&c->multipole, &c->progeny[k]->multipole);
+    }
 
-void runner_dopair_grav_new(struct runner *r, struct cell *ci,
-                            struct cell *cj) {
-
-  struct engine *restrict e = r->e;
-  int pid, pjd, k, sid;
-  double rshift, shift[3] = {0.0, 0.0, 0.0}, nshift[3];
-  struct entry *restrict sort_i, *restrict sort_j;
-  struct gpart *restrict pi, *restrict pj, *restrict parts_i, *restrict parts_j;
-  double pix[3];
-  float dx[3], r2, h_max, di, dj;
-  int count_i, count_j, cnj, cnj_new;
+  } else { /* Leaf node. */
+    /* Just construct the multipole from the gparts. */
+    multipole_init(&c->multipole, c->gparts, c->gcount);
+  }
+}
+
+/**
+ * @brief Computes the interaction of all the particles in a cell with the
+ * multipole of another cell.
+ *
+ * @param r The #runner.
+ * @param ci The #cell with particles to interct.
+ * @param cj The #cell with the multipole.
+ */
+__attribute__((always_inline)) INLINE static void runner_dopair_grav_pm(
+    const struct runner *r, const struct cell *restrict ci,
+    const struct cell *restrict cj) {
+
+  const struct engine *e = r->e;
+  const int gcount = ci->gcount;
+  struct gpart *restrict gparts = ci->gparts;
+  const struct multipole multi = cj->multipole;
   const int ti_current = e->ti_current;
-  struct multipole m;
-#ifdef WITH_VECTORIZATION
-  int icount = 0;
-  float r2q[VEC_SIZE] __attribute__((aligned(16)));
-  float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
-  struct gpart *piq[VEC_SIZE], *pjq[VEC_SIZE];
+  const float rlr_inv = 1. / (const_gravity_a_smooth * ci->super->width[0]);
+
+  TIMER_TIC;
+
+#ifdef SWIFT_DEBUG_CHECKS
+  if (gcount == 0) error("Empty cell!");  // MATTHIEU sanity check
+
+  if (multi.mass == 0.0)  // MATTHIEU sanity check
+    error("Multipole does not seem to have been set.");
 #endif
-  TIMER_TIC
 
   /* Anything to do here? */
-  if (ci->ti_end_min > ti_current && cj->ti_end_min > ti_current) return;
+  if (ci->ti_end_min > ti_current) return;
 
-  /* Get the sort ID. */
-  sid = space_getsid(e->s, &ci, &cj, shift);
-
-  /* Make sure the cells are sorted. */
-  // runner_do_gsort(r, ci, (1 << sid), 0);
-  // runner_do_gsort(r, cj, (1 << sid), 0);
-
-  /* Have the cells been sorted? */
-  if (!(ci->gsorted & (1 << sid)) || !(cj->gsorted & (1 << sid)))
-    error("Trying to interact unsorted cells.");
-
-  /* Get the cutoff shift. */
-  for (rshift = 0.0, k = 0; k < 3; k++)
-    rshift += shift[k] * runner_shift[sid][k];
-
-  /* Pick-out the sorted lists. */
-  sort_i = &ci->gsort[sid * (ci->count + 1)];
-  sort_j = &cj->gsort[sid * (cj->count + 1)];
-
-  /* Get some other useful values. */
-  h_max = sqrtf(ci->width[0] * ci->width[0] + ci->width[1] * ci->width[1] +
-                ci->width[2] * ci->width[2]) *
-          const_theta_max;
-  count_i = ci->gcount;
-  count_j = cj->gcount;
-  parts_i = ci->gparts;
-  parts_j = cj->gparts;
-  cnj = count_j;
-  multipole_reset(&m);
-  nshift[0] = -shift[0];
-  nshift[1] = -shift[1];
-  nshift[2] = -shift[2];
-
-  /* Loop over the parts in ci. */
-  for (pid = count_i - 1; pid >= 0; pid--) {
+#if ICHECK > 0
+  for (int pid = 0; pid < gcount; pid++) {
 
     /* Get a hold of the ith part in ci. */
-    pi = &parts_i[sort_i[pid].i];
-    if (pi->ti_end > ti_current) continue;
-    di = sort_i[pid].d + h_max - rshift;
-
-    for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
+    struct gpart *restrict gp = &gparts[pid];
 
-    /* Loop over the parts in cj. */
-    for (pjd = 0; pjd < cnj && sort_j[pjd].d < di; pjd++) {
+    if (gp->id_or_neg_offset == ICHECK)
+      message("id=%lld loc=[ %f %f %f ] size= %f count= %d",
+              gp->id_or_neg_offset, cj->loc[0], cj->loc[1], cj->loc[2],
+              cj->width[0], cj->gcount);
+  }
+#endif
 
-      /* Get a pointer to the jth particle. */
-      pj = &parts_j[sort_j[pjd].i];
+  /* Loop over every particle in leaf. */
+  for (int pid = 0; pid < gcount; pid++) {
 
-      /* Compute the pairwise distance. */
-      r2 = 0.0f;
-      for (k = 0; k < 3; k++) {
-        dx[k] = pix[k] - pj->x[k];
-        r2 += dx[k] * dx[k];
-      }
+    /* Get a hold of the ith part in ci. */
+    struct gpart *restrict gp = &gparts[pid];
 
-#ifndef WITH_VECTORIZATION
+    if (gp->ti_end > ti_current) continue;
 
-      // if ( pi->part->id == 3473472412525 || pj->part->id == 3473472412525 )
-      //     message( "interacting particles pi=%lli and pj=%lli with r=%.3e in
-      // cells %lli/%lli." , pi->part->id , pj->part->id , sqrtf(r2) , ((long
-      // long int)ci) / sizeof(struct cell) , ((long long int)cj) /
-      // sizeof(struct cell) );
+    /* Compute the pairwise distance. */
+    const float dx[3] = {multi.CoM[0] - gp->x[0],   // x
+                         multi.CoM[1] - gp->x[1],   // y
+                         multi.CoM[2] - gp->x[2]};  // z
+    const float r2 = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2];
 
-      runner_iact_grav(r2, dx, pi, pj);
+    /* Interact !*/
+    runner_iact_grav_pm(rlr_inv, r2, dx, gp, &multi);
+  }
 
-#else
+  TIMER_TOC(timer_dopair_grav_pm);
+}
 
-      /* Add this interaction to the queue. */
-      r2q[icount] = r2;
-      dxq[3 * icount + 0] = dx[0];
-      dxq[3 * icount + 1] = dx[1];
-      dxq[3 * icount + 2] = dx[2];
-      piq[icount] = pi;
-      pjq[icount] = pj;
-      icount += 1;
+/**
+ * @brief Computes the interaction of all the particles in a cell with all the
+ * particles of another cell.
+ *
+ * @param r The #runner.
+ * @param ci The first #cell.
+ * @param cj The other #cell.
+ *
+ * @todo Use a local cache for the particles.
+ */
+__attribute__((always_inline)) INLINE static void runner_dopair_grav_pp(
+    struct runner *r, struct cell *ci, struct cell *cj) {
+
+  const struct engine *e = r->e;
+  const int gcount_i = ci->gcount;
+  const int gcount_j = cj->gcount;
+  struct gpart *restrict gparts_i = ci->gparts;
+  struct gpart *restrict gparts_j = cj->gparts;
+  const int ti_current = e->ti_current;
+  const float rlr_inv = 1. / (const_gravity_a_smooth * ci->super->width[0]);
 
-      /* Flush? */
-      if (icount == VEC_SIZE) {
-        runner_iact_vec_grav(r2q, dxq, piq, pjq);
-        icount = 0;
-      }
+  TIMER_TIC;
 
+#ifdef SWIFT_DEBUG_CHECKS
+  if (ci->width[0] != cj->width[0])  // MATTHIEU sanity check
+    error("Non matching cell sizes !! h_i=%f h_j=%f", ci->width[0],
+          cj->width[0]);
 #endif
 
-    } /* loop over the parts in cj. */
+  /* Anything to do here? */
+  if (ci->ti_end_min > ti_current && cj->ti_end_min > ti_current) return;
+
+#if ICHECK > 0
+  for (int pid = 0; pid < gcount_i; pid++) {
 
-    /* Set the new limit. */
-    cnj_new = pjd;
+    /* Get a hold of the ith part in ci. */
+    struct gpart *restrict gp = &gparts_i[pid];
 
-    /* Add trailing parts to the multipole. */
-    for (pjd = cnj_new; pjd < cnj; pjd++) {
+    if (gp->id_or_neg_offset == ICHECK)
+      message("id=%lld loc=[ %f %f %f ] size= %f count= %d",
+              gp->id_or_neg_offset, cj->loc[0], cj->loc[1], cj->loc[2],
+              cj->width[0], cj->gcount);
+  }
 
-      /* Add the part to the multipole. */
-      multipole_addpart(&m, &parts_j[sort_j[pjd].i]);
+  for (int pid = 0; pid < gcount_j; pid++) {
 
-    } /* add trailing parts to the multipole. */
+    /* Get a hold of the ith part in ci. */
+    struct gpart *restrict gp = &gparts_j[pid];
 
-    /* Set the new cnj. */
-    cnj = cnj_new;
+    if (gp->id_or_neg_offset == ICHECK)
+      message("id=%lld loc=[ %f %f %f ] size= %f count=%d",
+              gp->id_or_neg_offset, ci->loc[0], ci->loc[1], ci->loc[2],
+              ci->width[0], ci->gcount);
+  }
+#endif
 
-    /* Interact the ith particle with the multipole. */
-    multipole_iact_mp(&m, pi, nshift);
+  /* Loop over all particles in ci... */
+  for (int pid = 0; pid < gcount_i; pid++) {
 
-  } /* loop over the parts in ci. */
+    /* Get a hold of the ith part in ci. */
+    struct gpart *restrict gpi = &gparts_i[pid];
 
-#ifdef WITH_VECTORIZATION
-  /* Pick up any leftovers. */
-  if (icount > 0)
-    for (k = 0; k < icount; k++)
-      runner_iact_grav(r2q[k], &dxq[3 * k], piq[k], pjq[k]);
-#endif
+    /* Loop over every particle in the other cell. */
+    for (int pjd = 0; pjd < gcount_j; pjd++) {
 
-  /* Re-set the multipole. */
-  multipole_reset(&m);
+      /* Get a hold of the jth part in cj. */
+      struct gpart *restrict gpj = &gparts_j[pjd];
 
-  /* Loop over the parts in cj and interact with the multipole in ci. */
-  for (pid = count_i - 1, pjd = 0; pjd < count_j; pjd++) {
+      /* Compute the pairwise distance. */
+      float dx[3] = {gpi->x[0] - gpj->x[0],   // x
+                     gpi->x[1] - gpj->x[1],   // y
+                     gpi->x[2] - gpj->x[2]};  // z
+      const float r2 = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2];
 
-    /* Get the position of pj along the axis. */
-    dj = sort_j[pjd].d - h_max + rshift;
+      /* Interact ! */
+      if (gpi->ti_end <= ti_current && gpj->ti_end <= ti_current) {
 
-    /* Add any left-over parts in cell_i to the multipole. */
-    while (pid >= 0 && sort_i[pid].d < dj) {
+        runner_iact_grav_pp(rlr_inv, r2, dx, gpi, gpj);
 
-      /* Add this particle to the multipole. */
-      multipole_addpart(&m, &parts_i[sort_i[pid].i]);
+      } else {
 
-      /* Decrease pid. */
-      pid -= 1;
-    }
+        if (gpi->ti_end <= ti_current) {
 
-    /* Interact pj with the multipole. */
-    multipole_iact_mp(&m, &parts_j[sort_j[pjd].i], shift);
+          runner_iact_grav_pp_nonsym(rlr_inv, r2, dx, gpi, gpj);
 
-  } /* loop over the parts in cj and interact with the multipole. */
+        } else if (gpj->ti_end <= ti_current) {
 
-  TIMER_TOC(TIMER_DOPAIR);
+          dx[0] = -dx[0];
+          dx[1] = -dx[1];
+          dx[2] = -dx[2];
+          runner_iact_grav_pp_nonsym(rlr_inv, r2, dx, gpj, gpi);
+        }
+      }
+    }
+  }
+
+  TIMER_TOC(timer_dopair_grav_pp);
 }
 
 /**
- * @brief Compute the recursive upward sweep, i.e. construct the
- *        multipoles in a cell hierarchy.
+ * @brief Computes the interaction of all the particles in a cell directly
  *
  * @param r The #runner.
- * @param c The top-level #cell.
+ * @param c The #cell.
+ *
+ * @todo Use a local cache for the particles.
  */
+__attribute__((always_inline)) INLINE static void runner_doself_grav_pp(
+    struct runner *r, struct cell *c) {
+
+  const struct engine *e = r->e;
+  const int gcount = c->gcount;
+  struct gpart *restrict gparts = c->gparts;
+  const int ti_current = e->ti_current;
+  const float rlr_inv = 1. / (const_gravity_a_smooth * c->super->width[0]);
 
-void runner_dograv_up(struct runner *r, struct cell *c) {
+  TIMER_TIC;
 
-  /* Re-set this cell's multipole. */
-  multipole_reset(&c->multipole);
+#ifdef SWIFT_DEBUG_CHECKS
+  if (c->gcount == 0)  // MATTHIEU sanity check
+    error("Empty cell !");
+#endif
 
-  /* Split? */
-  if (c->split) {
+  /* Anything to do here? */
+  if (c->ti_end_min > ti_current) return;
 
-    /* Recurse. */
-    for (int k = 0; k < 8; k++)
-      if (c->progeny[k] != NULL) runner_dograv_up(r, c->progeny[k]);
+#if ICHECK > 0
+  for (int pid = 0; pid < gcount; pid++) {
 
-    /* Collect the multipoles from the progeny. */
-    multipole_reset(&c->multipole);
-    for (int k = 0; k < 8; k++)
-      if (c->progeny[k] != NULL)
-        multipole_merge(&c->multipole, &c->progeny[k]->multipole);
+    /* Get a hold of the ith part in ci. */
+    struct gpart *restrict gp = &gparts[pid];
 
+    if (gp->id_or_neg_offset == ICHECK)
+      message("id=%lld loc=[ %f %f %f ] size= %f count= %d",
+              gp->id_or_neg_offset, c->loc[0], c->loc[1], c->loc[2],
+              c->width[0], c->gcount);
   }
+#endif
 
-  /* No, leaf node. */
-  else
+  /* Loop over all particles in ci... */
+  for (int pid = 0; pid < gcount; pid++) {
 
-    /* Just collect the multipole. */
-    multipole_init(&c->multipole, c->gparts, c->gcount);
-}
+    /* Get a hold of the ith part in ci. */
+    struct gpart *restrict gpi = &gparts[pid];
 
-/**
- * @brief Compute the recursive downward sweep, i.e. apply the multipole
- *        acceleration on all the particles.
- *
- * @param r The #runner.
- * @param c The top-level #cell.
- */
+    /* Loop over every particle in the other cell. */
+    for (int pjd = pid + 1; pjd < gcount; pjd++) {
+
+      /* Get a hold of the jth part in ci. */
+      struct gpart *restrict gpj = &gparts[pjd];
 
-void runner_dograv_down(struct runner *r, struct cell *c) {
+      /* Compute the pairwise distance. */
+      float dx[3] = {gpi->x[0] - gpj->x[0],   // x
+                     gpi->x[1] - gpj->x[1],   // y
+                     gpi->x[2] - gpj->x[2]};  // z
+      const float r2 = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2];
 
-  struct multipole *m = &c->multipole;
+      /* Interact ! */
+      if (gpi->ti_end <= ti_current && gpj->ti_end <= ti_current) {
 
-  /* Split? */
-  if (c->split) {
+        runner_iact_grav_pp(rlr_inv, r2, dx, gpi, gpj);
 
-    /* Apply this cell's acceleration on the multipoles below. */
-    for (int k = 0; k < 8; k++)
-      if (c->progeny[k] != NULL) {
-        struct multipole *mp = &c->progeny[k]->multipole;
-        mp->a[0] += m->a[0];
-        mp->a[1] += m->a[1];
-        mp->a[2] += m->a[2];
-      }
+      } else {
 
-    /* Recurse. */
-    for (int k = 0; k < 8; k++)
-      if (c->progeny[k] != NULL) runner_dograv_down(r, c->progeny[k]);
+        if (gpi->ti_end <= ti_current) {
 
-  }
+          runner_iact_grav_pp_nonsym(rlr_inv, r2, dx, gpi, gpj);
 
-  /* No, leaf node. */
-  else {
+        } else if (gpj->ti_end <= ti_current) {
 
-    /* Apply the multipole acceleration to all gparts. */
-    for (int k = 0; k < c->gcount; k++) {
-      struct gpart *p = &c->gparts[k];
-      p->a_grav[0] += m->a[0];
-      p->a_grav[1] += m->a[1];
-      p->a_grav[2] += m->a[2];
+          dx[0] = -dx[0];
+          dx[1] = -dx[1];
+          dx[2] = -dx[2];
+          runner_iact_grav_pp_nonsym(rlr_inv, r2, dx, gpj, gpi);
+        }
+      }
     }
   }
+
+  TIMER_TOC(timer_doself_grav_pp);
 }
 
 /**
- * @brief Compute the multipole-multipole interaction between two cells.
+ * @brief Computes the interaction of all the particles in a cell with all the
+ * particles of another cell.
  *
  * @param r The #runner.
  * @param ci The first #cell.
- * @param cj The second #cell.
+ * @param cj The other #cell.
+ * @param gettimer Are we timing this ?
+ *
+ * @todo Use a local cache for the particles.
  */
+static void runner_dopair_grav(struct runner *r, struct cell *ci,
+                               struct cell *cj, int gettimer) {
 
-void runner_dograv_mm(struct runner *r, struct cell *restrict ci,
-                      struct cell *restrict cj) {
-
-  struct engine *e = r->e;
-  int k;
-  double shift[3] = {0.0, 0.0, 0.0};
-  float dx[3], theta;
-
-  /* Compute the shift between the cells. */
-  for (k = 0; k < 3; k++) {
-    dx[k] = cj->loc[k] - ci->loc[k];
-    if (r->e->s->periodic) {
-      if (dx[k] < -e->s->dim[k] / 2)
-        shift[k] = e->s->dim[k];
-      else if (dx[k] > e->s->dim[k] / 2)
-        shift[k] = -e->s->dim[k];
-      dx[k] += shift[k];
-    }
-  }
-  theta = sqrt((dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2]) /
-               (ci->width[0] * ci->width[0] + ci->width[1] * ci->width[1] +
-                ci->width[2] * ci->width[2]));
+#ifdef SWIFT_DEBUG_CHECKS
 
-  /* Do an MM or an MP/PM? */
-  if (theta > const_theta_max * 4) {
+  const int gcount_i = ci->gcount;
+  const int gcount_j = cj->gcount;
 
-    /* Update the multipoles. */
-    multipole_iact_mm(&ci->multipole, &cj->multipole, shift);
+  /* Early abort? */
+  if (gcount_i == 0 || gcount_j == 0) error("Empty cell !");
 
-  } else {
+  /* Bad stuff will happen if cell sizes are different */
+  if (ci->width[0] != cj->width[0])
+    error("Non matching cell sizes !! h_i=%f h_j=%f", ci->width[0],
+          cj->width[0]);
 
-    /* Interact the multipoles via their parts. */
-    for (k = 0; k < ci->gcount; k++)
-      multipole_iact_mp(&cj->multipole, &ci->gparts[k], shift);
-    for (k = 0; k < cj->gcount; k++)
-      multipole_iact_mp(&ci->multipole, &cj->gparts[k], shift);
-  }
-}
+  /* Sanity check */
+  if (ci == cj)
+    error(
+        "The impossible has happened: pair interaction between a cell and "
+        "itself.");
 
-/**
- * @brief Compute the interactions between a cell pair.
- *
- * @param r The #runner.
- * @param ci The first #cell.
- * @param cj The second #cell.
- */
+  /* Are the cells direct neighbours? */
+  if (!cell_are_neighbours(ci, cj))
+    error(
+        "Non-neighbouring cells ! ci->x=[%f %f %f] ci->width=%f cj->loc=[%f %f "
+        "%f] "
+        "cj->width=%f",
+        ci->loc[0], ci->loc[1], ci->loc[2], ci->width[0], cj->loc[0],
+        cj->loc[1], cj->loc[2], cj->width[0]);
 
-void runner_dopair_grav(struct runner *r, struct cell *restrict ci,
-                        struct cell *restrict cj) {
-
-  struct engine *e = r->e;
-  int pid, pjd, k, count_i = ci->gcount, count_j = cj->gcount;
-  double shift[3] = {0.0, 0.0, 0.0};
-  struct gpart *restrict parts_i = ci->gparts, *restrict parts_j = cj->gparts;
-  struct gpart *restrict pi, *restrict pj;
-  double pix[3];
-  float dx[3], r2;
-  const int ti_current = r->e->ti_current;
-#ifdef WITH_VECTORIZATION
-  int icount = 0;
-  float r2q[VEC_SIZE] __attribute__((aligned(16)));
-  float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
-  struct gpart *piq[VEC_SIZE], *pjq[VEC_SIZE];
 #endif
-  TIMER_TIC
 
-  /* Anything to do here? */
-  if (ci->ti_end_min > ti_current && cj->ti_end_min > ti_current) return;
-
-  /* Get the relative distance between the pairs, wrapping. */
-  if (e->s->periodic)
-    for (k = 0; k < 3; k++) {
-      if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2)
-        shift[k] = e->s->dim[k];
-      else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2)
-        shift[k] = -e->s->dim[k];
-    }
-
-  /* Loop over the parts in ci. */
-  for (pid = 0; pid < count_i; pid++) {
+#if ICHECK > 0
+  for (int pid = 0; pid < ci->gcount; pid++) {
 
     /* Get a hold of the ith part in ci. */
-    pi = &parts_i[pid];
-    for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
+    struct gpart *restrict gp = &ci->gparts[pid];
 
-    /* Loop over the parts in cj. */
-    for (pjd = 0; pjd < count_j; pjd++) {
+    if (gp->id_or_neg_offset == ICHECK)
+      message("id=%lld loc=[ %f %f %f ] size= %f count= %d",
+              gp->id_or_neg_offset, cj->loc[0], cj->loc[1], cj->loc[2],
+              cj->width[0], cj->gcount);
+  }
 
-      /* Get a pointer to the jth particle. */
-      pj = &parts_j[pjd];
+  for (int pid = 0; pid < cj->gcount; pid++) {
 
-      /* Compute the pairwise distance. */
-      r2 = 0.0f;
-      for (k = 0; k < 3; k++) {
-        dx[k] = pix[k] - pj->x[k];
-        r2 += dx[k] * dx[k];
-      }
+    /* Get a hold of the ith part in ci. */
+    struct gpart *restrict gp = &cj->gparts[pid];
 
-/* Compute the interaction. */
-#ifndef WITH_VECTORIZATION
+    if (gp->id_or_neg_offset == ICHECK)
+      message("id=%lld loc=[ %f %f %f ] size= %f count= %d",
+              gp->id_or_neg_offset, ci->loc[0], ci->loc[1], ci->loc[2],
+              ci->width[0], ci->gcount);
+  }
+#endif
 
-      // if ( pi->part->id == 3473472412525 || pj->part->id == 3473472412525 )
-      //     message( "interacting particles pi=%lli and pj=%lli with r=%.3e in
-      // cells %lli/%lli." , pi->part->id , pj->part->id , sqrtf(r2) , ((long
-      // long int)ci) / sizeof(struct cell) , ((long long int)cj) /
-      // sizeof(struct cell) );
+  TIMER_TIC;
 
-      runner_iact_grav(r2, dx, pi, pj);
+  /* Are both cells split ? */
+  if (ci->split && cj->split) {
 
-#else
+    for (int j = 0; j < 8; j++) {
+      if (ci->progeny[j] != NULL) {
 
-      /* Add this interaction to the queue. */
-      r2q[icount] = r2;
-      dxq[3 * icount + 0] = dx[0];
-      dxq[3 * icount + 1] = dx[1];
-      dxq[3 * icount + 2] = dx[2];
-      piq[icount] = pi;
-      pjq[icount] = pj;
-      icount += 1;
+        for (int k = 0; k < 8; k++) {
+          if (cj->progeny[k] != NULL) {
 
-      /* Flush? */
-      if (icount == VEC_SIZE) {
-        runner_iact_vec_grav(r2q, dxq, piq, pjq);
-        icount = 0;
-      }
+            if (cell_are_neighbours(ci->progeny[j], cj->progeny[k])) {
 
-#endif
+              /* Recurse */
+              runner_dopair_grav(r, ci->progeny[j], cj->progeny[k], 0);
 
-    } /* loop over the parts in cj. */
+            } else {
 
-  } /* loop over the parts in ci. */
+              /* Ok, here we can go for particle-multipole interactions */
+              runner_dopair_grav_pm(r, ci->progeny[j], cj->progeny[k]);
+              runner_dopair_grav_pm(r, cj->progeny[k], ci->progeny[j]);
+            }
+          }
+        }
+      }
+    }
+  } else { /* Not split */
 
-#ifdef WITH_VECTORIZATION
-  /* Pick up any leftovers. */
-  if (icount > 0)
-    for (k = 0; k < icount; k++)
-      runner_iact_grav(r2q[k], &dxq[3 * k], piq[k], pjq[k]);
-#endif
+    /* Compute the interactions at this level directly. */
+    runner_dopair_grav_pp(r, ci, cj);
+  }
 
-  TIMER_TOC(timer_dopair_grav);
+  if (gettimer) TIMER_TOC(timer_dosub_pair_grav);
 }
 
 /**
- * @brief Compute the interactions within a cell.
+ * @brief Computes the interaction of all the particles in a cell
  *
  * @param r The #runner.
- * @param c The #cell.
+ * @param c The first #cell.
+ * @param gettimer Are we timing this ?
+ *
+ * @todo Use a local cache for the particles.
  */
+static void runner_doself_grav(struct runner *r, struct cell *c, int gettimer) {
+
+#ifdef SWIFT_DEBUG_CHECKS
 
-void runner_doself_grav(struct runner *r, struct cell *restrict c) {
-
-  int pid, pjd, k, count = c->gcount;
-  struct gpart *restrict parts = c->gparts;
-  struct gpart *restrict pi, *restrict pj;
-  double pix[3] = {0.0, 0.0, 0.0};
-  float dx[3], r2;
-  const int ti_current = r->e->ti_current;
-#ifdef WITH_VECTORIZATION
-  int icount = 0;
-  float r2q[VEC_SIZE] __attribute__((aligned(16)));
-  float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
-  struct gpart *piq[VEC_SIZE], *pjq[VEC_SIZE];
+  /* Early abort? */
+  if (c->gcount == 0) error("Empty cell !");
 #endif
-  TIMER_TIC
 
-  /* Anything to do here? */
-  if (c->ti_end_min > ti_current) return;
+  TIMER_TIC;
 
-  /* Loop over every part in c. */
-  for (pid = 0; pid < count; pid++) {
+  /* If the cell is split, interact each progeny with itself, and with
+     each of its siblings. */
+  if (c->split) {
 
-    /* Get a hold of the ith part in ci. */
-    pi = &parts[pid];
-    for (k = 0; k < 3; k++) pix[k] = pi->x[k];
+    for (int j = 0; j < 8; j++) {
+      if (c->progeny[j] != NULL) {
 
-    /* Loop over every other part in c. */
-    for (pjd = pid + 1; pjd < count; pjd++) {
+        runner_doself_grav(r, c->progeny[j], 0);
 
-      /* Get a pointer to the jth particle. */
-      pj = &parts[pjd];
+        for (int k = j + 1; k < 8; k++) {
+          if (c->progeny[k] != NULL) {
 
-      /* Compute the pairwise distance. */
-      r2 = 0.0f;
-      for (k = 0; k < 3; k++) {
-        dx[k] = pix[k] - pj->x[k];
-        r2 += dx[k] * dx[k];
+            runner_dopair_grav(r, c->progeny[j], c->progeny[k], 0);
+          }
+        }
       }
+    }
+  }
 
-/* Compute the interaction. */
-#ifndef WITH_VECTORIZATION
-
-      // if ( pi->part->id == 3473472412525 || pj->part->id == 3473472412525 )
-      //     message( "interacting particles pi=%lli and pj=%lli with r=%.3e." ,
-      // pi->part->id , pj->part->id , sqrtf(r2) );
-
-      runner_iact_grav(r2, dx, pi, pj);
+  /* If the cell is not split, then just go for it... */
+  else {
 
-#else
+    runner_doself_grav_pp(r, c);
+  }
 
-      /* Add this interaction to the queue. */
-      r2q[icount] = r2;
-      dxq[3 * icount + 0] = dx[0];
-      dxq[3 * icount + 1] = dx[1];
-      dxq[3 * icount + 2] = dx[2];
-      piq[icount] = pi;
-      pjq[icount] = pj;
-      icount += 1;
+  if (gettimer) TIMER_TOC(timer_dosub_self_grav);
+}
 
-      /* Flush? */
-      if (icount == VEC_SIZE) {
-        runner_iact_vec_grav(r2q, dxq, piq, pjq);
-        icount = 0;
-      }
+static void runner_dosub_grav(struct runner *r, struct cell *ci,
+                              struct cell *cj, int timer) {
 
-#endif
+  /* Is this a single cell? */
+  if (cj == NULL) {
 
-    } /* loop over the remaining parts in c. */
+    runner_doself_grav(r, ci, 1);
 
-  } /* loop over the parts in c. */
+  } else {
 
-#ifdef WITH_VECTORIZATION
-  /* Pick up any leftovers. */
-  if (icount > 0)
-    for (k = 0; k < icount; k++)
-      runner_iact_grav(r2q[k], &dxq[3 * k], piq[k], pjq[k]);
+#ifdef SWIFT_DEBUG_CHECKS
+    if (!cell_are_neighbours(ci, cj))
+      error("Non-neighbouring cells in pair task !");
 #endif
 
-  TIMER_TOC(timer_doself_grav);
+    runner_dopair_grav(r, ci, cj, 1);
+  }
 }
 
-/**
- * @brief Compute a gravity sub-task.
- *
- * @param r The #runner.
- * @param ci The first #cell.
- * @param cj The second #cell.
- * @param gettimer Flag to record timer or not.
- */
-
-void runner_dosub_grav(struct runner *r, struct cell *ci, struct cell *cj,
-                       int gettimer) {
+static void runner_do_grav_mm(struct runner *r, struct cell *ci, int timer) {
 
-  int j, k, periodic = r->e->s->periodic;
-  struct space *s = r->e->s;
-
-  TIMER_TIC
-
-  /* Self-interaction? */
-  if (cj == NULL) {
+#if ICHECK > 0
+  for (int pid = 0; pid < ci->gcount; pid++) {
 
-    /* If the cell is split, recurse. */
-    if (ci->split) {
-
-      /* Split this task into tasks on its progeny. */
-      for (j = 0; j < 8; j++)
-        if (ci->progeny[j] != NULL) {
-          runner_dosub_grav(r, ci->progeny[j], NULL, 0);
-          for (k = j + 1; k < 8; k++)
-            if (ci->progeny[k] != NULL)
-              runner_dosub_grav(r, ci->progeny[j], ci->progeny[k], 0);
-        }
+    /* Get a hold of the ith part in ci. */
+    struct gpart *restrict gp = &ci->gparts[pid];
 
-    }
+    if (gp->id_or_neg_offset == ICHECK)
+      message("id=%lld loc=[ %f %f %f ] size= %f count= %d",
+              gp->id_or_neg_offset, ci->loc[0], ci->loc[1], ci->loc[2],
+              ci->width[0], ci->gcount);
+  }
+#endif
 
-    /* Otherwise, just make a pp task out of it. */
-    else
-      runner_doself_grav(r, ci);
+  /* Recover the list of top-level cells */
+  const struct engine *e = r->e;
+  struct cell *cells = e->s->cells;
+  const int nr_cells = e->s->nr_cells;
+  const int ti_current = e->ti_current;
+  const double max_d =
+      const_gravity_a_smooth * const_gravity_r_cut * ci->width[0];
+  const double max_d2 = max_d * max_d;
+  const double pos_i[3] = {ci->loc[0], ci->loc[1], ci->loc[2]};
 
-  }
+  /* Anything to do here? */
+  if (ci->ti_end_min > ti_current) return;
 
-  /* Nope, pair. */
-  else {
+  /* Loop over all the cells and go for a p-m interaction if far enough but not
+   * too far */
+  for (int i = 0; i < nr_cells; ++i) {
 
-    /* Get the opening angle theta. */
-    float dx[3], theta;
-    for (k = 0; k < 3; k++) {
-      dx[k] = fabs(ci->loc[k] - cj->loc[k]);
-      if (periodic && dx[k] > 0.5 * s->dim[k]) dx[k] = -dx[k] + s->dim[k];
-      if (dx[k] > 0.0f) dx[k] -= ci->width[k];
-    }
-    theta = (dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2]) /
-            (ci->width[0] * ci->width[0] + ci->width[1] * ci->width[1] +
-             ci->width[2] * ci->width[2]);
-
-    /* Split the interaction? */
-    if (theta < const_theta_max * const_theta_max) {
-
-      /* Are both ci and cj split? */
-      if (ci->split && cj->split) {
-
-        /* Split this task into tasks on its progeny. */
-        for (j = 0; j < 8; j++)
-          if (ci->progeny[j] != NULL) {
-            for (k = 0; k < 8; k++)
-              if (cj->progeny[k] != NULL)
-                runner_dosub_grav(r, ci->progeny[j], cj->progeny[k], 0);
-          }
+    struct cell *cj = &cells[i];
 
-      }
+    if (ci == cj) continue;
 
-      /* Otherwise, make a pp task out of it. */
-      else
-        runner_dopair_grav(r, ci, cj);
+    const double dx[3] = {cj->loc[0] - pos_i[0],   // x
+                          cj->loc[1] - pos_i[1],   // y
+                          cj->loc[2] - pos_i[2]};  // z
+    const double r2 = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2];
 
-    }
+    if (r2 > max_d2) continue;
 
-    /* Otherwise, mm interaction is fine. */
-    else
-      runner_dograv_mm(r, ci, cj);
+    if (!cell_are_neighbours(ci, cj)) runner_dopair_grav_pm(r, ci, cj);
   }
-
-  if (gettimer) TIMER_TOC(timer_dosub_grav);
 }
+
 #endif /* SWIFT_RUNNER_DOIACT_GRAV_H */
diff --git a/src/scheduler.c b/src/scheduler.c
index 9b389c8b9ee4d4f844c44707d083542b33137ade..6a0d886bd5458028c5c05812f10c204bc8946a1a 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -173,7 +173,8 @@ void scheduler_splittasks(struct scheduler *s) {
       if (ci->split) {
 
         /* Make a sub? */
-        if (scheduler_dosub && ci->count < space_subsize / ci->count) {
+        if (scheduler_dosub && (ci->count * ci->count < space_subsize ||
+                                ci->gcount * ci->gcount < space_subsize)) {
 
           /* convert to a self-subtask. */
           t->type = task_type_sub_self;
@@ -204,11 +205,10 @@ void scheduler_splittasks(struct scheduler *s) {
                                     ci->progeny[j], ci->progeny[k], 0);
         }
       }
-
     }
 
-    /* Pair interaction? */
-    else if (t->type == task_type_pair) {
+    /* Hydro Pair interaction? */
+    else if (t->type == task_type_pair && t->subtype != task_subtype_grav) {
 
       /* Get a handle on the cells involved. */
       struct cell *ci = t->ci;
@@ -234,7 +234,7 @@ void scheduler_splittasks(struct scheduler *s) {
 
         /* Replace by a single sub-task? */
         if (scheduler_dosub &&
-            ci->count * sid_scale[sid] < space_subsize / cj->count &&
+            ci->count * cj->count * sid_scale[sid] < space_subsize &&
             sid != 0 && sid != 2 && sid != 6 && sid != 8) {
 
           /* Make this task a sub task. */
@@ -518,6 +518,17 @@ void scheduler_splittasks(struct scheduler *s) {
 
     } /* pair interaction? */
 
+    /* Long-range gravity interaction ? */
+    else if (t->type == task_type_grav_mm) {
+
+      /* Get a handle on the cells involved. */
+      struct cell *ci = t->ci;
+
+      /* Safety thing */
+      if (ci->gcount == 0) t->type = task_type_none;
+
+    } /* gravity interaction? */
+
   } /* loop over all tasks. */
 }
 
diff --git a/src/space.c b/src/space.c
index ea652f7f9b918e7f4f1ebe0a81e6604765691eb1..8b7e5f81b539fb02da44dac67a3575356c08c45e 100644
--- a/src/space.c
+++ b/src/space.c
@@ -205,6 +205,12 @@ void space_regrid(struct space *s, double cell_max, int verbose) {
         "Must have at least 3 cells in each spatial dimension when periodicity "
         "is switched on.");
 
+  /* Check if we have enough cells for gravity. */
+  if (s->gravity && (cdim[0] < 8 || cdim[1] < 8 || cdim[2] < 8))
+    error(
+        "Must have at least 8 cells in each spatial dimension when gravity "
+        "is switched on.");
+
 /* In MPI-Land, changing the top-level cell size requires that the
  * global partition is recomputed and the particles redistributed.
  * Be prepared to do that. */
@@ -1421,7 +1427,8 @@ struct cell *space_getcell(struct space *s) {
  * @param Npart The number of Gas particles in the space.
  * @param Ngpart The number of Gravity particles in the space.
  * @param periodic flag whether the domain is periodic or not.
- * @param verbose Print messages to stdout or not
+ * @param gravity flag whether we are doing gravity or not.
+ * @param verbose Print messages to stdout or not.
  * @param dry_run If 1, just initialise stuff, don't do anything with the parts.
  *
  * Makes a grid of edge length > r_max and fills the particles
@@ -1432,8 +1439,8 @@ struct cell *space_getcell(struct space *s) {
 
 void space_init(struct space *s, const struct swift_params *params,
                 double dim[3], struct part *parts, struct gpart *gparts,
-                size_t Npart, size_t Ngpart, int periodic, int verbose,
-                int dry_run) {
+                size_t Npart, size_t Ngpart, int periodic, int gravity,
+                int verbose, int dry_run) {
 
   /* Clean-up everything */
   bzero(s, sizeof(struct space));
@@ -1443,6 +1450,7 @@ void space_init(struct space *s, const struct swift_params *params,
   s->dim[1] = dim[1];
   s->dim[2] = dim[2];
   s->periodic = periodic;
+  s->gravity = gravity;
   s->nr_parts = Npart;
   s->size_parts = Npart;
   s->parts = parts;
diff --git a/src/space.h b/src/space.h
index a859ff5cb0497276ba5ae6dcec6a47d0ca5d7398..180759f2af69dd0d77e6e681229353358d554291 100644
--- a/src/space.h
+++ b/src/space.h
@@ -98,6 +98,9 @@ struct space {
   /* Is the space periodic? */
   int periodic;
 
+  /* Are we doing gravity? */
+  int gravity;
+
   /* General-purpose lock for this space. */
   swift_lock_type lock;
 
@@ -141,8 +144,8 @@ int space_getsid(struct space *s, struct cell **ci, struct cell **cj,
                  double *shift);
 void space_init(struct space *s, const struct swift_params *params,
                 double dim[3], struct part *parts, struct gpart *gparts,
-                size_t Npart, size_t Ngpart, int periodic, int verbose,
-                int dry_run);
+                size_t Npart, size_t Ngpart, int periodic, int gravity,
+                int verbose, int dry_run);
 void space_map_cells_pre(struct space *s, int full,
                          void (*fun)(struct cell *c, void *data), void *data);
 void space_map_parts(struct space *s,
diff --git a/src/task.c b/src/task.c
index f3c6a8f711df1a0506567216d1d4021c88c10f58..e9404ab00df4f757f49d6d186f28dc40c49cfa01 100644
--- a/src/task.c
+++ b/src/task.c
@@ -47,20 +47,19 @@
 
 /* Task type names. */
 const char *taskID_names[task_type_count] = {
-    "none",       "sort",      "self",          "pair",      "sub_self",
-    "sub_pair",   "init",      "ghost",         "drift",     "kick",
-    "kick_fixdt", "send",      "recv",          "grav_pp",   "grav_mm",
-    "grav_up",    "grav_down", "grav_external", "part_sort", "gpart_sort",
-    "split_cell", "rewait"};
+    "none",    "sort",          "self",          "pair",       "sub",
+    "init",    "ghost",         "drift",         "kick",       "kick_fixdt",
+    "send",    "recv",          "grav_gather_m", "grav_fft",   "grav_mm",
+    "grav_up", "grav_external", "part_sort",     "gpart_sort", "split_cell",
+    "rewait"};
 
 const char *subtaskID_names[task_type_count] = {"none", "density", "force",
-                                                "grav", "t_end"};
+                                                "grav"};
 
 /**
  * @brief Computes the overlap between the parts array of two given cells.
  */
-
-size_t task_cell_overlap(const struct cell *ci, const struct cell *cj) {
+size_t task_cell_overlap_part(const struct cell *ci, const struct cell *cj) {
   if (ci == NULL || cj == NULL) return 0;
   if (ci->parts <= cj->parts &&
       ci->parts + ci->count >= cj->parts + cj->count) {
@@ -72,6 +71,95 @@ size_t task_cell_overlap(const struct cell *ci, const struct cell *cj) {
   return 0;
 }
 
+/**
+ * @brief Computes the overlap between the gparts array of two given cells.
+ */
+size_t task_cell_overlap_gpart(const struct cell *ci, const struct cell *cj) {
+  if (ci == NULL || cj == NULL) return 0;
+  if (ci->gparts <= cj->gparts &&
+      ci->gparts + ci->gcount >= cj->gparts + cj->gcount) {
+    return cj->gcount;
+  } else if (cj->gparts <= ci->gparts &&
+             cj->gparts + cj->gcount >= ci->gparts + ci->gcount) {
+    return ci->gcount;
+  }
+  return 0;
+}
+
+/**
+ * @brief Returns the #task_actions for a given task.
+ *
+ * @param t The #task.
+ */
+enum task_actions task_acts_on(const struct task *t) {
+
+  switch (t->type) {
+
+    case task_type_none:
+      return task_action_none;
+      break;
+
+    case task_type_sort:
+    case task_type_ghost:
+      return task_action_part;
+      break;
+
+    case task_type_self:
+    case task_type_pair:
+    case task_type_sub_self:
+    case task_type_sub_pair:
+      switch (t->subtype) {
+
+        case task_subtype_density:
+        case task_subtype_force:
+          return task_action_part;
+          break;
+
+        case task_subtype_grav:
+          return task_action_gpart;
+          break;
+
+        default:
+          error("Unknow task_action for task");
+          return task_action_none;
+          break;
+      }
+      break;
+
+    case task_type_init:
+    case task_type_drift:
+    case task_type_kick:
+    case task_type_kick_fixdt:
+    case task_type_send:
+    case task_type_recv:
+      return task_action_all;
+      break;
+
+    case task_type_grav_gather_m:
+    case task_type_grav_fft:
+    case task_type_grav_mm:
+    case task_type_grav_up:
+      return task_action_multipole;
+      break;
+
+    case task_type_grav_external:
+      return task_action_gpart;
+      break;
+
+    case task_type_part_sort:
+    case task_type_gpart_sort:
+    case task_type_split_cell:
+    case task_type_rewait:
+      return task_action_none;
+      break;
+
+    default:
+      error("Unknow task_action for task");
+      return task_action_none;
+      break;
+  }
+}
+
 /**
  * @brief Compute the Jaccard similarity of the data used by two
  *        different tasks.
@@ -79,31 +167,64 @@ size_t task_cell_overlap(const struct cell *ci, const struct cell *cj) {
  * @param ta The first #task.
  * @param tb The second #task.
  */
-
 float task_overlap(const struct task *ta, const struct task *tb) {
+
+  if (ta == NULL || tb == NULL) return 0.f;
+
+  const enum task_actions ta_act = task_acts_on(ta);
+  const enum task_actions tb_act = task_acts_on(tb);
+
   /* First check if any of the two tasks are of a type that don't
      use cells. */
-  if (ta == NULL || tb == NULL || ta->type == task_type_none ||
-      ta->type == task_type_part_sort || ta->type == task_type_gpart_sort ||
-      ta->type == task_type_split_cell || ta->type == task_type_rewait ||
-      tb->type == task_type_none || tb->type == task_type_part_sort ||
-      tb->type == task_type_gpart_sort || tb->type == task_type_split_cell ||
-      tb->type == task_type_rewait)
-    return 0.0f;
-
-  /* Compute the union of the cell data. */
-  size_t size_union = 0;
-  if (ta->ci != NULL) size_union += ta->ci->count;
-  if (ta->cj != NULL) size_union += ta->cj->count;
-  if (tb->ci != NULL) size_union += tb->ci->count;
-  if (tb->cj != NULL) size_union += tb->cj->count;
-
-  /* Compute the intersection of the cell data. */
-  const size_t size_intersect =
-      task_cell_overlap(ta->ci, tb->ci) + task_cell_overlap(ta->ci, tb->cj) +
-      task_cell_overlap(ta->cj, tb->ci) + task_cell_overlap(ta->cj, tb->cj);
-
-  return ((float)size_intersect) / (size_union - size_intersect);
+  if (ta_act == task_action_none || tb_act == task_action_none) return 0.f;
+
+  const int ta_part = (ta_act == task_action_part || ta_act == task_action_all);
+  const int ta_gpart =
+      (ta_act == task_action_gpart || ta_act == task_action_all);
+  const int tb_part = (tb_act == task_action_part || tb_act == task_action_all);
+  const int tb_gpart =
+      (tb_act == task_action_gpart || tb_act == task_action_all);
+
+  /* In the case where both tasks act on parts */
+  if (ta_part && tb_part) {
+
+    /* Compute the union of the cell data. */
+    size_t size_union = 0;
+    if (ta->ci != NULL) size_union += ta->ci->count;
+    if (ta->cj != NULL) size_union += ta->cj->count;
+    if (tb->ci != NULL) size_union += tb->ci->count;
+    if (tb->cj != NULL) size_union += tb->cj->count;
+
+    /* Compute the intersection of the cell data. */
+    const size_t size_intersect = task_cell_overlap_part(ta->ci, tb->ci) +
+                                  task_cell_overlap_part(ta->ci, tb->cj) +
+                                  task_cell_overlap_part(ta->cj, tb->ci) +
+                                  task_cell_overlap_part(ta->cj, tb->cj);
+
+    return ((float)size_intersect) / (size_union - size_intersect);
+  }
+
+  /* In the case where both tasks act on gparts */
+  else if (ta_gpart && tb_gpart) {
+
+    /* Compute the union of the cell data. */
+    size_t size_union = 0;
+    if (ta->ci != NULL) size_union += ta->ci->gcount;
+    if (ta->cj != NULL) size_union += ta->cj->gcount;
+    if (tb->ci != NULL) size_union += tb->ci->gcount;
+    if (tb->cj != NULL) size_union += tb->cj->gcount;
+
+    /* Compute the intersection of the cell data. */
+    const size_t size_intersect = task_cell_overlap_gpart(ta->ci, tb->ci) +
+                                  task_cell_overlap_gpart(ta->ci, tb->cj) +
+                                  task_cell_overlap_gpart(ta->cj, tb->ci) +
+                                  task_cell_overlap_gpart(ta->cj, tb->cj);
+
+    return ((float)size_intersect) / (size_union - size_intersect);
+  }
+
+  /* Else, no overlap */
+  return 0.f;
 }
 
 /**
@@ -111,26 +232,41 @@ float task_overlap(const struct task *ta, const struct task *tb) {
  *
  * @param t The #task.
  */
-
 void task_unlock(struct task *t) {
 
+  const int type = t->type;
+  const int subtype = t->subtype;
+  struct cell *ci = t->ci, *cj = t->cj;
+
   /* Act based on task type. */
-  switch (t->type) {
+  switch (type) {
+
+    case task_type_sort:
+      cell_unlocktree(ci);
+      break;
+
     case task_type_self:
     case task_type_sub_self:
-    case task_type_sort:
-      cell_unlocktree(t->ci);
+      if (subtype == task_subtype_grav) {
+        cell_gunlocktree(ci);
+      } else {
+        cell_unlocktree(ci);
+      }
       break;
+
     case task_type_pair:
     case task_type_sub_pair:
-      cell_unlocktree(t->ci);
-      cell_unlocktree(t->cj);
+      if (subtype == task_subtype_grav) {
+        cell_gunlocktree(ci);
+        cell_gunlocktree(cj);
+      } else {
+        cell_unlocktree(ci);
+        cell_unlocktree(cj);
+      }
       break;
-    case task_type_grav_pp:
+
     case task_type_grav_mm:
-    case task_type_grav_down:
-      cell_gunlocktree(t->ci);
-      if (t->cj != NULL) cell_gunlocktree(t->cj);
+      cell_gunlocktree(ci);
       break;
     default:
       break;
@@ -142,7 +278,6 @@ void task_unlock(struct task *t) {
  *
  * @param t the #task.
  */
-
 int task_lock(struct task *t) {
 
   const int type = t->type;
@@ -205,6 +340,10 @@ int task_lock(struct task *t) {
       }
       break;
 
+    case task_type_grav_mm:
+      cell_glocktree(ci);
+      break;
+
     default:
       break;
   }
diff --git a/src/task.h b/src/task.h
index 8dea503e42cdba9a188112cfa668c464c69f9959..ee49568b143282b9e3025f6d2bc81ded04ffee41 100644
--- a/src/task.h
+++ b/src/task.h
@@ -46,10 +46,10 @@ enum task_types {
   task_type_kick_fixdt,
   task_type_send,
   task_type_recv,
-  task_type_grav_pp,
+  task_type_grav_gather_m,
+  task_type_grav_fft,
   task_type_grav_mm,
   task_type_grav_up,
-  task_type_grav_down,
   task_type_grav_external,
   task_type_part_sort,
   task_type_gpart_sort,
@@ -70,6 +70,16 @@ enum task_subtypes {
   task_subtype_count
 };
 
+/* The kind of action the task perform */
+enum task_actions {
+  task_action_none,
+  task_action_part,
+  task_action_gpart,
+  task_action_all,
+  task_action_multipole,
+  task_action_count
+};
+
 extern const char *subtaskID_names[];
 
 /* Data of a task. */
@@ -102,5 +112,6 @@ int task_lock(struct task *t);
 void task_print_mask(unsigned int mask);
 void task_print_submask(unsigned int submask);
 void task_do_rewait(struct task *t);
+enum task_actions task_acts_on(const struct task *t);
 
 #endif /* SWIFT_TASK_H */
diff --git a/src/timers.h b/src/timers.h
index c48961b39737f23eb936d7283f76651d33892991..aa8455397daf1e88b709a8332e3ae63694991e94 100644
--- a/src/timers.h
+++ b/src/timers.h
@@ -37,10 +37,11 @@ enum {
   timer_dosort,
   timer_doself_density,
   timer_doself_force,
-  timer_doself_grav,
+  timer_doself_grav_pp,
   timer_dopair_density,
   timer_dopair_force,
-  timer_dopair_grav,
+  timer_dopair_grav_pm,
+  timer_dopair_grav_pp,
   timer_dograv_external,
   timer_dosub_self_density,
   timer_dosub_self_force,
diff --git a/src/tools.c b/src/tools.c
index 50ad48af38285062738c7c8ce12e0f4535086dd5..b4a37328ff7df9de76adceebb5ed6801fa417851 100644
--- a/src/tools.c
+++ b/src/tools.c
@@ -426,7 +426,7 @@ void pairs_single_grav(double *dim, long long int pid,
       fdx[i] = dx[i];
     }
     r2 = fdx[0] * fdx[0] + fdx[1] * fdx[1] + fdx[2] * fdx[2];
-    runner_iact_grav(r2, fdx, &pi, &pj);
+    runner_iact_grav_pp(0.f, r2, fdx, &pi, &pj);
     a[0] += pi.a_grav[0];
     a[1] += pi.a_grav[1];
     a[2] += pi.a_grav[2];
@@ -600,3 +600,65 @@ void shuffle_particles(struct part *parts, const int count) {
   } else
     error("Array not big enough to shuffle!");
 }
+
+/**
+ * @brief Computes the forces between all g-particles using the N^2 algorithm
+ *
+ * Overwrites the accelerations of the gparts with the values.
+ * Do not use for actual runs.
+ *
+ * @brief gparts The array of particles.
+ * @brief gcount The number of particles.
+ */
+void gravity_n2(struct gpart *gparts, const int gcount,
+                const struct phys_const *constants, float rlr) {
+
+  const float rlr_inv = 1. / rlr;
+  const float max_d = const_gravity_r_cut * rlr;
+  const float max_d2 = max_d * max_d;
+
+  message("rlr_inv= %f", rlr_inv);
+  message("max_d: %f", max_d);
+
+  /* Reset everything */
+  for (int pid = 0; pid < gcount; pid++) {
+    struct gpart *restrict gpi = &gparts[pid];
+    gpi->a_grav[0] = 0.f;
+    gpi->a_grav[1] = 0.f;
+    gpi->a_grav[2] = 0.f;
+  }
+
+  /* Loop over all particles in ci... */
+  for (int pid = 0; pid < gcount; pid++) {
+
+    /* Get a hold of the ith part in ci. */
+    struct gpart *restrict gpi = &gparts[pid];
+
+    for (int pjd = pid + 1; pjd < gcount; pjd++) {
+
+      /* Get a hold of the jth part in ci. */
+      struct gpart *restrict gpj = &gparts[pjd];
+
+      /* Compute the pairwise distance. */
+      const float dx[3] = {gpi->x[0] - gpj->x[0],   // x
+                           gpi->x[1] - gpj->x[1],   // y
+                           gpi->x[2] - gpj->x[2]};  // z
+      const float r2 = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2];
+
+      if (r2 < max_d2 || 1) {
+
+        /* Apply the gravitational acceleration. */
+        runner_iact_grav_pp(rlr_inv, r2, dx, gpi, gpj);
+      }
+    }
+  }
+
+  /* Multiply by Newton's constant */
+  const double const_G = constants->const_newton_G;
+  for (int pid = 0; pid < gcount; pid++) {
+    struct gpart *restrict gpi = &gparts[pid];
+    gpi->a_grav[0] *= const_G;
+    gpi->a_grav[1] *= const_G;
+    gpi->a_grav[2] *= const_G;
+  }
+}
diff --git a/src/tools.h b/src/tools.h
index ba5b2f8c7f118edb042c825ff00907ad807b3d55..43ddd946c3e8cdf53139bb917135dffd8a8acd12 100644
--- a/src/tools.h
+++ b/src/tools.h
@@ -22,12 +22,9 @@
 #ifndef SWIFT_TOOL_H
 #define SWIFT_TOOL_H
 
-/* Config parameters. */
-#include "../config.h"
-
-/* Includes. */
 #include "cell.h"
 #include "part.h"
+#include "physical_constants.h"
 #include "runner.h"
 
 void factor(int value, int *f1, int *f2);
@@ -47,5 +44,7 @@ void pairs_n2(double *dim, struct part *restrict parts, int N, int periodic);
 
 double random_uniform(double a, double b);
 void shuffle_particles(struct part *parts, const int count);
+void gravity_n2(struct gpart *gparts, const int gcount,
+                const struct phys_const *constants, float rlr);
 
 #endif /* SWIFT_TOOL_H */
diff --git a/src/version.c b/src/version.c
index ab22c0d9ab8841dedd09aa83aa8803e468e69ce9..68a051fa08c53c68319c1e785c0c6503afe18f4d 100644
--- a/src/version.c
+++ b/src/version.c
@@ -33,6 +33,10 @@
 #include <hdf5.h>
 #endif
 
+#ifdef HAVE_FFTW
+#include <fftw3.h>
+#endif
+
 /* Some standard headers. */
 #include <stdio.h>
 #include <string.h>
@@ -238,6 +242,22 @@ const char *metis_version(void) {
   return version;
 }
 
+/**
+ * @brief return the FFTW version used when SWIFT was built.
+ *
+ * @result description of the FFTW version.
+ */
+const char *fftw3_version(void) {
+
+  static char version[256] = {0};
+#if defined(HAVE_FFTW)
+  sprintf(version, "%s", "3.x (details not available)");
+#else
+  sprintf(version, "Unknown version");
+#endif
+  return version;
+}
+
 /**
  * @brief Prints a greeting message to the standard output containing code
  * version and revision number
@@ -259,6 +279,9 @@ void greetings(void) {
 #ifdef HAVE_HDF5
   printf(" HDF5 library version: %s\n", hdf5_version());
 #endif
+#ifdef HAVE_FFTW
+  printf(" FFTW library version: %s\n", fftw3_version());
+#endif
 #ifdef WITH_MPI
   printf(" MPI library: %s\n", mpi_version());
 #ifdef HAVE_METIS
diff --git a/src/version.h b/src/version.h
index 4b6b38d220b36e5cb340571cd89e6f52a7b21a8e..0d568f312cf775cf932d580a49da7c19c9e14b21 100644
--- a/src/version.h
+++ b/src/version.h
@@ -27,8 +27,9 @@ const char* git_branch(void);
 const char* compiler_name(void);
 const char* compiler_version(void);
 const char* mpi_version(void);
-const char* hdf5_version(void);
 const char* metis_version(void);
+const char* hdf5_version(void);
+const char* fftw3_version(void);
 void greetings(void);
 
 #endif /* SWIFT_VERSION_H */
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 4d6cb65ca6971d30fd6e39f52ce582c59cecb8b8..2adce2c90985b537a668adfccec0d5241113314e 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -18,17 +18,17 @@
 AM_CFLAGS = -I$(top_srcdir)/src $(HDF5_CPPFLAGS) -DTIMER
 
 
-AM_LDFLAGS = ../src/.libs/libswiftsim.a $(HDF5_LDFLAGS) $(HDF5_LIBS)
+AM_LDFLAGS = ../src/.libs/libswiftsim.a $(HDF5_LDFLAGS) $(HDF5_LIBS) $(FFTW_LIBS)
 
 # List of programs and scripts to run in the test suite
 TESTS = testGreetings testMaths testReading.sh testSingle testKernel testSymmetry \
         testPair.sh testPairPerturbed.sh test27cells.sh test27cellsPerturbed.sh  \
-        testParser.sh testSPHStep test125cells.sh
+        testParser.sh testSPHStep test125cells.sh testKernelGrav testFFT
 
 # List of test programs to compile
 check_PROGRAMS = testGreetings testReading testSingle testTimeIntegration \
 		 testSPHStep testPair test27cells test125cells testParser \
-                 testKernel testInteractions testMaths testSymmetry
+                 testKernel testKernelGrav testFFT testInteractions testMaths testSymmetry
 
 # Sources for the individual programs
 testGreetings_SOURCES = testGreetings.c
@@ -37,8 +37,6 @@ testMaths_SOURCES = testMaths.c
 
 testReading_SOURCES = testReading.c
 
-testKernel_SOURCES = testKernel.c
-
 testSymmetry_SOURCES = testSymmetry.c
 
 testTimeIntegration_SOURCES = testTimeIntegration.c
@@ -55,6 +53,12 @@ test125cells_SOURCES = test125cells.c
 
 testParser_SOURCES = testParser.c
 
+testKernel_SOURCES = testKernel.c
+
+testKernelGrav_SOURCES = testKernelGrav.c
+
+testFFT_SOURCES = testFFT.c
+
 testInteractions_SOURCES = testInteractions.c
 
 # Files necessary for distribution
diff --git a/tests/testFFT.c b/tests/testFFT.c
new file mode 100644
index 0000000000000000000000000000000000000000..7ff29dac3f0e9b7d6aec65cbfc7e4f4b85d08dbb
--- /dev/null
+++ b/tests/testFFT.c
@@ -0,0 +1,189 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (C) 2015 Matthieu Schaller (matthieu.schaller@durham.ac.uk).
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+
+/* Some standard headers. */
+#include <stdlib.h>
+#include <string.h>
+
+#include <fftw3.h>
+
+/* Includes. */
+#include "swift.h"
+
+const double G = 1.;
+
+const size_t N = 16;
+const size_t PMGRID = 8;
+
+// const double asmth = 2. * M_PI * const_gravity_a_smooth / boxSize;
+// const double asmth2 = asmth * asmth;
+// const double fact = G / (M_PI * boxSize) * (1. / (2. * boxSize / PMGRID));
+
+int main() {
+
+  /* Initialize CPU frequency, this also starts time. */
+  unsigned long long cpufreq = 0;
+  clocks_set_cpufreq(cpufreq);
+
+  /* Simulation properties */
+  const size_t count = N * N * N;
+  const double boxSize = 1.;
+
+  /* Create some particles */
+  struct gpart* gparts = malloc(count * sizeof(struct gpart));
+  bzero(gparts, count * sizeof(struct gpart));
+  for (size_t i = 0; i < N; ++i) {
+    for (size_t j = 0; j < N; ++j) {
+      for (size_t k = 0; k < N; ++k) {
+
+        struct gpart* gp = &gparts[i * N * N + j * N + k];
+
+        gp->x[0] = i * boxSize / N + boxSize / (2 * N);
+        gp->x[1] = j * boxSize / N + boxSize / (2 * N);
+        gp->x[2] = k * boxSize / N + boxSize / (2 * N);
+
+        gp->mass = 1. / count;
+
+        gp->id_or_neg_offset = i * N * N + j * N + k;
+      }
+    }
+  }
+
+  /* Properties of the mesh */
+  const size_t meshmin[3] = {0, 0, 0};
+  const size_t meshmax[3] = {PMGRID - 1, PMGRID - 1, PMGRID - 1};
+
+  const size_t dimx = meshmax[0] - meshmin[0] + 2;
+  const size_t dimy = meshmax[1] - meshmin[1] + 2;
+  const size_t dimz = meshmax[2] - meshmin[2] + 2;
+
+  const double fac = PMGRID / boxSize;
+  const size_t PMGRID2 = 2 * (PMGRID / 2 + 1);
+
+  /* message("dimx=%zd dimy=%zd dimz=%zd", dimx, dimy, dimz); */
+
+  /* Allocate and empty the workspace mesh */
+  const size_t workspace_size = (dimx + 4) * (dimy + 4) * (dimz + 4);
+  double* workspace = fftw_malloc(workspace_size * sizeof(double));
+  bzero(workspace, workspace_size * sizeof(double));
+
+  /* Do CIC with the particles */
+  for (size_t pid = 0; pid < count; ++pid) {
+
+    const struct gpart* const gp = &gparts[pid];
+
+    const size_t slab_x =
+        (fac * gp->x[0] >= PMGRID) ? PMGRID - 1 : fac * gp->x[0];
+    const size_t slab_y =
+        (fac * gp->x[1] >= PMGRID) ? PMGRID - 1 : fac * gp->x[1];
+    const size_t slab_z =
+        (fac * gp->x[2] >= PMGRID) ? PMGRID - 1 : fac * gp->x[2];
+
+    const double dx = fac * gp->x[0] - (double)slab_x;
+    const double dy = fac * gp->x[1] - (double)slab_y;
+    const double dz = fac * gp->x[2] - (double)slab_z;
+
+    const size_t slab_xx = slab_x + 1;
+    const size_t slab_yy = slab_y + 1;
+    const size_t slab_zz = slab_z + 1;
+
+    workspace[(slab_x * dimy + slab_y) * dimz + slab_z] +=
+        gp->mass * (1.0 - dx) * (1.0 - dy) * (1.0 - dz);
+    workspace[(slab_x * dimy + slab_yy) * dimz + slab_z] +=
+        gp->mass * (1.0 - dx) * dy * (1.0 - dz);
+    workspace[(slab_x * dimy + slab_y) * dimz + slab_zz] +=
+        gp->mass * (1.0 - dx) * (1.0 - dy) * dz;
+    workspace[(slab_x * dimy + slab_yy) * dimz + slab_zz] +=
+        gp->mass * (1.0 - dx) * dy * dz;
+    workspace[(slab_xx * dimy + slab_y) * dimz + slab_z] +=
+        gp->mass * (dx) * (1.0 - dy) * (1.0 - dz);
+    workspace[(slab_xx * dimy + slab_yy) * dimz + slab_z] +=
+        gp->mass * (dx)*dy * (1.0 - dz);
+    workspace[(slab_xx * dimy + slab_y) * dimz + slab_zz] +=
+        gp->mass * (dx) * (1.0 - dy) * dz;
+    workspace[(slab_xx * dimy + slab_yy) * dimz + slab_zz] +=
+        gp->mass * (dx)*dy * dz;
+  }
+
+  /* for(size_t i = 0 ; i < dimx*dimy*dimz; ++i) */
+  /*   message("workspace[%zd] = %f", i, workspace[i]); */
+
+  /* Prepare the force grid */
+  const size_t fft_size = workspace_size;
+  double* forcegrid = fftw_malloc(fft_size * sizeof(double));
+  bzero(forcegrid, fft_size * sizeof(double));
+
+  const size_t sendmin = 0, recvmin = 0;
+  const size_t sendmax = PMGRID, recvmax = PMGRID;
+
+  memcpy(forcegrid, workspace + (sendmin - meshmin[0]) * dimy * dimz,
+         (sendmax - sendmin + 1) * dimy * dimz * sizeof(double));
+
+  /* for (size_t i = 0; i < fft_size; ++i) */
+  /*   if (forcegrid[i] != workspace[i]) error("wrong"); */
+
+  /* Prepare the density grid */
+  double* rhogrid = fftw_malloc(fft_size * sizeof(double));
+  bzero(rhogrid, fft_size * sizeof(double));
+
+  /* Now get the density */
+  for (size_t slab_x = recvmin; slab_x <= recvmax; slab_x++) {
+
+    const size_t slab_xx = slab_x % PMGRID;
+
+    for (size_t slab_y = recvmin; slab_y <= recvmax; slab_y++) {
+
+      const size_t slab_yy = slab_y % PMGRID;
+
+      for (size_t slab_z = recvmin; slab_z <= recvmax; slab_z++) {
+
+        const size_t slab_zz = slab_z % PMGRID;
+
+        rhogrid[PMGRID * PMGRID2 * slab_xx + PMGRID2 * slab_yy + slab_zz] +=
+            forcegrid[((slab_x - recvmin) * dimy + (slab_y - recvmin)) * dimz +
+                      (slab_z - recvmin)];
+      }
+    }
+  }
+
+  /* for (size_t i = 0; i < 640; i++) { */
+  /*   if (rhogrid[i] != workspace[i]) { */
+  /*     message("rhogrid[%zd]= %f workspace[%zd]= %f forcegrid[%zd]= %f", i, */
+  /*             rhogrid[i], i, workspace[i], i, forcegrid[i]); */
+  /*   } */
+  /* } */
+
+  /* FFT of the density field */
+  fftw_complex* fftgrid = fftw_malloc(fft_size * sizeof(fftw_complex));
+  fftw_plan plan_forward = fftw_plan_dft_r2c_3d(PMGRID, PMGRID, PMGRID, rhogrid,
+                                                fftgrid, FFTW_ESTIMATE);
+  fftw_execute(plan_forward);
+
+  for (size_t i = 0; i < 640; i++) {
+    message("workspace[%zd]= %f", i, fftgrid[i][0]);
+  }
+
+  /* Clean-up */
+  fftw_destroy_plan(plan_forward);
+  fftw_free(forcegrid);
+  fftw_free(rhogrid);
+  fftw_free(workspace);
+  free(gparts);
+  return 0;
+}
diff --git a/tests/testKernelGrav.c b/tests/testKernelGrav.c
new file mode 100644
index 0000000000000000000000000000000000000000..42ce6066b605c8cc9ac78a6e565dd59a500112f4
--- /dev/null
+++ b/tests/testKernelGrav.c
@@ -0,0 +1,112 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (C) 2016 Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ *                    James Willis (james.s.willis@durham.ac.uk)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+#include "const.h"
+#include "kernel_gravity.h"
+#include "kernel_long_gravity.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+
+#define numPoints (1 << 7)
+
+/**
+ * @brief The Gadget-2 gravity kernel function
+ *
+ * @param r The distance between particles
+ * @param h The cut-off distance of the kernel
+ */
+float gadget(float r, float h) {
+  float fac;
+  const float r2 = r * r;
+  if (r >= h)
+    fac = 1.0f / (r2 * r);
+  else {
+    const float h_inv = 1. / h;
+    const float h_inv3 = h_inv * h_inv * h_inv;
+    const float u = r * h_inv;
+    if (u < 0.5)
+      fac = h_inv3 * (10.666666666667 + u * u * (32.0 * u - 38.4));
+    else
+      fac =
+          h_inv3 * (21.333333333333 - 48.0 * u + 38.4 * u * u -
+                    10.666666666667 * u * u * u - 0.066666666667 / (u * u * u));
+  }
+  return fac;
+}
+
+int main() {
+
+  const float h = 3.f;
+  const float r_max = 6.f;
+
+  for (int k = 1; k < numPoints; ++k) {
+
+    const float r = (r_max * k) / numPoints;
+
+    const float u = r / h;
+
+    const float gadget_w = gadget(r, h);
+
+    float swift_w;
+    if (u < 1.) {
+      kernel_grav_eval(u, &swift_w);
+      swift_w *= (1 / (h * h * h));
+    } else {
+      swift_w = 1 / (r * r * r);
+    }
+
+    printf("%2d: r= %f h= %f u= %f Wg(r,h)= %f Ws(r,h)= %f\n", k, r, h, u,
+           gadget_w, swift_w);
+
+    if (fabsf(gadget_w - swift_w) > 2e-7) {
+      printf("Invalid value ! Gadget= %e, SWIFT= %e\n", gadget_w, swift_w);
+      return 1;
+    }
+  }
+
+  printf("\nAll values are consistent\n");
+
+  /* Now test the long range function */
+  const float a_smooth = const_gravity_a_smooth;
+
+  for (int k = 1; k < numPoints; ++k) {
+
+    const float r = (r_max * k) / numPoints;
+
+    const float u = r / a_smooth;
+
+    float swift_w;
+    kernel_long_grav_eval(u, &swift_w);
+
+    float gadget_w = erfc(u / 2) + u * exp(-u * u / 4) / sqrt(M_PI);
+
+    printf("%2d: r= %f r_lr= %f u= %f Ws(r)= %f Wg(r)= %f\n", k, r, a_smooth, u,
+           swift_w, gadget_w);
+
+    if (fabsf(gadget_w - swift_w) > 2e-7) {
+      printf("Invalid value ! Gadget= %e, SWIFT= %e\n", gadget_w, swift_w);
+      return 1;
+    }
+  }
+
+  return 0;
+}