diff --git a/.gitignore b/.gitignore
index 2a16a4a681b86d30940001fb2ce86dfb9d27a558..4e2f82309eb0dbf56f8eab50d2df14b65865d858 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,6 +32,7 @@ examples/*/*/*.hdf5
 examples/*/*/*.png
 examples/*/*/*.txt
 examples/*/*/used_parameters.yml
+examples/*/gravity_checks_*.dat
 
 tests/testPair
 tests/brute_force_standard.dat
@@ -88,6 +89,8 @@ theory/SPH/Flavours/sph_flavours.pdf
 theory/SPH/EoS/eos.pdf
 theory/SPH/*.pdf
 theory/paper_pasc/pasc_paper.pdf
+theory/Multipoles/fmm.pdf
+theory/Multipoles/fmm_standalone.pdf
 
 m4/libtool.m4
 m4/ltoptions.m4
diff --git a/README b/README
index bec538bfc35239945d60487d66ed2e02b5d363a2..6e2b2cb2304aaa1790646de5316b7c08362779bf 100644
--- a/README
+++ b/README
@@ -31,6 +31,7 @@ Valid options are:
   -s          Run with hydrodynamics.
   -S          Run with stars.
   -t    {int} The number of threads to use on each MPI rank. Defaults to 1 if not specified.
+  -T          Print timers every time-step.
   -v     [12] Increase the level of verbosity
   	      1: MPI-rank 0 writes
 	      2: All MPI-ranks write
diff --git a/examples/EAGLE_12/eagle_12.yml b/examples/EAGLE_12/eagle_12.yml
index 67ce7c530263f7905a0d5147832dfc39148d753d..e4eb43672a2e80da2f7ee558cebbebe92a28e269 100644
--- a/examples/EAGLE_12/eagle_12.yml
+++ b/examples/EAGLE_12/eagle_12.yml
@@ -28,7 +28,8 @@ Statistics:
 
 # Parameters for the self-gravity scheme
 Gravity:
-  eta:                   0.025    # Constant dimensionless multiplier for time integration. 
+  eta:                   0.025    # Constant dimensionless multiplier for time integration.
+  theta:                 0.7      # Opening angle (Multipole acceptance criterion)
   epsilon:               0.0001   # Softening length (in internal units).
   a_smooth:              1000.
   r_cut:                 4.
diff --git a/examples/UniformDMBox/uniformBox.yml b/examples/UniformDMBox/uniformBox.yml
new file mode 100644
index 0000000000000000000000000000000000000000..8d9ec300164a7bf8f3df257c34ee44d4f77fe94e
--- /dev/null
+++ b/examples/UniformDMBox/uniformBox.yml
@@ -0,0 +1,38 @@
+# Define the system of units to use internally. 
+InternalUnitSystem:
+  UnitMass_in_cgs:     1   # Grams
+  UnitLength_in_cgs:   1   # Centimeters
+  UnitVelocity_in_cgs: 1   # Centimeters per second
+  UnitCurrent_in_cgs:  1   # Amperes
+  UnitTemp_in_cgs:     1   # Kelvin
+
+# Parameters governing the time integration
+TimeIntegration:
+  time_begin: 0.    # The starting time of the simulation (in internal units).
+  time_end:   1.    # The end time of the simulation (in internal units).
+  dt_min:     1e-6  # The minimal time-step size of the simulation (in internal units).
+  dt_max:     1e-3  # The maximal time-step size of the simulation (in internal units).
+
+Scheduler:
+  max_top_level_cells: 8
+  cell_split_size:     50
+  
+# Parameters governing the snapshots
+Snapshots:
+  basename:            uniformDMBox # Common part of the name of output files
+  time_first:          0.           # Time of the first output (in internal units)
+  delta_time:          0.01         # Time difference between consecutive outputs (in internal units)
+
+# Parameters for the self-gravity scheme
+Gravity:
+  eta:                   0.025    # Constant dimensionless multiplier for time integration. 
+  theta:                 0.7      # Opening angle (Multipole acceptance criterion)
+  epsilon:               0.00001  # Softening length (in internal units).
+  
+# Parameters governing the conserved quantities statistics
+Statistics:
+  delta_time:          1e-2 # Time between statistics output
+
+# Parameters related to the initial conditions
+InitialConditions:
+  file_name:  ./uniformDMBox_100.hdf5     # The file to read
diff --git a/examples/main.c b/examples/main.c
index 9cde1062a5e0d1d190fd3139400f82db5f445545..5f42f3e06ea46d0c3b73363956f470016e5d1498 100644
--- a/examples/main.c
+++ b/examples/main.c
@@ -90,6 +90,7 @@ void print_help_message() {
   printf("  %2s %8s %s\n", "-t", "{int}",
          "The number of threads to use on each MPI rank. Defaults to 1 if not "
          "specified.");
+  printf("  %2s %8s %s\n", "-T", "", "Print timers every time-step.");
   printf("  %2s %8s %s\n", "-v", "[12]", "Increase the level of verbosity.");
   printf("  %2s %8s %s\n", "", "", "1: MPI-rank 0 writes ");
   printf("  %2s %8s %s\n", "", "", "2: All MPI-ranks write");
@@ -604,6 +605,7 @@ int main(int argc, char *argv[]) {
     printf("# %6s %14s %14s %10s %10s %10s %16s [%s]\n", "Step", "Time",
            "Time-step", "Updates", "g-Updates", "s-Updates", "Wall-clock time",
            clocks_getunit());
+
     if (with_verbose_timers) {
       printf("timers: ");
       for (int k = 0; k < timer_count; k++) printf("%s\t", timers_names[k]);
@@ -626,6 +628,7 @@ int main(int argc, char *argv[]) {
       for (int k = 0; k < timer_count; k++)
         printf("%.3f\t", clocks_from_ticks(timers[k]));
       printf("\n");
+      timers_reset(0xFFFFFFFFllu);
     }
 
 #ifdef SWIFT_DEBUG_TASKS
diff --git a/examples/parameter_example.yml b/examples/parameter_example.yml
index 501863a7047df33592fe77b523fdb211dfb7d16b..14bc60bc1e1c05ecdc66fb7ac828102b1d5748bf 100644
--- a/examples/parameter_example.yml
+++ b/examples/parameter_example.yml
@@ -52,6 +52,7 @@ SPH:
 # Parameters for the self-gravity scheme
 Gravity:
   eta:                   0.025    # Constant dimensionless multiplier for time integration.
+  theta:                 0.7      # Opening angle (Multipole acceptance criterion)
   epsilon:               0.1      # Softening length (in internal units).
   a_smooth:              1.25     # (Optional) Smoothing scale in top-level cell sizes to smooth the long-range forces over (this is the default value).
   r_cut:                 4.5      # (Optional) Cut-off in number of top-level cells beyond which no FMM forces are computed (this is the default value).
diff --git a/examples/plot_gravity_checks.py b/examples/plot_gravity_checks.py
new file mode 100644
index 0000000000000000000000000000000000000000..41d9d629899f5c34cbb0264b38547a3439c04bf3
--- /dev/null
+++ b/examples/plot_gravity_checks.py
@@ -0,0 +1,267 @@
+#!/usr/bin/env python
+
+import sys
+import glob
+import re
+import numpy as np
+import matplotlib.pyplot as plt
+
+params = {'axes.labelsize': 14,
+'axes.titlesize': 18,
+'font.size': 12,
+'legend.fontsize': 12,
+'xtick.labelsize': 14,
+'ytick.labelsize': 14,
+'text.usetex': True,
+'figure.figsize': (10, 10),
+'figure.subplot.left'    : 0.06,
+'figure.subplot.right'   : 0.99  ,
+'figure.subplot.bottom'  : 0.06  ,
+'figure.subplot.top'     : 0.99  ,
+'figure.subplot.wspace'  : 0.14  ,
+'figure.subplot.hspace'  : 0.14  ,
+'lines.markersize' : 6,
+'lines.linewidth' : 3.,
+'text.latex.unicode': True
+}
+plt.rcParams.update(params)
+plt.rc('font',**{'family':'sans-serif','sans-serif':['Times']})
+
+min_error = 1e-7
+max_error = 3e-1
+num_bins = 64
+
+# Construct the bins
+bin_edges = np.linspace(np.log10(min_error), np.log10(max_error), num_bins + 1)
+bin_size = (np.log10(max_error) - np.log10(min_error)) / num_bins
+bins = 0.5*(bin_edges[1:] + bin_edges[:-1])
+bin_edges = 10**bin_edges
+bins = 10**bins
+
+# Colours
+cols = ['#332288', '#88CCEE', '#117733', '#DDCC77', '#CC6677']
+
+# Time-step to plot
+step = int(sys.argv[1])
+
+# Find the files for the different expansion orders
+order_list = glob.glob("gravity_checks_swift_step%d_order*.dat"%step)
+num_order = len(order_list)
+
+# Get the multipole orders
+order = np.zeros(num_order)
+for i in range(num_order):
+    order[i] = int(order_list[i][32])
+order = sorted(order)
+order_list = sorted(order_list)
+
+# Read the exact accelerations first
+data = np.loadtxt('gravity_checks_exact_step%d.dat'%step)
+exact_ids = data[:,0]
+exact_pos = data[:,1:4]
+exact_a = data[:,4:7]
+# Sort stuff
+sort_index = np.argsort(exact_ids)
+exact_ids = exact_ids[sort_index]
+exact_pos = exact_pos[sort_index, :]
+exact_a = exact_a[sort_index, :]        
+exact_a_norm = np.sqrt(exact_a[:,0]**2 + exact_a[:,1]**2 + exact_a[:,2]**2)
+    
+# Start the plot
+plt.figure()
+
+count = 0
+
+# Get the Gadget-2 data if existing
+gadget2_file_list = glob.glob("forcetest_gadget2.txt")
+if len(gadget2_file_list) != 0:
+
+    gadget2_data = np.loadtxt(gadget2_file_list[0])
+    gadget2_ids = gadget2_data[:,0]
+    gadget2_pos = gadget2_data[:,1:4]
+    gadget2_a_exact = gadget2_data[:,4:7]
+    gadget2_a_grav = gadget2_data[:, 7:10]
+
+    # Sort stuff
+    sort_index = np.argsort(gadget2_ids)
+    gadget2_ids = gadget2_ids[sort_index]
+    gadget2_pos = gadget2_pos[sort_index, :]
+    gadget2_a_exact = gadget2_a_exact[sort_index, :]
+    gadget2_a_grav = gadget2_a_grav[sort_index, :]
+
+    # Cross-checks
+    if not np.array_equal(exact_ids, gadget2_ids):
+        print "Comparing different IDs !"
+
+    if np.max(np.abs(exact_pos - gadget2_pos)/np.abs(gadget2_pos)) > 1e-6:
+        print "Comparing different positions ! max difference:"
+        index = np.argmax(exact_pos[:,0]**2 + exact_pos[:,1]**2 + exact_pos[:,2]**2 - gadget2_pos[:,0]**2 - gadget2_pos[:,1]**2 - gadget2_pos[:,2]**2)
+        print "Gadget2 (id=%d):"%gadget2_ids[index], gadget2_pos[index,:], "exact (id=%d):"%exact_ids[index], exact_pos[index,:], "\n"
+
+    if np.max(np.abs(exact_a - gadget2_a_exact) / np.abs(gadget2_a_exact)) > 2e-6:
+        print "Comparing different exact accelerations ! max difference:"
+        index = np.argmax(exact_a[:,0]**2 + exact_a[:,1]**2 + exact_a[:,2]**2 - gadget2_a_exact[:,0]**2 - gadget2_a_exact[:,1]**2 - gadget2_a_exact[:,2]**2)
+        print "a_exact --- Gadget2:", gadget2_a_exact[index,:], "exact:", exact_a[index,:]
+        print "pos ---     Gadget2: (id=%d):"%gadget2_ids[index], gadget2_pos[index,:], "exact (id=%d):"%ids[index], pos[index,:],"\n"
+
+    
+    # Compute the error norm
+    diff = gadget2_a_exact - gadget2_a_grav
+
+    norm_diff = np.sqrt(diff[:,0]**2 + diff[:,1]**2 + diff[:,2]**2)
+    norm_a = np.sqrt(gadget2_a_exact[:,0]**2 + gadget2_a_exact[:,1]**2 + gadget2_a_exact[:,2]**2)
+
+    norm_error = norm_diff / norm_a
+    error_x = abs(diff[:,0]) / norm_a
+    error_y = abs(diff[:,1]) / norm_a
+    error_z = abs(diff[:,2]) / norm_a
+    
+    # Bin the error
+    norm_error_hist,_ = np.histogram(norm_error, bins=bin_edges, density=False) / (np.size(norm_error) * bin_size)
+    error_x_hist,_ = np.histogram(error_x, bins=bin_edges, density=False) / (np.size(norm_error) * bin_size)
+    error_y_hist,_ = np.histogram(error_y, bins=bin_edges, density=False) / (np.size(norm_error) * bin_size)
+    error_z_hist,_ = np.histogram(error_z, bins=bin_edges, density=False) / (np.size(norm_error) * bin_size)
+    
+    norm_median = np.median(norm_error)
+    median_x = np.median(error_x)
+    median_y = np.median(error_y)
+    median_z = np.median(error_z)
+
+    norm_per99 = np.percentile(norm_error,99)
+    per99_x = np.percentile(error_x,99)
+    per99_y = np.percentile(error_y,99)
+    per99_z = np.percentile(error_z,99)
+
+    norm_max = np.max(norm_error)
+    max_x = np.max(error_x)
+    max_y = np.max(error_y)
+    max_z = np.max(error_z)
+
+    print "Gadget-2 ---- "
+    print "Norm: median= %f 99%%= %f max= %f"%(norm_median, norm_per99, norm_max)
+    print "X   : median= %f 99%%= %f max= %f"%(median_x, per99_x, max_x)
+    print "Y   : median= %f 99%%= %f max= %f"%(median_y, per99_y, max_y)
+    print "Z   : median= %f 99%%= %f max= %f"%(median_z, per99_z, max_z)
+    print ""
+
+    plt.subplot(221)    
+    plt.text(min_error * 1.5, 1.55, "$50\\%%\\rightarrow%.4f~~ 99\\%%\\rightarrow%.4f$"%(norm_median, norm_per99), ha="left", va="top", alpha=0.8)
+    plt.semilogx(bins, norm_error_hist, 'k--', label="Gadget-2", alpha=0.8)
+    plt.subplot(222)
+    plt.semilogx(bins, error_x_hist, 'k--', label="Gadget-2", alpha=0.8)
+    plt.text(min_error * 1.5, 1.55, "$50\\%%\\rightarrow%.4f~~ 99\\%%\\rightarrow%.4f$"%(median_x, per99_x), ha="left", va="top", alpha=0.8)
+    plt.subplot(223)    
+    plt.semilogx(bins, error_y_hist, 'k--', label="Gadget-2", alpha=0.8)
+    plt.text(min_error * 1.5, 1.55, "$50\\%%\\rightarrow%.4f~~ 99\\%%\\rightarrow%.4f$"%(median_y, per99_y), ha="left", va="top", alpha=0.8)
+    plt.subplot(224)    
+    plt.semilogx(bins, error_z_hist, 'k--', label="Gadget-2", alpha=0.8)
+    plt.text(min_error * 1.5, 1.55, "$50\\%%\\rightarrow%.4f~~ 99\\%%\\rightarrow%.4f$"%(median_z, per99_z), ha="left", va="top", alpha=0.8)
+    
+    count += 1
+
+
+# Plot the different histograms
+for i in range(num_order):
+    data = np.loadtxt(order_list[i])
+    ids = data[:,0]
+    pos = data[:,1:4]
+    a_grav = data[:, 4:7]
+
+    # Sort stuff
+    sort_index = np.argsort(ids)
+    ids = ids[sort_index]
+    pos = pos[sort_index, :]
+    a_grav = a_grav[sort_index, :]        
+
+    # Cross-checks
+    if not np.array_equal(exact_ids, ids):
+        print "Comparing different IDs !"
+
+    if np.max(np.abs(exact_pos - pos)/np.abs(pos)) > 1e-6:
+        print "Comparing different positions ! max difference:"
+        index = np.argmax(exact_pos[:,0]**2 + exact_pos[:,1]**2 + exact_pos[:,2]**2 - pos[:,0]**2 - pos[:,1]**2 - pos[:,2]**2)
+        print "SWIFT (id=%d):"%ids[index], pos[index,:], "exact (id=%d):"%exact_ids[index], exact_pos[index,:], "\n"
+
+    
+    # Compute the error norm
+    diff = exact_a - a_grav
+
+    norm_diff = np.sqrt(diff[:,0]**2 + diff[:,1]**2 + diff[:,2]**2)
+
+    norm_error = norm_diff / exact_a_norm
+    error_x = abs(diff[:,0]) / exact_a_norm
+    error_y = abs(diff[:,1]) / exact_a_norm
+    error_z = abs(diff[:,2]) / exact_a_norm
+    
+    # Bin the error
+    norm_error_hist,_ = np.histogram(norm_error, bins=bin_edges, density=False) / (np.size(norm_error) * bin_size)
+    error_x_hist,_ = np.histogram(error_x, bins=bin_edges, density=False) / (np.size(norm_error) * bin_size)
+    error_y_hist,_ = np.histogram(error_y, bins=bin_edges, density=False) / (np.size(norm_error) * bin_size)
+    error_z_hist,_ = np.histogram(error_z, bins=bin_edges, density=False) / (np.size(norm_error) * bin_size)
+
+    norm_median = np.median(norm_error)
+    median_x = np.median(error_x)
+    median_y = np.median(error_y)
+    median_z = np.median(error_z)
+
+    norm_per99 = np.percentile(norm_error,99)
+    per99_x = np.percentile(error_x,99)
+    per99_y = np.percentile(error_y,99)
+    per99_z = np.percentile(error_z,99)
+
+    norm_max = np.max(norm_error)
+    max_x = np.max(error_x)
+    max_y = np.max(error_y)
+    max_z = np.max(error_z)
+
+    print "Order %d ---- "%order[i]
+    print "Norm: median= %f 99%%= %f max= %f"%(norm_median, norm_per99, norm_max)
+    print "X   : median= %f 99%%= %f max= %f"%(median_x, per99_x, max_x)
+    print "Y   : median= %f 99%%= %f max= %f"%(median_y, per99_y, max_y)
+    print "Z   : median= %f 99%%= %f max= %f"%(median_z, per99_z, max_z)
+    print ""
+    
+    plt.subplot(221)
+    plt.semilogx(bins, norm_error_hist, color=cols[i],label="SWIFT m-poles order %d"%order[i])
+    plt.text(min_error * 1.5, 1.5 - count/10., "$50\\%%\\rightarrow%.4f~~ 99\\%%\\rightarrow%.4f$"%(norm_median, norm_per99), ha="left", va="top", color=cols[i])
+    plt.subplot(222)    
+    plt.semilogx(bins, error_x_hist, color=cols[i],label="SWIFT m-poles order %d"%order[i])
+    plt.text(min_error * 1.5, 1.5 - count/10., "$50\\%%\\rightarrow%.4f~~ 99\\%%\\rightarrow%.4f$"%(median_x, per99_x), ha="left", va="top", color=cols[i])
+    plt.subplot(223)    
+    plt.semilogx(bins, error_y_hist, color=cols[i],label="SWIFT m-poles order %d"%order[i])
+    plt.text(min_error * 1.5, 1.5 - count/10., "$50\\%%\\rightarrow%.4f~~ 99\\%%\\rightarrow%.4f$"%(median_y, per99_y), ha="left", va="top", color=cols[i])
+    plt.subplot(224)    
+    plt.semilogx(bins, error_z_hist, color=cols[i],label="SWIFT m-poles order %d"%order[i])
+    plt.text(min_error * 1.5, 1.5 - count/10., "$50\\%%\\rightarrow%.4f~~ 99\\%%\\rightarrow%.4f$"%(median_z, per99_z), ha="left", va="top", color=cols[i])
+
+    count += 1
+
+plt.subplot(221)
+plt.xlabel("$|\delta \overrightarrow{a}|/|\overrightarrow{a}_{exact}|$")
+#plt.ylabel("Density")
+plt.xlim(min_error, max_error)
+plt.ylim(0,2.5)
+plt.legend(loc="upper left")
+plt.subplot(222)    
+plt.xlabel("$\delta a_x/|\overrightarrow{a}_{exact}|$")
+#plt.ylabel("Density")
+plt.xlim(min_error, max_error)
+plt.ylim(0,1.75)
+#plt.legend(loc="center left")
+plt.subplot(223)    
+plt.xlabel("$\delta a_y/|\overrightarrow{a}_{exact}|$")
+#plt.ylabel("Density")
+plt.xlim(min_error, max_error)
+plt.ylim(0,1.75)
+#plt.legend(loc="center left")
+plt.subplot(224)    
+plt.xlabel("$\delta a_z/|\overrightarrow{a}_{exact}|$")
+#plt.ylabel("Density")
+plt.xlim(min_error, max_error)
+plt.ylim(0,1.75)
+#plt.legend(loc="center left")
+
+
+
+plt.savefig("gravity_checks_step%d.png"%step, dpi=200)
+plt.savefig("gravity_checks_step%d.pdf"%step, dpi=200)
diff --git a/src/Makefile.am b/src/Makefile.am
index 14e435f663f01d8faa5f12720398b58633300093..1da7a0d955e488c0b96ee209080b5438356a36bc 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -46,7 +46,7 @@ include_HEADERS = space.h runner.h queue.h task.h lock.h cell.h part.h const.h \
     hydro_properties.h riemann.h threadpool.h cooling.h cooling_struct.h sourceterms.h \
     sourceterms_struct.h statistics.h memswap.h cache.h runner_doiact_vec.h profiler.h \
     dump.h logger.h active.h timeline.h xmf.h gravity_properties.h gravity_derivatives.h \
-    vector_power.h hydro_space.h sort_part.h
+    vector_power.h collectgroup.h hydro_space.h sort_part.h
 
 # Common source files
 AM_SOURCES = space.c runner.c queue.c task.c cell.c engine.c \
@@ -57,7 +57,7 @@ AM_SOURCES = space.c runner.c queue.c task.c cell.c engine.c \
     runner_doiact_fft.c threadpool.c cooling.c sourceterms.c \
     statistics.c runner_doiact_vec.c profiler.c dump.c logger.c \
     part_type.c xmf.c gravity_properties.c gravity.c \
-    hydro_space.c
+    collectgroup.c hydro_space.c
 
 # Include files for distribution, not installation.
 nobase_noinst_HEADERS = align.h approx_math.h atomic.h cycle.h error.h inline.h kernel_hydro.h kernel_gravity.h \
diff --git a/src/cell.c b/src/cell.c
index a9c3cc9cb94c3a06ce92abc3cb7904100840f630..95e8619d0c392f753aaa3c646868af4e360ef138 100644
--- a/src/cell.c
+++ b/src/cell.c
@@ -1018,14 +1018,15 @@ void cell_clean_links(struct cell *c, void *data) {
 }
 
 /**
- * @brief Checks that a cell is at the current point in time
+ * @brief Checks that the particles in a cell are at the
+ * current point in time
  *
  * Calls error() if the cell is not at the current time.
  *
  * @param c Cell to act upon
  * @param data The current time on the integer time-line
  */
-void cell_check_drift_point(struct cell *c, void *data) {
+void cell_check_particle_drift_point(struct cell *c, void *data) {
 
 #ifdef SWIFT_DEBUG_CHECKS
 
@@ -1057,6 +1058,34 @@ void cell_check_drift_point(struct cell *c, void *data) {
 #endif
 }
 
+/**
+ * @brief Checks that the multipole of a cell is at the current point in time
+ *
+ * Calls error() if the cell is not at the current time.
+ *
+ * @param c Cell to act upon
+ * @param data The current time on the integer time-line
+ */
+void cell_check_multipole_drift_point(struct cell *c, void *data) {
+
+#ifdef SWIFT_DEBUG_CHECKS
+
+  const integertime_t ti_drift = *(integertime_t *)data;
+
+  /* Only check local cells */
+  if (c->nodeID != engine_rank) return;
+
+  if (c->ti_old_multipole != ti_drift)
+    error(
+        "Cell multipole in an incorrect time-zone! c->ti_old_multipole=%lld "
+        "ti_drift=%lld (depth=%d)",
+        c->ti_old_multipole, ti_drift, c->depth);
+
+#else
+  error("Calling debugging code without debugging flag activated.");
+#endif
+}
+
 /**
  * @brief Resets all the individual cell task counters to 0.
  *
@@ -1129,13 +1158,24 @@ void cell_check_multipole(struct cell *c, void *data) {
 
     /* Now  compare the multipole expansion */
     if (!gravity_multipole_equal(&ma, c->multipole, tolerance)) {
-      message("Multipoles are not equal at depth=%d!", c->depth);
+      message("Multipoles are not equal at depth=%d! tol=%f", c->depth,
+              tolerance);
       message("Correct answer:");
       gravity_multipole_print(&ma.m_pole);
       message("Recursive multipole:");
       gravity_multipole_print(&c->multipole->m_pole);
       error("Aborting");
     }
+
+    /* Check that the upper limit of r_max is good enough */
+    if (!(c->multipole->r_max >= ma.r_max)) {
+      error("Upper-limit r_max=%e too small. Should be >=%e.",
+            c->multipole->r_max, ma.r_max);
+    } else if (c->multipole->r_max * c->multipole->r_max >
+               3. * c->width[0] * c->width[0]) {
+      error("r_max=%e larger than cell diagonal %e.", c->multipole->r_max,
+            sqrt(3. * c->width[0] * c->width[0]));
+    }
   }
 #else
   error("Calling debugging code without debugging flag activated.");
@@ -1529,17 +1569,15 @@ void cell_drift_all_multipoles(struct cell *c, const struct engine *e) {
   /* Check that we are actually going to move forward. */
   if (ti_current < ti_old_multipole) error("Attempt to drift to the past");
 
+  /* Drift the multipole */
+  if (ti_current > ti_old_multipole) gravity_drift(c->multipole, dt);
+
   /* Are we not in a leaf ? */
   if (c->split) {
 
-    /* Loop over the progeny and drift the multipoles. */
+    /* Loop over the progeny and recurse. */
     for (int k = 0; k < 8; k++)
-      if (c->progeny[k] != NULL) cell_drift_particles(c->progeny[k], e);
-
-  } else if (ti_current > ti_old_multipole) {
-
-    /* Drift the multipole */
-    gravity_drift(c->multipole, dt);
+      if (c->progeny[k] != NULL) cell_drift_all_multipoles(c->progeny[k], e);
   }
 
   /* Update the time of the last drift */
diff --git a/src/cell.h b/src/cell.h
index 54a1dde5776c3b6274dc0486135643e76889c225..64b874095e38d9a3919bbcdf00e48572a976a469 100644
--- a/src/cell.h
+++ b/src/cell.h
@@ -355,7 +355,8 @@ int cell_are_neighbours(const struct cell *restrict ci,
                         const struct cell *restrict cj);
 void cell_check_multipole(struct cell *c, void *data);
 void cell_clean(struct cell *c);
-void cell_check_drift_point(struct cell *c, void *data);
+void cell_check_particle_drift_point(struct cell *c, void *data);
+void cell_check_multipole_drift_point(struct cell *c, void *data);
 void cell_reset_task_counters(struct cell *c);
 int cell_is_drift_needed(struct cell *c, const struct engine *e);
 int cell_unskip_tasks(struct cell *c, struct scheduler *s);
diff --git a/src/collectgroup.c b/src/collectgroup.c
new file mode 100644
index 0000000000000000000000000000000000000000..0b4ddc405772a45a1e444ef48b65fcb7d37a248f
--- /dev/null
+++ b/src/collectgroup.c
@@ -0,0 +1,197 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (c) 2017 Peter W. Draper (p.w.draper@durham.ac.uk)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+
+/* Config parameters. */
+#include "../config.h"
+
+/* MPI headers. */
+#ifdef WITH_MPI
+#include <mpi.h>
+#endif
+
+/* This object's header. */
+#include "collectgroup.h"
+
+/* Local headers. */
+#include "engine.h"
+#include "error.h"
+
+#ifdef WITH_MPI
+
+/* Local collections for MPI reduces. */
+struct mpicollectgroup1 {
+  size_t updates, g_updates, s_updates;
+  integertime_t ti_end_min;
+  int forcerebuild;
+};
+
+/* Forward declarations. */
+static void mpicollect_create_MPI_type();
+
+/**
+ * @brief MPI datatype for the #mpicollectgroup1 structure.
+ */
+static MPI_Datatype mpicollectgroup1_type;
+
+/**
+ * @brief MPI operator to reduce #mpicollectgroup1 structures.
+ */
+static MPI_Op mpicollectgroup1_reduce_op;
+
+#endif
+
+/**
+ * @brief Perform any once only initialisations. Must be called once.
+ */
+void collectgroup_init() {
+
+#ifdef WITH_MPI
+  /* Initialise the MPI types. */
+  mpicollect_create_MPI_type();
+#endif
+}
+
+/**
+ * @brief Apply the collectgroup1 values to the engine by copying all the
+ * values to the engine fields.
+ *
+ * @param grp1 The #collectgroup1
+ * @param e The #engine
+ */
+void collectgroup1_apply(struct collectgroup1 *grp1, struct engine *e) {
+  e->ti_end_min = grp1->ti_end_min;
+  e->ti_end_max = grp1->ti_end_max;
+  e->ti_beg_max = grp1->ti_beg_max;
+  e->updates = grp1->updates;
+  e->g_updates = grp1->g_updates;
+  e->s_updates = grp1->s_updates;
+  e->forcerebuild = grp1->forcerebuild;
+}
+
+/**
+ * @brief Initialises a collectgroup1 struct ready for processing.
+ *
+ * @param grp1 The #collectgroup1 to initialise
+ * @param updates the number of updated hydro particles on this node this step.
+ * @param g_updates the number of updated gravity particles on this node this
+ * step.
+ * @param s_updates the number of updated star particles on this node this step.
+ * @param ti_end_min the minimum end time for next time step after this step.
+ * @param ti_end_max the maximum end time for next time step after this step.
+ * @param ti_beg_max the maximum begin time for next time step after this step.
+ * @param forcerebuild whether a rebuild is required after this step.
+ */
+void collectgroup1_init(struct collectgroup1 *grp1, size_t updates,
+                        size_t g_updates, size_t s_updates,
+                        integertime_t ti_end_min, integertime_t ti_end_max,
+                        integertime_t ti_beg_max, int forcerebuild) {
+  grp1->updates = updates;
+  grp1->g_updates = g_updates;
+  grp1->s_updates = s_updates;
+  grp1->ti_end_min = ti_end_min;
+  grp1->ti_end_max = ti_end_max;
+  grp1->ti_beg_max = ti_beg_max;
+  grp1->forcerebuild = forcerebuild;
+}
+
+/**
+ * @brief Do any processing necessary to the group before it can be used.
+ *
+ * This may involve an MPI reduction across all nodes.
+ *
+ * @param grp1 the #collectgroup1 struct already initialised by a call
+ *             to collectgroup1_init.
+ */
+void collectgroup1_reduce(struct collectgroup1 *grp1) {
+
+#ifdef WITH_MPI
+
+  /* Populate an MPI group struct and reduce this across all nodes. */
+  struct mpicollectgroup1 mpigrp11;
+  mpigrp11.updates = grp1->updates;
+  mpigrp11.g_updates = grp1->g_updates;
+  mpigrp11.s_updates = grp1->s_updates;
+  mpigrp11.ti_end_min = grp1->ti_end_min;
+  mpigrp11.forcerebuild = grp1->forcerebuild;
+
+  struct mpicollectgroup1 mpigrp12;
+  if (MPI_Allreduce(&mpigrp11, &mpigrp12, 1, mpicollectgroup1_type,
+                    mpicollectgroup1_reduce_op, MPI_COMM_WORLD) != MPI_SUCCESS)
+    error("Failed to reduce mpicollection1.");
+
+  /* And update. */
+  grp1->updates = mpigrp12.updates;
+  grp1->g_updates = mpigrp12.g_updates;
+  grp1->s_updates = mpigrp12.s_updates;
+  grp1->ti_end_min = mpigrp12.ti_end_min;
+  grp1->forcerebuild = mpigrp12.forcerebuild;
+
+#endif
+}
+
+#ifdef WITH_MPI
+/**
+ * @brief Do the reduction of two structs.
+ *
+ * @param mpigrp11 the first struct, this is updated on exit.
+ * @param mpigrp12 the second struct
+ */
+static void doreduce1(struct mpicollectgroup1 *mpigrp11,
+                      const struct mpicollectgroup1 *mpigrp12) {
+
+  /* Do what is needed for each part of the collection. */
+  /* Sum of updates. */
+  mpigrp11->updates += mpigrp12->updates;
+  mpigrp11->g_updates += mpigrp12->g_updates;
+  mpigrp11->s_updates += mpigrp12->s_updates;
+
+  /* Minimum end time. */
+  mpigrp11->ti_end_min = min(mpigrp11->ti_end_min, mpigrp12->ti_end_min);
+
+  /* Everyone must agree to not rebuild. */
+  if (mpigrp11->forcerebuild || mpigrp12->forcerebuild)
+    mpigrp11->forcerebuild = 1;
+}
+
+/**
+ * @brief MPI reduce operator for #mpicollectgroup structures.
+ */
+static void mpicollectgroup1_reduce(void *in, void *inout, int *len,
+                                    MPI_Datatype *datatype) {
+
+  for (int i = 0; i < *len; ++i)
+    doreduce1(&((struct mpicollectgroup1 *)inout)[0],
+              &((const struct mpicollectgroup1 *)in)[i]);
+}
+
+/**
+ * @brief Registers any MPI collection types and reduction functions.
+ */
+static void mpicollect_create_MPI_type() {
+
+  if (MPI_Type_contiguous(sizeof(struct mpicollectgroup1), MPI_BYTE,
+                          &mpicollectgroup1_type) != MPI_SUCCESS ||
+      MPI_Type_commit(&mpicollectgroup1_type) != MPI_SUCCESS) {
+    error("Failed to create MPI type for mpicollection1.");
+  }
+
+  /* Create the reduction operation */
+  MPI_Op_create(mpicollectgroup1_reduce, 1, &mpicollectgroup1_reduce_op);
+}
+#endif
diff --git a/src/collectgroup.h b/src/collectgroup.h
new file mode 100644
index 0000000000000000000000000000000000000000..b0cbf0763d94dec65ea1aae2c521c1c4b9f7bf83
--- /dev/null
+++ b/src/collectgroup.h
@@ -0,0 +1,55 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (c) 2017 Peter W. Draper (p.w.draper@durham.ac.uk)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+#ifndef SWIFT_COLLECTGROUP_H
+#define SWIFT_COLLECTGROUP_H
+
+/* Config parameters. */
+#include "../config.h"
+
+/* Standard headers. */
+#include <stddef.h>
+
+/* Local headers. */
+#include "timeline.h"
+
+/* Forward declaration of engine struct (to avoid cyclic include). */
+struct engine;
+
+/* A collection of global quantities that can be processed at the same time. */
+struct collectgroup1 {
+
+  /* Number of particles updated */
+  size_t updates, g_updates, s_updates;
+
+  /* Times for the time-step */
+  integertime_t ti_end_min, ti_end_max, ti_beg_max;
+
+  /* Force the engine to rebuild? */
+  int forcerebuild;
+};
+
+void collectgroup_init();
+void collectgroup1_apply(struct collectgroup1 *grp1, struct engine *e);
+void collectgroup1_init(struct collectgroup1 *grp1, size_t updates,
+                        size_t g_updates, size_t s_updates,
+                        integertime_t ti_end_min, integertime_t ti_end_max,
+                        integertime_t ti_beg_max, int forcerebuild);
+void collectgroup1_reduce(struct collectgroup1 *grp1);
+
+#endif /* SWIFT_COLLECTGROUP_H */
diff --git a/src/engine.c b/src/engine.c
index 496e1c4467ef2ba5cd857ca339165ec7b6d51af9..c0b2bde3643c3378b040f6fc95e48a388356b325 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -836,7 +836,8 @@ void engine_repartition(struct engine *e) {
   fflush(stdout);
 
   /* Check that all cells have been drifted to the current time */
-  space_check_drift_point(e->s, e->ti_old);
+  space_check_drift_point(e->s, e->ti_old,
+                          e->policy & engine_policy_self_gravity);
 #endif
 
   /* Clear the repartition flag. */
@@ -1635,9 +1636,9 @@ void engine_exchange_strays(struct engine *e, size_t offset_parts,
  * @brief Constructs the top-level tasks for the short-range gravity
  * interactions.
  *
- * All top-cells get a self task.
- * All neighbouring pairs get a pair task.
- * All non-neighbouring pairs within a range of 6 cells get a M-M task.
+ * - All top-cells get a self task.
+ * - All pairs within range according to the multipole acceptance
+ *   criterion get a pair task.
  *
  * @param e The #engine.
  */
@@ -1646,6 +1647,7 @@ void engine_make_self_gravity_tasks(struct engine *e) {
   struct space *s = e->s;
   struct scheduler *sched = &e->sched;
   const int nodeID = e->nodeID;
+  const double theta_crit_inv = e->gravity_properties->theta_crit_inv;
   struct cell *cells = s->cells_top;
   const int nr_cells = s->nr_cells;
 
@@ -1656,13 +1658,14 @@ void engine_make_self_gravity_tasks(struct engine *e) {
     /* Skip cells without gravity particles */
     if (ci->gcount == 0) continue;
 
-    /* Is that neighbour local ? */
+    /* Is that cell local ? */
     if (ci->nodeID != nodeID) continue;
 
     /* If the cells is local build a self-interaction */
     scheduler_addtask(sched, task_type_self, task_subtype_grav, 0, 0, ci, NULL,
                       0);
 
+    /* Loop over every other cell */
     for (int cjd = cid + 1; cjd < nr_cells; ++cjd) {
 
       struct cell *cj = &cells[cjd];
@@ -1671,9 +1674,11 @@ void engine_make_self_gravity_tasks(struct engine *e) {
       if (cj->gcount == 0) continue;
 
       /* Is that neighbour local ? */
-      if (cj->nodeID != nodeID) continue;
+      if (cj->nodeID != nodeID) continue;  // MATTHIEU
 
-      if (cell_are_neighbours(ci, cj))
+      /* Are the cells to close for a MM interaction ? */
+      if (!gravity_multipole_accept(ci->multipole, cj->multipole,
+                                    theta_crit_inv))
         scheduler_addtask(sched, task_type_pair, task_subtype_grav, 0, 0, ci,
                           cj, 1);
     }
@@ -2711,7 +2716,8 @@ void engine_rebuild(struct engine *e) {
   /* Check that all cells have been drifted to the current time.
    * That can include cells that have not
    * previously been active on this rank. */
-  space_check_drift_point(e->s, e->ti_old);
+  space_check_drift_point(e->s, e->ti_old,
+                          e->policy & engine_policy_self_gravity);
 #endif
 
   if (e->verbose)
@@ -2733,7 +2739,8 @@ void engine_prepare(struct engine *e) {
     /* Check that all cells have been drifted to the current time.
      * That can include cells that have not
      * previously been active on this rank. */
-    space_check_drift_point(e->s, e->ti_old);
+    space_check_drift_point(e->s, e->ti_old,
+                            e->policy & engine_policy_self_gravity);
   }
 #endif
 
@@ -2849,12 +2856,23 @@ void engine_collect_kick(struct cell *c) {
 }
 
 /**
- * @brief Collects the next time-step by making each super-cell recurse
- * to collect the minimal of ti_end and the number of updated particles.
+ * @brief Collects the next time-step and rebuild flag.
+ *
+ * The next time-step is determined by making each super-cell recurse to
+ * collect the minimal of ti_end and the number of updated particles.  When in
+ * MPI mode this routines reduces these across all nodes and also collects the
+ * forcerebuild flag -- this is so that we only use a single collective MPI
+ * call per step for all these values.
+ *
+ * Note that the results are stored in e->collect_group1 struct not in the
+ * engine fields, unless apply is true. These can be applied field-by-field
+ * or all at once using collectgroup1_copy();
  *
  * @param e The #engine.
+ * @param apply whether to apply the results to the engine or just keep in the
+ *              group1 struct.
  */
-void engine_collect_timestep(struct engine *e) {
+void engine_collect_timestep_and_rebuild(struct engine *e, int apply) {
 
   const ticks tic = getticks();
   int updates = 0, g_updates = 0, s_updates = 0;
@@ -2884,37 +2902,59 @@ void engine_collect_timestep(struct engine *e) {
     }
   }
 
-/* Aggregate the data from the different nodes. */
+  /* Store these in the temporary collection group. */
+  collectgroup1_init(&e->collect_group1, updates, g_updates, s_updates,
+                     ti_end_min, ti_end_max, ti_beg_max, e->forcerebuild);
+
+/* Aggregate collective data from the different nodes for this step. */
 #ifdef WITH_MPI
+  collectgroup1_reduce(&e->collect_group1);
+
+#ifdef SWIFT_DEBUG_CHECKS
   {
+    /* Check the above using the original MPI calls. */
     integertime_t in_i[1], out_i[1];
     in_i[0] = 0;
     out_i[0] = ti_end_min;
     if (MPI_Allreduce(out_i, in_i, 1, MPI_LONG_LONG_INT, MPI_MIN,
                       MPI_COMM_WORLD) != MPI_SUCCESS)
-      error("Failed to aggregate t_end_min.");
-    ti_end_min = in_i[0];
-  }
-  {
+      error("Failed to aggregate ti_end_min.");
+    if (in_i[0] != (long long)e->collect_group1.ti_end_min)
+      error("Failed to get same ti_end_min, is %lld, should be %lld", in_i[0],
+            e->collect_group1.ti_end_min);
+
     long long in_ll[3], out_ll[3];
     out_ll[0] = updates;
     out_ll[1] = g_updates;
     out_ll[2] = s_updates;
     if (MPI_Allreduce(out_ll, in_ll, 3, MPI_LONG_LONG_INT, MPI_SUM,
                       MPI_COMM_WORLD) != MPI_SUCCESS)
-      error("Failed to aggregate energies.");
-    updates = in_ll[0];
-    g_updates = in_ll[1];
-    s_updates = in_ll[2];
+      error("Failed to aggregate particle counts.");
+    if (in_ll[0] != (long long)e->collect_group1.updates)
+      error("Failed to get same updates, is %lld, should be %ld", in_ll[0],
+            e->collect_group1.updates);
+    if (in_ll[1] != (long long)e->collect_group1.g_updates)
+      error("Failed to get same g_updates, is %lld, should be %ld", in_ll[1],
+            e->collect_group1.g_updates);
+    if (in_ll[2] != (long long)e->collect_group1.s_updates)
+      error("Failed to get same s_updates, is %lld, should be %ld", in_ll[2],
+            e->collect_group1.s_updates);
+
+    int buff = 0;
+    if (MPI_Allreduce(&e->forcerebuild, &buff, 1, MPI_INT, MPI_MAX,
+                      MPI_COMM_WORLD) != MPI_SUCCESS)
+      error("Failed to aggregate the rebuild flag across nodes.");
+    if (!!buff != !!e->collect_group1.forcerebuild)
+      error(
+          "Failed to get same rebuild flag from all nodes, is %d,"
+          "should be %d",
+          buff, e->collect_group1.forcerebuild);
   }
+#endif
 #endif
 
-  e->ti_end_min = ti_end_min;
-  e->ti_end_max = ti_end_max;
-  e->ti_beg_max = ti_beg_max;
-  e->updates = updates;
-  e->g_updates = g_updates;
-  e->s_updates = s_updates;
+  /* Apply to the engine, if requested. */
+  if (apply) collectgroup1_apply(&e->collect_group1, e);
 
   if (e->verbose)
     message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
@@ -2934,7 +2974,8 @@ void engine_print_stats(struct engine *e) {
   /* Check that all cells have been drifted to the current time.
    * That can include cells that have not
    * previously been active on this rank. */
-  space_check_drift_point(e->s, e->ti_current);
+  space_check_drift_point(e->s, e->ti_current,
+                          e->policy & engine_policy_self_gravity);
 
   /* Be verbose about this */
   message("Saving statistics at t=%e.", e->time);
@@ -3165,7 +3206,7 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs) {
 #endif
 
   /* Recover the (integer) end of the next time-step */
-  engine_collect_timestep(e);
+  engine_collect_timestep_and_rebuild(e, 1);
 
   clocks_gettime(&time2);
 
@@ -3243,7 +3284,10 @@ void engine_step(struct engine *e) {
     for (int i = 0; i < e->s->nr_cells; ++i)
       num_gpart_mpole += e->s->cells_top[i].multipole->m_pole.num_gpart;
     if (num_gpart_mpole != e->s->nr_gparts)
-      error("Multipoles don't contain the total number of gpart");
+      error(
+          "Multipoles don't contain the total number of gpart mpoles=%zd "
+          "ngparts=%zd",
+          num_gpart_mpole, e->s->nr_gparts);
   }
 #endif
 
@@ -3252,6 +3296,9 @@ void engine_step(struct engine *e) {
   gravity_exact_force_compute(e->s, e);
 #endif
 
+  /* Do we need to drift the top-level multipoles ? */
+  if (e->policy & engine_policy_self_gravity) engine_drift_top_multipoles(e);
+
   /* Start all the tasks. */
   TIMER_TIC;
   engine_launch(e, e->nr_threads);
@@ -3262,14 +3309,16 @@ void engine_step(struct engine *e) {
   gravity_exact_force_check(e->s, e, 1e-1);
 #endif
 
-/* Collect the values of rebuild from all nodes. */
-#ifdef WITH_MPI
-  int buff = 0;
-  if (MPI_Allreduce(&e->forcerebuild, &buff, 1, MPI_INT, MPI_MAX,
-                    MPI_COMM_WORLD) != MPI_SUCCESS)
-    error("Failed to aggregate the rebuild flag across nodes.");
-  e->forcerebuild = buff;
-#endif
+  if (!(e->policy & engine_policy_hydro) &&
+      (e->policy & engine_policy_self_gravity) && e->step % 20 == 0)
+    e->forcerebuild = 1;
+
+  /* Collect the values of rebuild from all nodes and recover the (integer)
+   * end of the next time-step. Do these together to reduce the collective MPI
+   * calls per step, but some of the gathered information is not applied just
+   * yet (in case we save a snapshot or drift). */
+  engine_collect_timestep_and_rebuild(e, 0);
+  e->forcerebuild = e->collect_group1.forcerebuild;
 
   /* Save some statistics ? */
   if (e->time - e->timeLastStatistics >= e->deltaTimeStatistics) {
@@ -3305,8 +3354,8 @@ void engine_step(struct engine *e) {
     e->timeLastStatistics += e->deltaTimeStatistics;
   }
 
-  /* Recover the (integer) end of the next time-step */
-  engine_collect_timestep(e);
+  /* Now apply all the collected time step updates and particle counts. */
+  collectgroup1_apply(&e->collect_group1, e);
 
   TIMER_TOC2(timer_step);
 
@@ -3343,8 +3392,8 @@ void engine_unskip(struct engine *e) {
 }
 
 /**
- * @brief Mapper function to drift ALL particle types and multipoles forward in
- * time.
+ * @brief Mapper function to drift *all* particle types and multipoles forward
+ * in time.
  *
  * @param map_data An array of #cell%s.
  * @param num_elements Chunk size.
@@ -3362,7 +3411,7 @@ void engine_do_drift_all_mapper(void *map_data, int num_elements,
       /* Drift all the particles */
       cell_drift_particles(c, e);
 
-      /* Drift the multipole */
+      /* Drift the multipoles */
       if (e->policy & engine_policy_self_gravity)
         cell_drift_all_multipoles(c, e);
     }
@@ -3370,7 +3419,8 @@ void engine_do_drift_all_mapper(void *map_data, int num_elements,
 }
 
 /**
- * @brief Drift *all* particles forward to the current time.
+ * @brief Drift *all* particles and multipoles at all levels
+ * forward to the current time.
  *
  * @param e The #engine.
  */
@@ -3387,7 +3437,54 @@ void engine_drift_all(struct engine *e) {
 
 #ifdef SWIFT_DEBUG_CHECKS
   /* Check that all cells have been drifted to the current time. */
-  space_check_drift_point(e->s, e->ti_current);
+  space_check_drift_point(e->s, e->ti_current,
+                          e->policy & engine_policy_self_gravity);
+#endif
+
+  if (e->verbose)
+    message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
+            clocks_getunit());
+}
+
+/**
+ * @brief Mapper function to drift *all* top-level multipoles forward in
+ * time.
+ *
+ * @param map_data An array of #cell%s.
+ * @param num_elements Chunk size.
+ * @param extra_data Pointer to an #engine.
+ */
+void engine_do_drift_top_multipoles_mapper(void *map_data, int num_elements,
+                                           void *extra_data) {
+
+  struct engine *e = (struct engine *)extra_data;
+  struct cell *cells = (struct cell *)map_data;
+
+  for (int ind = 0; ind < num_elements; ind++) {
+    struct cell *c = &cells[ind];
+    if (c != NULL && c->nodeID == e->nodeID) {
+
+      /* Drift the multipole at this level only */
+      cell_drift_multipole(c, e);
+    }
+  }
+}
+
+/**
+ * @brief Drift *all* top-level multipoles forward to the current time.
+ *
+ * @param e The #engine.
+ */
+void engine_drift_top_multipoles(struct engine *e) {
+
+  const ticks tic = getticks();
+
+  threadpool_map(&e->threadpool, engine_do_drift_top_multipoles_mapper,
+                 e->s->cells_top, e->s->nr_cells, sizeof(struct cell), 10, e);
+
+#ifdef SWIFT_DEBUG_CHECKS
+  /* Check that all cells have been drifted to the current time. */
+  space_check_top_multipoles_drift_point(e->s, e->ti_current);
 #endif
 
   if (e->verbose)
@@ -3603,7 +3700,8 @@ void engine_dump_snapshot(struct engine *e) {
   /* Check that all cells have been drifted to the current time.
    * That can include cells that have not
    * previously been active on this rank. */
-  space_check_drift_point(e->s, e->ti_current);
+  space_check_drift_point(e->s, e->ti_current,
+                          e->policy & engine_policy_self_gravity);
 
   /* Be verbose about this */
   message("writing snapshot at t=%e.", e->time);
@@ -4048,6 +4146,9 @@ void engine_init(struct engine *e, struct space *s,
   stats_create_MPI_type();
 #endif
 
+  /* Initialise the collection group. */
+  collectgroup_init();
+
   /* Initialize the threadpool. */
   threadpool_init(&e->threadpool, e->nr_threads);
 
diff --git a/src/engine.h b/src/engine.h
index a0e32ad15b79c364d13d19589f8462ff8705ee29..22ee122bb082895131584942bde50509952b98ff 100644
--- a/src/engine.h
+++ b/src/engine.h
@@ -38,6 +38,7 @@
 
 /* Includes. */
 #include "clocks.h"
+#include "collectgroup.h"
 #include "cooling_struct.h"
 #include "gravity_properties.h"
 #include "parser.h"
@@ -243,6 +244,10 @@ struct engine {
 
   /* The (parsed) parameter file */
   const struct swift_params *parameter_file;
+
+  /* Temporary struct to hold a group of deferable properties (in MPI mode
+   * these are reduced together, but may not be required just yet). */
+  struct collectgroup1 collect_group1;
 };
 
 /* Function prototypes. */
@@ -250,6 +255,7 @@ void engine_barrier(struct engine *e, int tid);
 void engine_compute_next_snapshot_time(struct engine *e);
 void engine_unskip(struct engine *e);
 void engine_drift_all(struct engine *e);
+void engine_drift_top_multipoles(struct engine *e);
 void engine_dump_snapshot(struct engine *e);
 void engine_init(struct engine *e, struct space *s,
                  const struct swift_params *params, int nr_nodes, int nodeID,
diff --git a/src/gravity.c b/src/gravity.c
index 369c6b1b0ab0458f7c6ab5057261a7cade97a64c..405e52cc116416c27ec236340a0358443e255384 100644
--- a/src/gravity.c
+++ b/src/gravity.c
@@ -22,6 +22,7 @@
 
 /* Some standard headers. */
 #include <stdio.h>
+#include <unistd.h>
 
 /* This object's header. */
 #include "gravity.h"
@@ -29,6 +30,57 @@
 /* Local headers. */
 #include "active.h"
 #include "error.h"
+#include "version.h"
+
+/**
+ * @brief Checks whether the file containing the exact accelerations for
+ * the current choice of parameters already exists.
+ *
+ * @param e The #engine.
+ */
+int gravity_exact_force_file_exits(const struct engine *e) {
+
+#ifdef SWIFT_GRAVITY_FORCE_CHECKS
+
+  /* File name */
+  char file_name[100];
+  sprintf(file_name, "gravity_checks_exact_step%d.dat", e->step);
+
+  /* Does the file exist ? */
+  if (access(file_name, R_OK | W_OK) == 0) {
+
+    /* Let's check whether the header matches the parameters of this run */
+    FILE *file = fopen(file_name, "r");
+    if (!file) error("Problem reading gravity_check file");
+
+    char line[100];
+    char dummy1[10], dummy2[10];
+    double epsilon, newton_G;
+    int N;
+    /* Reads file header */
+    if (fgets(line, 100, file) != line) error("Problem reading title");
+    if (fgets(line, 100, file) != line) error("Problem reading G");
+    sscanf(line, "%s %s %le", dummy1, dummy2, &newton_G);
+    if (fgets(line, 100, file) != line) error("Problem reading N");
+    sscanf(line, "%s %s %d", dummy1, dummy2, &N);
+    if (fgets(line, 100, file) != line) error("Problem reading epsilon");
+    sscanf(line, "%s %s %le", dummy1, dummy2, &epsilon);
+    fclose(file);
+
+    /* Check whether it matches the current parameters */
+    if (N == SWIFT_GRAVITY_FORCE_CHECKS &&
+        (fabs(epsilon - e->gravity_properties->epsilon) / epsilon < 1e-5) &&
+        (fabs(newton_G - e->physical_constants->const_newton_G) / newton_G <
+         1e-5)) {
+      return 1;
+    }
+  }
+  return 0;
+#else
+  error("Gravity checking function called without the corresponding flag.");
+  return 0;
+#endif
+}
 
 /**
  * @brief Run a brute-force gravity calculation for a subset of particles.
@@ -47,6 +99,13 @@ void gravity_exact_force_compute(struct space *s, const struct engine *e) {
   const double const_G = e->physical_constants->const_newton_G;
   int counter = 0;
 
+  /* Let's start by checking whether we already computed these forces */
+  if (gravity_exact_force_file_exits(e)) {
+    message("Exact accelerations already computed. Skipping calculation.");
+    return;
+  }
+
+  /* No matching file present ? Do it then */
   for (size_t i = 0; i < s->nr_gparts; ++i) {
 
     struct gpart *gpi = &s->gparts[i];
@@ -72,8 +131,8 @@ void gravity_exact_force_compute(struct space *s, const struct engine *e) {
                               gpi->x[2] - gpj->x[2]};  // z
         const double r2 = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2];
 
-        const double r = sqrtf(r2);
-        const double ir = 1.f / r;
+        const double r = sqrt(r2);
+        const double ir = 1. / r;
         const double mj = gpj->mass;
         const double hi = gpi->epsilon;
         double f;
@@ -86,15 +145,17 @@ void gravity_exact_force_compute(struct space *s, const struct engine *e) {
 
         } else {
 
-          const double hi_inv = 1.f / hi;
+          const double hi_inv = 1. / hi;
           const double hi_inv3 = hi_inv * hi_inv * hi_inv;
           const double ui = r * hi_inv;
-          float W;
+          double W;
 
-          kernel_grav_eval(ui, &W);
+          kernel_grav_eval_double(ui, &W);
 
           /* Get softened gravity */
           f = mj * hi_inv3 * W * f_lr;
+
+          // printf("r=%e hi=%e W=%e fac=%e\n", r, hi, W, f);
         }
 
         const double fdx[3] = {f * dx[0], f * dx[1], f * dx[2]};
@@ -138,25 +199,24 @@ void gravity_exact_force_check(struct space *s, const struct engine *e,
 
 #ifdef SWIFT_GRAVITY_FORCE_CHECKS
 
-  int counter = 0;
-
-  /* Some accumulators */
-  float err_rel_max = 0.f;
-  float err_rel_min = FLT_MAX;
-  float err_rel_mean = 0.f;
-  float err_rel_mean2 = 0.f;
-  float err_rel_std = 0.f;
-
-  char file_name[100];
-  sprintf(file_name, "gravity_checks_step%d_order%d.dat", e->step,
+  /* File name */
+  char file_name_swift[100];
+  sprintf(file_name_swift, "gravity_checks_swift_step%d_order%d.dat", e->step,
           SELF_GRAVITY_MULTIPOLE_ORDER);
-  FILE *file = fopen(file_name, "w");
-  fprintf(file, "# Gravity accuracy test G = %16.8e\n",
-          e->physical_constants->const_newton_G);
-  fprintf(file, "# %16s %16s %16s %16s %16s %16s %16s %16s %16s %16s\n", "id",
-          "pos[0]", "pos[1]", "pos[2]", "a_exact[0]", "a_exact[1]",
-          "a_exact[2]", "a_grav[0]", "a_grav[1]", "a_grav[2]");
 
+  /* Creare files and write header */
+  FILE *file_swift = fopen(file_name_swift, "w");
+  fprintf(file_swift, "# Gravity accuracy test - SWIFT FORCES\n");
+  fprintf(file_swift, "# G= %16.8e\n", e->physical_constants->const_newton_G);
+  fprintf(file_swift, "# N= %d\n", SWIFT_GRAVITY_FORCE_CHECKS);
+  fprintf(file_swift, "# epsilon=%16.8e\n", e->gravity_properties->epsilon);
+  fprintf(file_swift, "# theta=%16.8e\n", e->gravity_properties->theta_crit);
+  fprintf(file_swift, "# Git Branch: %s\n", git_branch());
+  fprintf(file_swift, "# Git Revision: %s\n", git_revision());
+  fprintf(file_swift, "# %16s %16s %16s %16s %16s %16s %16s\n", "id", "pos[0]",
+          "pos[1]", "pos[2]", "a_swift[0]", "a_swift[1]", "a_swift[2]");
+
+  /* Output particle SWIFT accelerations  */
   for (size_t i = 0; i < s->nr_gparts; ++i) {
 
     struct gpart *gpi = &s->gparts[i];
@@ -165,63 +225,53 @@ void gravity_exact_force_check(struct space *s, const struct engine *e,
     if (gpi->id_or_neg_offset % SWIFT_GRAVITY_FORCE_CHECKS == 0 &&
         gpart_is_starting(gpi, e)) {
 
-      fprintf(file,
-              "%18lld %16.8e %16.8e %16.8e %16.8e %16.8e %16.8e %16.8e %16.8e "
-              "%16.8e \n",
+      fprintf(file_swift, "%18lld %16.8e %16.8e %16.8e %16.8e %16.8e %16.8e \n",
               gpi->id_or_neg_offset, gpi->x[0], gpi->x[1], gpi->x[2],
-              gpi->a_grav_exact[0], gpi->a_grav_exact[1], gpi->a_grav_exact[2],
               gpi->a_grav[0], gpi->a_grav[1], gpi->a_grav[2]);
-
-      const float diff[3] = {gpi->a_grav[0] - gpi->a_grav_exact[0],
-                             gpi->a_grav[1] - gpi->a_grav_exact[1],
-                             gpi->a_grav[2] - gpi->a_grav_exact[2]};
-
-      const float diff_norm =
-          sqrtf(diff[0] * diff[0] + diff[1] * diff[1] + diff[2] * diff[2]);
-      const float a_norm = sqrtf(gpi->a_grav_exact[0] * gpi->a_grav_exact[0] +
-                                 gpi->a_grav_exact[1] * gpi->a_grav_exact[1] +
-                                 gpi->a_grav_exact[2] * gpi->a_grav_exact[2]);
-
-      /* Compute relative error */
-      const float err_rel = diff_norm / a_norm;
-
-      /* Check that we are not above tolerance */
-      if (err_rel > rel_tol) {
-        message(
-            "Error too large ! gp->a_grav=[%3.6e %3.6e %3.6e] "
-            "gp->a_exact=[%3.6e %3.6e %3.6e], "
-            "gp->num_interacted=%lld, err=%f",
-            gpi->a_grav[0], gpi->a_grav[1], gpi->a_grav[2],
-            gpi->a_grav_exact[0], gpi->a_grav_exact[1], gpi->a_grav_exact[2],
-            gpi->num_interacted, err_rel);
-
-        continue;
-      }
-
-      /* Construct some statistics */
-      err_rel_max = max(err_rel_max, fabsf(err_rel));
-      err_rel_min = min(err_rel_min, fabsf(err_rel));
-      err_rel_mean += err_rel;
-      err_rel_mean2 += err_rel * err_rel;
-      counter++;
     }
   }
 
+  message("Written SWIFT accelerations in file '%s'.", file_name_swift);
+
   /* Be nice */
-  fclose(file);
+  fclose(file_swift);
 
-  /* Final operation on the stats */
-  if (counter > 0) {
-    err_rel_mean /= counter;
-    err_rel_mean2 /= counter;
-    err_rel_std = sqrtf(err_rel_mean2 - err_rel_mean * err_rel_mean);
-  }
+  if (!gravity_exact_force_file_exits(e)) {
 
-  /* Report on the findings */
-  message("Checked gravity for %d gparts.", counter);
-  message("Error on |a_grav|: min=%e max=%e mean=%e std=%e", err_rel_min,
-          err_rel_max, err_rel_mean, err_rel_std);
+    char file_name_exact[100];
+    sprintf(file_name_exact, "gravity_checks_exact_step%d.dat", e->step);
 
+    FILE *file_exact = fopen(file_name_exact, "w");
+    fprintf(file_exact, "# Gravity accuracy test - EXACT FORCES\n");
+    fprintf(file_exact, "# G= %16.8e\n", e->physical_constants->const_newton_G);
+    fprintf(file_exact, "# N= %d\n", SWIFT_GRAVITY_FORCE_CHECKS);
+    fprintf(file_exact, "# epsilon=%16.8e\n", e->gravity_properties->epsilon);
+    fprintf(file_exact, "# theta=%16.8e\n", e->gravity_properties->theta_crit);
+    fprintf(file_exact, "# %16s %16s %16s %16s %16s %16s %16s\n", "id",
+            "pos[0]", "pos[1]", "pos[2]", "a_exact[0]", "a_exact[1]",
+            "a_exact[2]");
+
+    /* Output particle exact accelerations  */
+    for (size_t i = 0; i < s->nr_gparts; ++i) {
+
+      struct gpart *gpi = &s->gparts[i];
+
+      /* Is the particle was active and part of the subset to be tested ? */
+      if (gpi->id_or_neg_offset % SWIFT_GRAVITY_FORCE_CHECKS == 0 &&
+          gpart_is_starting(gpi, e)) {
+
+        fprintf(
+            file_exact, "%18lld %16.8e %16.8e %16.8e %16.8e %16.8e %16.8e \n",
+            gpi->id_or_neg_offset, gpi->x[0], gpi->x[1], gpi->x[2],
+            gpi->a_grav_exact[0], gpi->a_grav_exact[1], gpi->a_grav_exact[2]);
+      }
+    }
+
+    message("Written exact accelerations in file '%s'.", file_name_exact);
+
+    /* Be nice */
+    fclose(file_exact);
+  }
 #else
   error("Gravity checking function called without the corresponding flag.");
 #endif
diff --git a/src/gravity_properties.c b/src/gravity_properties.c
index 6c1ec1f6d9e80d46278892cc7a2081b2cadad923..f52029fa1543ad1f8d0121c8c4e6d362227f4c53 100644
--- a/src/gravity_properties.c
+++ b/src/gravity_properties.c
@@ -47,8 +47,14 @@ void gravity_props_init(struct gravity_props *p,
   /* Time integration */
   p->eta = parser_get_param_float(params, "Gravity:eta");
 
+  /* Opening angle */
+  p->theta_crit = parser_get_param_double(params, "Gravity:theta");
+  p->theta_crit_inv = 1. / p->theta_crit;
+
   /* Softening lengths */
-  p->epsilon = parser_get_param_float(params, "Gravity:epsilon");
+  p->epsilon = parser_get_param_double(params, "Gravity:epsilon");
+  p->epsilon2 = p->epsilon * p->epsilon;
+  p->epsilon_inv = 1. / p->epsilon;
 }
 
 void gravity_props_print(const struct gravity_props *p) {
@@ -57,7 +63,9 @@ void gravity_props_print(const struct gravity_props *p) {
 
   message("Self-gravity time integration: eta=%.4f", p->eta);
 
-  message("Self-gravity softening: epsilon=%.4f", p->epsilon);
+  message("Self-gravity opening angle:  theta=%.4f", p->theta_crit);
+
+  message("Self-gravity softening:    epsilon=%.4f", p->epsilon);
 
   if (p->a_smooth != gravity_props_default_a_smooth)
     message("Self-gravity smoothing-scale: a_smooth=%f", p->a_smooth);
@@ -71,7 +79,8 @@ void gravity_props_print_snapshot(hid_t h_grpgrav,
                                   const struct gravity_props *p) {
 
   io_write_attribute_f(h_grpgrav, "Time integration eta", p->eta);
-  io_write_attribute_f(h_grpgrav, "Softening", p->epsilon);
+  io_write_attribute_f(h_grpgrav, "Softening length", p->epsilon);
+  io_write_attribute_f(h_grpgrav, "Opening angle", p->theta_crit);
   io_write_attribute_f(h_grpgrav, "MM a_smooth", p->a_smooth);
   io_write_attribute_f(h_grpgrav, "MM r_cut", p->r_cut);
 }
diff --git a/src/gravity_properties.h b/src/gravity_properties.h
index 6fde91b1d8f2d32a27ec278d5e42f91e5fe930a6..be26f0d1d23b8cec71fa3cbbeedac9f61f337b2c 100644
--- a/src/gravity_properties.h
+++ b/src/gravity_properties.h
@@ -38,11 +38,23 @@ struct gravity_props {
   float a_smooth;
   float r_cut;
 
-  /* Time integration parameters */
+  /*! Time integration dimensionless multiplier */
   float eta;
 
-  /* Softening lengths */
-  float epsilon;
+  /*! Tree opening angle (Multipole acceptance criterion) */
+  double theta_crit;
+
+  /*! Inverse of opening angle */
+  double theta_crit_inv;
+
+  /*! Softening length */
+  double epsilon;
+
+  /*! Square of softening length */
+  double epsilon2;
+
+  /*! Inverse of softening length */
+  double epsilon_inv;
 };
 
 void gravity_props_print(const struct gravity_props *p);
diff --git a/src/kernel_gravity.h b/src/kernel_gravity.h
index a9425c7dda8f864a1d6b9c2a658b308c7e7e0624..4a7588d312c381ef60fb97c43f8fadb1e03dfead 100644
--- a/src/kernel_gravity.h
+++ b/src/kernel_gravity.h
@@ -87,4 +87,49 @@ __attribute__((always_inline)) INLINE static void kernel_grav_eval(
   *W = w / (u * u * u);
 }
 
-#endif  // SWIFT_KERNEL_GRAVITY_H
+#ifdef SWIFT_GRAVITY_FORCE_CHECKS
+
+__attribute__((always_inline)) INLINE static void kernel_grav_eval_double(
+    double u, double *const W) {
+
+  static const double kernel_grav_coeffs_double[(kernel_grav_degree + 1) *
+                                                (kernel_grav_ivals + 1)]
+      __attribute__((aligned(16))) = {32.,
+                                      -38.4,
+                                      0.,
+                                      10.66666667,
+                                      0.,
+                                      0.,
+                                      0., /* 0 < u < 0.5 */
+                                      -10.66666667,
+                                      38.4,
+                                      -48.,
+                                      21.3333333,
+                                      0.,
+                                      0.,
+                                      -0.066666667, /* 0.5 < u < 1 */
+                                      0.,
+                                      0.,
+                                      0.,
+                                      0.,
+                                      0.,
+                                      0.,
+                                      0.}; /* 1 < u */
+
+  /* Pick the correct branch of the kernel */
+  const int ind = (int)min(u * (double)kernel_grav_ivals, kernel_grav_ivals);
+  const double *const coeffs =
+      &kernel_grav_coeffs_double[ind * (kernel_grav_degree + 1)];
+
+  /* First two terms of the polynomial ... */
+  double w = coeffs[0] * u + coeffs[1];
+
+  /* ... and the rest of them */
+  for (int k = 2; k <= kernel_grav_degree; k++) w = u * w + coeffs[k];
+
+  /* Return everything */
+  *W = w / (u * u * u);
+}
+#endif /* SWIFT_GRAVITY_FORCE_CHECKS */
+
+#endif /* SWIFT_KERNEL_GRAVITY_H */
diff --git a/src/multipole.h b/src/multipole.h
index 0daef6de24039afe63f529ee53f0afda761bc6c3..a633554490fe6226f8fe42befb7cb4eaf1e5135f 100644
--- a/src/multipole.h
+++ b/src/multipole.h
@@ -33,6 +33,7 @@
 #include "const.h"
 #include "error.h"
 #include "gravity_derivatives.h"
+#include "gravity_properties.h"
 #include "inline.h"
 #include "kernel_gravity.h"
 #include "minmax.h"
@@ -164,20 +165,26 @@ struct multipole {
  */
 struct gravity_tensors {
 
-  /*! Linking pointer for "memory management". */
-  struct gravity_tensors *next;
+  union {
 
-  /*! Centre of mass of the matter dsitribution */
-  double CoM[3];
+    /*! Linking pointer for "memory management". */
+    struct gravity_tensors *next;
 
-  /*! The actual content */
-  struct {
+    /*! The actual content */
+    struct {
 
-    /*! Multipole mass */
-    struct multipole m_pole;
+      /*! Centre of mass of the matter dsitribution */
+      double CoM[3];
 
-    /*! Field tensor for the potential */
-    struct grav_tensor pot;
+      /*! Upper limit of the CoM<->gpart distance */
+      double r_max;
+
+      /*! Multipole mass */
+      struct multipole m_pole;
+
+      /*! Field tensor for the potential */
+      struct grav_tensor pot;
+    };
   };
 } SWIFT_STRUCT_ALIGN;
 
@@ -200,12 +207,25 @@ INLINE static void gravity_reset(struct gravity_tensors *m) {
  */
 INLINE static void gravity_drift(struct gravity_tensors *m, double dt) {
 
+  const double dx = m->m_pole.vel[0] * dt;
+  const double dy = m->m_pole.vel[1] * dt;
+  const double dz = m->m_pole.vel[2] * dt;
+
   /* Move the whole thing according to bulk motion */
-  m->CoM[0] += m->m_pole.vel[0] * dt;
-  m->CoM[1] += m->m_pole.vel[1] * dt;
-  m->CoM[2] += m->m_pole.vel[2] * dt;
+  m->CoM[0] += dx;
+  m->CoM[1] += dy;
+  m->CoM[2] += dz;
+
+  /* Conservative change in maximal radius containing all gpart */
+  /* MATTHIEU: Use gpart->x_diff here ? */
+  m->r_max += sqrt(dx * dx + dy * dy + dz * dz);
 }
 
+/**
+ * @brief Zeroes all the fields of a field tensor
+ *
+ * @param l The field tensor.
+ */
 INLINE static void gravity_field_tensors_init(struct grav_tensor *l) {
 
   bzero(l, sizeof(struct grav_tensor));
@@ -354,6 +374,16 @@ INLINE static void gravity_field_tensors_print(const struct grav_tensor *l) {
 #endif
 }
 
+/**
+ * @brief Zeroes all the fields of a multipole.
+ *
+ * @param m The multipole
+ */
+INLINE static void gravity_multipole_init(struct multipole *m) {
+
+  bzero(m, sizeof(struct multipole));
+}
+
 /**
  * @brief Prints the content of a #multipole to stdout.
  *
@@ -948,9 +978,9 @@ INLINE static void gravity_P2M(struct gravity_tensors *m,
   double com[3] = {0.0, 0.0, 0.0};
   double vel[3] = {0.f, 0.f, 0.f};
 
-  /* Collect the particle data. */
+  /* Collect the particle data for CoM. */
   for (int k = 0; k < gcount; k++) {
-    const float m = gparts[k].mass;
+    const double m = gparts[k].mass;
 
     mass += m;
     com[0] += gparts[k].x[0] * m;
@@ -970,7 +1000,8 @@ INLINE static void gravity_P2M(struct gravity_tensors *m,
   vel[1] *= imass;
   vel[2] *= imass;
 
-/* Prepare some local counters */
+  /* Prepare some local counters */
+  double r_max2 = 0.;
 #if SELF_GRAVITY_MULTIPOLE_ORDER > 0
   double M_100 = 0., M_010 = 0., M_001 = 0.;
 #endif
@@ -1007,11 +1038,15 @@ INLINE static void gravity_P2M(struct gravity_tensors *m,
   /* Construce the higher order terms */
   for (int k = 0; k < gcount; k++) {
 
-#if SELF_GRAVITY_MULTIPOLE_ORDER > 0
-    const double m = gparts[k].mass;
     const double dx[3] = {gparts[k].x[0] - com[0], gparts[k].x[1] - com[1],
                           gparts[k].x[2] - com[2]};
 
+    /* Maximal distance CoM<->gpart */
+    r_max2 = max(r_max2, dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2]);
+
+#if SELF_GRAVITY_MULTIPOLE_ORDER > 0
+    const double m = gparts[k].mass;
+
     /* 1st order terms */
     M_100 += -m * X_100(dx);
     M_010 += -m * X_010(dx);
@@ -1096,6 +1131,7 @@ INLINE static void gravity_P2M(struct gravity_tensors *m,
 
   /* Store the data on the multipole. */
   m->m_pole.M_000 = mass;
+  m->r_max = sqrt(r_max2);
   m->CoM[0] = com[0];
   m->CoM[1] = com[1];
   m->CoM[2] = com[2];
@@ -1456,549 +1492,557 @@ INLINE static void gravity_M2M(struct multipole *m_a,
  * @param m_a The multipole creating the field.
  * @param pos_b The position of the field tensor.
  * @param pos_a The position of the multipole.
+ * @param props The #gravity_props of this calculation.
  * @param periodic Is the calculation periodic ?
  */
 INLINE static void gravity_M2L(struct grav_tensor *l_b,
                                const struct multipole *m_a,
                                const double pos_b[3], const double pos_a[3],
+                               const struct gravity_props *props,
                                int periodic) {
 
-  double dx, dy, dz;
-  if (periodic) {
-    dx = box_wrap(pos_b[0] - pos_a[0], 0., 1.);
-    dy = box_wrap(pos_b[1] - pos_a[1], 0., 1.);
-    dz = box_wrap(pos_b[2] - pos_a[2], 0., 1.);
-  } else {
-    dx = pos_b[0] - pos_a[0];
-    dy = pos_b[1] - pos_a[1];
-    dz = pos_b[2] - pos_a[2];
-  }
-  const double r2 = dx * dx + dy * dy + dz * dz;
+  /* Compute distance vector */
+  const double dx =
+      periodic ? box_wrap(pos_b[0] - pos_a[0], 0., 1.) : pos_b[0] - pos_a[0];
+  const double dy =
+      periodic ? box_wrap(pos_b[1] - pos_a[1], 0., 1.) : pos_b[1] - pos_a[1];
+  const double dz =
+      periodic ? box_wrap(pos_b[2] - pos_a[2], 0., 1.) : pos_b[2] - pos_a[2];
 
+  /* Compute distance */
+  const double r2 = dx * dx + dy * dy + dz * dz;
   const double r_inv = 1. / sqrt(r2);
 
-  /*  0th order term */
-  l_b->F_000 += m_a->M_000 * D_000(dx, dy, dz, r_inv);
+#ifdef SWIFT_DEBUG_CHECKS
+  /* Count interactions */
+  l_b->num_interacted += m_a->num_gpart;
+#endif
+
+  /* Un-softened case */
+  if (r2 > props->epsilon2) {
+
+    /*  0th order term */
+    l_b->F_000 += m_a->M_000 * D_000(dx, dy, dz, r_inv);
 
 #if SELF_GRAVITY_MULTIPOLE_ORDER > 0
-  /*  1st order multipole term (addition to rank 0)*/
-  l_b->F_000 += m_a->M_100 * D_100(dx, dy, dz, r_inv) +
-                m_a->M_010 * D_010(dx, dy, dz, r_inv) +
-                m_a->M_001 * D_001(dx, dy, dz, r_inv);
+    /*  1st order multipole term (addition to rank 0)*/
+    l_b->F_000 += m_a->M_100 * D_100(dx, dy, dz, r_inv) +
+                  m_a->M_010 * D_010(dx, dy, dz, r_inv) +
+                  m_a->M_001 * D_001(dx, dy, dz, r_inv);
 
-  /*  1st order multipole term (addition to rank 1)*/
-  l_b->F_100 += m_a->M_000 * D_100(dx, dy, dz, r_inv);
-  l_b->F_010 += m_a->M_000 * D_010(dx, dy, dz, r_inv);
-  l_b->F_001 += m_a->M_000 * D_001(dx, dy, dz, r_inv);
+    /*  1st order multipole term (addition to rank 1)*/
+    l_b->F_100 += m_a->M_000 * D_100(dx, dy, dz, r_inv);
+    l_b->F_010 += m_a->M_000 * D_010(dx, dy, dz, r_inv);
+    l_b->F_001 += m_a->M_000 * D_001(dx, dy, dz, r_inv);
 #endif
 #if SELF_GRAVITY_MULTIPOLE_ORDER > 1
 
-  /*  2nd order multipole term (addition to rank 0)*/
-  l_b->F_000 += m_a->M_200 * D_200(dx, dy, dz, r_inv) +
-                m_a->M_020 * D_020(dx, dy, dz, r_inv) +
-                m_a->M_002 * D_002(dx, dy, dz, r_inv);
-  l_b->F_000 += m_a->M_110 * D_110(dx, dy, dz, r_inv) +
-                m_a->M_101 * D_101(dx, dy, dz, r_inv) +
-                m_a->M_011 * D_011(dx, dy, dz, r_inv);
-
-  /*  2nd order multipole term (addition to rank 1)*/
-  l_b->F_100 += m_a->M_100 * D_200(dx, dy, dz, r_inv) +
-                m_a->M_010 * D_110(dx, dy, dz, r_inv) +
-                m_a->M_001 * D_101(dx, dy, dz, r_inv);
-  l_b->F_010 += m_a->M_100 * D_110(dx, dy, dz, r_inv) +
-                m_a->M_010 * D_020(dx, dy, dz, r_inv) +
-                m_a->M_001 * D_011(dx, dy, dz, r_inv);
-  l_b->F_001 += m_a->M_100 * D_101(dx, dy, dz, r_inv) +
-                m_a->M_010 * D_011(dx, dy, dz, r_inv) +
-                m_a->M_001 * D_002(dx, dy, dz, r_inv);
-
-  /*  2nd order multipole term (addition to rank 2)*/
-  l_b->F_200 += m_a->M_000 * D_200(dx, dy, dz, r_inv);
-  l_b->F_020 += m_a->M_000 * D_020(dx, dy, dz, r_inv);
-  l_b->F_002 += m_a->M_000 * D_002(dx, dy, dz, r_inv);
-  l_b->F_110 += m_a->M_000 * D_110(dx, dy, dz, r_inv);
-  l_b->F_101 += m_a->M_000 * D_101(dx, dy, dz, r_inv);
-  l_b->F_011 += m_a->M_000 * D_011(dx, dy, dz, r_inv);
+    /*  2nd order multipole term (addition to rank 0)*/
+    l_b->F_000 += m_a->M_200 * D_200(dx, dy, dz, r_inv) +
+                  m_a->M_020 * D_020(dx, dy, dz, r_inv) +
+                  m_a->M_002 * D_002(dx, dy, dz, r_inv);
+    l_b->F_000 += m_a->M_110 * D_110(dx, dy, dz, r_inv) +
+                  m_a->M_101 * D_101(dx, dy, dz, r_inv) +
+                  m_a->M_011 * D_011(dx, dy, dz, r_inv);
+
+    /*  2nd order multipole term (addition to rank 1)*/
+    l_b->F_100 += m_a->M_100 * D_200(dx, dy, dz, r_inv) +
+                  m_a->M_010 * D_110(dx, dy, dz, r_inv) +
+                  m_a->M_001 * D_101(dx, dy, dz, r_inv);
+    l_b->F_010 += m_a->M_100 * D_110(dx, dy, dz, r_inv) +
+                  m_a->M_010 * D_020(dx, dy, dz, r_inv) +
+                  m_a->M_001 * D_011(dx, dy, dz, r_inv);
+    l_b->F_001 += m_a->M_100 * D_101(dx, dy, dz, r_inv) +
+                  m_a->M_010 * D_011(dx, dy, dz, r_inv) +
+                  m_a->M_001 * D_002(dx, dy, dz, r_inv);
+
+    /*  2nd order multipole term (addition to rank 2)*/
+    l_b->F_200 += m_a->M_000 * D_200(dx, dy, dz, r_inv);
+    l_b->F_020 += m_a->M_000 * D_020(dx, dy, dz, r_inv);
+    l_b->F_002 += m_a->M_000 * D_002(dx, dy, dz, r_inv);
+    l_b->F_110 += m_a->M_000 * D_110(dx, dy, dz, r_inv);
+    l_b->F_101 += m_a->M_000 * D_101(dx, dy, dz, r_inv);
+    l_b->F_011 += m_a->M_000 * D_011(dx, dy, dz, r_inv);
 #endif
 #if SELF_GRAVITY_MULTIPOLE_ORDER > 2
 
-  /*  3rd order multipole term (addition to rank 0)*/
-  l_b->F_000 += m_a->M_300 * D_300(dx, dy, dz, r_inv) +
-                m_a->M_030 * D_030(dx, dy, dz, r_inv) +
-                m_a->M_003 * D_003(dx, dy, dz, r_inv);
-  l_b->F_000 += m_a->M_210 * D_210(dx, dy, dz, r_inv) +
-                m_a->M_201 * D_201(dx, dy, dz, r_inv) +
-                m_a->M_120 * D_120(dx, dy, dz, r_inv);
-  l_b->F_000 += m_a->M_021 * D_021(dx, dy, dz, r_inv) +
-                m_a->M_102 * D_102(dx, dy, dz, r_inv) +
-                m_a->M_012 * D_012(dx, dy, dz, r_inv);
-  l_b->F_000 += m_a->M_111 * D_111(dx, dy, dz, r_inv);
-
-  /*  3rd order multipole term (addition to rank 1)*/
-  l_b->F_100 += m_a->M_200 * D_300(dx, dy, dz, r_inv) +
-                m_a->M_020 * D_120(dx, dy, dz, r_inv) +
-                m_a->M_002 * D_102(dx, dy, dz, r_inv);
-  l_b->F_100 += m_a->M_110 * D_210(dx, dy, dz, r_inv) +
-                m_a->M_101 * D_201(dx, dy, dz, r_inv) +
-                m_a->M_011 * D_111(dx, dy, dz, r_inv);
-  l_b->F_010 += m_a->M_200 * D_210(dx, dy, dz, r_inv) +
-                m_a->M_020 * D_030(dx, dy, dz, r_inv) +
-                m_a->M_002 * D_012(dx, dy, dz, r_inv);
-  l_b->F_010 += m_a->M_110 * D_120(dx, dy, dz, r_inv) +
-                m_a->M_101 * D_111(dx, dy, dz, r_inv) +
-                m_a->M_011 * D_021(dx, dy, dz, r_inv);
-  l_b->F_001 += m_a->M_200 * D_201(dx, dy, dz, r_inv) +
-                m_a->M_020 * D_021(dx, dy, dz, r_inv) +
-                m_a->M_002 * D_003(dx, dy, dz, r_inv);
-  l_b->F_001 += m_a->M_110 * D_111(dx, dy, dz, r_inv) +
-                m_a->M_101 * D_102(dx, dy, dz, r_inv) +
-                m_a->M_011 * D_012(dx, dy, dz, r_inv);
-
-  /*  3rd order multipole term (addition to rank 2)*/
-  l_b->F_200 += m_a->M_100 * D_300(dx, dy, dz, r_inv) +
-                m_a->M_010 * D_210(dx, dy, dz, r_inv) +
-                m_a->M_001 * D_201(dx, dy, dz, r_inv);
-  l_b->F_020 += m_a->M_100 * D_120(dx, dy, dz, r_inv) +
-                m_a->M_010 * D_030(dx, dy, dz, r_inv) +
-                m_a->M_001 * D_021(dx, dy, dz, r_inv);
-  l_b->F_002 += m_a->M_100 * D_102(dx, dy, dz, r_inv) +
-                m_a->M_010 * D_012(dx, dy, dz, r_inv) +
-                m_a->M_001 * D_003(dx, dy, dz, r_inv);
-  l_b->F_110 += m_a->M_100 * D_210(dx, dy, dz, r_inv) +
-                m_a->M_010 * D_120(dx, dy, dz, r_inv) +
-                m_a->M_001 * D_111(dx, dy, dz, r_inv);
-  l_b->F_101 += m_a->M_100 * D_201(dx, dy, dz, r_inv) +
-                m_a->M_010 * D_111(dx, dy, dz, r_inv) +
-                m_a->M_001 * D_102(dx, dy, dz, r_inv);
-  l_b->F_011 += m_a->M_100 * D_111(dx, dy, dz, r_inv) +
-                m_a->M_010 * D_021(dx, dy, dz, r_inv) +
-                m_a->M_001 * D_012(dx, dy, dz, r_inv);
-
-  /*  3rd order multipole term (addition to rank 2)*/
-  l_b->F_300 += m_a->M_000 * D_300(dx, dy, dz, r_inv);
-  l_b->F_030 += m_a->M_000 * D_030(dx, dy, dz, r_inv);
-  l_b->F_003 += m_a->M_000 * D_003(dx, dy, dz, r_inv);
-  l_b->F_210 += m_a->M_000 * D_210(dx, dy, dz, r_inv);
-  l_b->F_201 += m_a->M_000 * D_201(dx, dy, dz, r_inv);
-  l_b->F_120 += m_a->M_000 * D_120(dx, dy, dz, r_inv);
-  l_b->F_021 += m_a->M_000 * D_021(dx, dy, dz, r_inv);
-  l_b->F_102 += m_a->M_000 * D_102(dx, dy, dz, r_inv);
-  l_b->F_012 += m_a->M_000 * D_012(dx, dy, dz, r_inv);
-  l_b->F_111 += m_a->M_000 * D_111(dx, dy, dz, r_inv);
+    /*  3rd order multipole term (addition to rank 0)*/
+    l_b->F_000 += m_a->M_300 * D_300(dx, dy, dz, r_inv) +
+                  m_a->M_030 * D_030(dx, dy, dz, r_inv) +
+                  m_a->M_003 * D_003(dx, dy, dz, r_inv);
+    l_b->F_000 += m_a->M_210 * D_210(dx, dy, dz, r_inv) +
+                  m_a->M_201 * D_201(dx, dy, dz, r_inv) +
+                  m_a->M_120 * D_120(dx, dy, dz, r_inv);
+    l_b->F_000 += m_a->M_021 * D_021(dx, dy, dz, r_inv) +
+                  m_a->M_102 * D_102(dx, dy, dz, r_inv) +
+                  m_a->M_012 * D_012(dx, dy, dz, r_inv);
+    l_b->F_000 += m_a->M_111 * D_111(dx, dy, dz, r_inv);
+
+    /*  3rd order multipole term (addition to rank 1)*/
+    l_b->F_100 += m_a->M_200 * D_300(dx, dy, dz, r_inv) +
+                  m_a->M_020 * D_120(dx, dy, dz, r_inv) +
+                  m_a->M_002 * D_102(dx, dy, dz, r_inv);
+    l_b->F_100 += m_a->M_110 * D_210(dx, dy, dz, r_inv) +
+                  m_a->M_101 * D_201(dx, dy, dz, r_inv) +
+                  m_a->M_011 * D_111(dx, dy, dz, r_inv);
+    l_b->F_010 += m_a->M_200 * D_210(dx, dy, dz, r_inv) +
+                  m_a->M_020 * D_030(dx, dy, dz, r_inv) +
+                  m_a->M_002 * D_012(dx, dy, dz, r_inv);
+    l_b->F_010 += m_a->M_110 * D_120(dx, dy, dz, r_inv) +
+                  m_a->M_101 * D_111(dx, dy, dz, r_inv) +
+                  m_a->M_011 * D_021(dx, dy, dz, r_inv);
+    l_b->F_001 += m_a->M_200 * D_201(dx, dy, dz, r_inv) +
+                  m_a->M_020 * D_021(dx, dy, dz, r_inv) +
+                  m_a->M_002 * D_003(dx, dy, dz, r_inv);
+    l_b->F_001 += m_a->M_110 * D_111(dx, dy, dz, r_inv) +
+                  m_a->M_101 * D_102(dx, dy, dz, r_inv) +
+                  m_a->M_011 * D_012(dx, dy, dz, r_inv);
+
+    /*  3rd order multipole term (addition to rank 2)*/
+    l_b->F_200 += m_a->M_100 * D_300(dx, dy, dz, r_inv) +
+                  m_a->M_010 * D_210(dx, dy, dz, r_inv) +
+                  m_a->M_001 * D_201(dx, dy, dz, r_inv);
+    l_b->F_020 += m_a->M_100 * D_120(dx, dy, dz, r_inv) +
+                  m_a->M_010 * D_030(dx, dy, dz, r_inv) +
+                  m_a->M_001 * D_021(dx, dy, dz, r_inv);
+    l_b->F_002 += m_a->M_100 * D_102(dx, dy, dz, r_inv) +
+                  m_a->M_010 * D_012(dx, dy, dz, r_inv) +
+                  m_a->M_001 * D_003(dx, dy, dz, r_inv);
+    l_b->F_110 += m_a->M_100 * D_210(dx, dy, dz, r_inv) +
+                  m_a->M_010 * D_120(dx, dy, dz, r_inv) +
+                  m_a->M_001 * D_111(dx, dy, dz, r_inv);
+    l_b->F_101 += m_a->M_100 * D_201(dx, dy, dz, r_inv) +
+                  m_a->M_010 * D_111(dx, dy, dz, r_inv) +
+                  m_a->M_001 * D_102(dx, dy, dz, r_inv);
+    l_b->F_011 += m_a->M_100 * D_111(dx, dy, dz, r_inv) +
+                  m_a->M_010 * D_021(dx, dy, dz, r_inv) +
+                  m_a->M_001 * D_012(dx, dy, dz, r_inv);
+
+    /*  3rd order multipole term (addition to rank 2)*/
+    l_b->F_300 += m_a->M_000 * D_300(dx, dy, dz, r_inv);
+    l_b->F_030 += m_a->M_000 * D_030(dx, dy, dz, r_inv);
+    l_b->F_003 += m_a->M_000 * D_003(dx, dy, dz, r_inv);
+    l_b->F_210 += m_a->M_000 * D_210(dx, dy, dz, r_inv);
+    l_b->F_201 += m_a->M_000 * D_201(dx, dy, dz, r_inv);
+    l_b->F_120 += m_a->M_000 * D_120(dx, dy, dz, r_inv);
+    l_b->F_021 += m_a->M_000 * D_021(dx, dy, dz, r_inv);
+    l_b->F_102 += m_a->M_000 * D_102(dx, dy, dz, r_inv);
+    l_b->F_012 += m_a->M_000 * D_012(dx, dy, dz, r_inv);
+    l_b->F_111 += m_a->M_000 * D_111(dx, dy, dz, r_inv);
 #endif
 #if SELF_GRAVITY_MULTIPOLE_ORDER > 3
-  /* Compute 4th order field tensor terms (addition to rank 0) */
-  l_b->F_000 += m_a->M_004 * D_004(dx, dy, dz, r_inv) +
-                m_a->M_013 * D_013(dx, dy, dz, r_inv) +
-                m_a->M_022 * D_022(dx, dy, dz, r_inv) +
-                m_a->M_031 * D_031(dx, dy, dz, r_inv) +
-                m_a->M_040 * D_040(dx, dy, dz, r_inv) +
-                m_a->M_103 * D_103(dx, dy, dz, r_inv) +
-                m_a->M_112 * D_112(dx, dy, dz, r_inv) +
-                m_a->M_121 * D_121(dx, dy, dz, r_inv) +
-                m_a->M_130 * D_130(dx, dy, dz, r_inv) +
-                m_a->M_202 * D_202(dx, dy, dz, r_inv) +
-                m_a->M_211 * D_211(dx, dy, dz, r_inv) +
-                m_a->M_220 * D_220(dx, dy, dz, r_inv) +
-                m_a->M_301 * D_301(dx, dy, dz, r_inv) +
-                m_a->M_310 * D_310(dx, dy, dz, r_inv) +
-                m_a->M_400 * D_400(dx, dy, dz, r_inv);
-
-  /* Compute 4th order field tensor terms (addition to rank 1) */
-  l_b->F_001 += m_a->M_003 * D_004(dx, dy, dz, r_inv) +
-                m_a->M_012 * D_013(dx, dy, dz, r_inv) +
-                m_a->M_021 * D_022(dx, dy, dz, r_inv) +
-                m_a->M_030 * D_031(dx, dy, dz, r_inv) +
-                m_a->M_102 * D_103(dx, dy, dz, r_inv) +
-                m_a->M_111 * D_112(dx, dy, dz, r_inv) +
-                m_a->M_120 * D_121(dx, dy, dz, r_inv) +
-                m_a->M_201 * D_202(dx, dy, dz, r_inv) +
-                m_a->M_210 * D_211(dx, dy, dz, r_inv) +
-                m_a->M_300 * D_301(dx, dy, dz, r_inv);
-  l_b->F_010 += m_a->M_003 * D_013(dx, dy, dz, r_inv) +
-                m_a->M_012 * D_022(dx, dy, dz, r_inv) +
-                m_a->M_021 * D_031(dx, dy, dz, r_inv) +
-                m_a->M_030 * D_040(dx, dy, dz, r_inv) +
-                m_a->M_102 * D_112(dx, dy, dz, r_inv) +
-                m_a->M_111 * D_121(dx, dy, dz, r_inv) +
-                m_a->M_120 * D_130(dx, dy, dz, r_inv) +
-                m_a->M_201 * D_211(dx, dy, dz, r_inv) +
-                m_a->M_210 * D_220(dx, dy, dz, r_inv) +
-                m_a->M_300 * D_310(dx, dy, dz, r_inv);
-  l_b->F_100 += m_a->M_003 * D_103(dx, dy, dz, r_inv) +
-                m_a->M_012 * D_112(dx, dy, dz, r_inv) +
-                m_a->M_021 * D_121(dx, dy, dz, r_inv) +
-                m_a->M_030 * D_130(dx, dy, dz, r_inv) +
-                m_a->M_102 * D_202(dx, dy, dz, r_inv) +
-                m_a->M_111 * D_211(dx, dy, dz, r_inv) +
-                m_a->M_120 * D_220(dx, dy, dz, r_inv) +
-                m_a->M_201 * D_301(dx, dy, dz, r_inv) +
-                m_a->M_210 * D_310(dx, dy, dz, r_inv) +
-                m_a->M_300 * D_400(dx, dy, dz, r_inv);
-
-  /* Compute 4th order field tensor terms (addition to rank 2) */
-  l_b->F_002 += m_a->M_002 * D_004(dx, dy, dz, r_inv) +
-                m_a->M_011 * D_013(dx, dy, dz, r_inv) +
-                m_a->M_020 * D_022(dx, dy, dz, r_inv) +
-                m_a->M_101 * D_103(dx, dy, dz, r_inv) +
-                m_a->M_110 * D_112(dx, dy, dz, r_inv) +
-                m_a->M_200 * D_202(dx, dy, dz, r_inv);
-  l_b->F_011 += m_a->M_002 * D_013(dx, dy, dz, r_inv) +
-                m_a->M_011 * D_022(dx, dy, dz, r_inv) +
-                m_a->M_020 * D_031(dx, dy, dz, r_inv) +
-                m_a->M_101 * D_112(dx, dy, dz, r_inv) +
-                m_a->M_110 * D_121(dx, dy, dz, r_inv) +
-                m_a->M_200 * D_211(dx, dy, dz, r_inv);
-  l_b->F_020 += m_a->M_002 * D_022(dx, dy, dz, r_inv) +
-                m_a->M_011 * D_031(dx, dy, dz, r_inv) +
-                m_a->M_020 * D_040(dx, dy, dz, r_inv) +
-                m_a->M_101 * D_121(dx, dy, dz, r_inv) +
-                m_a->M_110 * D_130(dx, dy, dz, r_inv) +
-                m_a->M_200 * D_220(dx, dy, dz, r_inv);
-  l_b->F_101 += m_a->M_002 * D_103(dx, dy, dz, r_inv) +
-                m_a->M_011 * D_112(dx, dy, dz, r_inv) +
-                m_a->M_020 * D_121(dx, dy, dz, r_inv) +
-                m_a->M_101 * D_202(dx, dy, dz, r_inv) +
-                m_a->M_110 * D_211(dx, dy, dz, r_inv) +
-                m_a->M_200 * D_301(dx, dy, dz, r_inv);
-  l_b->F_110 += m_a->M_002 * D_112(dx, dy, dz, r_inv) +
-                m_a->M_011 * D_121(dx, dy, dz, r_inv) +
-                m_a->M_020 * D_130(dx, dy, dz, r_inv) +
-                m_a->M_101 * D_211(dx, dy, dz, r_inv) +
-                m_a->M_110 * D_220(dx, dy, dz, r_inv) +
-                m_a->M_200 * D_310(dx, dy, dz, r_inv);
-  l_b->F_200 += m_a->M_002 * D_202(dx, dy, dz, r_inv) +
-                m_a->M_011 * D_211(dx, dy, dz, r_inv) +
-                m_a->M_020 * D_220(dx, dy, dz, r_inv) +
-                m_a->M_101 * D_301(dx, dy, dz, r_inv) +
-                m_a->M_110 * D_310(dx, dy, dz, r_inv) +
-                m_a->M_200 * D_400(dx, dy, dz, r_inv);
-
-  /* Compute 4th order field tensor terms (addition to rank 3) */
-  l_b->F_003 += m_a->M_001 * D_004(dx, dy, dz, r_inv) +
-                m_a->M_010 * D_013(dx, dy, dz, r_inv) +
-                m_a->M_100 * D_103(dx, dy, dz, r_inv);
-  l_b->F_012 += m_a->M_001 * D_013(dx, dy, dz, r_inv) +
-                m_a->M_010 * D_022(dx, dy, dz, r_inv) +
-                m_a->M_100 * D_112(dx, dy, dz, r_inv);
-  l_b->F_021 += m_a->M_001 * D_022(dx, dy, dz, r_inv) +
-                m_a->M_010 * D_031(dx, dy, dz, r_inv) +
-                m_a->M_100 * D_121(dx, dy, dz, r_inv);
-  l_b->F_030 += m_a->M_001 * D_031(dx, dy, dz, r_inv) +
-                m_a->M_010 * D_040(dx, dy, dz, r_inv) +
-                m_a->M_100 * D_130(dx, dy, dz, r_inv);
-  l_b->F_102 += m_a->M_001 * D_103(dx, dy, dz, r_inv) +
-                m_a->M_010 * D_112(dx, dy, dz, r_inv) +
-                m_a->M_100 * D_202(dx, dy, dz, r_inv);
-  l_b->F_111 += m_a->M_001 * D_112(dx, dy, dz, r_inv) +
-                m_a->M_010 * D_121(dx, dy, dz, r_inv) +
-                m_a->M_100 * D_211(dx, dy, dz, r_inv);
-  l_b->F_120 += m_a->M_001 * D_121(dx, dy, dz, r_inv) +
-                m_a->M_010 * D_130(dx, dy, dz, r_inv) +
-                m_a->M_100 * D_220(dx, dy, dz, r_inv);
-  l_b->F_201 += m_a->M_001 * D_202(dx, dy, dz, r_inv) +
-                m_a->M_010 * D_211(dx, dy, dz, r_inv) +
-                m_a->M_100 * D_301(dx, dy, dz, r_inv);
-  l_b->F_210 += m_a->M_001 * D_211(dx, dy, dz, r_inv) +
-                m_a->M_010 * D_220(dx, dy, dz, r_inv) +
-                m_a->M_100 * D_310(dx, dy, dz, r_inv);
-  l_b->F_300 += m_a->M_001 * D_301(dx, dy, dz, r_inv) +
-                m_a->M_010 * D_310(dx, dy, dz, r_inv) +
-                m_a->M_100 * D_400(dx, dy, dz, r_inv);
-
-  /* Compute 4th order field tensor terms (addition to rank 4) */
-  l_b->F_004 += m_a->M_000 * D_004(dx, dy, dz, r_inv);
-  l_b->F_013 += m_a->M_000 * D_013(dx, dy, dz, r_inv);
-  l_b->F_022 += m_a->M_000 * D_022(dx, dy, dz, r_inv);
-  l_b->F_031 += m_a->M_000 * D_031(dx, dy, dz, r_inv);
-  l_b->F_040 += m_a->M_000 * D_040(dx, dy, dz, r_inv);
-  l_b->F_103 += m_a->M_000 * D_103(dx, dy, dz, r_inv);
-  l_b->F_112 += m_a->M_000 * D_112(dx, dy, dz, r_inv);
-  l_b->F_121 += m_a->M_000 * D_121(dx, dy, dz, r_inv);
-  l_b->F_130 += m_a->M_000 * D_130(dx, dy, dz, r_inv);
-  l_b->F_202 += m_a->M_000 * D_202(dx, dy, dz, r_inv);
-  l_b->F_211 += m_a->M_000 * D_211(dx, dy, dz, r_inv);
-  l_b->F_220 += m_a->M_000 * D_220(dx, dy, dz, r_inv);
-  l_b->F_301 += m_a->M_000 * D_301(dx, dy, dz, r_inv);
-  l_b->F_310 += m_a->M_000 * D_310(dx, dy, dz, r_inv);
-  l_b->F_400 += m_a->M_000 * D_400(dx, dy, dz, r_inv);
+    /* Compute 4th order field tensor terms (addition to rank 0) */
+    l_b->F_000 += m_a->M_004 * D_004(dx, dy, dz, r_inv) +
+                  m_a->M_013 * D_013(dx, dy, dz, r_inv) +
+                  m_a->M_022 * D_022(dx, dy, dz, r_inv) +
+                  m_a->M_031 * D_031(dx, dy, dz, r_inv) +
+                  m_a->M_040 * D_040(dx, dy, dz, r_inv) +
+                  m_a->M_103 * D_103(dx, dy, dz, r_inv) +
+                  m_a->M_112 * D_112(dx, dy, dz, r_inv) +
+                  m_a->M_121 * D_121(dx, dy, dz, r_inv) +
+                  m_a->M_130 * D_130(dx, dy, dz, r_inv) +
+                  m_a->M_202 * D_202(dx, dy, dz, r_inv) +
+                  m_a->M_211 * D_211(dx, dy, dz, r_inv) +
+                  m_a->M_220 * D_220(dx, dy, dz, r_inv) +
+                  m_a->M_301 * D_301(dx, dy, dz, r_inv) +
+                  m_a->M_310 * D_310(dx, dy, dz, r_inv) +
+                  m_a->M_400 * D_400(dx, dy, dz, r_inv);
+
+    /* Compute 4th order field tensor terms (addition to rank 1) */
+    l_b->F_001 += m_a->M_003 * D_004(dx, dy, dz, r_inv) +
+                  m_a->M_012 * D_013(dx, dy, dz, r_inv) +
+                  m_a->M_021 * D_022(dx, dy, dz, r_inv) +
+                  m_a->M_030 * D_031(dx, dy, dz, r_inv) +
+                  m_a->M_102 * D_103(dx, dy, dz, r_inv) +
+                  m_a->M_111 * D_112(dx, dy, dz, r_inv) +
+                  m_a->M_120 * D_121(dx, dy, dz, r_inv) +
+                  m_a->M_201 * D_202(dx, dy, dz, r_inv) +
+                  m_a->M_210 * D_211(dx, dy, dz, r_inv) +
+                  m_a->M_300 * D_301(dx, dy, dz, r_inv);
+    l_b->F_010 += m_a->M_003 * D_013(dx, dy, dz, r_inv) +
+                  m_a->M_012 * D_022(dx, dy, dz, r_inv) +
+                  m_a->M_021 * D_031(dx, dy, dz, r_inv) +
+                  m_a->M_030 * D_040(dx, dy, dz, r_inv) +
+                  m_a->M_102 * D_112(dx, dy, dz, r_inv) +
+                  m_a->M_111 * D_121(dx, dy, dz, r_inv) +
+                  m_a->M_120 * D_130(dx, dy, dz, r_inv) +
+                  m_a->M_201 * D_211(dx, dy, dz, r_inv) +
+                  m_a->M_210 * D_220(dx, dy, dz, r_inv) +
+                  m_a->M_300 * D_310(dx, dy, dz, r_inv);
+    l_b->F_100 += m_a->M_003 * D_103(dx, dy, dz, r_inv) +
+                  m_a->M_012 * D_112(dx, dy, dz, r_inv) +
+                  m_a->M_021 * D_121(dx, dy, dz, r_inv) +
+                  m_a->M_030 * D_130(dx, dy, dz, r_inv) +
+                  m_a->M_102 * D_202(dx, dy, dz, r_inv) +
+                  m_a->M_111 * D_211(dx, dy, dz, r_inv) +
+                  m_a->M_120 * D_220(dx, dy, dz, r_inv) +
+                  m_a->M_201 * D_301(dx, dy, dz, r_inv) +
+                  m_a->M_210 * D_310(dx, dy, dz, r_inv) +
+                  m_a->M_300 * D_400(dx, dy, dz, r_inv);
+
+    /* Compute 4th order field tensor terms (addition to rank 2) */
+    l_b->F_002 += m_a->M_002 * D_004(dx, dy, dz, r_inv) +
+                  m_a->M_011 * D_013(dx, dy, dz, r_inv) +
+                  m_a->M_020 * D_022(dx, dy, dz, r_inv) +
+                  m_a->M_101 * D_103(dx, dy, dz, r_inv) +
+                  m_a->M_110 * D_112(dx, dy, dz, r_inv) +
+                  m_a->M_200 * D_202(dx, dy, dz, r_inv);
+    l_b->F_011 += m_a->M_002 * D_013(dx, dy, dz, r_inv) +
+                  m_a->M_011 * D_022(dx, dy, dz, r_inv) +
+                  m_a->M_020 * D_031(dx, dy, dz, r_inv) +
+                  m_a->M_101 * D_112(dx, dy, dz, r_inv) +
+                  m_a->M_110 * D_121(dx, dy, dz, r_inv) +
+                  m_a->M_200 * D_211(dx, dy, dz, r_inv);
+    l_b->F_020 += m_a->M_002 * D_022(dx, dy, dz, r_inv) +
+                  m_a->M_011 * D_031(dx, dy, dz, r_inv) +
+                  m_a->M_020 * D_040(dx, dy, dz, r_inv) +
+                  m_a->M_101 * D_121(dx, dy, dz, r_inv) +
+                  m_a->M_110 * D_130(dx, dy, dz, r_inv) +
+                  m_a->M_200 * D_220(dx, dy, dz, r_inv);
+    l_b->F_101 += m_a->M_002 * D_103(dx, dy, dz, r_inv) +
+                  m_a->M_011 * D_112(dx, dy, dz, r_inv) +
+                  m_a->M_020 * D_121(dx, dy, dz, r_inv) +
+                  m_a->M_101 * D_202(dx, dy, dz, r_inv) +
+                  m_a->M_110 * D_211(dx, dy, dz, r_inv) +
+                  m_a->M_200 * D_301(dx, dy, dz, r_inv);
+    l_b->F_110 += m_a->M_002 * D_112(dx, dy, dz, r_inv) +
+                  m_a->M_011 * D_121(dx, dy, dz, r_inv) +
+                  m_a->M_020 * D_130(dx, dy, dz, r_inv) +
+                  m_a->M_101 * D_211(dx, dy, dz, r_inv) +
+                  m_a->M_110 * D_220(dx, dy, dz, r_inv) +
+                  m_a->M_200 * D_310(dx, dy, dz, r_inv);
+    l_b->F_200 += m_a->M_002 * D_202(dx, dy, dz, r_inv) +
+                  m_a->M_011 * D_211(dx, dy, dz, r_inv) +
+                  m_a->M_020 * D_220(dx, dy, dz, r_inv) +
+                  m_a->M_101 * D_301(dx, dy, dz, r_inv) +
+                  m_a->M_110 * D_310(dx, dy, dz, r_inv) +
+                  m_a->M_200 * D_400(dx, dy, dz, r_inv);
+
+    /* Compute 4th order field tensor terms (addition to rank 3) */
+    l_b->F_003 += m_a->M_001 * D_004(dx, dy, dz, r_inv) +
+                  m_a->M_010 * D_013(dx, dy, dz, r_inv) +
+                  m_a->M_100 * D_103(dx, dy, dz, r_inv);
+    l_b->F_012 += m_a->M_001 * D_013(dx, dy, dz, r_inv) +
+                  m_a->M_010 * D_022(dx, dy, dz, r_inv) +
+                  m_a->M_100 * D_112(dx, dy, dz, r_inv);
+    l_b->F_021 += m_a->M_001 * D_022(dx, dy, dz, r_inv) +
+                  m_a->M_010 * D_031(dx, dy, dz, r_inv) +
+                  m_a->M_100 * D_121(dx, dy, dz, r_inv);
+    l_b->F_030 += m_a->M_001 * D_031(dx, dy, dz, r_inv) +
+                  m_a->M_010 * D_040(dx, dy, dz, r_inv) +
+                  m_a->M_100 * D_130(dx, dy, dz, r_inv);
+    l_b->F_102 += m_a->M_001 * D_103(dx, dy, dz, r_inv) +
+                  m_a->M_010 * D_112(dx, dy, dz, r_inv) +
+                  m_a->M_100 * D_202(dx, dy, dz, r_inv);
+    l_b->F_111 += m_a->M_001 * D_112(dx, dy, dz, r_inv) +
+                  m_a->M_010 * D_121(dx, dy, dz, r_inv) +
+                  m_a->M_100 * D_211(dx, dy, dz, r_inv);
+    l_b->F_120 += m_a->M_001 * D_121(dx, dy, dz, r_inv) +
+                  m_a->M_010 * D_130(dx, dy, dz, r_inv) +
+                  m_a->M_100 * D_220(dx, dy, dz, r_inv);
+    l_b->F_201 += m_a->M_001 * D_202(dx, dy, dz, r_inv) +
+                  m_a->M_010 * D_211(dx, dy, dz, r_inv) +
+                  m_a->M_100 * D_301(dx, dy, dz, r_inv);
+    l_b->F_210 += m_a->M_001 * D_211(dx, dy, dz, r_inv) +
+                  m_a->M_010 * D_220(dx, dy, dz, r_inv) +
+                  m_a->M_100 * D_310(dx, dy, dz, r_inv);
+    l_b->F_300 += m_a->M_001 * D_301(dx, dy, dz, r_inv) +
+                  m_a->M_010 * D_310(dx, dy, dz, r_inv) +
+                  m_a->M_100 * D_400(dx, dy, dz, r_inv);
+
+    /* Compute 4th order field tensor terms (addition to rank 4) */
+    l_b->F_004 += m_a->M_000 * D_004(dx, dy, dz, r_inv);
+    l_b->F_013 += m_a->M_000 * D_013(dx, dy, dz, r_inv);
+    l_b->F_022 += m_a->M_000 * D_022(dx, dy, dz, r_inv);
+    l_b->F_031 += m_a->M_000 * D_031(dx, dy, dz, r_inv);
+    l_b->F_040 += m_a->M_000 * D_040(dx, dy, dz, r_inv);
+    l_b->F_103 += m_a->M_000 * D_103(dx, dy, dz, r_inv);
+    l_b->F_112 += m_a->M_000 * D_112(dx, dy, dz, r_inv);
+    l_b->F_121 += m_a->M_000 * D_121(dx, dy, dz, r_inv);
+    l_b->F_130 += m_a->M_000 * D_130(dx, dy, dz, r_inv);
+    l_b->F_202 += m_a->M_000 * D_202(dx, dy, dz, r_inv);
+    l_b->F_211 += m_a->M_000 * D_211(dx, dy, dz, r_inv);
+    l_b->F_220 += m_a->M_000 * D_220(dx, dy, dz, r_inv);
+    l_b->F_301 += m_a->M_000 * D_301(dx, dy, dz, r_inv);
+    l_b->F_310 += m_a->M_000 * D_310(dx, dy, dz, r_inv);
+    l_b->F_400 += m_a->M_000 * D_400(dx, dy, dz, r_inv);
 
 #endif
 #if SELF_GRAVITY_MULTIPOLE_ORDER > 4
 
-  /* Compute 5th order field tensor terms (addition to rank 0) */
-  l_b->F_000 += m_a->M_005 * D_005(dx, dy, dz, r_inv) +
-                m_a->M_014 * D_014(dx, dy, dz, r_inv) +
-                m_a->M_023 * D_023(dx, dy, dz, r_inv) +
-                m_a->M_032 * D_032(dx, dy, dz, r_inv) +
-                m_a->M_041 * D_041(dx, dy, dz, r_inv) +
-                m_a->M_050 * D_050(dx, dy, dz, r_inv) +
-                m_a->M_104 * D_104(dx, dy, dz, r_inv) +
-                m_a->M_113 * D_113(dx, dy, dz, r_inv) +
-                m_a->M_122 * D_122(dx, dy, dz, r_inv) +
-                m_a->M_131 * D_131(dx, dy, dz, r_inv) +
-                m_a->M_140 * D_140(dx, dy, dz, r_inv) +
-                m_a->M_203 * D_203(dx, dy, dz, r_inv) +
-                m_a->M_212 * D_212(dx, dy, dz, r_inv) +
-                m_a->M_221 * D_221(dx, dy, dz, r_inv) +
-                m_a->M_230 * D_230(dx, dy, dz, r_inv) +
-                m_a->M_302 * D_302(dx, dy, dz, r_inv) +
-                m_a->M_311 * D_311(dx, dy, dz, r_inv) +
-                m_a->M_320 * D_320(dx, dy, dz, r_inv) +
-                m_a->M_401 * D_401(dx, dy, dz, r_inv) +
-                m_a->M_410 * D_410(dx, dy, dz, r_inv) +
-                m_a->M_500 * D_500(dx, dy, dz, r_inv);
-
-  /* Compute 5th order field tensor terms (addition to rank 1) */
-  l_b->F_001 += m_a->M_004 * D_005(dx, dy, dz, r_inv) +
-                m_a->M_013 * D_014(dx, dy, dz, r_inv) +
-                m_a->M_022 * D_023(dx, dy, dz, r_inv) +
-                m_a->M_031 * D_032(dx, dy, dz, r_inv) +
-                m_a->M_040 * D_041(dx, dy, dz, r_inv) +
-                m_a->M_103 * D_104(dx, dy, dz, r_inv) +
-                m_a->M_112 * D_113(dx, dy, dz, r_inv) +
-                m_a->M_121 * D_122(dx, dy, dz, r_inv) +
-                m_a->M_130 * D_131(dx, dy, dz, r_inv) +
-                m_a->M_202 * D_203(dx, dy, dz, r_inv) +
-                m_a->M_211 * D_212(dx, dy, dz, r_inv) +
-                m_a->M_220 * D_221(dx, dy, dz, r_inv) +
-                m_a->M_301 * D_302(dx, dy, dz, r_inv) +
-                m_a->M_310 * D_311(dx, dy, dz, r_inv) +
-                m_a->M_400 * D_401(dx, dy, dz, r_inv);
-  l_b->F_010 += m_a->M_004 * D_014(dx, dy, dz, r_inv) +
-                m_a->M_013 * D_023(dx, dy, dz, r_inv) +
-                m_a->M_022 * D_032(dx, dy, dz, r_inv) +
-                m_a->M_031 * D_041(dx, dy, dz, r_inv) +
-                m_a->M_040 * D_050(dx, dy, dz, r_inv) +
-                m_a->M_103 * D_113(dx, dy, dz, r_inv) +
-                m_a->M_112 * D_122(dx, dy, dz, r_inv) +
-                m_a->M_121 * D_131(dx, dy, dz, r_inv) +
-                m_a->M_130 * D_140(dx, dy, dz, r_inv) +
-                m_a->M_202 * D_212(dx, dy, dz, r_inv) +
-                m_a->M_211 * D_221(dx, dy, dz, r_inv) +
-                m_a->M_220 * D_230(dx, dy, dz, r_inv) +
-                m_a->M_301 * D_311(dx, dy, dz, r_inv) +
-                m_a->M_310 * D_320(dx, dy, dz, r_inv) +
-                m_a->M_400 * D_410(dx, dy, dz, r_inv);
-  l_b->F_100 += m_a->M_004 * D_104(dx, dy, dz, r_inv) +
-                m_a->M_013 * D_113(dx, dy, dz, r_inv) +
-                m_a->M_022 * D_122(dx, dy, dz, r_inv) +
-                m_a->M_031 * D_131(dx, dy, dz, r_inv) +
-                m_a->M_040 * D_140(dx, dy, dz, r_inv) +
-                m_a->M_103 * D_203(dx, dy, dz, r_inv) +
-                m_a->M_112 * D_212(dx, dy, dz, r_inv) +
-                m_a->M_121 * D_221(dx, dy, dz, r_inv) +
-                m_a->M_130 * D_230(dx, dy, dz, r_inv) +
-                m_a->M_202 * D_302(dx, dy, dz, r_inv) +
-                m_a->M_211 * D_311(dx, dy, dz, r_inv) +
-                m_a->M_220 * D_320(dx, dy, dz, r_inv) +
-                m_a->M_301 * D_401(dx, dy, dz, r_inv) +
-                m_a->M_310 * D_410(dx, dy, dz, r_inv) +
-                m_a->M_400 * D_500(dx, dy, dz, r_inv);
-
-  /* Compute 5th order field tensor terms (addition to rank 2) */
-  l_b->F_002 += m_a->M_003 * D_005(dx, dy, dz, r_inv) +
-                m_a->M_012 * D_014(dx, dy, dz, r_inv) +
-                m_a->M_021 * D_023(dx, dy, dz, r_inv) +
-                m_a->M_030 * D_032(dx, dy, dz, r_inv) +
-                m_a->M_102 * D_104(dx, dy, dz, r_inv) +
-                m_a->M_111 * D_113(dx, dy, dz, r_inv) +
-                m_a->M_120 * D_122(dx, dy, dz, r_inv) +
-                m_a->M_201 * D_203(dx, dy, dz, r_inv) +
-                m_a->M_210 * D_212(dx, dy, dz, r_inv) +
-                m_a->M_300 * D_302(dx, dy, dz, r_inv);
-  l_b->F_011 += m_a->M_003 * D_014(dx, dy, dz, r_inv) +
-                m_a->M_012 * D_023(dx, dy, dz, r_inv) +
-                m_a->M_021 * D_032(dx, dy, dz, r_inv) +
-                m_a->M_030 * D_041(dx, dy, dz, r_inv) +
-                m_a->M_102 * D_113(dx, dy, dz, r_inv) +
-                m_a->M_111 * D_122(dx, dy, dz, r_inv) +
-                m_a->M_120 * D_131(dx, dy, dz, r_inv) +
-                m_a->M_201 * D_212(dx, dy, dz, r_inv) +
-                m_a->M_210 * D_221(dx, dy, dz, r_inv) +
-                m_a->M_300 * D_311(dx, dy, dz, r_inv);
-  l_b->F_020 += m_a->M_003 * D_023(dx, dy, dz, r_inv) +
-                m_a->M_012 * D_032(dx, dy, dz, r_inv) +
-                m_a->M_021 * D_041(dx, dy, dz, r_inv) +
-                m_a->M_030 * D_050(dx, dy, dz, r_inv) +
-                m_a->M_102 * D_122(dx, dy, dz, r_inv) +
-                m_a->M_111 * D_131(dx, dy, dz, r_inv) +
-                m_a->M_120 * D_140(dx, dy, dz, r_inv) +
-                m_a->M_201 * D_221(dx, dy, dz, r_inv) +
-                m_a->M_210 * D_230(dx, dy, dz, r_inv) +
-                m_a->M_300 * D_320(dx, dy, dz, r_inv);
-  l_b->F_101 += m_a->M_003 * D_104(dx, dy, dz, r_inv) +
-                m_a->M_012 * D_113(dx, dy, dz, r_inv) +
-                m_a->M_021 * D_122(dx, dy, dz, r_inv) +
-                m_a->M_030 * D_131(dx, dy, dz, r_inv) +
-                m_a->M_102 * D_203(dx, dy, dz, r_inv) +
-                m_a->M_111 * D_212(dx, dy, dz, r_inv) +
-                m_a->M_120 * D_221(dx, dy, dz, r_inv) +
-                m_a->M_201 * D_302(dx, dy, dz, r_inv) +
-                m_a->M_210 * D_311(dx, dy, dz, r_inv) +
-                m_a->M_300 * D_401(dx, dy, dz, r_inv);
-  l_b->F_110 += m_a->M_003 * D_113(dx, dy, dz, r_inv) +
-                m_a->M_012 * D_122(dx, dy, dz, r_inv) +
-                m_a->M_021 * D_131(dx, dy, dz, r_inv) +
-                m_a->M_030 * D_140(dx, dy, dz, r_inv) +
-                m_a->M_102 * D_212(dx, dy, dz, r_inv) +
-                m_a->M_111 * D_221(dx, dy, dz, r_inv) +
-                m_a->M_120 * D_230(dx, dy, dz, r_inv) +
-                m_a->M_201 * D_311(dx, dy, dz, r_inv) +
-                m_a->M_210 * D_320(dx, dy, dz, r_inv) +
-                m_a->M_300 * D_410(dx, dy, dz, r_inv);
-  l_b->F_200 += m_a->M_003 * D_203(dx, dy, dz, r_inv) +
-                m_a->M_012 * D_212(dx, dy, dz, r_inv) +
-                m_a->M_021 * D_221(dx, dy, dz, r_inv) +
-                m_a->M_030 * D_230(dx, dy, dz, r_inv) +
-                m_a->M_102 * D_302(dx, dy, dz, r_inv) +
-                m_a->M_111 * D_311(dx, dy, dz, r_inv) +
-                m_a->M_120 * D_320(dx, dy, dz, r_inv) +
-                m_a->M_201 * D_401(dx, dy, dz, r_inv) +
-                m_a->M_210 * D_410(dx, dy, dz, r_inv) +
-                m_a->M_300 * D_500(dx, dy, dz, r_inv);
-
-  /* Compute 5th order field tensor terms (addition to rank 3) */
-  l_b->F_003 += m_a->M_002 * D_005(dx, dy, dz, r_inv) +
-                m_a->M_011 * D_014(dx, dy, dz, r_inv) +
-                m_a->M_020 * D_023(dx, dy, dz, r_inv) +
-                m_a->M_101 * D_104(dx, dy, dz, r_inv) +
-                m_a->M_110 * D_113(dx, dy, dz, r_inv) +
-                m_a->M_200 * D_203(dx, dy, dz, r_inv);
-  l_b->F_012 += m_a->M_002 * D_014(dx, dy, dz, r_inv) +
-                m_a->M_011 * D_023(dx, dy, dz, r_inv) +
-                m_a->M_020 * D_032(dx, dy, dz, r_inv) +
-                m_a->M_101 * D_113(dx, dy, dz, r_inv) +
-                m_a->M_110 * D_122(dx, dy, dz, r_inv) +
-                m_a->M_200 * D_212(dx, dy, dz, r_inv);
-  l_b->F_021 += m_a->M_002 * D_023(dx, dy, dz, r_inv) +
-                m_a->M_011 * D_032(dx, dy, dz, r_inv) +
-                m_a->M_020 * D_041(dx, dy, dz, r_inv) +
-                m_a->M_101 * D_122(dx, dy, dz, r_inv) +
-                m_a->M_110 * D_131(dx, dy, dz, r_inv) +
-                m_a->M_200 * D_221(dx, dy, dz, r_inv);
-  l_b->F_030 += m_a->M_002 * D_032(dx, dy, dz, r_inv) +
-                m_a->M_011 * D_041(dx, dy, dz, r_inv) +
-                m_a->M_020 * D_050(dx, dy, dz, r_inv) +
-                m_a->M_101 * D_131(dx, dy, dz, r_inv) +
-                m_a->M_110 * D_140(dx, dy, dz, r_inv) +
-                m_a->M_200 * D_230(dx, dy, dz, r_inv);
-  l_b->F_102 += m_a->M_002 * D_104(dx, dy, dz, r_inv) +
-                m_a->M_011 * D_113(dx, dy, dz, r_inv) +
-                m_a->M_020 * D_122(dx, dy, dz, r_inv) +
-                m_a->M_101 * D_203(dx, dy, dz, r_inv) +
-                m_a->M_110 * D_212(dx, dy, dz, r_inv) +
-                m_a->M_200 * D_302(dx, dy, dz, r_inv);
-  l_b->F_111 += m_a->M_002 * D_113(dx, dy, dz, r_inv) +
-                m_a->M_011 * D_122(dx, dy, dz, r_inv) +
-                m_a->M_020 * D_131(dx, dy, dz, r_inv) +
-                m_a->M_101 * D_212(dx, dy, dz, r_inv) +
-                m_a->M_110 * D_221(dx, dy, dz, r_inv) +
-                m_a->M_200 * D_311(dx, dy, dz, r_inv);
-  l_b->F_120 += m_a->M_002 * D_122(dx, dy, dz, r_inv) +
-                m_a->M_011 * D_131(dx, dy, dz, r_inv) +
-                m_a->M_020 * D_140(dx, dy, dz, r_inv) +
-                m_a->M_101 * D_221(dx, dy, dz, r_inv) +
-                m_a->M_110 * D_230(dx, dy, dz, r_inv) +
-                m_a->M_200 * D_320(dx, dy, dz, r_inv);
-  l_b->F_201 += m_a->M_002 * D_203(dx, dy, dz, r_inv) +
-                m_a->M_011 * D_212(dx, dy, dz, r_inv) +
-                m_a->M_020 * D_221(dx, dy, dz, r_inv) +
-                m_a->M_101 * D_302(dx, dy, dz, r_inv) +
-                m_a->M_110 * D_311(dx, dy, dz, r_inv) +
-                m_a->M_200 * D_401(dx, dy, dz, r_inv);
-  l_b->F_210 += m_a->M_002 * D_212(dx, dy, dz, r_inv) +
-                m_a->M_011 * D_221(dx, dy, dz, r_inv) +
-                m_a->M_020 * D_230(dx, dy, dz, r_inv) +
-                m_a->M_101 * D_311(dx, dy, dz, r_inv) +
-                m_a->M_110 * D_320(dx, dy, dz, r_inv) +
-                m_a->M_200 * D_410(dx, dy, dz, r_inv);
-  l_b->F_300 += m_a->M_002 * D_302(dx, dy, dz, r_inv) +
-                m_a->M_011 * D_311(dx, dy, dz, r_inv) +
-                m_a->M_020 * D_320(dx, dy, dz, r_inv) +
-                m_a->M_101 * D_401(dx, dy, dz, r_inv) +
-                m_a->M_110 * D_410(dx, dy, dz, r_inv) +
-                m_a->M_200 * D_500(dx, dy, dz, r_inv);
-
-  /* Compute 5th order field tensor terms (addition to rank 4) */
-  l_b->F_004 += m_a->M_001 * D_005(dx, dy, dz, r_inv) +
-                m_a->M_010 * D_014(dx, dy, dz, r_inv) +
-                m_a->M_100 * D_104(dx, dy, dz, r_inv);
-  l_b->F_013 += m_a->M_001 * D_014(dx, dy, dz, r_inv) +
-                m_a->M_010 * D_023(dx, dy, dz, r_inv) +
-                m_a->M_100 * D_113(dx, dy, dz, r_inv);
-  l_b->F_022 += m_a->M_001 * D_023(dx, dy, dz, r_inv) +
-                m_a->M_010 * D_032(dx, dy, dz, r_inv) +
-                m_a->M_100 * D_122(dx, dy, dz, r_inv);
-  l_b->F_031 += m_a->M_001 * D_032(dx, dy, dz, r_inv) +
-                m_a->M_010 * D_041(dx, dy, dz, r_inv) +
-                m_a->M_100 * D_131(dx, dy, dz, r_inv);
-  l_b->F_040 += m_a->M_001 * D_041(dx, dy, dz, r_inv) +
-                m_a->M_010 * D_050(dx, dy, dz, r_inv) +
-                m_a->M_100 * D_140(dx, dy, dz, r_inv);
-  l_b->F_103 += m_a->M_001 * D_104(dx, dy, dz, r_inv) +
-                m_a->M_010 * D_113(dx, dy, dz, r_inv) +
-                m_a->M_100 * D_203(dx, dy, dz, r_inv);
-  l_b->F_112 += m_a->M_001 * D_113(dx, dy, dz, r_inv) +
-                m_a->M_010 * D_122(dx, dy, dz, r_inv) +
-                m_a->M_100 * D_212(dx, dy, dz, r_inv);
-  l_b->F_121 += m_a->M_001 * D_122(dx, dy, dz, r_inv) +
-                m_a->M_010 * D_131(dx, dy, dz, r_inv) +
-                m_a->M_100 * D_221(dx, dy, dz, r_inv);
-  l_b->F_130 += m_a->M_001 * D_131(dx, dy, dz, r_inv) +
-                m_a->M_010 * D_140(dx, dy, dz, r_inv) +
-                m_a->M_100 * D_230(dx, dy, dz, r_inv);
-  l_b->F_202 += m_a->M_001 * D_203(dx, dy, dz, r_inv) +
-                m_a->M_010 * D_212(dx, dy, dz, r_inv) +
-                m_a->M_100 * D_302(dx, dy, dz, r_inv);
-  l_b->F_211 += m_a->M_001 * D_212(dx, dy, dz, r_inv) +
-                m_a->M_010 * D_221(dx, dy, dz, r_inv) +
-                m_a->M_100 * D_311(dx, dy, dz, r_inv);
-  l_b->F_220 += m_a->M_001 * D_221(dx, dy, dz, r_inv) +
-                m_a->M_010 * D_230(dx, dy, dz, r_inv) +
-                m_a->M_100 * D_320(dx, dy, dz, r_inv);
-  l_b->F_301 += m_a->M_001 * D_302(dx, dy, dz, r_inv) +
-                m_a->M_010 * D_311(dx, dy, dz, r_inv) +
-                m_a->M_100 * D_401(dx, dy, dz, r_inv);
-  l_b->F_310 += m_a->M_001 * D_311(dx, dy, dz, r_inv) +
-                m_a->M_010 * D_320(dx, dy, dz, r_inv) +
-                m_a->M_100 * D_410(dx, dy, dz, r_inv);
-  l_b->F_400 += m_a->M_001 * D_401(dx, dy, dz, r_inv) +
-                m_a->M_010 * D_410(dx, dy, dz, r_inv) +
-                m_a->M_100 * D_500(dx, dy, dz, r_inv);
-
-  /* Compute 5th order field tensor terms (addition to rank 5) */
-  l_b->F_005 += m_a->M_000 * D_005(dx, dy, dz, r_inv);
-  l_b->F_014 += m_a->M_000 * D_014(dx, dy, dz, r_inv);
-  l_b->F_023 += m_a->M_000 * D_023(dx, dy, dz, r_inv);
-  l_b->F_032 += m_a->M_000 * D_032(dx, dy, dz, r_inv);
-  l_b->F_041 += m_a->M_000 * D_041(dx, dy, dz, r_inv);
-  l_b->F_050 += m_a->M_000 * D_050(dx, dy, dz, r_inv);
-  l_b->F_104 += m_a->M_000 * D_104(dx, dy, dz, r_inv);
-  l_b->F_113 += m_a->M_000 * D_113(dx, dy, dz, r_inv);
-  l_b->F_122 += m_a->M_000 * D_122(dx, dy, dz, r_inv);
-  l_b->F_131 += m_a->M_000 * D_131(dx, dy, dz, r_inv);
-  l_b->F_140 += m_a->M_000 * D_140(dx, dy, dz, r_inv);
-  l_b->F_203 += m_a->M_000 * D_203(dx, dy, dz, r_inv);
-  l_b->F_212 += m_a->M_000 * D_212(dx, dy, dz, r_inv);
-  l_b->F_221 += m_a->M_000 * D_221(dx, dy, dz, r_inv);
-  l_b->F_230 += m_a->M_000 * D_230(dx, dy, dz, r_inv);
-  l_b->F_302 += m_a->M_000 * D_302(dx, dy, dz, r_inv);
-  l_b->F_311 += m_a->M_000 * D_311(dx, dy, dz, r_inv);
-  l_b->F_320 += m_a->M_000 * D_320(dx, dy, dz, r_inv);
-  l_b->F_401 += m_a->M_000 * D_401(dx, dy, dz, r_inv);
-  l_b->F_410 += m_a->M_000 * D_410(dx, dy, dz, r_inv);
-  l_b->F_500 += m_a->M_000 * D_500(dx, dy, dz, r_inv);
+    /* Compute 5th order field tensor terms (addition to rank 0) */
+    l_b->F_000 += m_a->M_005 * D_005(dx, dy, dz, r_inv) +
+                  m_a->M_014 * D_014(dx, dy, dz, r_inv) +
+                  m_a->M_023 * D_023(dx, dy, dz, r_inv) +
+                  m_a->M_032 * D_032(dx, dy, dz, r_inv) +
+                  m_a->M_041 * D_041(dx, dy, dz, r_inv) +
+                  m_a->M_050 * D_050(dx, dy, dz, r_inv) +
+                  m_a->M_104 * D_104(dx, dy, dz, r_inv) +
+                  m_a->M_113 * D_113(dx, dy, dz, r_inv) +
+                  m_a->M_122 * D_122(dx, dy, dz, r_inv) +
+                  m_a->M_131 * D_131(dx, dy, dz, r_inv) +
+                  m_a->M_140 * D_140(dx, dy, dz, r_inv) +
+                  m_a->M_203 * D_203(dx, dy, dz, r_inv) +
+                  m_a->M_212 * D_212(dx, dy, dz, r_inv) +
+                  m_a->M_221 * D_221(dx, dy, dz, r_inv) +
+                  m_a->M_230 * D_230(dx, dy, dz, r_inv) +
+                  m_a->M_302 * D_302(dx, dy, dz, r_inv) +
+                  m_a->M_311 * D_311(dx, dy, dz, r_inv) +
+                  m_a->M_320 * D_320(dx, dy, dz, r_inv) +
+                  m_a->M_401 * D_401(dx, dy, dz, r_inv) +
+                  m_a->M_410 * D_410(dx, dy, dz, r_inv) +
+                  m_a->M_500 * D_500(dx, dy, dz, r_inv);
+
+    /* Compute 5th order field tensor terms (addition to rank 1) */
+    l_b->F_001 += m_a->M_004 * D_005(dx, dy, dz, r_inv) +
+                  m_a->M_013 * D_014(dx, dy, dz, r_inv) +
+                  m_a->M_022 * D_023(dx, dy, dz, r_inv) +
+                  m_a->M_031 * D_032(dx, dy, dz, r_inv) +
+                  m_a->M_040 * D_041(dx, dy, dz, r_inv) +
+                  m_a->M_103 * D_104(dx, dy, dz, r_inv) +
+                  m_a->M_112 * D_113(dx, dy, dz, r_inv) +
+                  m_a->M_121 * D_122(dx, dy, dz, r_inv) +
+                  m_a->M_130 * D_131(dx, dy, dz, r_inv) +
+                  m_a->M_202 * D_203(dx, dy, dz, r_inv) +
+                  m_a->M_211 * D_212(dx, dy, dz, r_inv) +
+                  m_a->M_220 * D_221(dx, dy, dz, r_inv) +
+                  m_a->M_301 * D_302(dx, dy, dz, r_inv) +
+                  m_a->M_310 * D_311(dx, dy, dz, r_inv) +
+                  m_a->M_400 * D_401(dx, dy, dz, r_inv);
+    l_b->F_010 += m_a->M_004 * D_014(dx, dy, dz, r_inv) +
+                  m_a->M_013 * D_023(dx, dy, dz, r_inv) +
+                  m_a->M_022 * D_032(dx, dy, dz, r_inv) +
+                  m_a->M_031 * D_041(dx, dy, dz, r_inv) +
+                  m_a->M_040 * D_050(dx, dy, dz, r_inv) +
+                  m_a->M_103 * D_113(dx, dy, dz, r_inv) +
+                  m_a->M_112 * D_122(dx, dy, dz, r_inv) +
+                  m_a->M_121 * D_131(dx, dy, dz, r_inv) +
+                  m_a->M_130 * D_140(dx, dy, dz, r_inv) +
+                  m_a->M_202 * D_212(dx, dy, dz, r_inv) +
+                  m_a->M_211 * D_221(dx, dy, dz, r_inv) +
+                  m_a->M_220 * D_230(dx, dy, dz, r_inv) +
+                  m_a->M_301 * D_311(dx, dy, dz, r_inv) +
+                  m_a->M_310 * D_320(dx, dy, dz, r_inv) +
+                  m_a->M_400 * D_410(dx, dy, dz, r_inv);
+    l_b->F_100 += m_a->M_004 * D_104(dx, dy, dz, r_inv) +
+                  m_a->M_013 * D_113(dx, dy, dz, r_inv) +
+                  m_a->M_022 * D_122(dx, dy, dz, r_inv) +
+                  m_a->M_031 * D_131(dx, dy, dz, r_inv) +
+                  m_a->M_040 * D_140(dx, dy, dz, r_inv) +
+                  m_a->M_103 * D_203(dx, dy, dz, r_inv) +
+                  m_a->M_112 * D_212(dx, dy, dz, r_inv) +
+                  m_a->M_121 * D_221(dx, dy, dz, r_inv) +
+                  m_a->M_130 * D_230(dx, dy, dz, r_inv) +
+                  m_a->M_202 * D_302(dx, dy, dz, r_inv) +
+                  m_a->M_211 * D_311(dx, dy, dz, r_inv) +
+                  m_a->M_220 * D_320(dx, dy, dz, r_inv) +
+                  m_a->M_301 * D_401(dx, dy, dz, r_inv) +
+                  m_a->M_310 * D_410(dx, dy, dz, r_inv) +
+                  m_a->M_400 * D_500(dx, dy, dz, r_inv);
+
+    /* Compute 5th order field tensor terms (addition to rank 2) */
+    l_b->F_002 += m_a->M_003 * D_005(dx, dy, dz, r_inv) +
+                  m_a->M_012 * D_014(dx, dy, dz, r_inv) +
+                  m_a->M_021 * D_023(dx, dy, dz, r_inv) +
+                  m_a->M_030 * D_032(dx, dy, dz, r_inv) +
+                  m_a->M_102 * D_104(dx, dy, dz, r_inv) +
+                  m_a->M_111 * D_113(dx, dy, dz, r_inv) +
+                  m_a->M_120 * D_122(dx, dy, dz, r_inv) +
+                  m_a->M_201 * D_203(dx, dy, dz, r_inv) +
+                  m_a->M_210 * D_212(dx, dy, dz, r_inv) +
+                  m_a->M_300 * D_302(dx, dy, dz, r_inv);
+    l_b->F_011 += m_a->M_003 * D_014(dx, dy, dz, r_inv) +
+                  m_a->M_012 * D_023(dx, dy, dz, r_inv) +
+                  m_a->M_021 * D_032(dx, dy, dz, r_inv) +
+                  m_a->M_030 * D_041(dx, dy, dz, r_inv) +
+                  m_a->M_102 * D_113(dx, dy, dz, r_inv) +
+                  m_a->M_111 * D_122(dx, dy, dz, r_inv) +
+                  m_a->M_120 * D_131(dx, dy, dz, r_inv) +
+                  m_a->M_201 * D_212(dx, dy, dz, r_inv) +
+                  m_a->M_210 * D_221(dx, dy, dz, r_inv) +
+                  m_a->M_300 * D_311(dx, dy, dz, r_inv);
+    l_b->F_020 += m_a->M_003 * D_023(dx, dy, dz, r_inv) +
+                  m_a->M_012 * D_032(dx, dy, dz, r_inv) +
+                  m_a->M_021 * D_041(dx, dy, dz, r_inv) +
+                  m_a->M_030 * D_050(dx, dy, dz, r_inv) +
+                  m_a->M_102 * D_122(dx, dy, dz, r_inv) +
+                  m_a->M_111 * D_131(dx, dy, dz, r_inv) +
+                  m_a->M_120 * D_140(dx, dy, dz, r_inv) +
+                  m_a->M_201 * D_221(dx, dy, dz, r_inv) +
+                  m_a->M_210 * D_230(dx, dy, dz, r_inv) +
+                  m_a->M_300 * D_320(dx, dy, dz, r_inv);
+    l_b->F_101 += m_a->M_003 * D_104(dx, dy, dz, r_inv) +
+                  m_a->M_012 * D_113(dx, dy, dz, r_inv) +
+                  m_a->M_021 * D_122(dx, dy, dz, r_inv) +
+                  m_a->M_030 * D_131(dx, dy, dz, r_inv) +
+                  m_a->M_102 * D_203(dx, dy, dz, r_inv) +
+                  m_a->M_111 * D_212(dx, dy, dz, r_inv) +
+                  m_a->M_120 * D_221(dx, dy, dz, r_inv) +
+                  m_a->M_201 * D_302(dx, dy, dz, r_inv) +
+                  m_a->M_210 * D_311(dx, dy, dz, r_inv) +
+                  m_a->M_300 * D_401(dx, dy, dz, r_inv);
+    l_b->F_110 += m_a->M_003 * D_113(dx, dy, dz, r_inv) +
+                  m_a->M_012 * D_122(dx, dy, dz, r_inv) +
+                  m_a->M_021 * D_131(dx, dy, dz, r_inv) +
+                  m_a->M_030 * D_140(dx, dy, dz, r_inv) +
+                  m_a->M_102 * D_212(dx, dy, dz, r_inv) +
+                  m_a->M_111 * D_221(dx, dy, dz, r_inv) +
+                  m_a->M_120 * D_230(dx, dy, dz, r_inv) +
+                  m_a->M_201 * D_311(dx, dy, dz, r_inv) +
+                  m_a->M_210 * D_320(dx, dy, dz, r_inv) +
+                  m_a->M_300 * D_410(dx, dy, dz, r_inv);
+    l_b->F_200 += m_a->M_003 * D_203(dx, dy, dz, r_inv) +
+                  m_a->M_012 * D_212(dx, dy, dz, r_inv) +
+                  m_a->M_021 * D_221(dx, dy, dz, r_inv) +
+                  m_a->M_030 * D_230(dx, dy, dz, r_inv) +
+                  m_a->M_102 * D_302(dx, dy, dz, r_inv) +
+                  m_a->M_111 * D_311(dx, dy, dz, r_inv) +
+                  m_a->M_120 * D_320(dx, dy, dz, r_inv) +
+                  m_a->M_201 * D_401(dx, dy, dz, r_inv) +
+                  m_a->M_210 * D_410(dx, dy, dz, r_inv) +
+                  m_a->M_300 * D_500(dx, dy, dz, r_inv);
+
+    /* Compute 5th order field tensor terms (addition to rank 3) */
+    l_b->F_003 += m_a->M_002 * D_005(dx, dy, dz, r_inv) +
+                  m_a->M_011 * D_014(dx, dy, dz, r_inv) +
+                  m_a->M_020 * D_023(dx, dy, dz, r_inv) +
+                  m_a->M_101 * D_104(dx, dy, dz, r_inv) +
+                  m_a->M_110 * D_113(dx, dy, dz, r_inv) +
+                  m_a->M_200 * D_203(dx, dy, dz, r_inv);
+    l_b->F_012 += m_a->M_002 * D_014(dx, dy, dz, r_inv) +
+                  m_a->M_011 * D_023(dx, dy, dz, r_inv) +
+                  m_a->M_020 * D_032(dx, dy, dz, r_inv) +
+                  m_a->M_101 * D_113(dx, dy, dz, r_inv) +
+                  m_a->M_110 * D_122(dx, dy, dz, r_inv) +
+                  m_a->M_200 * D_212(dx, dy, dz, r_inv);
+    l_b->F_021 += m_a->M_002 * D_023(dx, dy, dz, r_inv) +
+                  m_a->M_011 * D_032(dx, dy, dz, r_inv) +
+                  m_a->M_020 * D_041(dx, dy, dz, r_inv) +
+                  m_a->M_101 * D_122(dx, dy, dz, r_inv) +
+                  m_a->M_110 * D_131(dx, dy, dz, r_inv) +
+                  m_a->M_200 * D_221(dx, dy, dz, r_inv);
+    l_b->F_030 += m_a->M_002 * D_032(dx, dy, dz, r_inv) +
+                  m_a->M_011 * D_041(dx, dy, dz, r_inv) +
+                  m_a->M_020 * D_050(dx, dy, dz, r_inv) +
+                  m_a->M_101 * D_131(dx, dy, dz, r_inv) +
+                  m_a->M_110 * D_140(dx, dy, dz, r_inv) +
+                  m_a->M_200 * D_230(dx, dy, dz, r_inv);
+    l_b->F_102 += m_a->M_002 * D_104(dx, dy, dz, r_inv) +
+                  m_a->M_011 * D_113(dx, dy, dz, r_inv) +
+                  m_a->M_020 * D_122(dx, dy, dz, r_inv) +
+                  m_a->M_101 * D_203(dx, dy, dz, r_inv) +
+                  m_a->M_110 * D_212(dx, dy, dz, r_inv) +
+                  m_a->M_200 * D_302(dx, dy, dz, r_inv);
+    l_b->F_111 += m_a->M_002 * D_113(dx, dy, dz, r_inv) +
+                  m_a->M_011 * D_122(dx, dy, dz, r_inv) +
+                  m_a->M_020 * D_131(dx, dy, dz, r_inv) +
+                  m_a->M_101 * D_212(dx, dy, dz, r_inv) +
+                  m_a->M_110 * D_221(dx, dy, dz, r_inv) +
+                  m_a->M_200 * D_311(dx, dy, dz, r_inv);
+    l_b->F_120 += m_a->M_002 * D_122(dx, dy, dz, r_inv) +
+                  m_a->M_011 * D_131(dx, dy, dz, r_inv) +
+                  m_a->M_020 * D_140(dx, dy, dz, r_inv) +
+                  m_a->M_101 * D_221(dx, dy, dz, r_inv) +
+                  m_a->M_110 * D_230(dx, dy, dz, r_inv) +
+                  m_a->M_200 * D_320(dx, dy, dz, r_inv);
+    l_b->F_201 += m_a->M_002 * D_203(dx, dy, dz, r_inv) +
+                  m_a->M_011 * D_212(dx, dy, dz, r_inv) +
+                  m_a->M_020 * D_221(dx, dy, dz, r_inv) +
+                  m_a->M_101 * D_302(dx, dy, dz, r_inv) +
+                  m_a->M_110 * D_311(dx, dy, dz, r_inv) +
+                  m_a->M_200 * D_401(dx, dy, dz, r_inv);
+    l_b->F_210 += m_a->M_002 * D_212(dx, dy, dz, r_inv) +
+                  m_a->M_011 * D_221(dx, dy, dz, r_inv) +
+                  m_a->M_020 * D_230(dx, dy, dz, r_inv) +
+                  m_a->M_101 * D_311(dx, dy, dz, r_inv) +
+                  m_a->M_110 * D_320(dx, dy, dz, r_inv) +
+                  m_a->M_200 * D_410(dx, dy, dz, r_inv);
+    l_b->F_300 += m_a->M_002 * D_302(dx, dy, dz, r_inv) +
+                  m_a->M_011 * D_311(dx, dy, dz, r_inv) +
+                  m_a->M_020 * D_320(dx, dy, dz, r_inv) +
+                  m_a->M_101 * D_401(dx, dy, dz, r_inv) +
+                  m_a->M_110 * D_410(dx, dy, dz, r_inv) +
+                  m_a->M_200 * D_500(dx, dy, dz, r_inv);
+
+    /* Compute 5th order field tensor terms (addition to rank 4) */
+    l_b->F_004 += m_a->M_001 * D_005(dx, dy, dz, r_inv) +
+                  m_a->M_010 * D_014(dx, dy, dz, r_inv) +
+                  m_a->M_100 * D_104(dx, dy, dz, r_inv);
+    l_b->F_013 += m_a->M_001 * D_014(dx, dy, dz, r_inv) +
+                  m_a->M_010 * D_023(dx, dy, dz, r_inv) +
+                  m_a->M_100 * D_113(dx, dy, dz, r_inv);
+    l_b->F_022 += m_a->M_001 * D_023(dx, dy, dz, r_inv) +
+                  m_a->M_010 * D_032(dx, dy, dz, r_inv) +
+                  m_a->M_100 * D_122(dx, dy, dz, r_inv);
+    l_b->F_031 += m_a->M_001 * D_032(dx, dy, dz, r_inv) +
+                  m_a->M_010 * D_041(dx, dy, dz, r_inv) +
+                  m_a->M_100 * D_131(dx, dy, dz, r_inv);
+    l_b->F_040 += m_a->M_001 * D_041(dx, dy, dz, r_inv) +
+                  m_a->M_010 * D_050(dx, dy, dz, r_inv) +
+                  m_a->M_100 * D_140(dx, dy, dz, r_inv);
+    l_b->F_103 += m_a->M_001 * D_104(dx, dy, dz, r_inv) +
+                  m_a->M_010 * D_113(dx, dy, dz, r_inv) +
+                  m_a->M_100 * D_203(dx, dy, dz, r_inv);
+    l_b->F_112 += m_a->M_001 * D_113(dx, dy, dz, r_inv) +
+                  m_a->M_010 * D_122(dx, dy, dz, r_inv) +
+                  m_a->M_100 * D_212(dx, dy, dz, r_inv);
+    l_b->F_121 += m_a->M_001 * D_122(dx, dy, dz, r_inv) +
+                  m_a->M_010 * D_131(dx, dy, dz, r_inv) +
+                  m_a->M_100 * D_221(dx, dy, dz, r_inv);
+    l_b->F_130 += m_a->M_001 * D_131(dx, dy, dz, r_inv) +
+                  m_a->M_010 * D_140(dx, dy, dz, r_inv) +
+                  m_a->M_100 * D_230(dx, dy, dz, r_inv);
+    l_b->F_202 += m_a->M_001 * D_203(dx, dy, dz, r_inv) +
+                  m_a->M_010 * D_212(dx, dy, dz, r_inv) +
+                  m_a->M_100 * D_302(dx, dy, dz, r_inv);
+    l_b->F_211 += m_a->M_001 * D_212(dx, dy, dz, r_inv) +
+                  m_a->M_010 * D_221(dx, dy, dz, r_inv) +
+                  m_a->M_100 * D_311(dx, dy, dz, r_inv);
+    l_b->F_220 += m_a->M_001 * D_221(dx, dy, dz, r_inv) +
+                  m_a->M_010 * D_230(dx, dy, dz, r_inv) +
+                  m_a->M_100 * D_320(dx, dy, dz, r_inv);
+    l_b->F_301 += m_a->M_001 * D_302(dx, dy, dz, r_inv) +
+                  m_a->M_010 * D_311(dx, dy, dz, r_inv) +
+                  m_a->M_100 * D_401(dx, dy, dz, r_inv);
+    l_b->F_310 += m_a->M_001 * D_311(dx, dy, dz, r_inv) +
+                  m_a->M_010 * D_320(dx, dy, dz, r_inv) +
+                  m_a->M_100 * D_410(dx, dy, dz, r_inv);
+    l_b->F_400 += m_a->M_001 * D_401(dx, dy, dz, r_inv) +
+                  m_a->M_010 * D_410(dx, dy, dz, r_inv) +
+                  m_a->M_100 * D_500(dx, dy, dz, r_inv);
+
+    /* Compute 5th order field tensor terms (addition to rank 5) */
+    l_b->F_005 += m_a->M_000 * D_005(dx, dy, dz, r_inv);
+    l_b->F_014 += m_a->M_000 * D_014(dx, dy, dz, r_inv);
+    l_b->F_023 += m_a->M_000 * D_023(dx, dy, dz, r_inv);
+    l_b->F_032 += m_a->M_000 * D_032(dx, dy, dz, r_inv);
+    l_b->F_041 += m_a->M_000 * D_041(dx, dy, dz, r_inv);
+    l_b->F_050 += m_a->M_000 * D_050(dx, dy, dz, r_inv);
+    l_b->F_104 += m_a->M_000 * D_104(dx, dy, dz, r_inv);
+    l_b->F_113 += m_a->M_000 * D_113(dx, dy, dz, r_inv);
+    l_b->F_122 += m_a->M_000 * D_122(dx, dy, dz, r_inv);
+    l_b->F_131 += m_a->M_000 * D_131(dx, dy, dz, r_inv);
+    l_b->F_140 += m_a->M_000 * D_140(dx, dy, dz, r_inv);
+    l_b->F_203 += m_a->M_000 * D_203(dx, dy, dz, r_inv);
+    l_b->F_212 += m_a->M_000 * D_212(dx, dy, dz, r_inv);
+    l_b->F_221 += m_a->M_000 * D_221(dx, dy, dz, r_inv);
+    l_b->F_230 += m_a->M_000 * D_230(dx, dy, dz, r_inv);
+    l_b->F_302 += m_a->M_000 * D_302(dx, dy, dz, r_inv);
+    l_b->F_311 += m_a->M_000 * D_311(dx, dy, dz, r_inv);
+    l_b->F_320 += m_a->M_000 * D_320(dx, dy, dz, r_inv);
+    l_b->F_401 += m_a->M_000 * D_401(dx, dy, dz, r_inv);
+    l_b->F_410 += m_a->M_000 * D_410(dx, dy, dz, r_inv);
+    l_b->F_500 += m_a->M_000 * D_500(dx, dy, dz, r_inv);
 
 #endif
 #if SELF_GRAVITY_MULTIPOLE_ORDER > 5
 #error "Missing implementation for order >5"
 #endif
 
-#ifdef SWIFT_DEBUG_CHECKS
-
-  l_b->num_interacted += m_a->num_gpart;
-#endif
+    /* Softened case */
+  } else {
+    message("Un-supported softened M2L interaction.");
+  }
 }
 
 /**
@@ -2470,4 +2514,29 @@ INLINE static void gravity_L2P(const struct grav_tensor *lb,
   gp->a_grav[2] += a_grav[2];
 }
 
+/**
+ * @brief Checks whether a cell-cell interaction can be appromixated by a M-M
+ * interaction.
+ *
+ * @param ma The #multipole of the first #cell.
+ * @param mb The #multipole of the second #cell.
+ * @param theta_crit_inv The inverse of the critical opening angle.
+ */
+__attribute__((always_inline)) INLINE static int gravity_multipole_accept(
+    const struct gravity_tensors *ma, const struct gravity_tensors *mb,
+    double theta_crit_inv) {
+
+  const double r_crit_a = ma->r_max * theta_crit_inv;
+  const double r_crit_b = mb->r_max * theta_crit_inv;
+
+  const double dx = ma->CoM[0] - mb->CoM[0];
+  const double dy = ma->CoM[1] - mb->CoM[1];
+  const double dz = ma->CoM[2] - mb->CoM[2];
+
+  const double r2 = dx * dx + dy * dy + dz * dz;
+
+  /* Multipole acceptance criterion (Dehnen 2002, eq.10) */
+  return (r2 > (r_crit_a + r_crit_b) * (r_crit_a + r_crit_b));
+}
+
 #endif /* SWIFT_MULTIPOLE_H */
diff --git a/src/runner_doiact.h b/src/runner_doiact.h
index f938fc54d4f62a2e6360d85ed24ee6b659220c03..d41470f68d9a1050edd44c0dd46ba077c5dfa327 100644
--- a/src/runner_doiact.h
+++ b/src/runner_doiact.h
@@ -2028,6 +2028,7 @@ void DOSUB_PAIR1(struct runner *r, struct cell *ci, struct cell *cj, int sid,
 
   /* Should we even bother? */
   if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return;
+  if (ci->count == 0 || cj->count == 0) return;
 
   /* Get the cell dimensions. */
   const float h = min(ci->width[0], min(ci->width[1], ci->width[2]));
@@ -2275,7 +2276,7 @@ void DOSUB_SELF1(struct runner *r, struct cell *ci, int gettimer) {
   TIMER_TIC;
 
   /* Should we even bother? */
-  if (!cell_is_active(ci, r->e)) return;
+  if (ci->count == 0 || !cell_is_active(ci, r->e)) return;
 
   /* Recurse? */
   if (ci->split) {
@@ -2329,6 +2330,7 @@ void DOSUB_PAIR2(struct runner *r, struct cell *ci, struct cell *cj, int sid,
 
   /* Should we even bother? */
   if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return;
+  if (ci->count == 0 || cj->count == 0) return;
 
   /* Get the cell dimensions. */
   const float h = min(ci->width[0], min(ci->width[1], ci->width[2]));
@@ -2576,7 +2578,7 @@ void DOSUB_SELF2(struct runner *r, struct cell *ci, int gettimer) {
   TIMER_TIC;
 
   /* Should we even bother? */
-  if (!cell_is_active(ci, r->e)) return;
+  if (ci->count == 0 || !cell_is_active(ci, r->e)) return;
 
   /* Recurse? */
   if (ci->split) {
diff --git a/src/runner_doiact_grav.h b/src/runner_doiact_grav.h
index edd368cf8eb48e18119aaa184a8bb29336016483..065e99dba8ee08a6a4fb91245a9d53ffdc7caccc 100644
--- a/src/runner_doiact_grav.h
+++ b/src/runner_doiact_grav.h
@@ -23,6 +23,7 @@
 /* Includes. */
 #include "cell.h"
 #include "gravity.h"
+#include "inline.h"
 #include "part.h"
 
 /**
@@ -93,6 +94,7 @@ void runner_dopair_grav_mm(const struct runner *r, struct cell *restrict ci,
                            struct cell *restrict cj) {
 
   const struct engine *e = r->e;
+  const struct gravity_props *props = e->gravity_properties;
   const int periodic = e->s->periodic;
   const struct multipole *multi_j = &cj->multipole->m_pole;
   // const float a_smooth = e->gravity_properties->a_smooth;
@@ -114,7 +116,7 @@ void runner_dopair_grav_mm(const struct runner *r, struct cell *restrict ci,
 
   /* Let's interact at this level */
   gravity_M2L(&ci->multipole->pot, multi_j, ci->multipole->CoM,
-              cj->multipole->CoM, periodic * 0);
+              cj->multipole->CoM, props, periodic * 0);
 
   TIMER_TOC(timer_dopair_grav_mm);
 }
@@ -156,12 +158,6 @@ void runner_dopair_grav_pp(struct runner *r, struct cell *ci, struct cell *cj) {
 
   TIMER_TIC;
 
-#ifdef SWIFT_DEBUG_CHECKS
-  if (ci->width[0] != cj->width[0])
-    error("Non matching cell sizes !! h_i=%f h_j=%f", ci->width[0],
-          cj->width[0]);
-#endif
-
   /* Anything to do here? */
   if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return;
 
@@ -189,6 +185,8 @@ void runner_dopair_grav_pp(struct runner *r, struct cell *ci, struct cell *cj) {
   }
 #endif
 
+  /* MATTHIEU: Should we use local DP accumulators ? */
+
   /* Loop over all particles in ci... */
   for (int pid = 0; pid < gcount_i; pid++) {
 
@@ -288,6 +286,8 @@ void runner_doself_grav_pp(struct runner *r, struct cell *c) {
   }
 #endif
 
+  /* MATTHIEU: Should we use local DP accumulators ? */
+
   /* Loop over all particles in ci... */
   for (int pid = 0; pid < gcount; pid++) {
 
@@ -358,6 +358,11 @@ void runner_doself_grav_pp(struct runner *r, struct cell *c) {
 void runner_dopair_grav(struct runner *r, struct cell *ci, struct cell *cj,
                         int gettimer) {
 
+  /* Some constants */
+  const struct engine *e = r->e;
+  const struct gravity_props *props = e->gravity_properties;
+  const double theta_crit_inv = props->theta_crit_inv;
+
 #ifdef SWIFT_DEBUG_CHECKS
 
   const int gcount_i = ci->gcount;
@@ -367,26 +372,8 @@ void runner_dopair_grav(struct runner *r, struct cell *ci, struct cell *cj,
   if (gcount_i == 0 || gcount_j == 0)
     error("Doing pair gravity on an empty cell !");
 
-  /* Bad stuff will happen if cell sizes are different */
-  if (ci->width[0] != cj->width[0])
-    error("Non matching cell sizes !! h_i=%f h_j=%f", ci->width[0],
-          cj->width[0]);
-
   /* Sanity check */
-  if (ci == cj)
-    error(
-        "The impossible has happened: pair interaction between a cell and "
-        "itself.");
-
-  /* Are the cells direct neighbours? */
-  if (!cell_are_neighbours(ci, cj))
-    error(
-        "Non-neighbouring cells ! ci->x=[%f %f %f] ci->width=%f cj->loc=[%f %f "
-        "%f] "
-        "cj->width=%f",
-        ci->loc[0], ci->loc[1], ci->loc[2], ci->width[0], cj->loc[0],
-        cj->loc[1], cj->loc[2], cj->width[0]);
-
+  if (ci == cj) error("Pair interaction between a cell and itself.");
 #endif
 
 #if ICHECK > 0
@@ -415,34 +402,70 @@ void runner_dopair_grav(struct runner *r, struct cell *ci, struct cell *cj,
 
   TIMER_TIC;
 
-  /* Are both cells split ? */
-  if (ci->split && cj->split) {
+  /* Can we use M-M interactions ? */
+  if (gravity_multipole_accept(ci->multipole, cj->multipole, theta_crit_inv)) {
+    /* MATTHIEU: make a symmetric M-M interaction function ! */
+    runner_dopair_grav_mm(r, ci, cj);
+    runner_dopair_grav_mm(r, cj, ci);
+  }
+  /* We have two leaves. Go P-P. */
+  else if (!ci->split && !cj->split) {
+    runner_dopair_grav_pp(r, ci, cj);
+  }
+  /* Alright, we'll have to split and recurse. */
+  else {
+
+    const double ri_max = ci->multipole->r_max;
+    const double rj_max = cj->multipole->r_max;
 
-    for (int j = 0; j < 8; j++) {
-      if (ci->progeny[j] != NULL) {
+    /* Split the larger of the two cells and start over again */
+    if (ri_max > rj_max) {
+
+      /* Can we actually split that interaction ? */
+      if (ci->split) {
 
+        /* Loop over ci's children */
         for (int k = 0; k < 8; k++) {
-          if (cj->progeny[k] != NULL) {
+          if (ci->progeny[k] != NULL)
+            runner_dopair_grav(r, ci->progeny[k], cj, 0);
+        }
 
-            if (cell_are_neighbours(ci->progeny[j], cj->progeny[k])) {
+      } else if (cj->split) {
+        /* MATTHIEU: This could maybe be replaced by P-M interactions ?  */
 
-              /* Recurse */
-              runner_dopair_grav(r, ci->progeny[j], cj->progeny[k], 0);
+        /* Loop over cj's children */
+        for (int k = 0; k < 8; k++) {
+          if (cj->progeny[k] != NULL)
+            runner_dopair_grav(r, ci, cj->progeny[k], 0);
+        }
 
-            } else {
+      } else {
+        error("Fundamental error in the logic");
+      }
+    } else {
 
-              /* Ok, here we can go for multipole-multipole interactions */
-              runner_dopair_grav_mm(r, ci->progeny[j], cj->progeny[k]);
-              runner_dopair_grav_mm(r, cj->progeny[k], ci->progeny[j]);
-            }
-          }
+      /* Can we actually split that interaction ? */
+      if (cj->split) {
+
+        /* Loop over cj's children */
+        for (int k = 0; k < 8; k++) {
+          if (cj->progeny[k] != NULL)
+            runner_dopair_grav(r, ci, cj->progeny[k], 0);
+        }
+
+      } else if (ci->split) {
+        /* MATTHIEU: This could maybe be replaced by P-M interactions ?  */
+
+        /* Loop over ci's children */
+        for (int k = 0; k < 8; k++) {
+          if (ci->progeny[k] != NULL)
+            runner_dopair_grav(r, ci->progeny[k], cj, 0);
         }
+
+      } else {
+        error("Fundamental error in the logic");
       }
     }
-  } else { /* Not split */
-
-    /* Compute the interactions at this level directly. */
-    runner_dopair_grav_pp(r, ci, cj);
   }
 
   if (gettimer) TIMER_TOC(timer_dosub_pair_grav);
@@ -460,7 +483,6 @@ void runner_dopair_grav(struct runner *r, struct cell *ci, struct cell *cj,
 void runner_doself_grav(struct runner *r, struct cell *c, int gettimer) {
 
 #ifdef SWIFT_DEBUG_CHECKS
-
   /* Early abort? */
   if (c->gcount == 0) error("Doing self gravity on an empty cell !");
 #endif
@@ -514,6 +536,14 @@ void runner_dosub_grav(struct runner *r, struct cell *ci, struct cell *cj,
   }
 }
 
+/**
+ * @brief Performs all M-M interactions between a given top-level cell and all
+ * the other top-levels that are far enough.
+ *
+ * @param r The thread #runner.
+ * @param ci The #cell of interest.
+ * @param timer Are we timing this ?
+ */
 void runner_do_grav_long_range(struct runner *r, struct cell *ci, int timer) {
 
 #if ICHECK > 0
@@ -529,39 +559,40 @@ void runner_do_grav_long_range(struct runner *r, struct cell *ci, int timer) {
   }
 #endif
 
+  /* Some constants */
+  const struct engine *e = r->e;
+  const struct gravity_props *props = e->gravity_properties;
+  const double theta_crit_inv = props->theta_crit_inv;
+
   TIMER_TIC;
 
   /* Recover the list of top-level cells */
-  const struct engine *e = r->e;
   struct cell *cells = e->s->cells_top;
   const int nr_cells = e->s->nr_cells;
-  /* const double max_d = */
-  /*     const_gravity_a_smooth * const_gravity_r_cut * ci->width[0]; */
-  /* const double max_d2 = max_d * max_d; */
-  // const double pos_i[3] = {ci->loc[0], ci->loc[1], ci->loc[2]};
 
   /* Anything to do here? */
-  if (!cell_is_active(ci, e)) return;
+  if (!cell_is_active(ci, e)) return;  // MATTHIEU (should never happen)
 
-  /* Drift our own multipole if need be */
-  if (ci->ti_old_multipole != e->ti_current) cell_drift_multipole(ci, e);
+  /* Check multipole has been drifted */
+  if (ci->ti_old_multipole != e->ti_current)
+    error("Interacting un-drifted multipole");
 
-  /* Loop over all the cells and go for a p-m interaction if far enough but not
-   * too far */
+  /* Loop over all the top-level cells and go for a M-M interaction if
+   * well-separated */
   for (int i = 0; i < nr_cells; ++i) {
 
+    /* Handle on the top-level cell */
     struct cell *cj = &cells[i];
 
-    if (ci == cj) continue;
-    if (cj->gcount == 0) continue;
+    /* Avoid stupid cases */
+    if (ci == cj || cj->gcount == 0) continue;
 
-    /* const double dx[3] = {cj->loc[0] - pos_i[0],   // x */
-    /*                       cj->loc[1] - pos_i[1],   // y */
-    /*                       cj->loc[2] - pos_i[2]};  // z */
-    /* const double r2 = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2]; */
-    /* if (r2 > max_d2) continue; */
-
-    if (!cell_are_neighbours(ci, cj)) runner_dopair_grav_mm(r, ci, cj);
+    /* Check the multipole acceptance criterion */
+    if (gravity_multipole_accept(ci->multipole, cj->multipole,
+                                 theta_crit_inv)) {
+      /* Go for a (non-symmetric) M-M calculation */
+      runner_dopair_grav_mm(r, ci, cj);
+    }
   }
 
   if (timer) TIMER_TOC(timer_dograv_long_range);
diff --git a/src/scheduler.c b/src/scheduler.c
index 4d9ab07f6ecac25d0756b20ef761f7d01e2efa92..298dcc2b3d3cb36457d5b76853fbbea12392beed 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -163,10 +163,10 @@ static void scheduler_splittask(struct task *t, struct scheduler *s) {
       }
 
       /* Is this cell even split? */
-      if (ci->split && ci->h_max * kernel_gamma * space_stretch < hi / 2) {
+      if (ci->split && 2.f * kernel_gamma * ci->h_max * space_stretch < hi) {
 
         /* Make a sub? */
-        if (scheduler_dosub &&
+        if (scheduler_dosub && /* Note division here to avoid overflow */
             ((ci->count > 0 && ci->count < space_subsize / ci->count) ||
              (ci->gcount > 0 && ci->gcount < space_subsize / ci->gcount))) {
 
@@ -240,8 +240,8 @@ static void scheduler_splittask(struct task *t, struct scheduler *s) {
 
       /* Should this task be split-up? */
       if (ci->split && cj->split &&
-          ci->h_max * kernel_gamma * space_stretch < hi / 2 &&
-          cj->h_max * kernel_gamma * space_stretch < hj / 2) {
+          2.f * kernel_gamma * space_stretch * ci->h_max < hi &&
+          2.f * kernel_gamma * space_stretch * cj->h_max < hj) {
 
         /* Replace by a single sub-task? */
         if (scheduler_dosub &&
@@ -773,18 +773,34 @@ struct task *scheduler_addtask(struct scheduler *s, enum task_types type,
 void scheduler_set_unlocks(struct scheduler *s) {
 
   /* Store the counts for each task. */
-  int *counts;
-  if ((counts = (int *)malloc(sizeof(int) * s->nr_tasks)) == NULL)
+  short int *counts;
+  if ((counts = (short int *)malloc(sizeof(short int) * s->nr_tasks)) == NULL)
     error("Failed to allocate temporary counts array.");
-  bzero(counts, sizeof(int) * s->nr_tasks);
-  for (int k = 0; k < s->nr_unlocks; k++) counts[s->unlock_ind[k]] += 1;
+  bzero(counts, sizeof(short int) * s->nr_tasks);
+  for (int k = 0; k < s->nr_unlocks; k++) {
+    counts[s->unlock_ind[k]] += 1;
+
+#ifdef SWIFT_DEBUG_CHECKS
+    /* Check that we are not overflowing */
+    if (counts[s->unlock_ind[k]] < 0)
+      error("Task unlocking more than %d other tasks!",
+            (1 << (sizeof(short int) - 1)) - 1);
+#endif
+  }
 
   /* Compute the offset for each unlock block. */
   int *offsets;
   if ((offsets = (int *)malloc(sizeof(int) * (s->nr_tasks + 1))) == NULL)
     error("Failed to allocate temporary offsets array.");
   offsets[0] = 0;
-  for (int k = 0; k < s->nr_tasks; k++) offsets[k + 1] = offsets[k] + counts[k];
+  for (int k = 0; k < s->nr_tasks; k++) {
+    offsets[k + 1] = offsets[k] + counts[k];
+
+#ifdef SWIFT_DEBUG_CHECKS
+    /* Check that we are not overflowing */
+    if (offsets[k + 1] < 0) error("Task unlock offset array overflowing");
+#endif
+  }
 
   /* Create and fill a temporary array with the sorted unlocks. */
   struct task **unlocks;
@@ -1054,10 +1070,17 @@ void scheduler_rewait_mapper(void *map_data, int num_elements,
     /* Increment the task's own wait counter for the enqueueing. */
     atomic_inc(&t->wait);
 
+#ifdef SWIFT_DEBUG_CHECKS
+    /* Check that we don't have more waits that what can be stored. */
+    if (t->wait < 0)
+      error("Task unlocked by more than %d tasks!",
+            (1 << (8 * sizeof(t->wait) - 1)) - 1);
+
     /* Skip sort tasks that have already been performed */
     if (t->type == task_type_sort && t->flags == 0) {
       error("Empty sort task encountered.");
     }
+#endif
 
     /* Sets the waits of the dependances */
     for (int k = 0; k < t->nr_unlock_tasks; k++) {
diff --git a/src/space.c b/src/space.c
index a8bb460b956baa0a7d3864c1c95aeb3db613bc4c..e907949f305b507f681c33beaedbeb90fc966475 100644
--- a/src/space.c
+++ b/src/space.c
@@ -2084,16 +2084,39 @@ void space_split_recursive(struct space *s, struct cell *c,
 
       /* Now shift progeny multipoles and add them up */
       struct multipole temp;
+      double r_max = 0.;
       for (int k = 0; k < 8; ++k) {
         if (c->progeny[k] != NULL) {
           const struct cell *cp = c->progeny[k];
           const struct multipole *m = &cp->multipole->m_pole;
+
+          /* Contribution to multipole */
           gravity_M2M(&temp, m, c->multipole->CoM, cp->multipole->CoM,
                       s->periodic);
           gravity_multipole_add(&c->multipole->m_pole, &temp);
+
+          /* Upper limit of max CoM<->gpart distance */
+          const double dx = c->multipole->CoM[0] - cp->multipole->CoM[0];
+          const double dy = c->multipole->CoM[1] - cp->multipole->CoM[1];
+          const double dz = c->multipole->CoM[2] - cp->multipole->CoM[2];
+          const double r2 = dx * dx + dy * dy + dz * dz;
+          r_max = max(r_max, cp->multipole->r_max + sqrt(r2));
         }
       }
-    }
+      /* Alternative upper limit of max CoM<->gpart distance */
+      const double dx = c->multipole->CoM[0] > c->loc[0] + c->width[0] / 2.
+                            ? c->multipole->CoM[0] - c->loc[0]
+                            : c->loc[0] + c->width[0] - c->multipole->CoM[0];
+      const double dy = c->multipole->CoM[1] > c->loc[1] + c->width[1] / 2.
+                            ? c->multipole->CoM[1] - c->loc[1]
+                            : c->loc[1] + c->width[1] - c->multipole->CoM[1];
+      const double dz = c->multipole->CoM[2] > c->loc[2] + c->width[2] / 2.
+                            ? c->multipole->CoM[2] - c->loc[2]
+                            : c->loc[2] + c->width[2] - c->multipole->CoM[2];
+
+      /* Take minimum of both limits */
+      c->multipole->r_max = min(r_max, sqrt(dx * dx + dy * dy + dz * dz));
+    } /* Deal with gravity */
   }
 
   /* Otherwise, collect the data for this cell. */
@@ -2151,7 +2174,27 @@ void space_split_recursive(struct space *s, struct cell *c,
     ti_beg_max = get_integer_time_begin(e->ti_current + 1, time_bin_max);
 
     /* Construct the multipole and the centre of mass*/
-    if (s->gravity) gravity_P2M(c->multipole, c->gparts, c->gcount);
+    if (s->gravity) {
+      if (gcount > 0) {
+        gravity_P2M(c->multipole, c->gparts, c->gcount);
+        const double dx = c->multipole->CoM[0] > c->loc[0] + c->width[0] / 2.
+                              ? c->multipole->CoM[0] - c->loc[0]
+                              : c->loc[0] + c->width[0] - c->multipole->CoM[0];
+        const double dy = c->multipole->CoM[1] > c->loc[1] + c->width[1] / 2.
+                              ? c->multipole->CoM[1] - c->loc[1]
+                              : c->loc[1] + c->width[1] - c->multipole->CoM[1];
+        const double dz = c->multipole->CoM[2] > c->loc[2] + c->width[2] / 2.
+                              ? c->multipole->CoM[2] - c->loc[2]
+                              : c->loc[2] + c->width[2] - c->multipole->CoM[2];
+        c->multipole->r_max = sqrt(dx * dx + dy * dy + dz * dz);
+      } else {
+        gravity_multipole_init(&c->multipole->m_pole);
+        c->multipole->CoM[0] = c->loc[0] + c->width[0] / 2.;
+        c->multipole->CoM[1] = c->loc[1] + c->width[1] / 2.;
+        c->multipole->CoM[2] = c->loc[2] + c->width[2] / 2.;
+        c->multipole->r_max = 0.;
+      }
+    }
   }
 
   /* Set the values for this cell. */
@@ -2820,11 +2863,26 @@ void space_link_cleanup(struct space *s) {
  *
  * @param s The #space to check.
  * @param ti_drift The (integer) time.
+ * @param multipole Are we also checking the multipoles ?
  */
-void space_check_drift_point(struct space *s, integertime_t ti_drift) {
+void space_check_drift_point(struct space *s, integertime_t ti_drift,
+                             int multipole) {
 #ifdef SWIFT_DEBUG_CHECKS
   /* Recursively check all cells */
-  space_map_cells_pre(s, 1, cell_check_drift_point, &ti_drift);
+  space_map_cells_pre(s, 1, cell_check_particle_drift_point, &ti_drift);
+  if (multipole)
+    space_map_cells_pre(s, 1, cell_check_multipole_drift_point, &ti_drift);
+#else
+  error("Calling debugging code without debugging flag activated.");
+#endif
+}
+
+void space_check_top_multipoles_drift_point(struct space *s,
+                                            integertime_t ti_drift) {
+#ifdef SWIFT_DEBUG_CHECKS
+  for (int i = 0; i < s->nr_cells; ++i) {
+    cell_check_multipole_drift_point(&s->cells_top[i], &ti_drift);
+  }
 #else
   error("Calling debugging code without debugging flag activated.");
 #endif
diff --git a/src/space.h b/src/space.h
index 974e0676b49eb543d200e6d28c2c46e99e63c6a5..c5f588563e5a9fb4b6cb73ac1446514f8149794f 100644
--- a/src/space.h
+++ b/src/space.h
@@ -213,7 +213,10 @@ void space_init_parts(struct space *s);
 void space_init_gparts(struct space *s);
 void space_init_sparts(struct space *s);
 void space_link_cleanup(struct space *s);
-void space_check_drift_point(struct space *s, integertime_t ti_drift);
+void space_check_drift_point(struct space *s, integertime_t ti_drift,
+                             int multipole);
+void space_check_top_multipoles_drift_point(struct space *s,
+                                            integertime_t ti_drift);
 void space_check_timesteps(struct space *s);
 void space_replicate(struct space *s, int replicate, int verbose);
 void space_reset_task_counters(struct space *s);
diff --git a/src/timers.c b/src/timers.c
index 9b6b32251d8efd2890ed3855b5e9b87c15978784..9ab1ae09c740028bee6eb44ebfd0466bb3ba9406 100644
--- a/src/timers.c
+++ b/src/timers.c
@@ -83,12 +83,9 @@ char *timers_names[timer_count] = {
  *
  * To reset all timers, use the mask #timers_mask_all.
  */
-
 void timers_reset(unsigned long long mask) {
 
-  int k;
-
   /* Loop over the timers and set the masked ones to zero. */
-  for (k = 0; k < timer_count; k++)
+  for (int k = 0; k < timer_count; k++)
     if (mask & (1ull << k)) timers[k] = 0;
 }
diff --git a/src/timers.h b/src/timers.h
index 9678221d43987456b72df5dfbd2759547cd12db7..d4bfedaa0a11a02b8244ab78645a9502226ef6da 100644
--- a/src/timers.h
+++ b/src/timers.h
@@ -27,9 +27,12 @@
 #include "cycle.h"
 #include "inline.h"
 
-/* The timers themselves.
-   If you modify this list, be sure to change timers_names in timers.c as
-   well! */
+/**
+ * @brief The timers themselves.
+ *
+ * If you modify this list, be sure to change timers_names in timers.c as
+ * well!
+ **/
 enum {
   timer_none = 0,
   timer_prepare,
diff --git a/src/vector_power.h b/src/vector_power.h
index 4f93ece3c2c82281faeec37d45f5166034b4a946..f2fd4381751706f6fca1e24daa131d8fd8a0eeba 100644
--- a/src/vector_power.h
+++ b/src/vector_power.h
@@ -307,7 +307,7 @@ __attribute__((always_inline)) INLINE static double X_310(const double v[3]) {
  */
 __attribute__((always_inline)) INLINE static double X_301(const double v[3]) {
 
-  return 0.1666666666666667 * v[0] * v[0] * v[0] * v[1];
+  return 0.1666666666666667 * v[0] * v[0] * v[0] * v[2];
 }
 
 /**
diff --git a/theory/Multipoles/bibliography.bib b/theory/Multipoles/bibliography.bib
new file mode 100644
index 0000000000000000000000000000000000000000..abdb5855f52aefd9b4562851946d16bd402db68a
--- /dev/null
+++ b/theory/Multipoles/bibliography.bib
@@ -0,0 +1,27 @@
+@ARTICLE{Dehnen2014,
+   author = {{Dehnen}, W.},
+    title = "{A fast multipole method for stellar dynamics}",
+  journal = {Computational Astrophysics and Cosmology},
+archivePrefix = "arXiv",
+   eprint = {1405.2255},
+ primaryClass = "astro-ph.IM",
+     year = 2014,
+    month = sep,
+   volume = 1,
+      eid = {1},
+    pages = {1},
+      doi = {10.1186/s40668-014-0001-7},
+   adsurl = {http://adsabs.harvard.edu/abs/2014ComAC...1....1D},
+  adsnote = {Provided by the SAO/NASA Astrophysics Data System}
+}
+
+@article{Hardy2006,
+  title={Combinatorics of partial derivatives},
+  author={Hardy, Michael},
+  journal={Electron. J. Combin},
+  volume={13},
+   number={1},
+   pages={13},
+   year={2006}
+ }
+	      
diff --git a/theory/Multipoles/fmm_standalone.tex b/theory/Multipoles/fmm_standalone.tex
new file mode 100644
index 0000000000000000000000000000000000000000..6948708622dbf0cfc05daa9d63917f7a3017bfc2
--- /dev/null
+++ b/theory/Multipoles/fmm_standalone.tex
@@ -0,0 +1,22 @@
+\documentclass[fleqn, usenatbib, useAMS,a4paper]{mnras}
+\usepackage{graphicx}
+\usepackage{amsmath,paralist,xcolor,xspace,amssymb}
+\usepackage{times}
+
+\newcommand{\swift}{{\sc Swift}\xspace}
+
+%opening
+\title{FMM in SWIFT}
+\author{Matthieu Schaller}
+
+\begin{document}
+
+\maketitle
+
+\input{gravity_derivatives}
+
+\bibliographystyle{mnras}
+\bibliography{./bibliography.bib}
+
+
+\end{document}
diff --git a/theory/Multipoles/gravity_derivatives.tex b/theory/Multipoles/gravity_derivatives.tex
new file mode 100644
index 0000000000000000000000000000000000000000..913bdf752c666aac5913c6e6452bdb06f3fcdc86
--- /dev/null
+++ b/theory/Multipoles/gravity_derivatives.tex
@@ -0,0 +1,54 @@
+We use the multi-index notation of \cite{Dehnen2014} to simplify expressions.
+
+\subsection{Derivatives of the gravitational potential}
+
+The calculation of all the
+$D_\mathbf{n}(x,y,z) \equiv \nabla^{\mathbf{n}}\phi(x,y,z)$ terms up
+to the relevent order can be quite tedious and it is beneficial to
+automatize the whole setup. Ideally, one would like to have an
+expression for each of this term that is only made of multiplications
+and additions of each of the coordinates and the inverse distance. We
+achieve this by writing $\phi$ as a composition of functions
+$\phi(u(x,y,z))$ and apply the \textit{Fa\`a di Bruno}
+formula \citep[i.e. the ``chain rule'' for higher order derivatives,
+e.g.][]{Hardy2006} to construct our terms:
+
+\begin{equation}
+\label{eq:faa_di_bruno}
+\frac{\partial^n}{\partial x_1 \cdots \partial x_n} \phi(u)
+= \sum_{A} \phi^{(|A|)}(u) \prod_{B \in
+A} \frac{\partial^{|B|}}{\prod_{c\in B}\partial x_c} u(x,y,z),
+\end{equation}
+where $A$ is the set of all partitions of $\lbrace1,\cdots, n\rbrace$,
+$B$ is a block of a partition $A$ and $|\cdot|$ denotes the
+cardinality of a set. For generic functions $\phi$ and $u$ this
+formula yields an untracktable number of terms; an 8th-order
+derivative will have $4140$ (!)  terms in the sum\footnote{The number
+of terms in the sum is given by the Bell number of the same order}. \\
+We choose to write
+\begin{align}
+   \phi(x,y,z) &= 1 / \sqrt{u(x,y,z)}, \\
+   u(x,y,z) &= x^2 + y^2 + z^2.
+\end{align}
+This choice allows to have derivatives of any order of $\phi(u)$ that
+only depend on powers of $u$:
+
+\begin{equation}
+f^{(n)}(u) = \frac{\Gamma(\frac{1}{2})}{\Gamma(\frac{1}{2} -
+n)}\frac{1}{u^{n+\frac{1}{2}}}.
+\end{equation}
+More importantly, this choice of decomposition allows us to have
+derivatives of $u$ only up to second order in $x$, $y$ or $z$. The
+number of non-zero terms in eq. \ref{eq:faa_di_bruno} is hence
+drastically reduced. For instance, when computing
+$D_{(4,1,3)} \equiv \frac{\partial^8}{\partial x^4 \partial y \partial
+z^3} \phi$, $4100$ of the $4140$ terms will involve at least one
+zero-valued derivative (e.g. $\partial^3/\partial x^3$ or
+$\partial^2/\partial x\partial y$) of $u$. Furthermore, among the 40
+remaining terms, many will involve the same derivatives and can be
+grouped together, leaving us with a sum of six products of $x$,$y$ and
+$z$. This is generally the case for most of the $D_\mathbf{n}$'s and
+figuring out which terms are identical in a given set of partitions of
+$\lbrace1,\cdots, n\rbrace$ is an interesting exercise in
+combinatorics left for the reader \citep[see also][]{Hardy2006}.
+
diff --git a/theory/Multipoles/run.sh b/theory/Multipoles/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..34304a37dc1cb1fad3a7442544a8df793ea94d35
--- /dev/null
+++ b/theory/Multipoles/run.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+python kernels.py
+pdflatex -jobname=fmm fmm_standalone.tex
+bibtex fmm.aux
+pdflatex -jobname=fmm fmm_standalone.tex
+pdflatex -jobname=fmm fmm_standalone.tex