diff --git a/.gitignore b/.gitignore
index 28a830818af36faad3f4278c6adcba5562b59ee7..e3e17bb82a01d9af0ace6ed72d196cf2dba242f6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -34,12 +34,17 @@ examples/*/*/*.txt
 examples/*/*/used_parameters.yml
 examples/*/gravity_checks_*.dat
 
-tests/testPair
+tests/testActivePair
+tests/brute_force_periodic_BC_standard.dat
+tests/swift_periodic_BC_standard.dat
+tests/brute_force_periodic_BC_pertrubed.dat
+tests/swift_periodic_BC_perturbed.dat
 tests/brute_force_standard.dat
 tests/swift_dopair_standard.dat
 tests/brute_force_perturbed.dat
 tests/swift_dopair_perturbed.dat
 tests/test27cells
+tests/testPeriodicBC
 tests/test125cells
 tests/brute_force_27_standard.dat
 tests/swift_dopair_27_standard.dat
@@ -49,6 +54,11 @@ tests/brute_force_125_standard.dat
 tests/swift_dopair_125_standard.dat
 tests/brute_force_125_perturbed.dat
 tests/swift_dopair_125_perturbed.dat
+tests/brute_force_active.dat
+tests/brute_force_periodic_BC_perturbed.dat
+tests/swift_dopair_active.dat
+tests/test_nonsym_density_serial.dat
+tests/test_nonsym_density_vec.dat
 tests/testGreetings
 tests/testReading
 tests/input.hdf5
@@ -64,12 +74,12 @@ tests/testMaths
 tests/testThreadpool
 tests/testParser
 tests/parser_output.yml
+tests/testPeriodicBC.sh
+tests/testPeriodicBCPerturbed.sh
 tests/test27cells.sh
 tests/test27cellsPerturbed.sh
 tests/test125cells.sh
 tests/test125cellsPerturbed.sh
-tests/testPair.sh
-tests/testPairPerturbed.sh
 tests/testParser.sh
 tests/testReading.sh
 tests/testAdiabaticIndex
@@ -95,6 +105,9 @@ theory/paper_pasc/pasc_paper.pdf
 theory/Multipoles/fmm.pdf
 theory/Multipoles/fmm_standalone.pdf
 theory/Multipoles/potential.pdf
+theory/Multipoles/potential_long.pdf
+theory/Multipoles/potential_short.pdf
+theory/Multipoles/force_short.pdf
 
 m4/libtool.m4
 m4/ltoptions.m4
diff --git a/README b/README
index 2dedb32a04a7cf143c3e65560c45a68c0e5d1c2a..c088a94488133ddf53cd8a6eba45d8dcdebfeb72 100644
--- a/README
+++ b/README
@@ -15,28 +15,31 @@ Usage: swift [OPTION]... PARAMFILE
        swift_mpi [OPTION]... PARAMFILE
 
 Valid options are:
-  -a          Pin runners using processor affinity.
-  -c          Run with cosmological time integration.
-  -C          Run with cooling.
-  -d          Dry run. Read the parameter file, allocate memory but does not read 
-              the particles from ICs and exit before the start of time integration.
-              Allows user to check validy of parameter and IC files as well as memory limits.
-  -D          Always drift all particles even the ones far from active particles. This emulates
-  	      Gadget-[23] and GIZMO's default behaviours.
-  -e          Enable floating-point exceptions (debugging mode).
-  -f    {int} Overwrite the CPU frequency (Hz) to be used for time measurements.
-  -g          Run with an external gravitational potential.
-  -G          Run with self-gravity.
-  -M          Reconstruct the multipoles every time-step.
-  -n    {int} Execute a fixed number of time steps. When unset use the time_end parameter to stop. 
-  -s          Run with hydrodynamics.
-  -S          Run with stars.
-  -t    {int} The number of threads to use on each MPI rank. Defaults to 1 if not specified.
-  -T          Print timers every time-step.
-  -v     [12] Increase the level of verbosity
-  	      1: MPI-rank 0 writes
-	      2: All MPI-ranks write
-  -y    {int} Time-step frequency at which task graphs are dumped.
-  -h          Print this help message and exit.
+  -a                Pin runners using processor affinity.
+  -c                Run with cosmological time integration.
+  -C                Run with cooling.
+  -d                Dry run. Read the parameter file, allocate memory but does not read
+                    the particles from ICs and exit before the start of time integration.
+                    Allows user to check validy of parameter and IC files as well as memory limits.
+  -D                Always drift all particles even the ones far from active particles. This emulates
+                    Gadget-[23] and GIZMO's default behaviours.
+  -e                Enable floating-point exceptions (debugging mode).
+  -f          {int} Overwrite the CPU frequency (Hz) to be used for time measurements.
+  -g                Run with an external gravitational potential.
+  -G                Run with self-gravity.
+  -M                Reconstruct the multipoles every time-step.
+  -n          {int} Execute a fixed number of time steps. When unset use the time_end parameter to stop.
+  -P  {sec:par:val} Set parameter value and overwrites values read from the parameters file. Can be used more than once.
+  -s                Run with hydrodynamics.
+  -S                Run with stars.
+  -t          {int} The number of threads to use on each MPI rank. Defaults to 1 if not specified.
+  -T                Print timers every time-step.
+  -v           [12] Increase the level of verbosity:
+                    1: MPI-rank 0 writes,
+                    2: All MPI-ranks write.
+  -y          {int} Time-step frequency at which task graphs are dumped.
+  -Y          {int} Time-step frequency at which threadpool tasks are dumped.
+  -h                Print this help message and exit.
+
+See the file parameter_example.yml for an example of parameter file.
 
-See the file examples/parameter_example.yml for an example of parameter file.
diff --git a/configure.ac b/configure.ac
index 788bb57eed801c1a1dff2204b57b34c4fadf3b58..74fede99f4fbf578af4e703cedaa42f2c278b037 100644
--- a/configure.ac
+++ b/configure.ac
@@ -16,7 +16,7 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 # Init the project.
-AC_INIT([SWIFT],[0.5.0],[https://gitlab.cosma.dur.ac.uk/swift/swiftsim])
+AC_INIT([SWIFT],[0.6.0],[https://gitlab.cosma.dur.ac.uk/swift/swiftsim])
 swift_config_flags="$*"
 
 #  Need to define this, instead of using fifth argument of AC_INIT, until 2.64.
@@ -189,6 +189,19 @@ if test "$enable_task_debugging" = "yes"; then
    AC_DEFINE([SWIFT_DEBUG_TASKS],1,[Enable task debugging])
 fi
 
+# Check if threadpool debugging is on.
+AC_ARG_ENABLE([threadpool-debugging],
+   [AS_HELP_STRING([--enable-threadpool-debugging],
+     [Store threadpool mapper timing information and generate threadpool dump files @<:@yes/no@:>@]
+   )],
+   [enable_threadpool_debugging="$enableval"],
+   [enable_threadpool_debugging="no"]
+)
+if test "$enable_threadpool_debugging" = "yes"; then
+   AC_DEFINE([SWIFT_DEBUG_THREADPOOL],1,[Enable threadpool debugging])
+   LDFLAGS="$LDFLAGS -rdynamic"
+fi
+
 # Check if the general timers are switched on.
 AC_ARG_ENABLE([timers],
    [AS_HELP_STRING([--enable-timers],
@@ -829,10 +842,10 @@ esac
 #  Gravity multipole order
 AC_ARG_WITH([multipole-order],
    [AS_HELP_STRING([--with-multipole-order=<order>],
-      [order of the multipole and gravitational field expansion @<:@ default: 3@:>@]
+      [order of the multipole and gravitational field expansion @<:@ default: 4@:>@]
    )],
    [with_multipole_order="$withval"],
-   [with_multipole_order="3"]
+   [with_multipole_order="4"]
 )
 AC_DEFINE_UNQUOTED([SELF_GRAVITY_MULTIPOLE_ORDER], [$with_multipole_order], [Multipole order])
 
@@ -848,19 +861,31 @@ AM_CONDITIONAL([HAVE_DOXYGEN], [test "$ac_cv_path_ac_pt_DX_DOXYGEN" != ""])
 # Handle .in files.
 AC_CONFIG_FILES([Makefile src/Makefile examples/Makefile doc/Makefile doc/Doxyfile tests/Makefile])
 AC_CONFIG_FILES([tests/testReading.sh], [chmod +x tests/testReading.sh])
-AC_CONFIG_FILES([tests/testPair.sh], [chmod +x tests/testPair.sh])
-AC_CONFIG_FILES([tests/testPairPerturbed.sh], [chmod +x tests/testPairPerturbed.sh])
+AC_CONFIG_FILES([tests/testActivePair.sh], [chmod +x tests/testActivePair.sh])
 AC_CONFIG_FILES([tests/test27cells.sh], [chmod +x tests/test27cells.sh])
 AC_CONFIG_FILES([tests/test27cellsPerturbed.sh], [chmod +x tests/test27cellsPerturbed.sh])
 AC_CONFIG_FILES([tests/test125cells.sh], [chmod +x tests/test125cells.sh])
 AC_CONFIG_FILES([tests/test125cellsPerturbed.sh], [chmod +x tests/test125cellsPerturbed.sh])
+AC_CONFIG_FILES([tests/testPeriodicBC.sh], [chmod +x tests/testPeriodicBC.sh])
+AC_CONFIG_FILES([tests/testPeriodicBCPerturbed.sh], [chmod +x tests/testPeriodicBCPerturbed.sh])
+AC_CONFIG_FILES([tests/testInteractions.sh], [chmod +x tests/testInteractions.sh])
 AC_CONFIG_FILES([tests/testParser.sh], [chmod +x tests/testParser.sh])
 
 # Save the compilation options
 AC_DEFINE_UNQUOTED([SWIFT_CONFIG_FLAGS],["$swift_config_flags"],[Flags passed to configure])
 
+# Make sure the latest git revision string gets included
+touch src/version.c
+
+# Generate output.
+AC_OUTPUT
+
 # Report general configuration.
-AC_MSG_RESULT([
+AC_MSG_RESULT([ 
+ ------- Summary --------
+
+   $PACKAGE_NAME v.$PACKAGE_VERSION
+
    Compiler        : $CC
     - vendor       : $ax_cv_c_compiler_vendor
     - version      : $ax_cv_c_compiler_version
@@ -887,14 +912,10 @@ AC_MSG_RESULT([
    Multipole order     : $with_multipole_order
    No gravity below ID : $no_gravity_below_id
 
-   Individual timers : $enable_timers
-   Task debugging    : $enable_task_debugging
-   Debugging checks  : $enable_debugging_checks
-   Gravity checks    : $gravity_force_checks
-])
-
-# Make sure the latest git revision string gets included
-touch src/version.c
+   Individual timers    : $enable_timers
+   Task debugging       : $enable_task_debugging
+   Threadpool debugging : $enable_threadpool_debugging
+   Debugging checks     : $enable_debugging_checks
+   Gravity checks       : $gravity_force_checks
 
-# Generate output.
-AC_OUTPUT
+ ------------------------])
diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in
index 0df1f91194b6d1e7e98cb1b75be7d3eaaca7fc32..0193760d3114aecab91f0c2ad27a9c1dd77dec9a 100644
--- a/doc/Doxyfile.in
+++ b/doc/Doxyfile.in
@@ -1988,6 +1988,9 @@ INCLUDE_FILE_PATTERNS  =
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
 PREDEFINED             = "__attribute__(x)= "
+PREDEFINED             += HAVE_HDF5
+PREDEFINED             += WITH_MPI
+PREDEFINED             += WITH_VECTORIZATION
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
 # tag can be used to specify a list of macro names that should be expanded. The
diff --git a/examples/CoolingBox/energy_plot.py b/examples/CoolingBox/energy_plot.py
index c8948e7e209c2786ffdecbb2b8b606e73d703238..45f0b4f6b11c3855a919f6a98fd0ca006a887f82 100644
--- a/examples/CoolingBox/energy_plot.py
+++ b/examples/CoolingBox/energy_plot.py
@@ -34,7 +34,7 @@ import sys
 stats_filename = "./energy.txt"
 
 # First snapshot
-snap_filename = "coolingBox_000.hdf5"
+snap_filename = "coolingBox_0000.hdf5"
 
 # Some constants in cgs units
 k_b = 1.38E-16 #boltzmann
@@ -104,7 +104,7 @@ print "Cooling time:", cooling_time_cgs, "[s]"
 u_snapshots_cgs = zeros(25)
 t_snapshots_cgs = zeros(25)
 for i in range(25):
-    snap = h5.File("coolingBox_%0.3d.hdf5"%i,'r')
+    snap = h5.File("coolingBox_%0.4d.hdf5"%i,'r')
     u_snapshots_cgs[i] = sum(snap["/PartType0/InternalEnergy"][:] * snap["/PartType0/Masses"][:])  / total_mass[0] * unit_length**2 / (unit_time)**2
     t_snapshots_cgs[i] = snap["/Header"].attrs["Time"] * unit_time
 
diff --git a/examples/CoolingHalo/density_profile.py b/examples/CoolingHalo/density_profile.py
index 335f7089b6835b65cf37e1bcd312a17966c295a7..c53be03b369e04d2cb8e68e419e08347ee6721eb 100644
--- a/examples/CoolingHalo/density_profile.py
+++ b/examples/CoolingHalo/density_profile.py
@@ -20,7 +20,7 @@ H_0_cgs = 100. * h * KM_PER_SEC_IN_CGS / (1.0e6 * PARSEC_IN_CGS)
 
 #read some header/parameter information from the first snapshot
 
-filename = "Hydrostatic_000.hdf5"
+filename = "Hydrostatic_0000.hdf5"
 f = h5.File(filename,'r')
 params = f["Parameters"]
 unit_mass_cgs = float(params.attrs["InternalUnitSystem:UnitMass_in_cgs"])
@@ -39,7 +39,7 @@ M_vir_cgs = r_vir_cgs * v_c_cgs**2 / CONST_G_CGS
 
 for i in range(n_snaps):
 
-    filename = "Hydrostatic_%03d.hdf5" %i
+    filename = "Hydrostatic_%04d.hdf5" %i
     f = h5.File(filename,'r')
     coords_dset = f["PartType0/Coordinates"]
     coords = np.array(coords_dset)
diff --git a/examples/CoolingHalo/internal_energy_profile.py b/examples/CoolingHalo/internal_energy_profile.py
index 854bdf223cfae75203a1924b4af6136b4b7aa6cd..d5f77c32ad17b02026abc7f8806c323c130c735a 100644
--- a/examples/CoolingHalo/internal_energy_profile.py
+++ b/examples/CoolingHalo/internal_energy_profile.py
@@ -38,7 +38,7 @@ H_0_cgs = 100. * h * KM_PER_SEC_IN_CGS / (1.0e6 * PARSEC_IN_CGS)
 
 #read some header/parameter information from the first snapshot
 
-filename = "Hydrostatic_000.hdf5"
+filename = "Hydrostatic_0000.hdf5"
 f = h5.File(filename,'r')
 params = f["Parameters"]
 unit_mass_cgs = float(params.attrs["InternalUnitSystem:UnitMass_in_cgs"])
@@ -57,7 +57,7 @@ M_vir_cgs = r_vir_cgs * v_c_cgs**2 / CONST_G_CGS
 
 for i in range(n_snaps):
 
-    filename = "Hydrostatic_%03d.hdf5" %i
+    filename = "Hydrostatic_%04d.hdf5" %i
     f = h5.File(filename,'r')
     coords_dset = f["PartType0/Coordinates"]
     coords = np.array(coords_dset)
diff --git a/examples/CoolingHalo/test_energy_conservation.py b/examples/CoolingHalo/test_energy_conservation.py
index 00374e905e8eeb66bfe8c7360ab37522bc93af32..2e2ad3607f888f892f021a760dfa89753d52c133 100644
--- a/examples/CoolingHalo/test_energy_conservation.py
+++ b/examples/CoolingHalo/test_energy_conservation.py
@@ -17,7 +17,7 @@ H_0_cgs = 100. * h * KM_PER_SEC_IN_CGS / (1.0e6 * PARSEC_IN_CGS)
 
 #read some header/parameter information from the first snapshot
 
-filename = "CoolingHalo_000.hdf5"
+filename = "CoolingHalo_0000.hdf5"
 f = h5.File(filename,'r')
 params = f["Parameters"]
 unit_mass_cgs = float(params.attrs["InternalUnitSystem:UnitMass_in_cgs"])
@@ -41,7 +41,7 @@ time_array_cgs = []
 
 for i in range(n_snaps):
 
-    filename = "CoolingHalo_%03d.hdf5" %i
+    filename = "CoolingHalo_%04d.hdf5" %i
     f = h5.File(filename,'r')
     coords_dset = f["PartType0/Coordinates"]
     coords = np.array(coords_dset)
diff --git a/examples/CoolingHalo/velocity_profile.py b/examples/CoolingHalo/velocity_profile.py
index d64d255b18482bc26578f21f46199aa3540ae7b5..7d31e66ff52c51d0852fa9165753032d130db9c2 100644
--- a/examples/CoolingHalo/velocity_profile.py
+++ b/examples/CoolingHalo/velocity_profile.py
@@ -39,7 +39,7 @@ H_0_cgs = 100. * h * KM_PER_SEC_IN_CGS / (1.0e6 * PARSEC_IN_CGS)
 
 #read some header/parameter information from the first snapshot
 
-filename = "CoolingHalo_000.hdf5"
+filename = "CoolingHalo_0000.hdf5"
 f = h5.File(filename,'r')
 params = f["Parameters"]
 unit_mass_cgs = float(params.attrs["InternalUnitSystem:UnitMass_in_cgs"])
@@ -58,7 +58,7 @@ M_vir_cgs = r_vir_cgs * v_c_cgs**2 / CONST_G_CGS
 
 for i in range(n_snaps):
 
-    filename = "CoolingHalo_%03d.hdf5" %i
+    filename = "CoolingHalo_%04d.hdf5" %i
     f = h5.File(filename,'r')
     coords_dset = f["PartType0/Coordinates"]
     coords = np.array(coords_dset)
diff --git a/examples/CoolingHaloWithSpin/density_profile.py b/examples/CoolingHaloWithSpin/density_profile.py
index fb88ddd6aea71603a6f6fcb36b13771106737e6a..cc4f8a195d9b88dbbaef3891b57ab9e2dfa9e3ed 100644
--- a/examples/CoolingHaloWithSpin/density_profile.py
+++ b/examples/CoolingHaloWithSpin/density_profile.py
@@ -21,7 +21,7 @@ H_0_cgs = 100. * h * KM_PER_SEC_IN_CGS / (1.0e6 * PARSEC_IN_CGS)
 
 #read some header/parameter information from the first snapshot
 
-filename = "CoolingHalo_000.hdf5"
+filename = "CoolingHalo_0000.hdf5"
 f = h5.File(filename,'r')
 params = f["Parameters"]
 unit_mass_cgs = float(params.attrs["InternalUnitSystem:UnitMass_in_cgs"])
@@ -42,7 +42,7 @@ M_vir_cgs = r_vir_cgs * v_c_cgs**2 / CONST_G_CGS
 
 for i in range(n_snaps):
 
-    filename = "CoolingHalo_%03d.hdf5" %i
+    filename = "CoolingHalo_%04d.hdf5" %i
     f = h5.File(filename,'r')
     coords_dset = f["PartType0/Coordinates"]
     coords = np.array(coords_dset)
diff --git a/examples/CoolingHaloWithSpin/internal_energy_profile.py b/examples/CoolingHaloWithSpin/internal_energy_profile.py
index 5f71d69ca7a978de242559f84ec390faa86a27f0..8e039bd3c2d1287946350b2af0efb595cc848ac0 100644
--- a/examples/CoolingHaloWithSpin/internal_energy_profile.py
+++ b/examples/CoolingHaloWithSpin/internal_energy_profile.py
@@ -39,7 +39,7 @@ H_0_cgs = 100. * h * KM_PER_SEC_IN_CGS / (1.0e6 * PARSEC_IN_CGS)
 
 #read some header/parameter information from the first snapshot
 
-filename = "CoolingHalo_000.hdf5"
+filename = "CoolingHalo_0000.hdf5"
 f = h5.File(filename,'r')
 params = f["Parameters"]
 unit_mass_cgs = float(params.attrs["InternalUnitSystem:UnitMass_in_cgs"])
@@ -60,7 +60,7 @@ M_vir_cgs = r_vir_cgs * v_c_cgs**2 / CONST_G_CGS
 
 for i in range(n_snaps):
 
-    filename = "CoolingHalo_%03d.hdf5" %i
+    filename = "CoolingHalo_%04d.hdf5" %i
     f = h5.File(filename,'r')
     coords_dset = f["PartType0/Coordinates"]
     coords = np.array(coords_dset)
diff --git a/examples/CoolingHaloWithSpin/test_energy_conservation.py b/examples/CoolingHaloWithSpin/test_energy_conservation.py
index cc7518d2e4d64441b2c4d6b0663caae873f34d95..c9d020b69b7bcccc4778ee12071dd448df0bdee0 100644
--- a/examples/CoolingHaloWithSpin/test_energy_conservation.py
+++ b/examples/CoolingHaloWithSpin/test_energy_conservation.py
@@ -20,7 +20,7 @@ H_0_cgs = 100. * h * KM_PER_SEC_IN_CGS / (1.0e6 * PARSEC_IN_CGS)
 
 #read some header/parameter information from the first snapshot
 
-filename = "CoolingHalo_000.hdf5"
+filename = "CoolingHalo_0000.hdf5"
 f = h5.File(filename,'r')
 params = f["Parameters"]
 unit_mass_cgs = float(params.attrs["InternalUnitSystem:UnitMass_in_cgs"])
@@ -44,7 +44,7 @@ time_array_cgs = []
 
 for i in range(n_snaps):
 
-    filename = "CoolingHalo_%03d.hdf5" %i
+    filename = "CoolingHalo_%04d.hdf5" %i
     f = h5.File(filename,'r')
     coords_dset = f["PartType0/Coordinates"]
     coords = np.array(coords_dset)
diff --git a/examples/CoolingHaloWithSpin/velocity_profile.py b/examples/CoolingHaloWithSpin/velocity_profile.py
index 07df8e1b0751307513c30a5b128773b193c3a9cd..7247e23a34a3965207b0d4749b46fecfafc4eda9 100644
--- a/examples/CoolingHaloWithSpin/velocity_profile.py
+++ b/examples/CoolingHaloWithSpin/velocity_profile.py
@@ -39,7 +39,7 @@ H_0_cgs = 100. * h * KM_PER_SEC_IN_CGS / (1.0e6 * PARSEC_IN_CGS)
 
 #read some header/parameter information from the first snapshot
 
-filename = "CoolingHalo_000.hdf5"
+filename = "CoolingHalo_0000.hdf5"
 f = h5.File(filename,'r')
 params = f["Parameters"]
 unit_mass_cgs = float(params.attrs["InternalUnitSystem:UnitMass_in_cgs"])
@@ -58,7 +58,7 @@ M_vir_cgs = r_vir_cgs * v_c_cgs**2 / CONST_G_CGS
 
 for i in range(n_snaps):
 
-    filename = "CoolingHalo_%03d.hdf5" %i
+    filename = "CoolingHalo_%04d.hdf5" %i
     f = h5.File(filename,'r')
     coords_dset = f["PartType0/Coordinates"]
     coords = np.array(coords_dset)
diff --git a/examples/DiscPatch/HydroStatic/README b/examples/DiscPatch/HydroStatic/README
index 42853e6b51983f2868528202adec3fc829c2ddbc..49ed96dc3bac607a4d454547d880b10bb6b28857 100644
--- a/examples/DiscPatch/HydroStatic/README
+++ b/examples/DiscPatch/HydroStatic/README
@@ -18,3 +18,5 @@ output to 'Disc-Patch-dynamic.hdf5'. These are now the ICs for the actual test.
 
 When running SWIFT with the parameters from 'disc-patch.yml' and an
 ideal gas EoS on these ICs the disc should stay in equilibrium.
+
+The solution can be checked using the 'plotSolution.py' script.
diff --git a/examples/DiscPatch/HydroStatic/disc-patch-icc.yml b/examples/DiscPatch/HydroStatic/disc-patch-icc.yml
index 6a27016b8a3f484b7c1c9b74594073d5f28efe90..6f17cfbb1e0125faf8e47fe4e9e55bfdf4df7b71 100644
--- a/examples/DiscPatch/HydroStatic/disc-patch-icc.yml
+++ b/examples/DiscPatch/HydroStatic/disc-patch-icc.yml
@@ -1,8 +1,8 @@
 # Define the system of units to use internally. 
 InternalUnitSystem:
-  UnitMass_in_cgs:     1.9885e33     # Grams
-  UnitLength_in_cgs:   3.0856776e18  # Centimeters
-  UnitVelocity_in_cgs: 1e5           # Centimeters per second
+  UnitMass_in_cgs:     1.9885e33         # Grams
+  UnitLength_in_cgs:   3.08567758149e18  # Centimeters
+  UnitVelocity_in_cgs: 1e5               # Centimeters per second
   UnitCurrent_in_cgs:  1   # Amperes
   UnitTemp_in_cgs:     1   # Kelvin
 
@@ -11,17 +11,17 @@ TimeIntegration:
   time_begin: 0     # The starting time of the simulation (in internal units).
   time_end:   968.  # The end time of the simulation (in internal units).
   dt_min:     1e-4  # The minimal time-step size of the simulation (in internal units).
-  dt_max:     1.    # The maximal time-step size of the simulation (in internal units).
+  dt_max:     10.   # The maximal time-step size of the simulation (in internal units).
 
 # Parameters governing the conserved quantities statistics
 Statistics:
-  delta_time:          1 # Time between statistics output
+  delta_time:          12. # Time between statistics output
   
 # Parameters governing the snapshots
 Snapshots:
-  basename:            Disc-Patch   # Common part of the name of output files
-  time_first:          0.           # Time of the first output (in internal units)
-  delta_time:          12.          # Time difference between consecutive outputs (in internal units)
+  basename:    Disc-Patch   # Common part of the name of output files
+  time_first:  0.           # Time of the first output (in internal units)
+  delta_time:  48.          # Time difference between outputs (in internal units)
 
 # Parameters for the hydrodynamics scheme
 SPH:
@@ -29,7 +29,7 @@ SPH:
   delta_neighbours:      0.1      # The tolerance for the targetted number of neighbours.
   CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
   max_ghost_iterations:  30       # Maximal number of iterations allowed to converge towards the smoothing length.
-  max_smoothing_length:  70.      # Maximal smoothing length allowed (in internal units).
+  h_max:                 60.      # Maximal smoothing length allowed (in internal units).
 
 # Parameters related to the initial conditions
 InitialConditions:
@@ -39,6 +39,8 @@ InitialConditions:
 DiscPatchPotential:
   surface_density: 10.
   scale_height:    100.
-  z_disc:          200.
+  x_disc:          400.
+  x_trunc:         300.
+  x_max:           350.
   timestep_mult:   0.03
   growth_time:     5.
diff --git a/examples/DiscPatch/HydroStatic/disc-patch.yml b/examples/DiscPatch/HydroStatic/disc-patch.yml
index 8bd67c5b08de82bb6a3d47ccf3419f85e3e5c6b1..8816bc17ca526d01b7abcf55bb43287bbb36224a 100644
--- a/examples/DiscPatch/HydroStatic/disc-patch.yml
+++ b/examples/DiscPatch/HydroStatic/disc-patch.yml
@@ -1,8 +1,8 @@
 # Define the system of units to use internally. 
 InternalUnitSystem:
-  UnitMass_in_cgs:     1.9885e33     # Grams
-  UnitLength_in_cgs:   3.0856776e18  # Centimeters
-  UnitVelocity_in_cgs: 1e5           # Centimeters per second
+  UnitMass_in_cgs:     1.9885e33         # Grams
+  UnitLength_in_cgs:   3.08567758149e18  # Centimeters
+  UnitVelocity_in_cgs: 1e5               # Centimeters per second
   UnitCurrent_in_cgs:  1   # Amperes
   UnitTemp_in_cgs:     1   # Kelvin
 
@@ -11,17 +11,17 @@ TimeIntegration:
   time_begin: 968   # The starting time of the simulation (in internal units).
   time_end:   12000.  # The end time of the simulation (in internal units).
   dt_min:     1e-4  # The minimal time-step size of the simulation (in internal units).
-  dt_max:     1.    # The maximal time-step size of the simulation (in internal units).
+  dt_max:     10.   # The maximal time-step size of the simulation (in internal units).
 
 # Parameters governing the conserved quantities statistics
 Statistics:
-  delta_time:          1 # Time between statistics output
+  delta_time:          24 # Time between statistics output
   
 # Parameters governing the snapshots
 Snapshots:
-  basename:           Disc-Patch-dynamic # Common part of the name of output files
-  time_first:         968.               # Time of the first output (in internal units)
-  delta_time:         24.                 # Time difference between consecutive outputs (in internal units)
+  basename:    Disc-Patch-dynamic # Common part of the name of output files
+  time_first:  968.               # Time of the first output (in internal units)
+  delta_time:  96.                # Time difference between outputs (in internal units)
 
 # Parameters for the hydrodynamics scheme
 SPH:
@@ -29,7 +29,7 @@ SPH:
   delta_neighbours:      0.1      # The tolerance for the targetted number of neighbours.
   CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
   max_ghost_iterations:  30       # Maximal number of iterations allowed to converge towards the smoothing length.
-  max_smoothing_length:  70.      # Maximal smoothing length allowed (in internal units).
+  h_max:                 60.      # Maximal smoothing length allowed (in internal units).
 
 # Parameters related to the initial conditions
 InitialConditions:
@@ -39,5 +39,7 @@ InitialConditions:
 DiscPatchPotential:
   surface_density: 10.
   scale_height:    100.
-  z_disc:          200.
+  x_disc:          400.
+  x_trunc:         300.
+  x_max:           380.
   timestep_mult:   0.03
diff --git a/examples/DiscPatch/HydroStatic/dynamic.pro b/examples/DiscPatch/HydroStatic/dynamic.pro
deleted file mode 100644
index 00ee3f7a8d2dc435be2093af959efd2c49903637..0000000000000000000000000000000000000000
--- a/examples/DiscPatch/HydroStatic/dynamic.pro
+++ /dev/null
@@ -1,139 +0,0 @@
-;
-;  test energy / angular momentum conservation of test problem
-;
-
-iplot = 1 ; if iplot = 1, make plot of E/Lz conservation, else, simply compare final and initial energy
-
-; set physical constants
-@physunits
-
-indir    = './'
-;basefile = 'Disc-Patch-dynamic_'
-basefile = 'Disc-Patch_'
-
-; set properties of potential
-uL   = phys.pc                  ; unit of length
-uM   = phys.msun                ; unit of mass
-uV   = 1d5                      ; unit of velocity
-
-; properties of patch
-surface_density = 100.          ; surface density of all mass, which generates the gravitational potential
-scale_height    = 100.
-z_disk          = 200.          ;
-fgas            = 0.1           ; gas fraction
-gamma           = 5./3.
-
-; derived units
-constG   = 10.^(alog10(phys.g)+alog10(uM)-2d0*alog10(uV)-alog10(uL)) ;
-pcentre  = [0.,0.,z_disk] * pc / uL
-utherm     = !pi * constG * surface_density * scale_height / (gamma-1.)
-temp       = (utherm*uV^2)*phys.m_h/phys.kb
-soundspeed = sqrt(gamma * (gamma-1.) * utherm)
-t_dyn      = sqrt(scale_height / (constG * surface_density))
-rho0       = fgas*(surface_density)/(2.*scale_height)
-print,' dynamical time = ',t_dyn,' = ',t_dyn*UL/uV/(1d6*phys.yr),' Myr'
-print,' thermal energy per unit mass = ',utherm
-print,' central density = ',rho0,' = ',rho0*uM/uL^3/m_h,' particles/cm^3'
-print,' central temperature = ',temp
-lambda = 2 * !pi * phys.G^1.5 * (scale_height*uL)^1.5 * (surface_density * uM/uL^2)^0.5 * phys.m_h^2 / (gamma-1) / fgas
-print,' lambda = ',lambda
-stop
-;
-infile = indir + basefile + '*'
-spawn,'ls -1 '+infile,res
-nfiles = n_elements(res)
-
-
-; choose: calculate change of energy and Lz, comparing first and last
-; snapshots for all particles, or do so for a subset
-
-; compare all
-ifile   = 0
-inf     = indir + basefile + strtrim(string(ifile,'(i3.3)'),1) + '.hdf5'
-id      = h5rd(inf,'PartType0/ParticleIDs')
-nfollow = n_elements(id)
-
-
-; compute anlytic profile
-nbins = 100
-zbins = findgen(nbins)/float(nbins-1) * 2 * scale_height
-rbins = (surface_density/(2.*scale_height)) / cosh(abs(zbins)/scale_height)^2
-
-
-; plot analytic profile
-wset,0
-plot,[0],[0],xr=[0,2*scale_height],yr=[0,max(rbins)],/nodata,xtitle='|z|',ytitle=textoidl('\rho')
-oplot,zbins,rbins,color=blue
-
-ifile  = 0
-nskip   = nfiles - 1
-isave  = 0
-nplot  = 8192 ; randomly plot particles
-color = floor(findgen(nfiles)/float(nfiles-1)*255)
-;for ifile=0,nfiles-1,nskip do begin
-tsave  = [0.]
-toplot = [1,nfiles-1]
-for iplot=0,1 do begin
-   ifile  = toplot[iplot]
-   inf    = indir + basefile + strtrim(string(ifile,'(i3.3)'),1) + '.hdf5'
-   time   = h5ra(inf, 'Header','Time')
-   tsave  = [tsave, time]
-   print,' time= ',time
-   p      = h5rd(inf,'PartType0/Coordinates')
-   v      = h5rd(inf,'PartType0/Velocities')
-   id     = h5rd(inf,'PartType0/ParticleIDs')
-   rho    = h5rd(inf,'PartType0/Density')
-   h      = h5rd(inf,'PartType0/SmoothingLength')
-   utherm = h5rd(inf,'PartType0/InternalEnergy')
-   indx   = sort(id)
-
-; substract disk centre
-   for ic=0,2 do p[ic,*]=p[ic,*] - pcentre[ic]
-
-
-;; ;  if you want to sort particles by ID
-;;    id     = id[indx]
-;;    rho    = rho[indx]
-;;    utherm = utherm[indx]
-;;    h      = h[indx]
-;;    for ic=0,2 do begin
-;;       tmp = reform(p[ic,*]) & p[ic,*] = tmp[indx]
-;;       tmp = reform(v[ic,*]) & v[ic,*] = tmp[indx]
-;;    endfor
-   
-   ip = floor(randomu(ifile+1,nplot)*n_elements(rho))
-   color = red
-   if(ifile eq 1) then begin
-      color=black
-   endif else begin
-      color=red
-   endelse
-   oplot,abs(p[2,ip]), rho[ip], psym=3, color=color
-
-   isave = isave + 1
-   
-endfor
-
-; time in units of dynamical time
-tsave = tsave[1:*] / t_dyn
-
-label = ['']
-for i=0,n_elements(tsave)-1 do label=[label,'time/t_dynamic='+string(tsave[i],format='(f8.0)')]
-label = label[1:*]
-legend,['analytic',label[0],label[1]],linestyle=[0,0,0],color=[blue,black,red],box=0,/top,/right
-
-; make histograms of particle velocities
-xr    = 1d-3 * [-1,1]
-bsize = 1.d-5
-ohist,v[0,*]/soundspeed,x,vx,xr[0],xr[1],bsize
-ohist,v[1,*]/soundspeed,y,vy,xr[0],xr[1],bsize
-ohist,v[2,*]/soundspeed,z,vz,xr[0],xr[1],bsize
-wset,2
-plot,x,vx,psym=10,xtitle='velocity/soundspeed',ytitle='pdf',/nodata,xr=xr,/xs
-oplot,x,vx,psym=10,color=black
-oplot,y,vy,psym=10,color=blue
-oplot,z,vz,psym=10,color=red
-legend,['vx/c','vy/c','vz/c'],linestyle=[0,0,0],color=[black,blue,red],box=0,/top,/right
-end
-
-
diff --git a/examples/DiscPatch/HydroStatic/makeIC.py b/examples/DiscPatch/HydroStatic/makeIC.py
index 6ba1ccd06fed84ca728aadaa5922dbba536b6881..11b482059b494fc9a6b9447fdfe2e7ec543d52ff 100644
--- a/examples/DiscPatch/HydroStatic/makeIC.py
+++ b/examples/DiscPatch/HydroStatic/makeIC.py
@@ -1,158 +1,162 @@
 ###############################################################################
- # This file is part of SWIFT.
- # Copyright (c) 2016 John A. Regan (john.a.regan@durham.ac.uk)
- #                    Tom Theuns (tom.theuns@durham.ac.uk)
- # 
- # This program is free software: you can redistribute it and/or modify
- # it under the terms of the GNU Lesser General Public License as published
- # by the Free Software Foundation, either version 3 of the License, or
- # (at your option) any later version.
- # 
- # This program is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- # GNU General Public License for more details.
- # 
- # You should have received a copy of the GNU Lesser General Public License
- # along with this program.  If not, see <http://www.gnu.org/licenses/>.
- # 
- ##############################################################################
+# This file is part of SWIFT.
+# Copyright (c) 2016 John A. Regan (john.a.regan@durham.ac.uk)
+#                    Tom Theuns (tom.theuns@durham.ac.uk)
+#               2017 Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+#                    Bert Vandenbroucke (bert.vandenbroucke@gmail.com)
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+##############################################################################
 
 import h5py
 import sys
-import numpy
+import numpy as np
 import math
 import random
-import matplotlib.pyplot as plt
 
 # Generates a disc-patch in hydrostatic equilibrium
-# see Creasey, Theuns & Bower, 2013, for the equations:
-# disc parameters are: surface density sigma
-#                      scale height b
-# density: rho(z) = (sigma/2b) sech^2(z/b)
-# isothermal velocity dispersion = <v_z^2? = b pi G sigma
-# grad potential  = 2 pi G sigma tanh(z/b)
-# potential       = ln(cosh(z/b)) + const
-# Dynamical time  = sqrt(b / (G sigma))
-# to obtain the 1/ch^2(z/b) profile from a uniform profile (a glass, say, or a uniform random variable), note that, when integrating in z
-# \int 0^z dz/ch^2(z) = tanh(z)-tanh(0) = \int_0^x dx = x (where the last integral refers to a uniform density distribution), so that z = atanh(x)
-# usage: python makeIC.py 1000 
-
-# physical constants in cgs
-NEWTON_GRAVITY_CGS  = 6.672e-8
-SOLAR_MASS_IN_CGS   = 1.9885e33
-PARSEC_IN_CGS       = 3.0856776e18
-PROTON_MASS_IN_CGS  = 1.6726231e24
-YEAR_IN_CGS         = 3.154e+7
-
-# choice of units
-const_unit_length_in_cgs   =   (PARSEC_IN_CGS)
-const_unit_mass_in_cgs     =   (SOLAR_MASS_IN_CGS)
-const_unit_velocity_in_cgs =   (1e5)
+#
+# See Creasey, Theuns & Bower, 2013, MNRAS, Volume 429, Issue 3, p.1922-1948
+#
+#
+# Disc parameters are: surface density  -- sigma
+#                      scale height -- b
+#                      gas adiabatic index -- gamma
+#
+# Problem parameters are: Ratio height/width of the box -- z_factor
+#                         Size of the patch -- side_length
+
+# Parameters of the gas disc
+surface_density = 10.
+scale_height    = 100.
+gas_gamma       = 5./3.
 
-print "UnitMass_in_cgs:     ", const_unit_mass_in_cgs 
-print "UnitLength_in_cgs:   ", const_unit_length_in_cgs
-print "UnitVelocity_in_cgs: ", const_unit_velocity_in_cgs
+# Parameters of the problem
+x_factor        = 2
+side_length     = 400.
 
+# File
+fileName = "Disc-Patch.hdf5"
 
-# parameters of potential
-surface_density = 100. # surface density of all mass, which generates the gravitational potential
-scale_height    = 100.
-gamma           = 5./3.
-fgas            = 0.1  # gas fraction
-
-# derived units
-const_unit_time_in_cgs = (const_unit_length_in_cgs / const_unit_velocity_in_cgs)
-const_G                = ((NEWTON_GRAVITY_CGS*const_unit_mass_in_cgs*const_unit_time_in_cgs*const_unit_time_in_cgs/(const_unit_length_in_cgs*const_unit_length_in_cgs*const_unit_length_in_cgs)))
-print 'G=', const_G
-utherm                 = math.pi * const_G * surface_density * scale_height / (gamma-1)
-v_disp                 = numpy.sqrt(2 * utherm)
-soundspeed             = numpy.sqrt(utherm / (gamma * (gamma-1.)))
-t_dyn                  = numpy.sqrt(scale_height / (const_G * surface_density))
-t_cross                = scale_height / soundspeed
-print 'dynamical time = ',t_dyn,' sound crossing time = ',t_cross,' sound speed= ',soundspeed,' 3D velocity dispersion = ',v_disp,' thermal_energy= ',utherm
-
-
-# Parameters
-periodic= 1            # 1 For periodic box
-boxSize = 400.         #  [kpc]
-Radius  = 100.         # maximum radius of particles [kpc]
-G       = const_G 
+####################################################################
 
-# File
-fileName = "Disc-Patch.hdf5" 
-
-#---------------------------------------------------
-mass           = 1
-
-#--------------------------------------------------
-
-
-# using glass ICs
-# read glass file and generate gas positions and tile it ntile times in each dimension
-ntile   = 1
-inglass = 'glassCube_32.hdf5'
-infile  = h5py.File(inglass, "r")
-one_glass_p = infile["/PartType0/Coordinates"][:,:]
-one_glass_h = infile["/PartType0/SmoothingLength"][:]
-
-# scale in [-0.5,0.5]*BoxSize / ntile
-one_glass_p[:,:] -= 0.5
-one_glass_p      *= boxSize / ntile
-one_glass_h      *= boxSize / ntile
-ndens_glass       = (one_glass_h.shape[0]) / (boxSize/ntile)**3
-h_glass           = numpy.amin(one_glass_h) * (boxSize/ntile)
-
-glass_p = []
-glass_h = []
-for ix in range(0,ntile):
-    for iy in range(0,ntile):
-        for iz in range(0,ntile):
-            shift = one_glass_p.copy()
-            shift[:,0] += (ix-(ntile-1)/2.) * boxSize / ntile
-            shift[:,1] += (iy-(ntile-1)/2.) * boxSize / ntile
-            shift[:,2] += (iz-(ntile-1)/2.) * boxSize / ntile
-            glass_p.append(shift)
-            glass_h.append(one_glass_h.copy())
-
-glass_p = numpy.concatenate(glass_p, axis=0)
-glass_h = numpy.concatenate(glass_h, axis=0)
-
-# random shuffle of glas ICs
-numpy.random.seed(12345)
-indx   = numpy.random.rand(numpy.shape(glass_h)[0])
-indx   = numpy.argsort(indx)
-glass_p = glass_p[indx, :]
-glass_h = glass_h[indx]
-
-# select numGas of them
-numGas = 8192
-pos    = glass_p[0:numGas,:]
-h      = glass_h[0:numGas]
-numGas = numpy.shape(pos)[0]
-
-# compute furthe properties of ICs
-column_density = fgas * surface_density * numpy.tanh(boxSize/2./scale_height)
-enclosed_mass  = column_density * boxSize * boxSize
-pmass          = enclosed_mass / numGas
-meanrho        = enclosed_mass / boxSize**3
-print 'pmass= ',pmass,' mean(rho) = ', meanrho,' entropy= ', (gamma-1) * utherm / meanrho**(gamma-1)
-
-# desired density
-rho            = surface_density / (2. * scale_height) / numpy.cosh(abs(pos[:,2])/scale_height)**2
-u              = (1. + 0 * h) * utherm 
-entropy        = (gamma-1) * u / rho**(gamma-1)
-mass           = 0.*h + pmass
-entropy_flag   = 0
-vel            = 0 + 0 * pos
-
-# move centre of disc to middle of box
-pos[:,:]     += boxSize/2
-
-
-# create numPart dm particles
-numPart = 0
+# physical constants in cgs
+NEWTON_GRAVITY_CGS  = 6.67408e-8
+SOLAR_MASS_IN_CGS   = 1.9885e33
+PARSEC_IN_CGS       = 3.08567758149e18
+PROTON_MASS_IN_CGS  = 1.672621898e-24
+BOLTZMANN_IN_CGS    = 1.38064852e-16
+YEAR_IN_CGS         = 3.15569252e7
 
+# choice of units
+unit_length_in_cgs   =   (PARSEC_IN_CGS)
+unit_mass_in_cgs     =   (SOLAR_MASS_IN_CGS)
+unit_velocity_in_cgs =   (1e5)
+unit_time_in_cgs     =   unit_length_in_cgs / unit_velocity_in_cgs
+
+print "UnitMass_in_cgs:     %.5e"%unit_mass_in_cgs
+print "UnitLength_in_cgs:   %.5e"%unit_length_in_cgs
+print "UnitVelocity_in_cgs: %.5e"%unit_velocity_in_cgs
+print "UnitTime_in_cgs:     %.5e"%unit_time_in_cgs
+print ""
+
+# Derived units
+const_G  = NEWTON_GRAVITY_CGS * unit_mass_in_cgs * unit_time_in_cgs**2 * \
+           unit_length_in_cgs**-3
+const_mp = PROTON_MASS_IN_CGS * unit_mass_in_cgs**-1
+const_kb = BOLTZMANN_IN_CGS * unit_mass_in_cgs**-1 * unit_length_in_cgs**-2 * \
+           unit_time_in_cgs**2
+
+print "--- Some constants [internal units] ---"
+print "G_Newton:    %.5e"%const_G
+print "m_proton:    %.5e"%const_mp
+print "k_boltzmann: %.5e"%const_kb
+print ""
+
+# derived quantities
+temp       = math.pi * const_G * surface_density * scale_height * const_mp / \
+             const_kb
+u_therm    = const_kb * temp / ((gas_gamma-1) * const_mp)
+v_disp     = math.sqrt(2 * u_therm)
+soundspeed = math.sqrt(u_therm / (gas_gamma * (gas_gamma-1.)))
+t_dyn      = math.sqrt(scale_height / (const_G * surface_density))
+t_cross    = scale_height / soundspeed
+
+print "--- Properties of the gas [internal units] ---"
+print "Gas temperature:     %.5e"%temp
+print "Gas thermal_energy:  %.5e"%u_therm
+print "Dynamical time:      %.5e"%t_dyn
+print "Sound crossing time: %.5e"%t_cross
+print "Gas sound speed:     %.5e"%soundspeed
+print "Gas 3D vel_disp:     %.5e"%v_disp
+print ""
+
+# Problem properties
+boxSize_x = side_length
+boxSize_y = boxSize_x
+boxSize_z = boxSize_x
+boxSize_x *= x_factor
+volume = boxSize_x * boxSize_y * boxSize_z
+M_tot = boxSize_y * boxSize_z * surface_density * \
+        math.tanh(boxSize_x / (2. * scale_height))
+density = M_tot / volume
+entropy = (gas_gamma - 1.) * u_therm / density**(gas_gamma - 1.)
+
+print "--- Problem properties [internal units] ---"
+print "Box:        [%.1f, %.1f, %.1f]"%(boxSize_x, boxSize_y, boxSize_z)
+print "Volume:     %.5e"%volume
+print "Total mass: %.5e"%M_tot
+print "Density:    %.5e"%density
+print "Entropy:    %.5e"%entropy
+print ""
+
+####################################################################
+
+# Read glass pre-ICs
+infile  = h5py.File('glassCube_32.hdf5', "r")
+one_glass_pos = infile["/PartType0/Coordinates"][:,:]
+one_glass_h   = infile["/PartType0/SmoothingLength"][:]
+
+# Rescale to the problem size
+one_glass_pos *= side_length
+one_glass_h   *= side_length
+
+# Now create enough copies to fill the volume in x
+pos = np.copy(one_glass_pos)
+h = np.copy(one_glass_h)
+for i in range(1, x_factor):
+    one_glass_pos[:,0] += side_length
+    pos = np.append(pos, one_glass_pos, axis=0)
+    h   = np.append(h, one_glass_h, axis=0)
+
+# Compute further properties of ICs
+numPart = np.size(h)
+mass = M_tot / numPart
+
+print "--- Particle properties [internal units] ---"
+print "Number part.: ", numPart
+print "Part. mass:   %.5e"%mass
+print ""
+
+# Create additional arrays
+u    = np.ones(numPart) * u_therm
+mass = np.ones(numPart) * mass
+vel  = np.zeros((numPart, 3))
+ids  = 1 + np.linspace(0, numPart, numPart, endpoint=False)
+
+####################################################################
 # Create and write output file
 
 #File
@@ -160,97 +164,45 @@ file = h5py.File(fileName, 'w')
 
 #Units
 grp = file.create_group("/Units")
-grp.attrs["Unit length in cgs (U_L)"] = const_unit_length_in_cgs
-grp.attrs["Unit mass in cgs (U_M)"] = const_unit_mass_in_cgs 
-grp.attrs["Unit time in cgs (U_t)"] = const_unit_length_in_cgs / const_unit_velocity_in_cgs
+grp.attrs["Unit length in cgs (U_L)"] = unit_length_in_cgs
+grp.attrs["Unit mass in cgs (U_M)"] = unit_mass_in_cgs
+grp.attrs["Unit time in cgs (U_t)"] = unit_time_in_cgs
 grp.attrs["Unit current in cgs (U_I)"] = 1.
 grp.attrs["Unit temperature in cgs (U_T)"] = 1.
 
 # Header
 grp = file.create_group("/Header")
-grp.attrs["BoxSize"] = boxSize
-grp.attrs["NumPart_Total"] =  [numGas, numPart, 0, 0, 0, 0]
+grp.attrs["BoxSize"] = [boxSize_x, boxSize_y, boxSize_z]
+grp.attrs["NumPart_Total"] =  [numPart, 0, 0, 0, 0, 0]
 grp.attrs["NumPart_Total_HighWord"] = [0, 0, 0, 0, 0, 0]
-grp.attrs["NumPart_ThisFile"] = [numGas, numPart, 0, 0, 0, 0]
+grp.attrs["NumPart_ThisFile"] = [numPart, 0, 0, 0, 0, 0]
 grp.attrs["Time"] = 0.0
 grp.attrs["NumFilesPerSnapshot"] = 1
 grp.attrs["MassTable"] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
-grp.attrs["Flag_Entropy_ICs"] = [entropy_flag]
+grp.attrs["Flag_Entropy_ICs"] = [0, 0, 0, 0, 0, 0]
 grp.attrs["Dimension"] = 3
 
 #Runtime parameters
 grp = file.create_group("/RuntimePars")
-grp.attrs["PeriodicBoundariesOn"] = periodic
-
+grp.attrs["PeriodicBoundariesOn"] = 1
 
 # write gas particles
 grp0   = file.create_group("/PartType0")
 
-ds     = grp0.create_dataset('Coordinates', (numGas, 3), 'f')
-ds[()] = pos
-
-ds     = grp0.create_dataset('Velocities', (numGas, 3), 'f')
-ds[()] = vel
-
-ds     = grp0.create_dataset('Masses', (numGas,), 'f')
-ds[()] = mass
-
-ds     = grp0.create_dataset('SmoothingLength', (numGas,), 'f')
-ds[()] = h
-
-ds = grp0.create_dataset('InternalEnergy', (numGas,), 'f')
-u = numpy.full((numGas, ), utherm)
-if (entropy_flag == 1):
-    ds[()] = entropy
-else:
-    ds[()] = u    
-
-ids = 1 + numpy.linspace(0, numGas, numGas, endpoint=False)
-ds = grp0.create_dataset('ParticleIDs', (numGas, ), 'L')
-ds[()] = ids
-
-print "Internal energy:", u[0]
-
-# generate dark matter particles if needed
-if(numPart > 0):
-    
-    # set seed for random number
-    numpy.random.seed(1234)
-    
-    grp1 = file.create_group("/PartType1")
-    
-    radius = Radius * (numpy.random.rand(N))**(1./3.) 
-    ctheta = -1. + 2 * numpy.random.rand(N)
-    stheta = numpy.sqrt(1.-ctheta**2)
-    phi    =  2 * math.pi * numpy.random.rand(N)
-    r      = numpy.zeros((numPart, 3))
-
-    speed  = vrot
-    v      = numpy.zeros((numPart, 3))
-    omega  = speed / radius
-    period = 2.*math.pi/omega
-    print 'period = minimum = ',min(period), ' maximum = ',max(period)
-    
-    v[:,0] = -omega * r[:,1]
-    v[:,1] =  omega * r[:,0]
-    
-    ds = grp1.create_dataset('Coordinates', (numPart, 3), 'd')
-    ds[()] = r
-    
-    ds = grp1.create_dataset('Velocities', (numPart, 3), 'f')
-    ds[()] = v
-    v = numpy.zeros(1)
-    
-    m = numpy.full((numPart, ),10)
-    ds = grp1.create_dataset('Masses', (numPart,), 'f')
-    ds[()] = m
-    m = numpy.zeros(1)
-        
-    ids = 1 + numpy.linspace(0, numPart, numPart, endpoint=False, dtype='L')
-    ds = grp1.create_dataset('ParticleIDs', (numPart, ), 'L')
-    ds[()] = ids
-
-
-file.close()
-
-sys.exit()
+ds = grp0.create_dataset('Coordinates', (numPart, 3), 'f', data=pos)
+ds = grp0.create_dataset('Velocities', (numPart, 3), 'f')
+ds = grp0.create_dataset('Masses', (numPart,), 'f', data=mass)
+ds = grp0.create_dataset('SmoothingLength', (numPart,), 'f', data=h)
+ds = grp0.create_dataset('InternalEnergy', (numPart,), 'f', data=u)
+ds = grp0.create_dataset('ParticleIDs', (numPart, ), 'L', data=ids)
+
+####################################################################
+
+print "--- Runtime parameters (YAML file): ---"
+print "DiscPatchPotential:surface_density:    ", surface_density
+print "DiscPatchPotential:scale_height:       ", scale_height
+print "DiscPatchPotential:x_disc:             ", 0.5 * boxSize_x
+print ""
+
+print "--- Constant parameters: ---"
+print "const_isothermal_internal_energy: %ef"%u_therm
diff --git a/examples/DiscPatch/HydroStatic/plot.py b/examples/DiscPatch/HydroStatic/plotSolution.py
similarity index 58%
rename from examples/DiscPatch/HydroStatic/plot.py
rename to examples/DiscPatch/HydroStatic/plotSolution.py
index 2de749f9e3b3c287390218e09ea347d660f9ce8a..681f7d8ab3f2320b5de75e688edcb92efef9d883 100644
--- a/examples/DiscPatch/HydroStatic/plot.py
+++ b/examples/DiscPatch/HydroStatic/plotSolution.py
@@ -1,6 +1,7 @@
 ################################################################################
 # This file is part of SWIFT.
 # Copyright (c) 2017 Bert Vandenbroucke (bert.vandenbroucke@gmail.com)
+#                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published
@@ -20,7 +21,7 @@
 ##
 # This script plots the Disc-Patch_*.hdf5 snapshots.
 # It takes two (optional) parameters: the counter value of the first and last
-# snapshot to plot (default: 0 81).
+# snapshot to plot (default: 0 21).
 ##
 
 import numpy as np
@@ -34,12 +35,14 @@ import sys
 # Parameters
 surface_density = 10.
 scale_height = 100.
-z_disc = 200.
-utherm = 20.2615290634
+x_disc = 400.
+x_trunc = 300.
+x_max = 350.
+utherm = 20.2678457288
 gamma = 5. / 3.
 
 start = 0
-stop = 81
+stop = 21
 if len(sys.argv) > 1:
   start = int(sys.argv[1])
 if len(sys.argv) > 2:
@@ -48,14 +51,14 @@ if len(sys.argv) > 2:
 # Get the analytic solution for the density
 def get_analytic_density(x):
   return 0.5 * surface_density / scale_height / \
-           np.cosh( (x - z_disc) / scale_height )**2
+           np.cosh( (x - x_disc) / scale_height )**2
 
 # Get the analytic solution for the (isothermal) pressure
 def get_analytic_pressure(x):
   return (gamma - 1.) * utherm * get_analytic_density(x)
 
 # Get the data fields to plot from the snapshot file with the given name:
-#  snapshot time, z-coord, density, pressure, velocity norm
+#  snapshot time, x-coord, density, pressure, velocity norm
 def get_data(name):
   file = h5py.File(name, "r")
   coords = np.array(file["/PartType0/Coordinates"])
@@ -67,7 +70,7 @@ def get_data(name):
 
   vtot = np.sqrt( v[:,0]**2 + v[:,1]**2 + v[:,2]**2 )
 
-  return float(file["/Header"].attrs["Time"]), coords[:,2], rho, P, vtot
+  return float(file["/Header"].attrs["Time"]), coords[:,0], rho, P, vtot
 
 # scan the folder for snapshot files and plot all of them (within the requested
 # range)
@@ -78,23 +81,38 @@ for f in sorted(glob.glob("Disc-Patch_*.hdf5")):
 
   print "processing", f, "..."
 
-  zrange = np.linspace(0., 400., 1000)
-  time, z, rho, P, v = get_data(f)
+  xrange = np.linspace(0., 2. * x_disc, 1000)
+  time, x, rho, P, v = get_data(f)
 
   fig, ax = pl.subplots(3, 1, sharex = True)
 
-  ax[0].plot(z, rho, "r.")
-  ax[0].plot(zrange, get_analytic_density(zrange), "k-")
+  ax[0].plot(x, rho, "r.")
+  ax[0].plot(xrange, get_analytic_density(xrange), "k-")
+  ax[0].plot([x_disc - x_max, x_disc - x_max], [0, 10], "k--", alpha=0.5)
+  ax[0].plot([x_disc + x_max, x_disc + x_max], [0, 10], "k--", alpha=0.5)
+  ax[0].plot([x_disc - x_trunc, x_disc - x_trunc], [0, 10], "k--", alpha=0.5)
+  ax[0].plot([x_disc + x_trunc, x_disc + x_trunc], [0, 10], "k--", alpha=0.5)
+  ax[0].set_ylim(0., 1.2 * get_analytic_density(x_disc))
   ax[0].set_ylabel("density")
 
-  ax[1].plot(z, v, "r.")
-  ax[1].plot(zrange, np.zeros(len(zrange)), "k-")
+  ax[1].plot(x, v, "r.")
+  ax[1].plot(xrange, np.zeros(len(xrange)), "k-")
+  ax[1].plot([x_disc - x_max, x_disc - x_max], [0, 10], "k--", alpha=0.5)
+  ax[1].plot([x_disc + x_max, x_disc + x_max], [0, 10], "k--", alpha=0.5)
+  ax[1].plot([x_disc - x_trunc, x_disc - x_trunc], [0, 10], "k--", alpha=0.5)
+  ax[1].plot([x_disc + x_trunc, x_disc + x_trunc], [0, 10], "k--", alpha=0.5)
+  ax[1].set_ylim(-0.5, 10.)
   ax[1].set_ylabel("velocity norm")
 
-  ax[2].plot(z, P, "r.")
-  ax[2].plot(zrange, get_analytic_pressure(zrange), "k-")
-  ax[2].set_xlim(0., 400.)
-  ax[2].set_xlabel("z")
+  ax[2].plot(x, P, "r.")
+  ax[2].plot(xrange, get_analytic_pressure(xrange), "k-")
+  ax[2].plot([x_disc - x_max, x_disc - x_max], [0, 10], "k--", alpha=0.5)
+  ax[2].plot([x_disc + x_max, x_disc + x_max], [0, 10], "k--", alpha=0.5)
+  ax[2].plot([x_disc - x_trunc, x_disc - x_trunc], [0, 10], "k--", alpha=0.5)
+  ax[2].plot([x_disc + x_trunc, x_disc + x_trunc], [0, 10], "k--", alpha=0.5)
+  ax[2].set_xlim(0., 2. * x_disc)
+  ax[2].set_ylim(0., 1.2 * get_analytic_pressure(x_disc))
+  ax[2].set_xlabel("x")
   ax[2].set_ylabel("pressure")
 
   pl.suptitle("t = {0:.2f}".format(time))
diff --git a/examples/DiscPatch/HydroStatic/run.sh b/examples/DiscPatch/HydroStatic/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..e1f47ecad54e7e171d78b7da080d56579e985d1e
--- /dev/null
+++ b/examples/DiscPatch/HydroStatic/run.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+# Generate the initial conditions if they are not present.
+if [ ! -e glassCube_32.hdf5 ]
+then
+    echo "Fetching initial glass file for the disc patch example..."
+    ./getGlass.sh
+fi
+if [ ! -e Disc-Patch.hdf5 ]
+then
+    echo "Generating initial conditions for the disc patch example..."
+    python makeIC.py
+fi
+
+# Run SWIFT
+../../swift -g -s -t 4 disc-patch-icc.yml 2>&1 | tee output.log
+
+python plotSolution.py
diff --git a/examples/DiscPatch/HydroStatic/test.pro b/examples/DiscPatch/HydroStatic/test.pro
deleted file mode 100644
index 950aebc65d7d34cd7aaeb2368734e5492902a912..0000000000000000000000000000000000000000
--- a/examples/DiscPatch/HydroStatic/test.pro
+++ /dev/null
@@ -1,142 +0,0 @@
-;
-;  test energy / angular momentum conservation of test problem
-;
-
-iplot = 1 ; if iplot = 1, make plot of E/Lz conservation, else, simply compare final and initial energy
-
-; set physical constants
-@physunits
-
-indir    = './'
-basefile = 'Disc-Patch_'
-
-; set properties of potential
-uL   = phys.pc                  ; unit of length
-uM   = phys.msun                ; unit of mass
-uV   = 1d5                      ; unit of velocity
-
-; properties of patch
-surface_density = 10.
-scale_height    = 100.
-
-; derived units
-constG   = 10.^(alog10(phys.g)+alog10(uM)-2d0*alog10(uV)-alog10(uL)) ;
-pcentre  = [0.,0.,200.] * pc / uL
-
-;
-infile = indir + basefile + '*'
-spawn,'ls -1 '+infile,res
-nfiles = n_elements(res)
-
-
-; choose: calculate change of energy and Lz, comparing first and last
-; snapshots for all particles, or do so for a subset
-
-; compare all
-ifile   = 0
-inf     = indir + basefile + strtrim(string(ifile,'(i3.3)'),1) + '.hdf5'
-id      = h5rd(inf,'PartType0/ParticleIDs')
-nfollow = n_elements(id)
-
-; follow a subset
-; nfollow  = min(4000, nfollow)   ; number of particles to follow
-
-;
-if (iplot eq 1) then begin
-   nskip = 1
-   nsave = nfiles
-endif else begin
-   nskip = nfiles - 2
-   nsave = 2
-endelse
-
-;
-lout     = fltarr(nfollow, nsave) ; Lz
-xout     = fltarr(nfollow, nsave) ; x
-yout     = fltarr(nfollow, nsave) ; y
-zout     = fltarr(nfollow, nsave) ; z
-vzout    = fltarr(nfollow, nsave) ; z
-rout     = fltarr(nfollow, nsave) ; rho
-hout     = fltarr(nfollow, nsave) ; h
-uout     = fltarr(nfollow, nsave) ; thermal energy
-eout     = fltarr(nfollow, nsave) ; energies
-ekin     = fltarr(nfollow, nsave)
-epot     = fltarr(nfollow, nsave) ; 2 pi G Sigma b ln(cosh(z/b)) + const
-tout     = fltarr(nsave)
-
-ifile  = 0
-isave = 0
-for ifile=0,nfiles-1,nskip do begin
-   inf    = indir + basefile + strtrim(string(ifile,'(i3.3)'),1) + '.hdf5'
-   time   = h5ra(inf, 'Header','Time')
-   p      = h5rd(inf,'PartType0/Coordinates')
-   v      = h5rd(inf,'PartType0/Velocities')
-   id     = h5rd(inf,'PartType0/ParticleIDs')
-   rho    = h5rd(inf,'PartType0/Density')
-   h      = h5rd(inf,'PartType0/SmoothingLength')
-   utherm = h5rd(inf,'PartType0/InternalEnergy')
-   indx   = sort(id)
-
-;  if you want to sort particles by ID
-   id     = id[indx]
-   rho    = rho[indx]
-   utherm = utherm[indx]
-   h      = h[indx]
-   for ic=0,2 do begin
-      tmp = reform(p[ic,*]) & p[ic,*] = tmp[indx]
-      tmp = reform(v[ic,*]) & v[ic,*] = tmp[indx]
-   endfor
-
-; calculate energy
-   dd  = size(p,/dimen) & npart = dd[1]
-   ener = fltarr(npart)
-   dr   = fltarr(npart) & dv = dr
-   for ic=0,2 do dr[*] = dr[*] + (p[ic,*]-pcentre[ic])^2
-   for ic=0,2 do dv[*] = dv[*] + v[ic,*]^2
-   xout[*,isave] = p[0,0:nfollow-1]-pcentre[0]
-   yout[*,isave] = p[1,0:nfollow-1]-pcentre[1]
-   zout[*,isave] = p[2,0:nfollow-1]-pcentre[2]
-   vzout[*,isave]= v[2,0:nfollow-1]
-   rout[*,isave] = rho[0:nfollow-1]
-   hout[*,isave] = h[0:nfollow-1]
-   uout[*,isave] = utherm[0:nfollow-1]
-   Lz  = (p[0,*]-pcentre[0]) * v[1,*] - (p[1,*]-pcentre[1]) * v[0,*]
-   dz  = reform(p[2,0:nfollow-1]-pcentre[2])
-;   print,'time = ',time,p[0,0],v[0,0],id[0]
-   ek   = 0.5 * dv
-   ep   = fltarr(nfollow)
-   ep   = 2 * !pi * constG * surface_density * scale_height * alog(cosh(abs(dz)/scale_height))
-   ener = ek + ep
-   tout(isave) = time
-   lout[*,isave] = lz[0:nfollow-1]
-   eout(*,isave) = ener[0:nfollow-1]
-   ekin(*,isave) = ek[0:nfollow-1]
-   epot(*,isave) = ep[0:nfollow-1]
-   print,format='('' time= '',f7.1,'' E= '',f9.2,'' Lz= '',e9.2)', time,eout[0],lz[0]
-   isave = isave + 1
-   
-endfor
-
-x0 = reform(xout[0,*])
-y0 = reform(xout[1,*])
-z0 = reform(xout[2,*])
-
-
-; plot density profile and compare to analytic profile
-nplot = nfollow
-
-                                ; plot density profile
-wset,0
-xr   = [0, 3*scale_height]
-nbins = 100
-zpos  = findgen(nbins)/float(nbins-1) * max(xr)
-dens  = (surface_density/(2.d0*scale_height)) * 1./cosh(zpos/scale_height)^2
-plot,[0],[0],xr=xr,/xs,yr=[0,max(dens)*1.4],/ys,/nodata,xtitle='|z|',ytitle='density'
-oplot,zpos,dens,color=black,thick=3
-;oplot,abs(zout[*,1]),rout[*,1],psym=3 ; initial profile
-oplot,abs(zout[*,nsave-1]),rout[*,nsave-1],psym=3,color=red
-
-
-end
-
-
diff --git a/examples/DiscPatch/HydroStatic_1D/disc-patch-icc.yml b/examples/DiscPatch/HydroStatic_1D/disc-patch-icc.yml
new file mode 100644
index 0000000000000000000000000000000000000000..6f17cfbb1e0125faf8e47fe4e9e55bfdf4df7b71
--- /dev/null
+++ b/examples/DiscPatch/HydroStatic_1D/disc-patch-icc.yml
@@ -0,0 +1,46 @@
+# Define the system of units to use internally. 
+InternalUnitSystem:
+  UnitMass_in_cgs:     1.9885e33         # Grams
+  UnitLength_in_cgs:   3.08567758149e18  # Centimeters
+  UnitVelocity_in_cgs: 1e5               # Centimeters per second
+  UnitCurrent_in_cgs:  1   # Amperes
+  UnitTemp_in_cgs:     1   # Kelvin
+
+# Parameters governing the time integration
+TimeIntegration:
+  time_begin: 0     # The starting time of the simulation (in internal units).
+  time_end:   968.  # The end time of the simulation (in internal units).
+  dt_min:     1e-4  # The minimal time-step size of the simulation (in internal units).
+  dt_max:     10.   # The maximal time-step size of the simulation (in internal units).
+
+# Parameters governing the conserved quantities statistics
+Statistics:
+  delta_time:          12. # Time between statistics output
+  
+# Parameters governing the snapshots
+Snapshots:
+  basename:    Disc-Patch   # Common part of the name of output files
+  time_first:  0.           # Time of the first output (in internal units)
+  delta_time:  48.          # Time difference between outputs (in internal units)
+
+# Parameters for the hydrodynamics scheme
+SPH:
+  resolution_eta:        1.2349   # Target smoothing length in units of the mean inter-particle separation (1.2349 == 48Ngbs with the cubic spline kernel).
+  delta_neighbours:      0.1      # The tolerance for the targetted number of neighbours.
+  CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
+  max_ghost_iterations:  30       # Maximal number of iterations allowed to converge towards the smoothing length.
+  h_max:                 60.      # Maximal smoothing length allowed (in internal units).
+
+# Parameters related to the initial conditions
+InitialConditions:
+  file_name:  Disc-Patch.hdf5       # The file to read
+
+# External potential parameters
+DiscPatchPotential:
+  surface_density: 10.
+  scale_height:    100.
+  x_disc:          400.
+  x_trunc:         300.
+  x_max:           350.
+  timestep_mult:   0.03
+  growth_time:     5.
diff --git a/examples/DiscPatch/HydroStatic_1D/makeIC.py b/examples/DiscPatch/HydroStatic_1D/makeIC.py
new file mode 100644
index 0000000000000000000000000000000000000000..1589dfc8c73e5b9bf3c2cad4bcf3029654d9e67e
--- /dev/null
+++ b/examples/DiscPatch/HydroStatic_1D/makeIC.py
@@ -0,0 +1,194 @@
+###############################################################################
+# This file is part of SWIFT.
+# Copyright (c) 2016 John A. Regan (john.a.regan@durham.ac.uk)
+#                    Tom Theuns (tom.theuns@durham.ac.uk)
+#               2017 Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+#                    Bert Vandenbroucke (bert.vandenbroucke@gmail.com)
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+##############################################################################
+
+import h5py
+import sys
+import numpy as np
+import math
+import random
+
+# Generates a disc-patch in hydrostatic equilibrium
+#
+# See Creasey, Theuns & Bower, 2013, MNRAS, Volume 429, Issue 3, p.1922-1948
+#
+#
+# Disc parameters are: surface density  -- sigma
+#                      scale height -- b
+#                      gas adiabatic index -- gamma
+#
+# Problem parameters are: Ratio height/width of the box -- z_factor
+#                         Size of the patch -- side_length
+
+# Parameters of the gas disc
+surface_density = 10.
+scale_height    = 100.
+gas_gamma       = 5./3.
+
+# Parameters of the problem
+x_factor        = 2
+side_length     = 400.
+numPart         = 1000
+
+# File
+fileName = "Disc-Patch.hdf5"
+
+####################################################################
+
+# physical constants in cgs
+NEWTON_GRAVITY_CGS  = 6.67408e-8
+SOLAR_MASS_IN_CGS   = 1.9885e33
+PARSEC_IN_CGS       = 3.08567758149e18
+PROTON_MASS_IN_CGS  = 1.672621898e-24
+BOLTZMANN_IN_CGS    = 1.38064852e-16
+YEAR_IN_CGS         = 3.15569252e7
+
+# choice of units
+unit_length_in_cgs   =   (PARSEC_IN_CGS)
+unit_mass_in_cgs     =   (SOLAR_MASS_IN_CGS)
+unit_velocity_in_cgs =   (1e5)
+unit_time_in_cgs     =   unit_length_in_cgs / unit_velocity_in_cgs
+
+print "UnitMass_in_cgs:     %.5e"%unit_mass_in_cgs
+print "UnitLength_in_cgs:   %.5e"%unit_length_in_cgs
+print "UnitVelocity_in_cgs: %.5e"%unit_velocity_in_cgs
+print "UnitTime_in_cgs:     %.5e"%unit_time_in_cgs
+print ""
+
+# Derived units
+const_G  = NEWTON_GRAVITY_CGS * unit_mass_in_cgs * unit_time_in_cgs**2 * \
+           unit_length_in_cgs**-3
+const_mp = PROTON_MASS_IN_CGS * unit_mass_in_cgs**-1
+const_kb = BOLTZMANN_IN_CGS * unit_mass_in_cgs**-1 * unit_length_in_cgs**-2 * \
+           unit_time_in_cgs**2
+
+print "--- Some constants [internal units] ---"
+print "G_Newton:    %.5e"%const_G
+print "m_proton:    %.5e"%const_mp
+print "k_boltzmann: %.5e"%const_kb
+print ""
+
+# derived quantities
+temp       = math.pi * const_G * surface_density * scale_height * const_mp / \
+             const_kb
+u_therm    = const_kb * temp / ((gas_gamma-1) * const_mp)
+v_disp     = math.sqrt(2 * u_therm)
+soundspeed = math.sqrt(u_therm / (gas_gamma * (gas_gamma-1.)))
+t_dyn      = math.sqrt(scale_height / (const_G * surface_density))
+t_cross    = scale_height / soundspeed
+
+print "--- Properties of the gas [internal units] ---"
+print "Gas temperature:     %.5e"%temp
+print "Gas thermal_energy:  %.5e"%u_therm
+print "Dynamical time:      %.5e"%t_dyn
+print "Sound crossing time: %.5e"%t_cross
+print "Gas sound speed:     %.5e"%soundspeed
+print "Gas 3D vel_disp:     %.5e"%v_disp
+print ""
+
+# Problem properties
+boxSize_x = side_length
+boxSize_x *= x_factor
+volume = boxSize_x
+M_tot = surface_density * math.tanh(boxSize_x / (2. * scale_height))
+density = M_tot / volume
+entropy = (gas_gamma - 1.) * u_therm / density**(gas_gamma - 1.)
+
+print "--- Problem properties [internal units] ---"
+print "Box:        %.1f"%boxSize_x
+print "Volume:     %.5e"%volume
+print "Total mass: %.5e"%M_tot
+print "Density:    %.5e"%density
+print "Entropy:    %.5e"%entropy
+print ""
+
+####################################################################
+
+# Now create enough copies to fill the volume in x
+pos = np.zeros((numPart, 3))
+h = np.zeros(numPart) + 2. * boxSize_x / numPart
+for i in range(numPart):
+    pos[i, 0] = (i + 0.5) * boxSize_x / numPart
+
+# Compute further properties of ICs
+mass = M_tot / numPart
+
+print "--- Particle properties [internal units] ---"
+print "Number part.: ", numPart
+print "Part. mass:   %.5e"%mass
+print ""
+
+# Create additional arrays
+u    = np.ones(numPart) * u_therm
+mass = np.ones(numPart) * mass
+vel  = np.zeros((numPart, 3))
+ids  = 1 + np.linspace(0, numPart, numPart, endpoint=False)
+
+####################################################################
+# Create and write output file
+
+#File
+file = h5py.File(fileName, 'w')
+
+#Units
+grp = file.create_group("/Units")
+grp.attrs["Unit length in cgs (U_L)"] = unit_length_in_cgs
+grp.attrs["Unit mass in cgs (U_M)"] = unit_mass_in_cgs
+grp.attrs["Unit time in cgs (U_t)"] = unit_time_in_cgs
+grp.attrs["Unit current in cgs (U_I)"] = 1.
+grp.attrs["Unit temperature in cgs (U_T)"] = 1.
+
+# Header
+grp = file.create_group("/Header")
+grp.attrs["BoxSize"] = [boxSize_x, 1., 1.]
+grp.attrs["NumPart_Total"] =  [numPart, 0, 0, 0, 0, 0]
+grp.attrs["NumPart_Total_HighWord"] = [0, 0, 0, 0, 0, 0]
+grp.attrs["NumPart_ThisFile"] = [numPart, 0, 0, 0, 0, 0]
+grp.attrs["Time"] = 0.0
+grp.attrs["NumFilesPerSnapshot"] = 1
+grp.attrs["MassTable"] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+grp.attrs["Flag_Entropy_ICs"] = [0, 0, 0, 0, 0, 0]
+grp.attrs["Dimension"] = 1
+
+#Runtime parameters
+grp = file.create_group("/RuntimePars")
+grp.attrs["PeriodicBoundariesOn"] = 1
+
+# write gas particles
+grp0   = file.create_group("/PartType0")
+
+ds = grp0.create_dataset('Coordinates', (numPart, 3), 'f', data=pos)
+ds = grp0.create_dataset('Velocities', (numPart, 3), 'f')
+ds = grp0.create_dataset('Masses', (numPart,), 'f', data=mass)
+ds = grp0.create_dataset('SmoothingLength', (numPart,), 'f', data=h)
+ds = grp0.create_dataset('InternalEnergy', (numPart,), 'f', data=u)
+ds = grp0.create_dataset('ParticleIDs', (numPart, ), 'L', data=ids)
+
+####################################################################
+
+print "--- Runtime parameters (YAML file): ---"
+print "DiscPatchPotential:surface_density:    ", surface_density
+print "DiscPatchPotential:scale_height:       ", scale_height
+print "DiscPatchPotential:x_disc:             ", 0.5 * boxSize_x
+print ""
+
+print "--- Constant parameters: ---"
+print "const_isothermal_internal_energy: %ef"%u_therm
diff --git a/examples/DiscPatch/HydroStatic_1D/plotSolution.py b/examples/DiscPatch/HydroStatic_1D/plotSolution.py
new file mode 100644
index 0000000000000000000000000000000000000000..681f7d8ab3f2320b5de75e688edcb92efef9d883
--- /dev/null
+++ b/examples/DiscPatch/HydroStatic_1D/plotSolution.py
@@ -0,0 +1,121 @@
+################################################################################
+# This file is part of SWIFT.
+# Copyright (c) 2017 Bert Vandenbroucke (bert.vandenbroucke@gmail.com)
+#                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+################################################################################
+
+##
+# This script plots the Disc-Patch_*.hdf5 snapshots.
+# It takes two (optional) parameters: the counter value of the first and last
+# snapshot to plot (default: 0 21).
+##
+
+import numpy as np
+import h5py
+import matplotlib
+matplotlib.use("Agg")
+import pylab as pl
+import glob
+import sys
+
+# Parameters
+surface_density = 10.
+scale_height = 100.
+x_disc = 400.
+x_trunc = 300.
+x_max = 350.
+utherm = 20.2678457288
+gamma = 5. / 3.
+
+start = 0
+stop = 21
+if len(sys.argv) > 1:
+  start = int(sys.argv[1])
+if len(sys.argv) > 2:
+  stop = int(sys.argv[2])
+
+# Get the analytic solution for the density
+def get_analytic_density(x):
+  return 0.5 * surface_density / scale_height / \
+           np.cosh( (x - x_disc) / scale_height )**2
+
+# Get the analytic solution for the (isothermal) pressure
+def get_analytic_pressure(x):
+  return (gamma - 1.) * utherm * get_analytic_density(x)
+
+# Get the data fields to plot from the snapshot file with the given name:
+#  snapshot time, x-coord, density, pressure, velocity norm
+def get_data(name):
+  file = h5py.File(name, "r")
+  coords = np.array(file["/PartType0/Coordinates"])
+  rho = np.array(file["/PartType0/Density"])
+  u = np.array(file["/PartType0/InternalEnergy"])
+  v = np.array(file["/PartType0/Velocities"])
+
+  P = (gamma - 1.) * rho * u
+
+  vtot = np.sqrt( v[:,0]**2 + v[:,1]**2 + v[:,2]**2 )
+
+  return float(file["/Header"].attrs["Time"]), coords[:,0], rho, P, vtot
+
+# scan the folder for snapshot files and plot all of them (within the requested
+# range)
+for f in sorted(glob.glob("Disc-Patch_*.hdf5")):
+  num = int(f[-8:-5])
+  if num < start or num > stop:
+    continue
+
+  print "processing", f, "..."
+
+  xrange = np.linspace(0., 2. * x_disc, 1000)
+  time, x, rho, P, v = get_data(f)
+
+  fig, ax = pl.subplots(3, 1, sharex = True)
+
+  ax[0].plot(x, rho, "r.")
+  ax[0].plot(xrange, get_analytic_density(xrange), "k-")
+  ax[0].plot([x_disc - x_max, x_disc - x_max], [0, 10], "k--", alpha=0.5)
+  ax[0].plot([x_disc + x_max, x_disc + x_max], [0, 10], "k--", alpha=0.5)
+  ax[0].plot([x_disc - x_trunc, x_disc - x_trunc], [0, 10], "k--", alpha=0.5)
+  ax[0].plot([x_disc + x_trunc, x_disc + x_trunc], [0, 10], "k--", alpha=0.5)
+  ax[0].set_ylim(0., 1.2 * get_analytic_density(x_disc))
+  ax[0].set_ylabel("density")
+
+  ax[1].plot(x, v, "r.")
+  ax[1].plot(xrange, np.zeros(len(xrange)), "k-")
+  ax[1].plot([x_disc - x_max, x_disc - x_max], [0, 10], "k--", alpha=0.5)
+  ax[1].plot([x_disc + x_max, x_disc + x_max], [0, 10], "k--", alpha=0.5)
+  ax[1].plot([x_disc - x_trunc, x_disc - x_trunc], [0, 10], "k--", alpha=0.5)
+  ax[1].plot([x_disc + x_trunc, x_disc + x_trunc], [0, 10], "k--", alpha=0.5)
+  ax[1].set_ylim(-0.5, 10.)
+  ax[1].set_ylabel("velocity norm")
+
+  ax[2].plot(x, P, "r.")
+  ax[2].plot(xrange, get_analytic_pressure(xrange), "k-")
+  ax[2].plot([x_disc - x_max, x_disc - x_max], [0, 10], "k--", alpha=0.5)
+  ax[2].plot([x_disc + x_max, x_disc + x_max], [0, 10], "k--", alpha=0.5)
+  ax[2].plot([x_disc - x_trunc, x_disc - x_trunc], [0, 10], "k--", alpha=0.5)
+  ax[2].plot([x_disc + x_trunc, x_disc + x_trunc], [0, 10], "k--", alpha=0.5)
+  ax[2].set_xlim(0., 2. * x_disc)
+  ax[2].set_ylim(0., 1.2 * get_analytic_pressure(x_disc))
+  ax[2].set_xlabel("x")
+  ax[2].set_ylabel("pressure")
+
+  pl.suptitle("t = {0:.2f}".format(time))
+
+  pl.savefig("{name}.png".format(name = f[:-5]))
+  pl.close()
diff --git a/examples/DiscPatch/HydroStatic_1D/run.sh b/examples/DiscPatch/HydroStatic_1D/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..e9d073a6cc7a06ec9ebd9fdb556c44778d32c7f4
--- /dev/null
+++ b/examples/DiscPatch/HydroStatic_1D/run.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+# Generate the initial conditions if they are not present.
+if [ ! -e Disc-Patch.hdf5 ]
+then
+    echo "Generating initial conditions for the disc patch example..."
+    python makeIC.py
+fi
+
+# Run SWIFT
+../../swift -g -s -t 4 disc-patch-icc.yml 2>&1 | tee output.log
+
+python plotSolution.py
diff --git a/examples/EAGLE_100/eagle_100.yml b/examples/EAGLE_100/eagle_100.yml
index a9b83b81f085e66b36d115c5265b66d6093ffdfb..1ea1518825debe56cb8462c4a1b398c03c257bfe 100644
--- a/examples/EAGLE_100/eagle_100.yml
+++ b/examples/EAGLE_100/eagle_100.yml
@@ -23,6 +23,12 @@ Snapshots:
 Statistics:
   delta_time:          1e-2 # Time between statistics output
 
+# Parameters for the self-gravity scheme
+Gravity:
+  eta:                   0.025    # Constant dimensionless multiplier for time integration. 
+  epsilon:               0.0001   # Softening length (in internal units).
+  theta:                 0.7      # Opening angle (Multipole acceptance criterion)
+  
 # Parameters for the hydrodynamics scheme
 SPH:
   resolution_eta:        1.2348   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
diff --git a/examples/EAGLE_12/eagle_12.yml b/examples/EAGLE_12/eagle_12.yml
index 6afffed0f9d39b34588b89569a85ab56223fc548..f56c330590ac25cc5b3fe8f68ed68aa1e94d6490 100644
--- a/examples/EAGLE_12/eagle_12.yml
+++ b/examples/EAGLE_12/eagle_12.yml
@@ -12,9 +12,6 @@ TimeIntegration:
   time_end:   1e-2  # The end time of the simulation (in internal units).
   dt_min:     1e-10 # The minimal time-step size of the simulation (in internal units).
   dt_max:     1e-4  # The maximal time-step size of the simulation (in internal units).
-
-Scheduler:
-  cell_split_size:     50
   
 # Parameters governing the snapshots
 Snapshots:
@@ -29,8 +26,8 @@ Statistics:
 # Parameters for the self-gravity scheme
 Gravity:
   eta:                   0.025    # Constant dimensionless multiplier for time integration.
+  epsilon:               0.001    # Softening length (in internal units).
   theta:                 0.7      # Opening angle (Multipole acceptance criterion)
-  epsilon:               0.0001   # Softening length (in internal units).
   
 # Parameters for the hydrodynamics scheme
 SPH:
diff --git a/examples/EAGLE_25/eagle_25.yml b/examples/EAGLE_25/eagle_25.yml
index c755768bcfafebf3efe6307080e9e85d3a0a4bf5..5dee9dad0b5d7f694c61fa4c983ead0f1cd6e5e2 100644
--- a/examples/EAGLE_25/eagle_25.yml
+++ b/examples/EAGLE_25/eagle_25.yml
@@ -27,8 +27,7 @@ Statistics:
 Gravity:
   eta:                   0.025    # Constant dimensionless multiplier for time integration. 
   epsilon:               0.0001   # Softening length (in internal units).
-  a_smooth:              1000.
-  r_cut:                 4.
+  theta:                 0.7      # Opening angle (Multipole acceptance criterion)
   
 # Parameters for the hydrodynamics scheme
 SPH:
diff --git a/examples/EAGLE_50/eagle_50.yml b/examples/EAGLE_50/eagle_50.yml
index b84b1eb7c362f85d8cd6a08ff2a15f72d1337396..898c28935abd02ec115ce107bdcfa4006c41dc48 100644
--- a/examples/EAGLE_50/eagle_50.yml
+++ b/examples/EAGLE_50/eagle_50.yml
@@ -23,6 +23,12 @@ Snapshots:
 Statistics:
   delta_time:          1e-2 # Time between statistics output
 
+# Parameters for the self-gravity scheme
+Gravity:
+  eta:                   0.025    # Constant dimensionless multiplier for time integration.
+  epsilon:               0.0001   # Softening length (in internal units).
+  theta:                 0.7      # Opening angle (Multipole acceptance criterion)
+
 # Parameters for the hydrodynamics scheme
 SPH:
   resolution_eta:        1.2348   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
diff --git a/examples/EAGLE_6/README b/examples/EAGLE_6/README
new file mode 100644
index 0000000000000000000000000000000000000000..9fe951252f1abf4e27264c6497ec14451080b01e
--- /dev/null
+++ b/examples/EAGLE_6/README
@@ -0,0 +1,13 @@
+ICs extracted from the EAGLE suite of simulations. 
+
+WARNING: These ICs correspond to a very small cosmological volume
+and are not representative of actual load-balancing of large runs.
+
+The particle distribution here is the snapshot 27 (z=0.1) of the 6.25Mpc
+Ref-model. h- and a- factors from the original Gadget code have been
+corrected for. Variables not used in a pure hydro & gravity code have
+been removed. 
+Everything is ready to be run without cosmological integration. 
+
+MD5 checksum of the ICs:
+a4efccd3646a60ad8600ac3a2895ea82  EAGLE_ICs_6.hdf5
diff --git a/examples/EAGLE_6/eagle_6.yml b/examples/EAGLE_6/eagle_6.yml
new file mode 100644
index 0000000000000000000000000000000000000000..f55ecc856953d4cb60a86e3461625318a1757693
--- /dev/null
+++ b/examples/EAGLE_6/eagle_6.yml
@@ -0,0 +1,44 @@
+# Define the system of units to use internally. 
+InternalUnitSystem:
+  UnitMass_in_cgs:     1.989e43      # 10^10 M_sun in grams
+  UnitLength_in_cgs:   3.085678e24   # Mpc in centimeters
+  UnitVelocity_in_cgs: 1e5           # km/s in centimeters per second
+  UnitCurrent_in_cgs:  1             # Amperes
+  UnitTemp_in_cgs:     1             # Kelvin
+
+# Parameters governing the time integration
+TimeIntegration:
+  time_begin: 0.    # The starting time of the simulation (in internal units).
+  time_end:   1e-2  # The end time of the simulation (in internal units).
+  dt_min:     1e-10 # The minimal time-step size of the simulation (in internal units).
+  dt_max:     1e-4  # The maximal time-step size of the simulation (in internal units).
+
+Scheduler:
+  cell_split_size: 64
+  
+# Parameters governing the snapshots
+Snapshots:
+  basename:            eagle # Common part of the name of output files
+  time_first:          0.    # Time of the first output (in internal units)
+  delta_time:          1e-3  # Time difference between consecutive outputs (in internal units)
+
+# Parameters governing the conserved quantities statistics
+Statistics:
+  delta_time:          1e-2 # Time between statistics output
+
+# Parameters for the self-gravity scheme
+Gravity:
+  eta:                   0.025    # Constant dimensionless multiplier for time integration.
+  theta:                 0.7      # Opening angle (Multipole acceptance criterion)
+  epsilon:               0.0001   # Softening length (in internal units).
+  
+# Parameters for the hydrodynamics scheme
+SPH:
+  resolution_eta:        1.2348   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
+  delta_neighbours:      0.1      # The tolerance for the targetted number of neighbours.
+  CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
+
+# Parameters related to the initial conditions
+InitialConditions:
+  file_name:  ./EAGLE_ICs_6.hdf5     # The file to read
+
diff --git a/examples/EAGLE_6/getIC.sh b/examples/EAGLE_6/getIC.sh
new file mode 100755
index 0000000000000000000000000000000000000000..08daa32a9b708532ab3e78924fb44f7c5dd06795
--- /dev/null
+++ b/examples/EAGLE_6/getIC.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+wget http://virgodb.cosma.dur.ac.uk/swift-webstorage/ICs/EAGLE_ICs_6.hdf5
diff --git a/examples/EAGLE_6/run.sh b/examples/EAGLE_6/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..d8e5592467a115460bb455ab31bb5e1f4017a948
--- /dev/null
+++ b/examples/EAGLE_6/run.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+ # Generate the initial conditions if they are not present.
+if [ ! -e EAGLE_ICs_6.hdf5 ]
+then
+    echo "Fetching initial conditions for the EAGLE 6Mpc example..."
+    ./getIC.sh
+fi
+
+../swift -s -t 16 eagle_6.yml 2>&1 | tee output.log
+
diff --git a/examples/ExternalPointMass/energy_plot.py b/examples/ExternalPointMass/energy_plot.py
index 25640bcb5af2966dcd57efbe1a814bb18ac4f263..1863305614c226f64faac3d86fa2f809d49b9d74 100644
--- a/examples/ExternalPointMass/energy_plot.py
+++ b/examples/ExternalPointMass/energy_plot.py
@@ -34,7 +34,7 @@ import sys
 stats_filename = "./energy.txt"
 
 # First snapshot
-snap_filename = "pointMass_000.hdf5"
+snap_filename = "pointMass_0000.hdf5"
 f = h5.File(snap_filename,'r')
 
 # Read the units parameters from the snapshot
@@ -71,7 +71,7 @@ Lz_snap = np.zeros(402)
 
 # Read all the particles from the snapshots
 for i in range(402):
-    snap_filename = "pointMass_%0.3d.hdf5"%i
+    snap_filename = "pointMass_%0.4d.hdf5"%i
     f = h5.File(snap_filename,'r')
 
     pos_x = f["PartType1/Coordinates"][:,0]
diff --git a/examples/Gradients/run.sh b/examples/Gradients/run.sh
index cc1adc676427b257445f64a011ed8ebee87285ab..44c25ac5695175c40483d9f8b3bbd160b2fcbc0a 100755
--- a/examples/Gradients/run.sh
+++ b/examples/Gradients/run.sh
@@ -2,12 +2,12 @@
 
 python makeICs.py stretched
 ../swift -s -t 2 gradientsStretched.yml
-python plot.py gradients_stretched_001.hdf5 stretched
+python plot.py gradients_stretched_0001.hdf5 stretched
 
 python makeICs.py cartesian
 ../swift -s -t 2 gradientsCartesian.yml
-python plot.py gradients_cartesian_001.hdf5 cartesian
+python plot.py gradients_cartesian_0001.hdf5 cartesian
 
 python makeICs.py random
 ../swift -s -t 2 gradientsRandom.yml
-python plot.py gradients_random_001.hdf5 random
+python plot.py gradients_random_0001.hdf5 random
diff --git a/examples/GreshoVortex_2D/plotSolution.py b/examples/GreshoVortex_2D/plotSolution.py
index 7a86daa6a4e5e1dd80888ceac9a6eb6b08dff443..d497a6b297bf38b39cf85a9107a769c20f815b77 100644
--- a/examples/GreshoVortex_2D/plotSolution.py
+++ b/examples/GreshoVortex_2D/plotSolution.py
@@ -83,7 +83,7 @@ solution_s = solution_P / solution_rho**gas_gamma
 solution_u = solution_P /((gas_gamma - 1.)*solution_rho)
 
 # Read the simulation data
-sim = h5py.File("gresho_%03d.hdf5"%snap, "r")
+sim = h5py.File("gresho_%04d.hdf5"%snap, "r")
 boxSize = sim["/Header"].attrs["BoxSize"][0]
 time = sim["/Header"].attrs["Time"][0]
 scheme = sim["/HydroScheme"].attrs["Scheme"]
diff --git a/examples/HydrostaticHalo/density_profile.py b/examples/HydrostaticHalo/density_profile.py
index 5248587ec343d3c0ffe2cef0cbd8716b9a1e055c..a28b4d56a911c10afba07fcb25b377428eb4f857 100644
--- a/examples/HydrostaticHalo/density_profile.py
+++ b/examples/HydrostaticHalo/density_profile.py
@@ -42,7 +42,7 @@ H_0_cgs = 100. * h * KM_PER_SEC_IN_CGS / (1.0e6 * PARSEC_IN_CGS)
 
 #read some header/parameter information from the first snapshot
 
-filename = "Hydrostatic_000.hdf5"
+filename = "Hydrostatic_0000.hdf5"
 f = h5.File(filename,'r')
 params = f["Parameters"]
 unit_mass_cgs = float(params.attrs["InternalUnitSystem:UnitMass_in_cgs"])
@@ -63,7 +63,7 @@ M_vir_cgs = r_vir_cgs * v_c_cgs**2 / CONST_G_CGS
 
 for i in range(n_snaps):
 
-    filename = "Hydrostatic_%03d.hdf5" %i
+    filename = "Hydrostatic_%04d.hdf5" %i
     f = h5.File(filename,'r')
     coords_dset = f["PartType0/Coordinates"]
     coords = np.array(coords_dset)
diff --git a/examples/HydrostaticHalo/internal_energy_profile.py b/examples/HydrostaticHalo/internal_energy_profile.py
index f1be049adb8e972f89fd9ffe86106b1b9f3b19dc..f73fe4b70718054b29a7147b4ee3fa5b13539acf 100644
--- a/examples/HydrostaticHalo/internal_energy_profile.py
+++ b/examples/HydrostaticHalo/internal_energy_profile.py
@@ -60,7 +60,7 @@ H_0_cgs = 100. * h * KM_PER_SEC_IN_CGS / (1.0e6 * PARSEC_IN_CGS)
 
 #read some header/parameter information from the first snapshot
 
-filename = "Hydrostatic_000.hdf5"
+filename = "Hydrostatic_0000.hdf5"
 f = h5.File(filename,'r')
 params = f["Parameters"]
 unit_mass_cgs = float(params.attrs["InternalUnitSystem:UnitMass_in_cgs"])
@@ -79,7 +79,7 @@ M_vir_cgs = r_vir_cgs * v_c_cgs**2 / CONST_G_CGS
 
 for i in range(n_snaps):
 
-    filename = "Hydrostatic_%03d.hdf5" %i
+    filename = "Hydrostatic_%04d.hdf5" %i
     f = h5.File(filename,'r')
     coords_dset = f["PartType0/Coordinates"]
     coords = np.array(coords_dset)
diff --git a/examples/HydrostaticHalo/test_energy_conservation.py b/examples/HydrostaticHalo/test_energy_conservation.py
index 8368d475813d248ca93c12e46737b062752ab779..cc3e3da38d714f103b5f89c7eb713b64ddc6a8ec 100644
--- a/examples/HydrostaticHalo/test_energy_conservation.py
+++ b/examples/HydrostaticHalo/test_energy_conservation.py
@@ -38,7 +38,7 @@ H_0_cgs = 100. * h * KM_PER_SEC_IN_CGS / (1.0e6 * PARSEC_IN_CGS)
 
 #read some header/parameter information from the first snapshot
 
-filename = "Hydrostatic_000.hdf5"
+filename = "Hydrostatic_0000.hdf5"
 f = h5.File(filename,'r')
 params = f["Parameters"]
 unit_mass_cgs = float(params.attrs["InternalUnitSystem:UnitMass_in_cgs"])
@@ -62,7 +62,7 @@ time_array_cgs = []
 
 for i in range(n_snaps):
 
-    filename = "Hydrostatic_%03d.hdf5" %i
+    filename = "Hydrostatic_%04d.hdf5" %i
     f = h5.File(filename,'r')
     coords_dset = f["PartType0/Coordinates"]
     coords = np.array(coords_dset)
diff --git a/examples/HydrostaticHalo/velocity_profile.py b/examples/HydrostaticHalo/velocity_profile.py
index f8f607362846a323937a9203dab8bc228f52a149..19ae4b9c3339a0fb2f2bf73fb6e60acb6d82ba7e 100644
--- a/examples/HydrostaticHalo/velocity_profile.py
+++ b/examples/HydrostaticHalo/velocity_profile.py
@@ -60,7 +60,7 @@ H_0_cgs = 100. * h * KM_PER_SEC_IN_CGS / (1.0e6 * PARSEC_IN_CGS)
 
 #read some header/parameter information from the first snapshot
 
-filename = "Hydrostatic_000.hdf5"
+filename = "Hydrostatic_0000.hdf5"
 f = h5.File(filename,'r')
 params = f["Parameters"]
 unit_mass_cgs = float(params.attrs["InternalUnitSystem:UnitMass_in_cgs"])
@@ -79,7 +79,7 @@ M_vir_cgs = r_vir_cgs * v_c_cgs**2 / CONST_G_CGS
 
 for i in range(n_snaps):
 
-    filename = "Hydrostatic_%03d.hdf5" %i
+    filename = "Hydrostatic_%04d.hdf5" %i
     f = h5.File(filename,'r')
     coords_dset = f["PartType0/Coordinates"]
     coords = np.array(coords_dset)
diff --git a/examples/IsothermalPotential/energy_plot.py b/examples/IsothermalPotential/energy_plot.py
index 0afa6fa93fa2a992e6ddeab3c9d33538c0b41de3..dab30715fbdaa0393f62c764ba552bbe4106325d 100644
--- a/examples/IsothermalPotential/energy_plot.py
+++ b/examples/IsothermalPotential/energy_plot.py
@@ -34,7 +34,7 @@ import sys
 stats_filename = "./energy.txt"
 
 # First snapshot
-snap_filename = "Isothermal_000.hdf5"
+snap_filename = "Isothermal_0000.hdf5"
 f = h5.File(snap_filename,'r')
 
 # Read the units parameters from the snapshot
@@ -70,7 +70,7 @@ Lz_snap = np.zeros(402)
 
 # Read all the particles from the snapshots
 for i in range(402):
-    snap_filename = "Isothermal_%0.3d.hdf5"%i
+    snap_filename = "Isothermal_%0.4d.hdf5"%i
     f = h5.File(snap_filename,'r')
 
     pos_x = f["PartType1/Coordinates"][:,0]
diff --git a/examples/KelvinHelmholtz_2D/plotSolution.py b/examples/KelvinHelmholtz_2D/plotSolution.py
index 9191f3ac7ec75c61d5fdab5d347c86222f787fab..77ab6fb244da25d13760f90653fac7eac11a0ee7 100644
--- a/examples/KelvinHelmholtz_2D/plotSolution.py
+++ b/examples/KelvinHelmholtz_2D/plotSolution.py
@@ -63,7 +63,7 @@ rc('font',**{'family':'sans-serif','sans-serif':['Times']})
 snap = int(sys.argv[1])
 
 # Read the simulation data
-sim = h5py.File("kelvinHelmholtz_%03d.hdf5"%snap, "r")
+sim = h5py.File("kelvinHelmholtz_%04d.hdf5"%snap, "r")
 boxSize = sim["/Header"].attrs["BoxSize"][0]
 time = sim["/Header"].attrs["Time"][0]
 scheme = sim["/HydroScheme"].attrs["Scheme"]
diff --git a/examples/Makefile.am b/examples/Makefile.am
index 1dd240fb6015fe5fdd2465cccb1bb221706efeed..5501601f95bde15484142e994dbf3d6fa475da98 100644
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -60,9 +60,11 @@ EXTRA_DIST = BigCosmoVolume/makeIC.py \
 	     BigPerturbedBox/makeIC_fcc.py \
 	     CosmoVolume/cosmoVolume.yml CosmoVolume/getIC.sh CosmoVolume/run.sh \
 	     CoolingBox/coolingBox.yml CoolingBox/energy_plot.py CoolingBox/makeIC.py CoolingBox/run.sh \
+	     EAGLE_6/eagle_6.yml EAGLE_6/getIC.sh EAGLE_6/README EAGLE_6/run.sh \
 	     EAGLE_12/eagle_12.yml EAGLE_12/getIC.sh EAGLE_12/README EAGLE_12/run.sh \
 	     EAGLE_25/eagle_25.yml EAGLE_25/getIC.sh EAGLE_25/README EAGLE_25/run.sh \
 	     EAGLE_50/eagle_50.yml EAGLE_50/getIC.sh EAGLE_50/README EAGLE_50/run.sh \
+	     EAGLE_100/eagle_100.yml EAGLE_100/getIC.sh EAGLE_100/README EAGLE_100/run.sh \
 	     ExternalPointMass/externalPointMass.yml ExternalPointMass/makeIC.py ExternalPointMass/run.sh ExternalPointMass/energy_plot.py \
 	     GreshoVortex_2D/getGlass.sh GreshoVortex_2D/gresho.yml GreshoVortex_2D/makeIC.py GreshoVortex_2D/plotSolution.py GreshoVortex_2D/run.sh \
 	     HydrostaticHalo/README HydrostaticHalo/hydrostatic.yml HydrostaticHalo/makeIC.py HydrostaticHalo/run.sh \
@@ -70,11 +72,17 @@ EXTRA_DIST = BigCosmoVolume/makeIC.py \
 	     IsothermalPotential/README IsothermalPotential/run.sh IsothermalPotential/energy_plot.py IsothermalPotential/isothermal.yml IsothermalPotential/makeIC.py \
 	     KelvinHelmholtz_2D/kelvinHelmholtz.yml KelvinHelmholtz_2D/makeIC.py KelvinHelmholtz_2D/plotSolution.py KelvinHelmholtz_2D/run.sh \
 	     MultiTypes/makeIC.py  MultiTypes/multiTypes.yml MultiTypes/run.sh \
+             Noh_1D/makeIC.py Noh_1D/noh.yml Noh_1D/plotSolution.py Noh_1D/run.sh \
+             Noh_2D/makeIC.py Noh_2D/noh.yml Noh_2D/plotSolution.py Noh_2D/run.sh Noh_2D/getGlass.sh \
+             Noh_3D/makeIC.py Noh_3D/noh.yml Noh_3D/plotSolution.py Noh_3D/run.sh Noh_3D/getGlass.sh \
 	     PerturbedBox_2D/makeIC.py PerturbedBox_2D/perturbedPlane.yml \
 	     PerturbedBox_3D/makeIC.py PerturbedBox_3D/perturbedBox.yml PerturbedBox_3D/run.sh \
 	     SedovBlast_1D/makeIC.py SedovBlast_1D/plotSolution.py SedovBlast_1D/run.sh SedovBlast_1D/sedov.yml \
 	     SedovBlast_2D/getGlass.sh SedovBlast_2D/makeIC.py SedovBlast_2D/plotSolution.py SedovBlast_2D/run.sh SedovBlast_2D/sedov.yml \
 	     SedovBlast_3D/getGlass.sh SedovBlast_3D/makeIC.py SedovBlast_3D/plotSolution.py SedovBlast_3D/run.sh SedovBlast_3D/sedov.yml \
+             SineWavePotential_1D/makeIC.py SineWavePotential_1D/plotSolution.py SineWavePotential_1D/run.sh SineWavePotential_1D/sineWavePotential.yml \
+             SineWavePotential_2D/makeIC.py SineWavePotential_2D/plotSolution.py SineWavePotential_2D/run.sh SineWavePotential_2D/sineWavePotential.yml \
+             SineWavePotential_3D/makeIC.py SineWavePotential_3D/plotSolution.py SineWavePotential_3D/run.sh SineWavePotential_3D/sineWavePotential.yml \
 	     SodShock_1D/makeIC.py SodShock_1D/plotSolution.py SodShock_1D/run.sh SodShock_1D/sodShock.yml \
 	     SodShock_2D/getGlass.sh SodShock_2D/makeIC.py SodShock_2D/plotSolution.py SodShock_2D/run.sh SodShock_2D/sodShock.yml \
 	     SodShock_3D/getGlass.sh SodShock_3D/makeIC.py SodShock_3D/plotSolution.py SodShock_3D/run.sh SodShock_3D/sodShock.yml \
@@ -88,8 +96,15 @@ EXTRA_DIST += parameter_example.yml
 
 # Scripts to plot task graphs
 EXTRA_DIST += plot_tasks_MPI.py plot_tasks.py \
+              analyse_tasks_MPI.py analyse_tasks.py \
 	      process_plot_tasks_MPI process_plot_tasks
 
+# Scripts to plot threadpool 'task' graphs
+EXTRA_DIST += analyse_threadpool_tasks.py \
+              plot_threadpool.py \
+              process_plot_threadpool
+
 # Script for scaling plot
-EXTRA_DIST += plot_scaling_results.py
+EXTRA_DIST += plot_scaling_results.py \
+              plot_scaling_results_breakdown.py
 
diff --git a/examples/Noh_1D/plotSolution.py b/examples/Noh_1D/plotSolution.py
index f4916af6e6066d21f76c28b5acef41e1907a83fd..25b9b2f16b24cba5def592a5cf00dbae82195ef7 100644
--- a/examples/Noh_1D/plotSolution.py
+++ b/examples/Noh_1D/plotSolution.py
@@ -58,7 +58,7 @@ snap = int(sys.argv[1])
 
 
 # Read the simulation data
-sim = h5py.File("noh_%03d.hdf5"%snap, "r")
+sim = h5py.File("noh_%04d.hdf5"%snap, "r")
 boxSize = sim["/Header"].attrs["BoxSize"][0]
 time = sim["/Header"].attrs["Time"][0]
 scheme = sim["/HydroScheme"].attrs["Scheme"]
diff --git a/examples/Noh_2D/plotSolution.py b/examples/Noh_2D/plotSolution.py
index a01a712efd412488aea09c3f3c4e8d68323fc916..775ddf4e8a7954c14034ad51a6b66622c41a6996 100644
--- a/examples/Noh_2D/plotSolution.py
+++ b/examples/Noh_2D/plotSolution.py
@@ -58,7 +58,7 @@ rc('font',**{'family':'sans-serif','sans-serif':['Times']})
 snap = int(sys.argv[1])
 
 # Read the simulation data
-sim = h5py.File("noh_%03d.hdf5"%snap, "r")
+sim = h5py.File("noh_%04d.hdf5"%snap, "r")
 boxSize = sim["/Header"].attrs["BoxSize"][0]
 time = sim["/Header"].attrs["Time"][0]
 scheme = sim["/HydroScheme"].attrs["Scheme"]
diff --git a/examples/Noh_3D/plotSolution.py b/examples/Noh_3D/plotSolution.py
index 1742e13a5daeff392690a9804fb2831ef4304963..386b9f728b5e8d8e38fb7ec9aeaa336d194e35dd 100644
--- a/examples/Noh_3D/plotSolution.py
+++ b/examples/Noh_3D/plotSolution.py
@@ -59,7 +59,7 @@ snap = int(sys.argv[1])
 
 
 # Read the simulation data
-sim = h5py.File("noh_%03d.hdf5"%snap, "r")
+sim = h5py.File("noh_%04d.hdf5"%snap, "r")
 boxSize = sim["/Header"].attrs["BoxSize"][0]
 time = sim["/Header"].attrs["Time"][0]
 scheme = sim["/HydroScheme"].attrs["Scheme"]
diff --git a/examples/PerturbedBox_2D/perturbedPlane.yml b/examples/PerturbedBox_2D/perturbedPlane.yml
index b92e29f620edc6f72399111fbe73ba6bd1485e92..a0c6b6d9dbc7a677002dbce5abc6e5d268b56e97 100644
--- a/examples/PerturbedBox_2D/perturbedPlane.yml
+++ b/examples/PerturbedBox_2D/perturbedPlane.yml
@@ -9,7 +9,7 @@ InternalUnitSystem:
 # Parameters governing the time integration
 TimeIntegration:
   time_begin: 0.    # The starting time of the simulation (in internal units).
-  time_end:   10.   # The end time of the simulation (in internal units).
+  time_end:   1000. # The end time of the simulation (in internal units).
   dt_min:     1e-6  # The minimal time-step size of the simulation (in internal units).
   dt_max:     1e-2  # The maximal time-step size of the simulation (in internal units).
 
@@ -21,12 +21,11 @@ Snapshots:
 
 # Parameters governing the conserved quantities statistics
 Statistics:
-  delta_time:          1e-3 # Time between statistics output
+  delta_time:          1.   # Time between statistics output
 
 # Parameters for the hydrodynamics scheme
 SPH:
   resolution_eta:        1.2348   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
-  delta_neighbours:      0.1      # The tolerance for the targetted number of neighbours.
   CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
   
 # Parameters related to the initial conditions
diff --git a/examples/PerturbedBox_3D/perturbedBox.yml b/examples/PerturbedBox_3D/perturbedBox.yml
index 71c8dece4df5505eb44511ee92291feedd7ffab1..3148510979d0e349c0d6242bf11e1a0db94f9e1f 100644
--- a/examples/PerturbedBox_3D/perturbedBox.yml
+++ b/examples/PerturbedBox_3D/perturbedBox.yml
@@ -9,9 +9,9 @@ InternalUnitSystem:
 # Parameters governing the time integration
 TimeIntegration:
   time_begin: 0.    # The starting time of the simulation (in internal units).
-  time_end:   1.    # The end time of the simulation (in internal units).
+  time_end:   1000  # The end time of the simulation (in internal units).
   dt_min:     1e-6  # The minimal time-step size of the simulation (in internal units).
-  dt_max:     1e-3  # The maximal time-step size of the simulation (in internal units).
+  dt_max:     1e-2  # The maximal time-step size of the simulation (in internal units).
 
 # Parameters governing the snapshots
 Snapshots:
@@ -21,12 +21,11 @@ Snapshots:
 
 # Parameters governing the conserved quantities statistics
 Statistics:
-  delta_time:          1e-3 # Time between statistics output
+  delta_time:          1. # Time between statistics output
 
 # Parameters for the hydrodynamics scheme
 SPH:
   resolution_eta:        1.2348   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
-  delta_neighbours:      0.1      # The tolerance for the targetted number of neighbours.
   CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
   
 # Parameters related to the initial conditions
diff --git a/examples/SedovBlast_1D/plotSolution.py b/examples/SedovBlast_1D/plotSolution.py
index a62775b012edda3217558031c266ed6e9b48f423..2738b7c8f301a7351d962ac0f29faccd0a770fc9 100644
--- a/examples/SedovBlast_1D/plotSolution.py
+++ b/examples/SedovBlast_1D/plotSolution.py
@@ -64,7 +64,7 @@ snap = int(sys.argv[1])
 
 
 # Read the simulation data
-sim = h5py.File("sedov_%03d.hdf5"%snap, "r")
+sim = h5py.File("sedov_%04d.hdf5"%snap, "r")
 boxSize = sim["/Header"].attrs["BoxSize"][0]
 time = sim["/Header"].attrs["Time"][0]
 scheme = sim["/HydroScheme"].attrs["Scheme"]
diff --git a/examples/SedovBlast_2D/plotSolution.py b/examples/SedovBlast_2D/plotSolution.py
index d8c0c9791d1834cc2a5cf0103b46a49e20d2e8a3..2b5de6f32b8673bbc825fbb5236f4e2ab3b4f408 100644
--- a/examples/SedovBlast_2D/plotSolution.py
+++ b/examples/SedovBlast_2D/plotSolution.py
@@ -65,7 +65,7 @@ snap = int(sys.argv[1])
 
 
 # Read the simulation data
-sim = h5py.File("sedov_%03d.hdf5"%snap, "r")
+sim = h5py.File("sedov_%04d.hdf5"%snap, "r")
 boxSize = sim["/Header"].attrs["BoxSize"][0]
 time = sim["/Header"].attrs["Time"][0]
 scheme = sim["/HydroScheme"].attrs["Scheme"]
diff --git a/examples/SedovBlast_3D/plotSolution.py b/examples/SedovBlast_3D/plotSolution.py
index 6e90a9a43524b3cdb279054764b71fd1b546b366..ad34695d36f1bf8e8985b883200f17d6e38a70c9 100644
--- a/examples/SedovBlast_3D/plotSolution.py
+++ b/examples/SedovBlast_3D/plotSolution.py
@@ -65,7 +65,7 @@ snap = int(sys.argv[1])
 
 
 # Read the simulation data
-sim = h5py.File("sedov_%03d.hdf5"%snap, "r")
+sim = h5py.File("sedov_%04d.hdf5"%snap, "r")
 boxSize = sim["/Header"].attrs["BoxSize"][0]
 time = sim["/Header"].attrs["Time"][0]
 scheme = sim["/HydroScheme"].attrs["Scheme"]
diff --git a/examples/SodShock_1D/plotSolution.py b/examples/SodShock_1D/plotSolution.py
index 0a7720f4a6cf26e5a8acda1101bd438850d8d553..e001a8d87a03cb246be63ab10d245f95eb1a7ce7 100644
--- a/examples/SodShock_1D/plotSolution.py
+++ b/examples/SodShock_1D/plotSolution.py
@@ -67,7 +67,7 @@ snap = int(sys.argv[1])
 
 
 # Read the simulation data
-sim = h5py.File("sodShock_%03d.hdf5"%snap, "r")
+sim = h5py.File("sodShock_%04d.hdf5"%snap, "r")
 boxSize = sim["/Header"].attrs["BoxSize"][0]
 time = sim["/Header"].attrs["Time"][0]
 scheme = sim["/HydroScheme"].attrs["Scheme"]
diff --git a/examples/SodShock_2D/plotSolution.py b/examples/SodShock_2D/plotSolution.py
index b4a203d93518d98ee87282f4ea46d045c4c3b38a..19cbe0ffb766845c051ffb6cea81bd918d890e36 100644
--- a/examples/SodShock_2D/plotSolution.py
+++ b/examples/SodShock_2D/plotSolution.py
@@ -68,7 +68,7 @@ snap = int(sys.argv[1])
 
 
 # Read the simulation data
-sim = h5py.File("sodShock_%03d.hdf5"%snap, "r")
+sim = h5py.File("sodShock_%04d.hdf5"%snap, "r")
 boxSize = sim["/Header"].attrs["BoxSize"][0]
 time = sim["/Header"].attrs["Time"][0]
 scheme = sim["/HydroScheme"].attrs["Scheme"]
diff --git a/examples/SodShock_3D/plotSolution.py b/examples/SodShock_3D/plotSolution.py
index 3d9616af55a204db4be9df2e42b355e266944153..6da7193bcd3cdfb7c22a3fc6a14f91aea5cff5f7 100644
--- a/examples/SodShock_3D/plotSolution.py
+++ b/examples/SodShock_3D/plotSolution.py
@@ -68,7 +68,7 @@ snap = int(sys.argv[1])
 
 
 # Read the simulation data
-sim = h5py.File("sodShock_%03d.hdf5"%snap, "r")
+sim = h5py.File("sodShock_%04d.hdf5"%snap, "r")
 boxSize = sim["/Header"].attrs["BoxSize"][0]
 time = sim["/Header"].attrs["Time"][0]
 scheme = sim["/HydroScheme"].attrs["Scheme"]
diff --git a/examples/SquareTest_2D/plotSolution.py b/examples/SquareTest_2D/plotSolution.py
index b9efe76de1e6c5993fa5333be76a13ba95bdab0f..f182b4d7437348d29065b51df79e5334aa26f9a4 100644
--- a/examples/SquareTest_2D/plotSolution.py
+++ b/examples/SquareTest_2D/plotSolution.py
@@ -63,7 +63,7 @@ rc('font',**{'family':'sans-serif','sans-serif':['Times']})
 snap = int(sys.argv[1])
 
 # Read the simulation data
-sim = h5py.File("square_%03d.hdf5"%snap, "r")
+sim = h5py.File("square_%04d.hdf5"%snap, "r")
 boxSize = sim["/Header"].attrs["BoxSize"][0]
 time = sim["/Header"].attrs["Time"][0]
 scheme = sim["/HydroScheme"].attrs["Scheme"]
diff --git a/examples/UniformDMBox/uniformBox.yml b/examples/UniformDMBox/uniformBox.yml
index cffd442a9a5b16d8e042e41caf9991fcf0e1202e..e59d677b308ca70f212f74c7e4d8b79f015c77a9 100644
--- a/examples/UniformDMBox/uniformBox.yml
+++ b/examples/UniformDMBox/uniformBox.yml
@@ -28,7 +28,7 @@ Gravity:
   eta:                   0.025    # Constant dimensionless multiplier for time integration. 
   theta:                 0.7      # Opening angle (Multipole acceptance criterion)
   epsilon:               0.00001  # Softening length (in internal units).
-  
+ 
 # Parameters governing the conserved quantities statistics
 Statistics:
   delta_time:          1e-2 # Time between statistics output
diff --git a/examples/analyse_tasks.py b/examples/analyse_tasks.py
index 04cd59feedba7ee41621ac0891d544c4aa294543..970c4a91042b8c61185727f27ef898f93af81fdc 100755
--- a/examples/analyse_tasks.py
+++ b/examples/analyse_tasks.py
@@ -50,12 +50,17 @@ infile = args.input
 TASKTYPES = ["none", "sort", "self", "pair", "sub_self", "sub_pair",
              "init_grav", "ghost", "extra_ghost", "drift_part",
              "drift_gpart", "kick1", "kick2", "timestep", "send", "recv",
-             "grav_top_level", "grav_long_range", "grav_mm", "grav_down",
-             "cooling", "sourceterms", "count"]
+             "grav_top_level", "grav_long_range", "grav_ghost", "grav_mm",
+             "grav_down", "cooling", "sourceterms", "count"]
 
 SUBTYPES = ["none", "density", "gradient", "force", "grav", "external_grav",
             "tend", "xv", "rho", "gpart", "multipole", "spart", "count"]
 
+SIDS = ["(-1,-1,-1)", "(-1,-1, 0)", "(-1,-1, 1)", "(-1, 0,-1)",
+        "(-1, 0, 0)", "(-1, 0, 1)", "(-1, 1,-1)", "(-1, 1, 0)",
+        "(-1, 1, 1)", "( 0,-1,-1)", "( 0,-1, 0)", "( 0,-1, 1)",
+        "( 0, 0,-1)"]
+
 #  Read input.
 data = pl.loadtxt( infile )
 
@@ -66,11 +71,17 @@ print "# Maximum thread id:", maxthread
 full_step = data[0,:]
 tic_step = int(full_step[4])
 toc_step = int(full_step[5])
+updates = int(full_step[6])
+g_updates = int(full_step[7])
+s_updates = int(full_step[8])
 CPU_CLOCK = float(full_step[-1]) / 1000.0
 data = data[1:,:]
 if args.verbose:
-    print "CPU frequency:", CPU_CLOCK * 1000.0
-
+    print "# CPU frequency:", CPU_CLOCK * 1000.0
+print "#   updates:", updates
+print "# g_updates:", g_updates
+print "# s_updates:", s_updates
+    
 #  Avoid start and end times of zero.
 data = data[data[:,4] != 0]
 data = data[data[:,5] != 0]
@@ -78,6 +89,7 @@ data = data[data[:,5] != 0]
 #  Calculate the time range.
 total_t = (toc_step - tic_step)/ CPU_CLOCK
 print "# Data range: ", total_t, "ms"
+print
 
 #  Correct times to relative values.
 start_t = float(tic_step)
@@ -90,15 +102,16 @@ for i in range(maxthread):
     tasks[i] = []
 
 #  Gather into by thread data.
-num_lines = pl.size(data) / 10
+num_lines = pl.size(data) / pl.size(full_step)
 for line in range(num_lines):
     thread = int(data[line,0])
     tic = int(data[line,4]) / CPU_CLOCK
     toc = int(data[line,5]) / CPU_CLOCK
     tasktype = int(data[line,1])
     subtype = int(data[line,2])
+    sid = int(data[line, -1])
 
-    tasks[thread].append([tic,toc,tasktype,subtype])
+    tasks[thread].append([tic,toc,tasktype,subtype, sid])
 
 #  Sort by tic and gather used thread ids.
 threadids = []
@@ -109,10 +122,12 @@ for i in range(maxthread):
 
 #  Times per task.
 print "# Task times:"
-print "# {0:<16s}: {1:>7s} {2:>9s} {3:>9s} {4:>9s} {5:>9s} {6:>9s}"\
+print "# -----------"
+print "# {0:<17s}: {1:>7s} {2:>9s} {3:>9s} {4:>9s} {5:>9s} {6:>9s}"\
       .format("type/subtype", "count","minimum", "maximum",
               "sum", "mean", "percent")
 alltasktimes = {}
+sidtimes = {}
 for i in threadids:
     tasktimes = {}
     for task in tasks[i]:
@@ -126,12 +141,19 @@ for i in threadids:
             alltasktimes[key] = []
         alltasktimes[key].append(dt)
 
+        my_sid = task[4]
+        if my_sid > -1:
+            if not my_sid in sidtimes:
+                sidtimes[my_sid] = []
+            sidtimes[my_sid].append(dt)
+                
+        
     print "# Thread : ", i
     for key in sorted(tasktimes.keys()):
         taskmin = min(tasktimes[key])
         taskmax = max(tasktimes[key])
         tasksum = sum(tasktimes[key])
-        print "{0:18s}: {1:7d} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.4f} {6:9.2f}"\
+        print "{0:19s}: {1:7d} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.4f} {6:9.2f}"\
               .format(key, len(tasktimes[key]), taskmin, taskmax, tasksum,
                       tasksum / len(tasktimes[key]), tasksum / total_t * 100.0)
     print
@@ -141,14 +163,118 @@ for key in sorted(alltasktimes.keys()):
     taskmin = min(alltasktimes[key])
     taskmax = max(alltasktimes[key])
     tasksum = sum(alltasktimes[key])
-    print "{0:18s}: {1:7d} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.4f} {6:9.2f}"\
+    print "{0:19s}: {1:7d} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.4f} {6:9.2f}"\
           .format(key, len(alltasktimes[key]), taskmin, taskmax, tasksum,
                   tasksum / len(alltasktimes[key]),
                   tasksum / (len(threadids) * total_t) * 100.0)
 print
 
+# For pairs, show stuf sorted by SID
+print "# By SID (all threads): "
+print "# {0:<17s}: {1:>7s} {2:>9s} {3:>9s} {4:>9s} {5:>9s} {6:>9s}"\
+    .format("Pair/Sub-pair SID", "count","minimum", "maximum",
+            "sum", "mean", "percent")
+
+for sid in range(0,13):
+    if sid in sidtimes:
+        sidmin = min(sidtimes[sid])
+        sidmax = max(sidtimes[sid])
+        sidsum = sum(sidtimes[sid])
+        sidcount = len(sidtimes[sid])
+        sidmean = sidsum / sidcount
+    else:
+        sidmin = 0.
+        sidmax = 0.
+        sidsum = 0.
+        sidcount = 0
+        sidmean = 0.
+    print "{0:3d} {1:15s}: {2:7d} {3:9.4f} {4:9.4f} {5:9.4f} {6:9.4f} {7:9.2f}"\
+        .format(sid, SIDS[sid], sidcount, sidmin, sidmax, sidsum,
+                sidmean, sidsum / (len(threadids) * total_t) * 100.0)   
+print
+
 #  Dead times.
-print "# Deadtimes:"
+print "# Times not in tasks (deadtimes)"
+print "# ------------------------------"
+print "# Time before first task:"
+print "# no.    : {0:>9s} {1:>9s}".format("value", "percent")
+predeadtimes = []
+for i in threadids:
+    predeadtime = tasks[i][0][0]
+    print "thread {0:2d}: {1:9.4f} {2:9.4f}"\
+          .format(i, predeadtime, predeadtime / total_t * 100.0)
+    predeadtimes.append(predeadtime)
+
+predeadmin = min(predeadtimes)
+predeadmax = max(predeadtimes)
+predeadsum = sum(predeadtimes)
+print "#        : {0:>9s} {1:>9s} {2:>9s} {3:>9s} {4:>9s} {5:>9s}"\
+      .format("count", "minimum", "maximum", "sum", "mean", "percent")
+print "all      : {0:9d} {1:9.4f} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.2f}"\
+      .format(len(predeadtimes), predeadmin, predeadmax, predeadsum,
+              predeadsum / len(predeadtimes),
+              predeadsum / (len(threadids) * total_t ) * 100.0)
+print
+
+print "# Time after last task:"
+print "# no.    : {0:>9s} {1:>9s}".format("value", "percent")
+postdeadtimes = []
+for i in threadids:
+    postdeadtime = total_t - tasks[i][-1][1]
+    print "thread {0:2d}: {1:9.4f} {2:9.4f}"\
+          .format(i, postdeadtime, postdeadtime / total_t * 100.0)
+    postdeadtimes.append(postdeadtime)
+
+postdeadmin = min(postdeadtimes)
+postdeadmax = max(postdeadtimes)
+postdeadsum = sum(postdeadtimes)
+print "#        : {0:>9s} {1:>9s} {2:>9s} {3:>9s} {4:>9s} {5:>9s}"\
+      .format("count", "minimum", "maximum", "sum", "mean", "percent")
+print "all      : {0:9d} {1:9.4f} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.2f}"\
+      .format(len(postdeadtimes), postdeadmin, postdeadmax, postdeadsum,
+              postdeadsum / len(postdeadtimes),
+              postdeadsum / (len(threadids) * total_t ) * 100.0)
+print
+
+#  Time in engine, i.e. from first to last tasks.
+print "# Time between tasks (engine deadtime):"
+print "# no.    : {0:>9s} {1:>9s} {2:>9s} {3:>9s} {4:>9s} {5:>9s}"\
+      .format("count", "minimum", "maximum", "sum", "mean", "percent")
+enginedeadtimes = []
+for i in threadids:
+    deadtimes = []
+    last = tasks[i][0][0]
+    for task in tasks[i]:
+        dt = task[0] - last
+        deadtimes.append(dt)
+        last = task[1]
+
+    #  Drop first value, last value already gone.
+    if len(deadtimes) > 1:
+        deadtimes = deadtimes[1:]
+    else:
+        #  Only one task, so no deadtime by definition.
+        deadtimes = [0.0]
+
+    deadmin = min(deadtimes)
+    deadmax = max(deadtimes)
+    deadsum = sum(deadtimes)
+    print "thread {0:2d}: {1:9d} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.4f} {6:9.2f}"\
+          .format(i, len(deadtimes), deadmin, deadmax, deadsum,
+                  deadsum / len(deadtimes), deadsum / total_t * 100.0)
+    enginedeadtimes.extend(deadtimes)
+
+deadmin = min(enginedeadtimes)
+deadmax = max(enginedeadtimes)
+deadsum = sum(enginedeadtimes)
+print "all      : {0:9d} {1:9.4f} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.2f}"\
+      .format(len(enginedeadtimes), deadmin, deadmax, deadsum,
+              deadsum / len(enginedeadtimes),
+              deadsum / (len(threadids) * total_t ) * 100.0)
+print
+
+#  All times in step.
+print "# All deadtimes:"
 print "# no.    : {0:>9s} {1:>9s} {2:>9s} {3:>9s} {4:>9s} {5:>9s}"\
       .format("count", "minimum", "maximum", "sum", "mean", "percent")
 alldeadtimes = []
@@ -179,5 +305,4 @@ print "all      : {0:9d} {1:9.4f} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.2f}"\
               deadsum / (len(threadids) * total_t ) * 100.0)
 print
 
-
 sys.exit(0)
diff --git a/examples/analyse_tasks_MPI.py b/examples/analyse_tasks_MPI.py
index 9feffaf67ec393257d75428e310a2e8b807df39a..b78d73e879046b05b8a089f97c4c9c00a5f7bb79 100755
--- a/examples/analyse_tasks_MPI.py
+++ b/examples/analyse_tasks_MPI.py
@@ -42,6 +42,9 @@ parser.add_argument("input", help="Thread data file (-y output)")
 parser.add_argument("-v", "--verbose", dest="verbose",
                     help="Verbose output (default: False)",
                     default=False, action="store_true")
+parser.add_argument("-r", "--rank", dest="rank",
+                    help="Rank to process (default: all)",
+                    default="all", action="store")
 
 args = parser.parse_args()
 infile = args.input
@@ -56,17 +59,36 @@ TASKTYPES = ["none", "sort", "self", "pair", "sub_self", "sub_pair",
 SUBTYPES = ["none", "density", "gradient", "force", "grav", "external_grav",
             "tend", "xv", "rho", "gpart", "multipole", "spart", "count"]
 
+SIDS = ["(-1,-1,-1)", "(-1,-1, 0)", "(-1,-1, 1)", "(-1, 0,-1)",
+        "(-1, 0, 0)", "(-1, 0, 1)", "(-1, 1,-1)", "(-1, 1, 0)",
+        "(-1, 1, 1)", "( 0,-1,-1)", "( 0,-1, 0)", "( 0,-1, 1)",
+        "( 0, 0,-1)"]
+
 #  Read input.
 data = pl.loadtxt( infile )
 
 #  Get the CPU clock to convert ticks into milliseconds.
 full_step = data[0,:]
+updates = int(full_step[7])
+g_updates = int(full_step[8])
+s_updates = int(full_step[9])
 CPU_CLOCK = float(full_step[-1]) / 1000.0
 if args.verbose:
     print "# CPU frequency:", CPU_CLOCK * 1000.0
+print "#   updates:", updates
+print "# g_updates:", g_updates
+print "# s_updates:", s_updates
 
 nranks = int(max(data[:,0])) + 1
 print "# Number of ranks:", nranks
+if args.rank == "all":
+    ranks = range(nranks)
+else:
+    ranks = [int(args.rank)]
+    if ranks[0] >= nranks:
+        print "Error: maximum rank is " + str(nranks - 1)
+        sys.exit(1)
+
 maxthread = int(max(data[:,1])) + 1
 print "# Maximum thread id:", maxthread
 
@@ -74,8 +96,8 @@ print "# Maximum thread id:", maxthread
 sdata = data[data[:,5] != 0]
 sdata = data[data[:,6] != 0]
 
-#  Now we process all the ranks.
-for rank in range(nranks):
+#  Now we process the required ranks.
+for rank in ranks:
     print "# Rank", rank
     data = sdata[sdata[:,0] == rank]
 
@@ -92,6 +114,7 @@ for rank in range(nranks):
     #  Calculate the time range.
     total_t = (toc_step - tic_step)/ CPU_CLOCK
     print "# Data range: ", total_t, "ms"
+    print
 
     #  Correct times to relative values.
     start_t = float(tic_step)
@@ -105,15 +128,16 @@ for rank in range(nranks):
         tasks[i] = []
 
     #  Gather into by thread data.
-    num_lines = pl.size(data) / 12
+    num_lines = pl.shape(data)[0]
     for line in range(num_lines):
         thread = int(data[line,1])
         tic = int(data[line,5]) / CPU_CLOCK
         toc = int(data[line,6]) / CPU_CLOCK
         tasktype = int(data[line,2])
         subtype = int(data[line,3])
+        sid = int(data[line, -1])
 
-        tasks[thread].append([tic,toc,tasktype,subtype])
+        tasks[thread].append([tic,toc,tasktype,subtype, sid])
 
     #  Sort by tic and gather used threads.
     threadids = []
@@ -123,10 +147,13 @@ for rank in range(nranks):
 
     #  Times per task.
     print "# Task times:"
-    print "# {0:<16s}: {1:>7s} {2:>9s} {3:>9s} {4:>9s} {5:>9s} {6:>9s}"\
+    print "# -----------"
+    print "# {0:<17s}: {1:>7s} {2:>9s} {3:>9s} {4:>9s} {5:>9s} {6:>9s}"\
           .format("type/subtype", "count","minimum", "maximum",
                   "sum", "mean", "percent")
+
     alltasktimes = {}
+    sidtimes = {}
     for i in threadids:
         tasktimes = {}
         for task in tasks[i]:
@@ -139,13 +166,19 @@ for rank in range(nranks):
             if not key in alltasktimes:
                 alltasktimes[key] = []
             alltasktimes[key].append(dt)
+            
+            my_sid = task[4]
+            if my_sid > -1:
+                if not my_sid in sidtimes:
+                    sidtimes[my_sid] = []
+                sidtimes[my_sid].append(dt)
 
         print "# Thread : ", i
         for key in sorted(tasktimes.keys()):
             taskmin = min(tasktimes[key])
             taskmax = max(tasktimes[key])
             tasksum = sum(tasktimes[key])
-            print "{0:18s}: {1:7d} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.4f} {6:9.2f}"\
+            print "{0:19s}: {1:7d} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.4f} {6:9.2f}"\
                   .format(key, len(tasktimes[key]), taskmin, taskmax, tasksum,
                           tasksum / len(tasktimes[key]), tasksum / total_t * 100.0)
         print
@@ -161,8 +194,121 @@ for rank in range(nranks):
                       tasksum / (len(threadids) * total_t) * 100.0)
     print
 
+    # For pairs, show stuf sorted by SID
+    print "# By SID (all threads): "
+    print "# {0:<17s}: {1:>7s} {2:>9s} {3:>9s} {4:>9s} {5:>9s} {6:>9s}"\
+        .format("Pair/Sub-pair SID", "count","minimum", "maximum",
+                "sum", "mean", "percent")
+
+    for sid in range(0,13):
+        if sid in sidtimes:
+            sidmin = min(sidtimes[sid])
+            sidmax = max(sidtimes[sid])
+            sidsum = sum(sidtimes[sid])
+            sidcount = len(sidtimes[sid])
+            sidmean = sidsum / sidcount
+        else:
+            sidmin = 0.
+            sidmax = 0.
+            sidsum = 0.
+            sidcount = 0
+            sidmean = 0.
+        print "{0:3d} {1:15s}: {2:7d} {3:9.4f} {4:9.4f} {5:9.4f} {6:9.4f} {7:9.2f}"\
+            .format(sid, SIDS[sid], sidcount, sidmin, sidmax, sidsum,
+                    sidmean, sidsum / (len(threadids) * total_t) * 100.0)   
+    print
+
     #  Dead times.
-    print "# Deadtimes:"
+    print "# Times not in tasks (deadtimes)"
+    print "# ------------------------------"
+    print "# Time before first task:"
+    print "# no.    : {0:>9s} {1:>9s}".format("value", "percent")
+    predeadtimes = []
+    for i in threadids:
+        if len(tasks[i]) > 0:
+            predeadtime = tasks[i][0][0]
+            print "thread {0:2d}: {1:9.4f} {2:9.4f}"\
+                  .format(i, predeadtime, predeadtime / total_t * 100.0)
+            predeadtimes.append(predeadtime)
+        else:
+            predeadtimes.append(0.0)
+
+    predeadmin = min(predeadtimes)
+    predeadmax = max(predeadtimes)
+    predeadsum = sum(predeadtimes)
+    print "#        : {0:>9s} {1:>9s} {2:>9s} {3:>9s} {4:>9s} {5:>9s}"\
+          .format("count", "minimum", "maximum", "sum", "mean", "percent")
+    print "all      : {0:9d} {1:9.4f} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.2f}"\
+          .format(len(predeadtimes), predeadmin, predeadmax, predeadsum,
+                  predeadsum / len(predeadtimes),
+                  predeadsum / (len(threadids) * total_t ) * 100.0)
+    print
+
+    print "# Time after last task:"
+    print "# no.    : {0:>9s} {1:>9s}".format("value", "percent")
+    postdeadtimes = []
+    for i in threadids:
+        if len(tasks[i]) > 0:
+            postdeadtime = total_t - tasks[i][-1][1]
+            print "thread {0:2d}: {1:9.4f} {2:9.4f}"\
+                  .format(i, postdeadtime, postdeadtime / total_t * 100.0)
+            postdeadtimes.append(postdeadtime)
+        else:
+            postdeadtimes.append(0.0)
+
+    postdeadmin = min(postdeadtimes)
+    postdeadmax = max(postdeadtimes)
+    postdeadsum = sum(postdeadtimes)
+    print "#        : {0:>9s} {1:>9s} {2:>9s} {3:>9s} {4:>9s} {5:>9s}"\
+          .format("count", "minimum", "maximum", "sum", "mean", "percent")
+    print "all      : {0:9d} {1:9.4f} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.2f}"\
+          .format(len(postdeadtimes), postdeadmin, postdeadmax, postdeadsum,
+                  postdeadsum / len(postdeadtimes),
+                  postdeadsum / (len(threadids) * total_t ) * 100.0)
+    print
+
+    #  Time in engine, i.e. from first to last tasks.
+    print "# Time between tasks (engine deadtime):"
+    print "# no.    : {0:>9s} {1:>9s} {2:>9s} {3:>9s} {4:>9s} {5:>9s}"\
+          .format("count", "minimum", "maximum", "sum", "mean", "percent")
+    enginedeadtimes = []
+    for i in threadids:
+        deadtimes = []
+        if len(tasks[i]) > 0:
+            last = tasks[i][0][0]
+        else:
+            last = 0.0
+        for task in tasks[i]:
+            dt = task[0] - last
+            deadtimes.append(dt)
+            last = task[1]
+
+        #  Drop first value, last value already gone.
+        if len(deadtimes) > 1:
+            deadtimes = deadtimes[1:]
+        else:
+            #  Only one or fewer tasks, so no deadtime by definition.
+            deadtimes = [0.0]
+
+        deadmin = min(deadtimes)
+        deadmax = max(deadtimes)
+        deadsum = sum(deadtimes)
+        print "thread {0:2d}: {1:9d} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.4f} {6:9.2f}"\
+              .format(i, len(deadtimes), deadmin, deadmax, deadsum,
+                      deadsum / len(deadtimes), deadsum / total_t * 100.0)
+        enginedeadtimes.extend(deadtimes)
+
+    deadmin = min(enginedeadtimes)
+    deadmax = max(enginedeadtimes)
+    deadsum = sum(enginedeadtimes)
+    print "all      : {0:9d} {1:9.4f} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.2f}"\
+          .format(len(enginedeadtimes), deadmin, deadmax, deadsum,
+                  deadsum / len(enginedeadtimes),
+                  deadsum / (len(threadids) * total_t ) * 100.0)
+    print
+
+    #  All times in step.
+    print "# All deadtimes:"
     print "# no.    : {0:>9s} {1:>9s} {2:>9s} {3:>9s} {4:>9s} {5:>9s}"\
           .format("count", "minimum", "maximum", "sum", "mean", "percent")
     alldeadtimes = []
@@ -181,7 +327,7 @@ for rank in range(nranks):
         deadsum = sum(deadtimes)
         print "thread {0:2d}: {1:9d} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.4f} {6:9.2f}"\
               .format(i, len(deadtimes), deadmin, deadmax, deadsum,
-                      deadsum / len(deadtimes), deadsum / total_t * 100.0)
+                  deadsum / len(deadtimes), deadsum / total_t * 100.0)
         alldeadtimes.extend(deadtimes)
 
     deadmin = min(alldeadtimes)
@@ -190,8 +336,7 @@ for rank in range(nranks):
     print "all      : {0:9d} {1:9.4f} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.2f}"\
           .format(len(alldeadtimes), deadmin, deadmax, deadsum,
                   deadsum / len(alldeadtimes),
-              deadsum / (len(threadids) * total_t ) * 100.0)
+                  deadsum / (len(threadids) * total_t ) * 100.0)
     print
 
-
 sys.exit(0)
diff --git a/examples/analyse_threadpool_tasks.py b/examples/analyse_threadpool_tasks.py
new file mode 100755
index 0000000000000000000000000000000000000000..609af363b4110e010d6714bef6862d40e5acb278
--- /dev/null
+++ b/examples/analyse_threadpool_tasks.py
@@ -0,0 +1,273 @@
+#!/usr/bin/env python
+"""
+Usage:
+    analsyse_threadpool_tasks.py [options] input.dat
+
+where input.dat is a threadpool dump for a step.  Use the '-Y interval' flag
+of the swift command to create these.
+
+The output is an analysis of the threadpool task timings, including deadtime
+per thread and step, total amount of time spent for each task type, for the
+whole step and per thread and the minimum and maximum times spent per task
+type.
+
+This file is part of SWIFT.
+Copyright (c) 2017 Peter W. Draper (p.w.draper@durham.ac.uk)
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published
+by the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.collections as collections
+import matplotlib.ticker as plticker
+import pylab as pl
+import sys
+import argparse
+
+#  Handle the command line.
+parser = argparse.ArgumentParser(description="Analyse task dumps")
+
+parser.add_argument("input", help="Threadpool data file (-y output)")
+parser.add_argument("-v", "--verbose", dest="verbose",
+                    help="Verbose output (default: False)",
+                    default=False, action="store_true")
+
+args = parser.parse_args()
+infile = args.input
+
+#  Read header. First two lines.
+with open(infile) as infid:
+    head = [next(infid) for x in xrange(2)]
+header = head[1][2:].strip()
+header = eval(header)
+nthread = int(header['num_threads']) + 1
+CPU_CLOCK = float(header['cpufreq']) / 1000.0
+print "Number of threads: ", nthread - 1
+if args.verbose:
+    print "CPU frequency:", CPU_CLOCK * 1000.0
+
+#  Read input.
+data = pl.genfromtxt(infile, dtype=None, delimiter=" ")
+
+#  Mixed types, so need to separate.
+tics = []
+tocs = []
+funcs = []
+threads = []
+chunks = []
+for i in data:
+    if i[0] != "#":
+        funcs.append(i[0].replace("_mapper", ""))
+        if i[1] < 0:
+            threads.append(nthread-1)
+        else:
+            threads.append(i[1])
+        chunks.append(i[2])
+        tics.append(i[3])
+        tocs.append(i[4])
+tics = pl.array(tics)
+tocs = pl.array(tocs)
+funcs = pl.array(funcs)
+threads = pl.array(threads)
+chunks = pl.array(chunks)
+
+#  Recover the start and end time
+tic_step = min(tics)
+toc_step = max(tocs)
+
+#  Calculate the time range.
+total_t = (toc_step - tic_step)/ CPU_CLOCK
+print "# Data range: ", total_t, "ms"
+print
+
+#  Correct times to relative millisecs.
+start_t = float(tic_step)
+tics = (tics - start_t) / CPU_CLOCK
+tocs = (tocs - start_t) / CPU_CLOCK
+
+tasks = {}
+tasks[-1] = []
+for i in range(nthread):
+    tasks[i] = []
+
+#  Gather into by thread data.
+for i in range(len(tics)):
+    tasks[threads[i]].append([tics[i],tocs[i],funcs[i]])
+
+#  Don't actually process the fake thread.
+nthread = nthread - 1
+
+#  Sort by tic and gather used thread ids.
+threadids = []
+for i in range(nthread):
+    if len(tasks[i]) > 0:
+        tasks[i] = sorted(tasks[i], key=lambda task: task[0])
+        threadids.append(i)
+
+#  Times per task.
+print "# Task times:"
+print "# -----------"
+print "# {0:<31s}: {1:>7s} {2:>9s} {3:>9s} {4:>9s} {5:>9s} {6:>9s}"\
+      .format("type/subtype", "count","minimum", "maximum",
+              "sum", "mean", "percent")
+alltasktimes = {}
+sidtimes = {}
+for i in threadids:
+    tasktimes = {}
+    for task in tasks[i]:
+        key = task[2]
+        dt = task[1] - task[0]
+        if not key in tasktimes:
+            tasktimes[key] = []
+        tasktimes[key].append(dt)
+
+        if not key in alltasktimes:
+            alltasktimes[key] = []
+        alltasktimes[key].append(dt)
+
+    print "# Thread : ", i
+    for key in sorted(tasktimes.keys()):
+        taskmin = min(tasktimes[key])
+        taskmax = max(tasktimes[key])
+        tasksum = sum(tasktimes[key])
+        print "{0:33s}: {1:7d} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.4f} {6:9.2f}"\
+              .format(key, len(tasktimes[key]), taskmin, taskmax, tasksum,
+                      tasksum / len(tasktimes[key]), tasksum / total_t * 100.0)
+    print
+
+print "# All threads : "
+for key in sorted(alltasktimes.keys()):
+    taskmin = min(alltasktimes[key])
+    taskmax = max(alltasktimes[key])
+    tasksum = sum(alltasktimes[key])
+    print "{0:33s}: {1:7d} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.4f} {6:9.2f}"\
+          .format(key, len(alltasktimes[key]), taskmin, taskmax, tasksum,
+                  tasksum / len(alltasktimes[key]),
+                  tasksum / (len(threadids) * total_t) * 100.0)
+print
+
+#  Dead times.
+print "# Times not in tasks (deadtimes)"
+print "# ------------------------------"
+print "# Time before first task:"
+print "# no.    : {0:>9s} {1:>9s}".format("value", "percent")
+predeadtimes = []
+for i in threadids:
+    predeadtime = tasks[i][0][0]
+    print "thread {0:2d}: {1:9.4f} {2:9.4f}"\
+          .format(i, predeadtime, predeadtime / total_t * 100.0)
+    predeadtimes.append(predeadtime)
+
+predeadmin = min(predeadtimes)
+predeadmax = max(predeadtimes)
+predeadsum = sum(predeadtimes)
+print "#        : {0:>9s} {1:>9s} {2:>9s} {3:>9s} {4:>9s} {5:>9s}"\
+      .format("count", "minimum", "maximum", "sum", "mean", "percent")
+print "all      : {0:9d} {1:9.4f} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.2f}"\
+      .format(len(predeadtimes), predeadmin, predeadmax, predeadsum,
+              predeadsum / len(predeadtimes),
+              predeadsum / (len(threadids) * total_t ) * 100.0)
+print
+
+print "# Time after last task:"
+print "# no.    : {0:>9s} {1:>9s}".format("value", "percent")
+postdeadtimes = []
+for i in threadids:
+    postdeadtime = total_t - tasks[i][-1][1]
+    print "thread {0:2d}: {1:9.4f} {2:9.4f}"\
+          .format(i, postdeadtime, postdeadtime / total_t * 100.0)
+    postdeadtimes.append(postdeadtime)
+
+postdeadmin = min(postdeadtimes)
+postdeadmax = max(postdeadtimes)
+postdeadsum = sum(postdeadtimes)
+print "#        : {0:>9s} {1:>9s} {2:>9s} {3:>9s} {4:>9s} {5:>9s}"\
+      .format("count", "minimum", "maximum", "sum", "mean", "percent")
+print "all      : {0:9d} {1:9.4f} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.2f}"\
+      .format(len(postdeadtimes), postdeadmin, postdeadmax, postdeadsum,
+              postdeadsum / len(postdeadtimes),
+              postdeadsum / (len(threadids) * total_t ) * 100.0)
+print
+
+#  Time in threadpool, i.e. from first to last tasks.
+print "# Time between tasks (threadpool deadtime):"
+print "# no.    : {0:>9s} {1:>9s} {2:>9s} {3:>9s} {4:>9s} {5:>9s}"\
+      .format("count", "minimum", "maximum", "sum", "mean", "percent")
+threadpooldeadtimes = []
+for i in threadids:
+    deadtimes = []
+    last = tasks[i][0][0]
+    for task in tasks[i]:
+        dt = task[0] - last
+        deadtimes.append(dt)
+        last = task[1]
+
+    #  Drop first value, last value already gone.
+    if len(deadtimes) > 1:
+        deadtimes = deadtimes[1:]
+    else:
+        #  Only one task, so no deadtime by definition.
+        deadtimes = [0.0]
+
+    deadmin = min(deadtimes)
+    deadmax = max(deadtimes)
+    deadsum = sum(deadtimes)
+    print "thread {0:2d}: {1:9d} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.4f} {6:9.2f}"\
+          .format(i, len(deadtimes), deadmin, deadmax, deadsum,
+                  deadsum / len(deadtimes), deadsum / total_t * 100.0)
+    threadpooldeadtimes.extend(deadtimes)
+
+deadmin = min(threadpooldeadtimes)
+deadmax = max(threadpooldeadtimes)
+deadsum = sum(threadpooldeadtimes)
+print "all      : {0:9d} {1:9.4f} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.2f}"\
+      .format(len(threadpooldeadtimes), deadmin, deadmax, deadsum,
+              deadsum / len(threadpooldeadtimes),
+              deadsum / (len(threadids) * total_t ) * 100.0)
+print
+
+#  All times in step.
+print "# All deadtimes:"
+print "# no.    : {0:>9s} {1:>9s} {2:>9s} {3:>9s} {4:>9s} {5:>9s}"\
+      .format("count", "minimum", "maximum", "sum", "mean", "percent")
+alldeadtimes = []
+for i in threadids:
+    deadtimes = []
+    last = 0
+    for task in tasks[i]:
+        dt = task[0] - last
+        deadtimes.append(dt)
+        last = task[1]
+    dt = total_t - last
+    deadtimes.append(dt)
+
+    deadmin = min(deadtimes)
+    deadmax = max(deadtimes)
+    deadsum = sum(deadtimes)
+    print "thread {0:2d}: {1:9d} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.4f} {6:9.2f}"\
+          .format(i, len(deadtimes), deadmin, deadmax, deadsum,
+                  deadsum / len(deadtimes), deadsum / total_t * 100.0)
+    alldeadtimes.extend(deadtimes)
+
+deadmin = min(alldeadtimes)
+deadmax = max(alldeadtimes)
+deadsum = sum(alldeadtimes)
+print "all      : {0:9d} {1:9.4f} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.2f}"\
+      .format(len(alldeadtimes), deadmin, deadmax, deadsum,
+              deadsum / len(alldeadtimes),
+              deadsum / (len(threadids) * total_t ) * 100.0)
+print
+
+sys.exit(0)
diff --git a/examples/main.c b/examples/main.c
index 631117148addd3ab7ad49ed2760855b793757870..ee1253062409ec2e787e064a5fb50da2c830d35d 100644
--- a/examples/main.c
+++ b/examples/main.c
@@ -26,7 +26,9 @@
 #include "../config.h"
 
 /* Some standard headers. */
+#include <errno.h>
 #include <fenv.h>
+#include <libgen.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -57,48 +59,53 @@ void print_help_message() {
   printf("       swift_mpi [OPTION]... PARAMFILE\n\n");
 
   printf("Valid options are:\n");
-  printf("  %2s %8s %s\n", "-a", "", "Pin runners using processor affinity.");
-  printf("  %2s %8s %s\n", "-c", "", "Run with cosmological time integration.");
-  printf("  %2s %8s %s\n", "-C", "", "Run with cooling.");
+  printf("  %2s %14s %s\n", "-a", "", "Pin runners using processor affinity.");
+  printf("  %2s %14s %s\n", "-c", "",
+         "Run with cosmological time integration.");
+  printf("  %2s %14s %s\n", "-C", "", "Run with cooling.");
   printf(
-      "  %2s %8s %s\n", "-d", "",
+      "  %2s %14s %s\n", "-d", "",
       "Dry run. Read the parameter file, allocate memory but does not read ");
   printf(
-      "  %2s %8s %s\n", "", "",
+      "  %2s %14s %s\n", "", "",
       "the particles from ICs and exit before the start of time integration.");
-  printf("  %2s %8s %s\n", "", "",
+  printf("  %2s %14s %s\n", "", "",
          "Allows user to check validy of parameter and IC files as well as "
          "memory limits.");
-  printf("  %2s %8s %s\n", "-D", "",
+  printf("  %2s %14s %s\n", "-D", "",
          "Always drift all particles even the ones far from active particles. "
          "This emulates");
-  printf("  %2s %8s %s\n", "", "",
+  printf("  %2s %14s %s\n", "", "",
          "Gadget-[23] and GIZMO's default behaviours.");
-  printf("  %2s %8s %s\n", "-e", "",
+  printf("  %2s %14s %s\n", "-e", "",
          "Enable floating-point exceptions (debugging mode).");
-  printf("  %2s %8s %s\n", "-f", "{int}",
+  printf("  %2s %14s %s\n", "-f", "{int}",
          "Overwrite the CPU frequency (Hz) to be used for time measurements.");
-  printf("  %2s %8s %s\n", "-g", "",
+  printf("  %2s %14s %s\n", "-g", "",
          "Run with an external gravitational potential.");
-  printf("  %2s %8s %s\n", "-F", "", "Run with feedback.");
-  printf("  %2s %8s %s\n", "-G", "", "Run with self-gravity.");
-  printf("  %2s %8s %s\n", "-M", "",
+  printf("  %2s %14s %s\n", "-G", "", "Run with self-gravity.");
+  printf("  %2s %14s %s\n", "-M", "",
          "Reconstruct the multipoles every time-step.");
-  printf("  %2s %8s %s\n", "-n", "{int}",
+  printf("  %2s %14s %s\n", "-n", "{int}",
          "Execute a fixed number of time steps. When unset use the time_end "
          "parameter to stop.");
-  printf("  %2s %8s %s\n", "-s", "", "Run with hydrodynamics.");
-  printf("  %2s %8s %s\n", "-S", "", "Run with stars.");
-  printf("  %2s %8s %s\n", "-t", "{int}",
+  printf("  %2s %14s %s\n", "-P", "{sec:par:val}",
+         "Set parameter value and overwrites values read from the parameters "
+         "file. Can be used more than once.");
+  printf("  %2s %14s %s\n", "-s", "", "Run with hydrodynamics.");
+  printf("  %2s %14s %s\n", "-S", "", "Run with stars.");
+  printf("  %2s %14s %s\n", "-t", "{int}",
          "The number of threads to use on each MPI rank. Defaults to 1 if not "
          "specified.");
-  printf("  %2s %8s %s\n", "-T", "", "Print timers every time-step.");
-  printf("  %2s %8s %s\n", "-v", "[12]", "Increase the level of verbosity.");
-  printf("  %2s %8s %s\n", "", "", "1: MPI-rank 0 writes ");
-  printf("  %2s %8s %s\n", "", "", "2: All MPI-ranks write");
-  printf("  %2s %8s %s\n", "-y", "{int}",
+  printf("  %2s %14s %s\n", "-T", "", "Print timers every time-step.");
+  printf("  %2s %14s %s\n", "-v", "[12]", "Increase the level of verbosity:");
+  printf("  %2s %14s %s\n", "", "", "1: MPI-rank 0 writes,");
+  printf("  %2s %14s %s\n", "", "", "2: All MPI-ranks write.");
+  printf("  %2s %14s %s\n", "-y", "{int}",
          "Time-step frequency at which task graphs are dumped.");
-  printf("  %2s %8s %s\n", "-h", "", "Print this help message and exit.");
+  printf("  %2s %14s %s\n", "-Y", "{int}",
+         "Time-step frequency at which threadpool tasks are dumped.");
+  printf("  %2s %14s %s\n", "-h", "", "Print this help message and exit.");
   printf(
       "\nSee the file parameter_example.yml for an example of "
       "parameter file.\n");
@@ -135,7 +142,9 @@ int main(int argc, char *argv[]) {
   if ((res = MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN)) !=
       MPI_SUCCESS)
     error("Call to MPI_Comm_set_errhandler failed with error %i.", res);
-  if (myrank == 0) message("MPI is up and running with %i node(s).", nr_nodes);
+  if (myrank == 0)
+    printf("[0000] [00000.0] main: MPI is up and running with %i node(s).\n\n",
+           nr_nodes);
   if (nr_nodes == 1) {
     message("WARNING: you are running with one MPI rank.");
     message("WARNING: you should use the non-MPI version of this program.");
@@ -156,6 +165,7 @@ int main(int argc, char *argv[]) {
   int with_aff = 0;
   int dry_run = 0;
   int dump_tasks = 0;
+  int dump_threadpool = 0;
   int nsteps = -2;
   int with_cosmology = 0;
   int with_external_gravity = 0;
@@ -170,15 +180,21 @@ int main(int argc, char *argv[]) {
   int verbose = 0;
   int nr_threads = 1;
   int with_verbose_timers = 0;
+  int nparams = 0;
+  char *cmdparams[PARSER_MAX_NO_OF_PARAMS];
   char paramFileName[200] = "";
   unsigned long long cpufreq = 0;
 
   /* Parse the parameters */
   int c;
-  while ((c = getopt(argc, argv, "acCdDef:FgGhMn:sSt:Tv:y:")) != -1)
+  while ((c = getopt(argc, argv, "acCdDef:FgGhMn:P:sSt:Tv:y:Y:")) != -1)
     switch (c) {
       case 'a':
+#if defined(HAVE_SETAFFINITY) && defined(HAVE_LIBNUMA)
         with_aff = 1;
+#else
+        error("Need NUMA support for thread affinity");
+#endif
         break;
       case 'c':
         with_cosmology = 1;
@@ -224,6 +240,10 @@ int main(int argc, char *argv[]) {
           return 1;
         }
         break;
+      case 'P':
+        cmdparams[nparams] = optarg;
+        nparams++;
+        break;
       case 's':
         with_hydro = 1;
         break;
@@ -260,6 +280,21 @@ int main(int argc, char *argv[]) {
               "Task dumping is only possible if SWIFT was configured with the "
               "--enable-task-debugging option.");
         }
+#endif
+        break;
+      case 'Y':
+        if (sscanf(optarg, "%d", &dump_threadpool) != 1) {
+          if (myrank == 0) printf("Error parsing dump_threadpool (-Y). \n");
+          if (myrank == 0) print_help_message();
+          return 1;
+        }
+#ifndef SWIFT_DEBUG_THREADPOOL
+        if (dump_threadpool) {
+          error(
+              "Threadpool dumping is only possible if SWIFT was configured "
+              "with the "
+              "--enable-threadpool-debugging option.");
+        }
 #endif
         break;
       case '?':
@@ -285,6 +320,14 @@ int main(int argc, char *argv[]) {
     if (myrank == 0) print_help_message();
     return 1;
   }
+  if (with_stars && !with_external_gravity && !with_self_gravity) {
+    if (myrank == 0)
+      printf(
+          "Error: Cannot process stars without gravity, -g or -G must be "
+          "chosen.\n");
+    if (myrank == 0) print_help_message();
+    return 1;
+  }
 
   /* Genesis 1.1: And then, there was time ! */
   clocks_set_cpufreq(cpufreq);
@@ -351,6 +394,16 @@ int main(int argc, char *argv[]) {
   if (myrank == 0) {
     message("Reading runtime parameters from file '%s'", paramFileName);
     parser_read_file(paramFileName, params);
+
+    /* Handle any command-line overrides. */
+    if (nparams > 0) {
+      message(
+          "Overwriting values read from the YAML file with command-line "
+          "values.");
+      for (int k = 0; k < nparams; k++) parser_set_param(params, cmdparams[k]);
+    }
+
+    /* And dump the parameters as used. */
     // parser_print_params(&params);
     parser_write_params_to_file(params, "used_parameters.yml");
   }
@@ -359,6 +412,15 @@ int main(int argc, char *argv[]) {
   MPI_Bcast(params, sizeof(struct swift_params), MPI_BYTE, 0, MPI_COMM_WORLD);
 #endif
 
+  /* Check that we can write the snapshots by testing if the output
+   * directory exists and is searchable and writable. */
+  char basename[PARSER_MAX_LINE_SIZE];
+  parser_get_param_string(params, "Snapshots:basename", basename);
+  const char *dirp = dirname(basename);
+  if (access(dirp, W_OK | X_OK) != 0) {
+    error("Cannot write snapshots in directory %s (%s)", dirp, strerror(errno));
+  }
+
   /* Prepare the domain decomposition scheme */
   struct repartition reparttype;
 #ifdef WITH_MPI
@@ -403,6 +465,8 @@ int main(int argc, char *argv[]) {
   parser_get_param_string(params, "InitialConditions:file_name", ICfileName);
   const int replicate =
       parser_get_opt_param_int(params, "InitialConditions:replicate", 1);
+  const int clean_h_values =
+      parser_get_opt_param_int(params, "InitialConditions:cleanup_h", 0);
   if (myrank == 0) message("Reading ICs from file '%s'", ICfileName);
   fflush(stdout);
 
@@ -509,6 +573,11 @@ int main(int argc, char *argv[]) {
     message("nr of cells at depth %i is %i.", data[0], data[1]);
   }
 
+/* Initialise the table of Ewald corrections for the gravity checks */
+#ifdef SWIFT_GRAVITY_FORCE_CHECKS
+  if (periodic) gravity_exact_force_ewald_init(dim[0]);
+#endif
+
   /* Initialise the external potential properties */
   struct external_potential potential;
   if (with_external_gravity)
@@ -604,7 +673,7 @@ int main(int argc, char *argv[]) {
 #endif
 
   /* Initialise the particles */
-  engine_init_particles(&e, flag_entropy_ICs);
+  engine_init_particles(&e, flag_entropy_ICs, clean_h_values);
 
   /* Write the state of the system before starting time integration. */
   engine_dump_snapshot(&e);
@@ -656,14 +725,16 @@ int main(int argc, char *argv[]) {
           /* Open file and position at end. */
           file_thread = fopen(dumpfile, "a");
 
-          fprintf(file_thread, " %03i 0 0 0 0 %lli %lli 0 0 0 0 %lli\n", myrank,
-                  e.tic_step, e.toc_step, cpufreq);
+          fprintf(file_thread, " %03i 0 0 0 0 %lli %lli %zi %zi %zi 0 0 %lli\n",
+                  myrank, e.tic_step, e.toc_step, e.updates, e.g_updates,
+                  e.s_updates, cpufreq);
           int count = 0;
           for (int l = 0; l < e.sched.nr_tasks; l++) {
             if (!e.sched.tasks[l].implicit && e.sched.tasks[l].toc != 0) {
               fprintf(
-                  file_thread, " %03i %i %i %i %i %lli %lli %i %i %i %i %i\n",
-                  myrank, e.sched.tasks[l].rid, e.sched.tasks[l].type,
+                  file_thread,
+                  " %03i %i %i %i %i %lli %lli %i %i %i %i %i %i\n", myrank,
+                  e.sched.tasks[l].rid, e.sched.tasks[l].type,
                   e.sched.tasks[l].subtype, (e.sched.tasks[l].cj == NULL),
                   e.sched.tasks[l].tic, e.sched.tasks[l].toc,
                   (e.sched.tasks[l].ci != NULL) ? e.sched.tasks[l].ci->count
@@ -674,7 +745,7 @@ int main(int argc, char *argv[]) {
                                                 : 0,
                   (e.sched.tasks[l].cj != NULL) ? e.sched.tasks[l].cj->gcount
                                                 : 0,
-                  e.sched.tasks[l].flags);
+                  e.sched.tasks[l].flags, e.sched.tasks[l].sid);
             }
             fflush(stdout);
             count++;
@@ -692,25 +763,43 @@ int main(int argc, char *argv[]) {
       FILE *file_thread;
       file_thread = fopen(dumpfile, "w");
       /* Add some information to help with the plots */
-      fprintf(file_thread, " %i %i %i %i %lli %lli %i %i %i %lli\n", -2, -1, -1,
-              1, e.tic_step, e.toc_step, 0, 0, 0, cpufreq);
+      fprintf(file_thread, " %i %i %i %i %lli %lli %zi %zi %zi %i %lli\n", -2,
+              -1, -1, 1, e.tic_step, e.toc_step, e.updates, e.g_updates,
+              e.s_updates, 0, cpufreq);
       for (int l = 0; l < e.sched.nr_tasks; l++) {
         if (!e.sched.tasks[l].implicit && e.sched.tasks[l].toc != 0) {
           fprintf(
-              file_thread, " %i %i %i %i %lli %lli %i %i %i %i\n",
+              file_thread, " %i %i %i %i %lli %lli %i %i %i %i %i\n",
               e.sched.tasks[l].rid, e.sched.tasks[l].type,
               e.sched.tasks[l].subtype, (e.sched.tasks[l].cj == NULL),
               e.sched.tasks[l].tic, e.sched.tasks[l].toc,
               (e.sched.tasks[l].ci == NULL) ? 0 : e.sched.tasks[l].ci->count,
               (e.sched.tasks[l].cj == NULL) ? 0 : e.sched.tasks[l].cj->count,
               (e.sched.tasks[l].ci == NULL) ? 0 : e.sched.tasks[l].ci->gcount,
-              (e.sched.tasks[l].cj == NULL) ? 0 : e.sched.tasks[l].cj->gcount);
+              (e.sched.tasks[l].cj == NULL) ? 0 : e.sched.tasks[l].cj->gcount,
+              e.sched.tasks[l].sid);
         }
       }
       fclose(file_thread);
 #endif  // WITH_MPI
     }
 #endif  // SWIFT_DEBUG_TASKS
+
+#ifdef SWIFT_DEBUG_THREADPOOL
+    /* Dump the task data using the given frequency. */
+    if (dump_threadpool && (dump_threadpool == 1 || j % dump_threadpool == 1)) {
+      char dumpfile[40];
+#ifdef WITH_MPI
+      snprintf(dumpfile, 30, "threadpool_info-rank%d-step%d.dat", engine_rank,
+               j + 1);
+#else
+      snprintf(dumpfile, 30, "threadpool_info-step%d.dat", j + 1);
+#endif  // WITH_MPI
+      threadpool_dump_log(&e.threadpool, dumpfile, 1);
+    } else {
+      threadpool_reset_log(&e.threadpool);
+    }
+#endif  // SWIFT_DEBUG_THREADPOOL
   }
 
 /* Print the values of the runner histogram. */
diff --git a/examples/parameter_example.yml b/examples/parameter_example.yml
index 8006c1a325845d6e9fec655b809310a63daa9ddb..9c3cee7630edf1be1e161a3e70547f06e6108ebd 100644
--- a/examples/parameter_example.yml
+++ b/examples/parameter_example.yml
@@ -8,12 +8,12 @@ InternalUnitSystem:
 
 # Parameters for the task scheduling
 Scheduler:
-  nr_queues:             0        # (Optional) The number of task queues to use. Use 0  to let the system decide.
-  cell_max_size:         8000000  # (Optional) Maximal number of interactions per task if we force the split (this is the default value).
-  cell_sub_size:         64000000 # (Optional) Maximal number of interactions per sub-task  (this is the default value).
-  cell_split_size:       400      # (Optional) Maximal number of particles per cell (this is the default value).
-  cell_max_count:        10000    # (Optional) Maximal number of particles per cell allowed before triggering a sanitizing (this is the default value).
-  max_top_level_cells:   12       # (Optional) Maximal number of top-level cells in any dimension. The number of top-level cells will be the cube of this (this is the default value).
+  nr_queues:             0         # (Optional) The number of task queues to use. Use 0  to let the system decide.
+  cell_max_size:         8000000   # (Optional) Maximal number of interactions per task if we force the split (this is the default value).
+  cell_sub_size_pair:    256000000 # (Optional) Maximal number of interactions per sub-pair task  (this is the default value).
+  cell_sub_size_self:    32000     # (Optional) Maximal number of interactions per sub-self task  (this is the default value).
+  cell_split_size:       400       # (Optional) Maximal number of particles per cell (this is the default value).
+  max_top_level_cells:   12        # (Optional) Maximal number of top-level cells in any dimension. The number of top-level cells will be the cube of this (this is the default value).
 
 # Parameters governing the time integration (Set dt_min and dt_max to the same value for a fixed time-step run.)
 TimeIntegration:
@@ -43,23 +43,25 @@ Statistics:
 # Parameters for the hydrodynamics scheme
 SPH:
   resolution_eta:        1.2348   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
-  delta_neighbours:      0.1      # The tolerance for the targetted number of neighbours.
   CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
-  max_ghost_iterations:  30       # (Optional) Maximal number of iterations allowed to converge towards the smoothing length.
-  max_volume_change:     2.       # (Optional) Maximal allowed change of kernel volume over one time-step
+  h_tolerance:           1e-4     # (Optional) Relative accuracy of the Netwon-Raphson scheme for the smoothing lengths.
   h_max:                 10.      # (Optional) Maximal allowed smoothing length in internal units. Defaults to FLT_MAX if unspecified.
+  max_volume_change:     1.4      # (Optional) Maximal allowed change of kernel volume over one time-step.
+  max_ghost_iterations:  30       # (Optional) Maximal number of iterations allowed to converge towards the smoothing length.
 
 # Parameters for the self-gravity scheme
 Gravity:
-  eta:                   0.025    # Constant dimensionless multiplier for time integration.
-  theta:                 0.7      # Opening angle (Multipole acceptance criterion)
-  epsilon:               0.1      # Softening length (in internal units).
-  a_smooth:              1.25     # (Optional) Smoothing scale in top-level cell sizes to smooth the long-range forces over (this is the default value).
-  r_cut:                 4.5      # (Optional) Cut-off in number of top-level cells beyond which no FMM forces are computed (this is the default value).
+  eta:          0.025    # Constant dimensionless multiplier for time integration.
+  theta:        0.7      # Opening angle (Multipole acceptance criterion)
+  epsilon:      0.1      # Softening length (in internal units).
+  a_smooth:     1.25     # (Optional) Smoothing scale in top-level cell sizes to smooth the long-range forces over (this is the default value).
+  r_cut_max:    4.5      # (Optional) Cut-off in number of top-level cells beyond which no FMM forces are computed (this is the default value).
+  r_cut_min:    0.1      # (Optional) Cut-off in number of top-level cells below which no truncation of FMM forces are performed (this is the default value).
 
 # Parameters related to the initial conditions
 InitialConditions:
   file_name:  SedovBlast/sedov.hdf5 # The file to read
+  cleanup_h:   0                    # (Optional) Clean the values of h that are read in. Set to 1 to activate.
   h_scaling:  1.                    # (Optional) A scaling factor to apply to all smoothing lengths in the ICs.
   shift_x:    0.                    # (Optional) A shift to apply to all particles read from the ICs (in internal units).
   shift_y:    0.
@@ -103,7 +105,9 @@ IsothermalPotential:
 DiscPatchPotential:
   surface_density: 10.      # Surface density of the disc (internal units)
   scale_height:    100.     # Scale height of the disc (internal units)
-  z_disc:          200.     # Position of the disc along the z-axis (internal units)
+  z_disc:          400.     # Position of the disc along the z-axis (internal units)
+  z_trunc:         300.     # (Optional) Distance from the disc along z-axis above which the potential gets truncated.
+  z_max:           380.     # (Optional) Distance from the disc along z-axis above which the potential is set to 0.
   timestep_mult:   0.03     # Dimensionless pre-factor for the time-step condition
   growth_time:     5.       # (Optional) Time for the disc to grow to its final size (multiple of the dynamical time)
 
diff --git a/examples/plot_tasks.py b/examples/plot_tasks.py
index 88f176687db8116cfd4370970769164985e4d366..c49020939cca8f744db352631b2ec47267d7bd20 100755
--- a/examples/plot_tasks.py
+++ b/examples/plot_tasks.py
@@ -78,7 +78,7 @@ PLOT_PARAMS = {"axes.labelsize": 10,
                "figure.figsize" : (args.width, args.height),
                "figure.subplot.left" : 0.03,
                "figure.subplot.right" : 0.995,
-               "figure.subplot.bottom" : 0.1,
+               "figure.subplot.bottom" : 0.09,
                "figure.subplot.top" : 0.99,
                "figure.subplot.wspace" : 0.,
                "figure.subplot.hspace" : 0.,
@@ -91,17 +91,18 @@ pl.rcParams.update(PLOT_PARAMS)
 TASKTYPES = ["none", "sort", "self", "pair", "sub_self", "sub_pair",
              "init_grav", "ghost", "extra_ghost", "drift_part",
              "drift_gpart", "kick1", "kick2", "timestep", "send", "recv",
-             "grav_top_level", "grav_long_range", "grav_mm", "grav_down",
-             "cooling", "sourceterms", "count"]
+             "grav_top_level", "grav_long_range", "grav_ghost", "grav_mm",
+             "grav_down", "cooling", "sourceterms", "count"]
 
 SUBTYPES = ["none", "density", "gradient", "force", "grav", "external_grav",
             "tend", "xv", "rho", "gpart", "multipole", "spart", "count"]
 
 #  Task/subtypes of interest.
 FULLTYPES = ["self/force", "self/density", "self/grav", "sub_self/force",
-             "sub_self/density", "pair/force", "pair/density", "pair/grav",
-             "sub_pair/force",
-             "sub_pair/density", "recv/xv", "send/xv", "recv/rho", "send/rho",
+             "sub_self/density", "sub_self/grav", "pair/force", "pair/density",
+             "pair/grav", "sub_pair/force",
+             "sub_pair/density", "sub_pair/grav", "recv/xv", "send/xv",
+             "recv/rho", "send/rho",
              "recv/tend", "send/tend"]
 
 #  A number of colours for the various types. Recycled when there are
@@ -109,7 +110,7 @@ FULLTYPES = ["self/force", "self/density", "self/grav", "sub_self/force",
 colours = ["cyan", "lightgray", "darkblue", "yellow", "tan", "dodgerblue",
            "sienna", "aquamarine", "bisque", "blue", "green", "lightgreen",
            "brown", "purple", "moccasin", "olivedrab", "chartreuse",
-           "darksage", "darkgreen", "green", "mediumseagreen",
+           "steelblue", "darkgreen", "green", "mediumseagreen",
            "mediumaquamarine", "darkslategrey", "mediumturquoise",
            "black", "cadetblue", "skyblue", "red", "slategray", "gold",
            "slateblue", "blueviolet", "mediumorchid", "firebrick",
@@ -183,7 +184,7 @@ ecounter = []
 for i in range(nthread):
     ecounter.append(0)
 
-num_lines = pl.size(data) / 10
+num_lines = pl.size(data) / pl.size(full_step)
 for line in range(num_lines):
     thread = int(data[line,0])
 
@@ -243,21 +244,21 @@ for i in range(nthread):
 #  Legend and room for it.
 nrow = len(typesseen) / 5
 if not args.nolegend:
-    if len(typesseen) * 5 < nrow:
-        nrow = nrow + 1
     ax.fill_between([0, 0], nthread+0.5, nthread + nrow + 0.5, facecolor="white")
-    ax.set_ylim(0, nthread + nrow + 1)
-    ax.legend(loc=1, shadow=True, mode="expand", ncol=5)
-
+    ax.set_ylim(0, nthread + 0.5)
+    ax.legend(loc=1, shadow=True, bbox_to_anchor=(0., 1.05 ,1., 0.2), mode="expand", ncol=5)
+    box = ax.get_position()
+    ax.set_position([box.x0, box.y0, box.width, box.height*0.8])
+    
 # Start and end of time-step
 ax.plot([0, 0], [0, nthread + nrow + 1], 'k--', linewidth=1)
 ax.plot([end_t, end_t], [0, nthread + nrow + 1], 'k--', linewidth=1)
 
-ax.set_xlabel("Wall clock time [ms]")
+ax.set_xlabel("Wall clock time [ms]", labelpad=0.)
 if expand == 1:
-    ax.set_ylabel("Thread ID" )
+    ax.set_ylabel("Thread ID", labelpad=0 )
 else:
-    ax.set_ylabel("Thread ID * " + str(expand) )
+    ax.set_ylabel("Thread ID * " + str(expand), labelpad=0 )
 ax.set_yticks(pl.array(range(nthread)), True)
 
 loc = plticker.MultipleLocator(base=expand)
diff --git a/examples/plot_tasks_MPI.py b/examples/plot_tasks_MPI.py
index 83465aee87e8b641775d760fa4db2f06b125dd8b..85d7c54567a66c9c2151732e0e7a11c6580f958b 100755
--- a/examples/plot_tasks_MPI.py
+++ b/examples/plot_tasks_MPI.py
@@ -278,12 +278,12 @@ for rank in range(nranks):
 
     #  Legend and room for it.
     nrow = len(typesseen) / 5
-    if len(typesseen) * 5 < nrow:
-        nrow = nrow + 1
     ax.fill_between([0, 0], nethread+0.5, nethread + nrow + 0.5, facecolor="white")
-    ax.set_ylim(0, nethread + nrow + 1)
+    ax.set_ylim(0, nethread + 0.5)
     if data.size > 0:
-        ax.legend(loc=1, shadow=True, mode="expand", ncol=5)
+        ax.legend(loc=1, shadow=True, bbox_to_anchor=(0., 1.05 ,1., 0.2), mode="expand", ncol=5)
+        box = ax.get_position()
+        ax.set_position([box.x0, box.y0, box.width, box.height*0.8])
 
     # Start and end of time-step
     ax.plot([0, 0], [0, nethread + nrow + 1], 'k--', linewidth=1)
diff --git a/examples/plot_threadpool.py b/examples/plot_threadpool.py
new file mode 100755
index 0000000000000000000000000000000000000000..495fc3e0c532d9cafbf96e622decbc8179869160
--- /dev/null
+++ b/examples/plot_threadpool.py
@@ -0,0 +1,271 @@
+#!/usr/bin/env python
+"""
+Usage:
+    plot_threadpool.py [options] input.dat output.png
+
+where input.dat is a threadpool info file for a step.  Use the '-Y interval'
+flag of the swift command to create these. The output plot will be called
+'output.png'. The --limit option can be used to produce plots with the same
+time span and the --expand option to expand each thread line into '*expand'
+lines, so that adjacent tasks of the same type can be distinguished. Other
+options can be seen using the --help flag.
+
+This file is part of SWIFT.
+Copyright (c) 2015 Pedro Gonnet (pedro.gonnet@durham.ac.uk),
+                   Bert Vandenbroucke (bert.vandenbroucke@ugent.be)
+                   Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+          (c) 2017 Peter W. Draper (p.w.draper@durham.ac.uk)
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published
+by the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.collections as collections
+import matplotlib.ticker as plticker
+import pylab as pl
+import sys
+import argparse
+
+#  Handle the command line.
+parser = argparse.ArgumentParser(description="Plot threadpool function graphs")
+
+parser.add_argument("input", help="Threadpool data file (-Y output)")
+parser.add_argument("outpng", help="Name for output graphic file (PNG)")
+parser.add_argument("-l", "--limit", dest="limit",
+                    help="Upper time limit in millisecs (def: depends on data)",
+                    default=0, type=int)
+parser.add_argument("-e", "--expand", dest="expand",
+                    help="Thread expansion factor (def: 1)",
+                    default=1, type=int)
+parser.add_argument("--height", dest="height",
+                    help="Height of plot in inches (def: 4)",
+                    default=4., type=float)
+parser.add_argument("--width", dest="width",
+                    help="Width of plot in inches (def: 16)",
+                    default=16., type=float)
+parser.add_argument("--nolegend", dest="nolegend",
+                    help="Whether to show the legend (def: False)",
+                    default=False, action="store_true")
+parser.add_argument("-v", "--verbose", dest="verbose",
+                    help="Show colour assignments and other details (def: False)",
+                    default=False, action="store_true")
+
+args = parser.parse_args()
+infile = args.input
+outpng = args.outpng
+delta_t = args.limit
+expand = args.expand
+
+#  Basic plot configuration.
+PLOT_PARAMS = {"axes.labelsize": 10,
+               "axes.titlesize": 10,
+               "font.size": 12,
+               "legend.fontsize": 12,
+               "xtick.labelsize": 10,
+               "ytick.labelsize": 10,
+               "figure.figsize" : (args.width, args.height),
+               "figure.subplot.left" : 0.03,
+               "figure.subplot.right" : 0.995,
+               "figure.subplot.bottom" : 0.09,
+               "figure.subplot.top" : 0.99,
+               "figure.subplot.wspace" : 0.,
+               "figure.subplot.hspace" : 0.,
+               "lines.markersize" : 6,
+               "lines.linewidth" : 3.
+               }
+pl.rcParams.update(PLOT_PARAMS)
+
+#  A number of colours for the various types. Recycled when there are
+#  more task types than colours...
+colours = ["cyan", "lightgray", "darkblue", "yellow", "tan", "dodgerblue",
+           "sienna", "aquamarine", "bisque", "blue", "green", "lightgreen",
+           "brown", "purple", "moccasin", "olivedrab", "chartreuse",
+           "darksage", "darkgreen", "green", "mediumseagreen",
+           "mediumaquamarine", "darkslategrey", "mediumturquoise",
+           "black", "cadetblue", "skyblue", "red", "slategray", "gold",
+           "slateblue", "blueviolet", "mediumorchid", "firebrick",
+           "magenta", "hotpink", "pink", "orange", "lightgreen"]
+maxcolours = len(colours)
+
+#  Read header. First two lines.
+with open(infile) as infid:
+    head = [next(infid) for x in xrange(2)]
+header = head[1][2:].strip()
+header = eval(header)
+nthread = int(header['num_threads']) + 1
+CPU_CLOCK = float(header['cpufreq']) / 1000.0
+print "Number of threads: ", nthread
+if args.verbose:
+    print "CPU frequency:", CPU_CLOCK * 1000.0
+
+#  Read input.
+data = pl.genfromtxt(infile, dtype=None, delimiter=" ")
+
+#  Mixed types, so need to separate.
+tics = []
+tocs = []
+funcs = []
+threads = []
+chunks = []
+for i in data:
+    if i[0] != "#":
+        funcs.append(i[0].replace("_mapper", ""))
+        if i[1] < 0:
+            threads.append(nthread-1)
+        else:
+            threads.append(i[1])
+        chunks.append(i[2])
+        tics.append(i[3])
+        tocs.append(i[4])
+tics = pl.array(tics)
+tocs = pl.array(tocs)
+funcs = pl.array(funcs)
+threads = pl.array(threads)
+chunks = pl.array(chunks)
+
+
+#  Recover the start and end time
+tic_step = min(tics)
+toc_step = max(tocs)
+
+#   Not known.
+
+#  Calculate the time range, if not given.
+delta_t = delta_t * CPU_CLOCK
+if delta_t == 0:
+    dt = toc_step - tic_step
+    if dt > delta_t:
+        delta_t = dt
+    print "Data range: ", delta_t / CPU_CLOCK, "ms"
+
+#  Once more doing the real gather and plots this time.
+start_t = float(tic_step)
+tics -= tic_step
+tocs -= tic_step
+end_t = (toc_step - start_t) / CPU_CLOCK
+
+#  Get all "task" names and assign colours.
+TASKTYPES = pl.unique(funcs)
+print TASKTYPES
+
+#  Set colours of task/subtype.
+TASKCOLOURS = {}
+ncolours = 0
+for task in TASKTYPES:
+    TASKCOLOURS[task] = colours[ncolours]
+    ncolours = (ncolours + 1) % maxcolours
+
+#  For fiddling with colours...
+if args.verbose:
+    print "#Selected colours:"
+    for task in sorted(TASKCOLOURS.keys()):
+        print "# " + task + ": " + TASKCOLOURS[task]
+    for task in sorted(SUBCOLOURS.keys()):
+        print "# " + task + ": " + SUBCOLOURS[task]
+
+tasks = {}
+tasks[-1] = []
+for i in range(nthread*expand):
+    tasks[i] = []
+
+#  Counters for each thread when expanding.
+ecounter = []
+for i in range(nthread):
+    ecounter.append(0)
+
+for i in range(len(threads)):
+    thread = threads[i]
+
+    # Expand to cover extra lines if expanding.
+    ethread = thread * expand + (ecounter[thread] % expand)
+    ecounter[thread] = ecounter[thread] + 1
+    thread = ethread
+
+    tasks[thread].append({})
+    tasks[thread][-1]["type"] = funcs[i]
+    tic = tics[i] / CPU_CLOCK
+    toc = tocs[i] / CPU_CLOCK
+    tasks[thread][-1]["tic"] = tic
+    tasks[thread][-1]["toc"] = toc
+    tasks[thread][-1]["colour"] = TASKCOLOURS[funcs[i]]
+
+# Use expanded threads from now on.
+nthread = nthread * expand
+
+typesseen = []
+fig = pl.figure()
+ax = fig.add_subplot(1,1,1)
+ax.set_xlim(-delta_t * 0.01 / CPU_CLOCK, delta_t * 1.01 / CPU_CLOCK)
+ax.set_ylim(0, nthread)
+
+# Fake thread is used to colour the whole range, do that first.
+tictocs = []
+colours = []
+j = 0
+for task in tasks[nthread - expand]:
+    tictocs.append((task["tic"], task["toc"] - task["tic"]))
+    colours.append(task["colour"])
+ax.broken_barh(tictocs, [0,(nthread-1)], facecolors = colours, linewidth=0, alpha=0.15)
+
+# And we don't plot the fake thread.
+nthread = nthread - expand
+for i in range(nthread):
+
+    #  Collect ranges and colours into arrays.
+    tictocs = []
+    colours = []
+    j = 0
+    for task in tasks[i]:
+        tictocs.append((task["tic"], task["toc"] - task["tic"]))
+        colours.append(task["colour"])
+
+        #  Legend support, collections don't add to this.
+        qtask = task["type"]
+        if qtask not in typesseen:
+            pl.plot([], [], color=task["colour"], label=qtask)
+            typesseen.append(qtask)
+
+    #  Now plot.
+    ax.broken_barh(tictocs, [i+0.05,0.90], facecolors = colours, linewidth=0)
+
+#  Legend and room for it.
+nrow = len(typesseen) / 5
+if not args.nolegend:
+    ax.fill_between([0, 0], nthread+0.5, nthread + nrow + 0.5, facecolor="white")
+    ax.set_ylim(0, nthread + 0.5)
+    ax.legend(loc=1, shadow=True, bbox_to_anchor=(0., 1.05 ,1., 0.2), mode="expand", ncol=5)
+    box = ax.get_position()
+    ax.set_position([box.x0, box.y0, box.width, box.height*0.8])
+    
+# Start and end of time-step
+ax.plot([0, 0], [0, nthread + nrow + 1], 'k--', linewidth=1)
+ax.plot([end_t, end_t], [0, nthread + nrow + 1], 'k--', linewidth=1)
+
+ax.set_xlabel("Wall clock time [ms]", labelpad=0.)
+if expand == 1:
+    ax.set_ylabel("Thread ID", labelpad=0 )
+else:
+    ax.set_ylabel("Thread ID * " + str(expand), labelpad=0 )
+ax.set_yticks(pl.array(range(nthread)), True)
+
+loc = plticker.MultipleLocator(base=expand)
+ax.yaxis.set_major_locator(loc)
+ax.grid(True, which='major', axis="y", linestyle="-")
+
+pl.show()
+pl.savefig(outpng)
+print "Graphics done, output written to", outpng
+
+sys.exit(0)
diff --git a/examples/process_plot_tasks_MPI b/examples/process_plot_tasks_MPI
index b2672b3711823eb87d0bede5b1ffd8945a735f98..691822ebc33b43450d69b06e49c2c95bb0683045 100755
--- a/examples/process_plot_tasks_MPI
+++ b/examples/process_plot_tasks_MPI
@@ -62,7 +62,9 @@ nrank=$(($nrank-1))
 #  And process them,
 echo "Processing thread info files..."
 echo $list | xargs -P $NPROCS -n 3 /bin/bash -c "./plot_tasks_MPI.py --expand 1 --limit $TIMERANGE \$0 \$2 "
-echo $list | xargs -P $NPROCS -n 3 /bin/bash -c "./analyse_tasks_MPI.py \$0 > \$2.stats"
+for i in $(seq 0 $nrank); do
+    echo $list | xargs -P $NPROCS -n 3 /bin/bash -c "./analyse_tasks_MPI.py -r $i \$0 > \$2${i}.stats"
+done
 
 echo "Writing output index.html file"
 #  Construct document - serial.
@@ -93,7 +95,7 @@ EOF2
 <img src="step${s}r${i}.png">
 <pre>
 EOF2
-cat step${s}r.stats >> step${s}r${i}.html
+cat step${s}r${i}.stats >> step${s}r${i}.html
 cat <<EOF2 >> step${s}r${i}.html
 </pre>
 </body>
diff --git a/examples/process_plot_threadpool b/examples/process_plot_threadpool
new file mode 100755
index 0000000000000000000000000000000000000000..343c1559ee37d6714ac32e5305457eddbb7e6414
--- /dev/null
+++ b/examples/process_plot_threadpool
@@ -0,0 +1,108 @@
+#!/bin/bash
+#
+# Usage:
+#  process_plot_threadpool nprocess [time-range-ms]
+#
+# Description:
+#  Process all the threadpool info files in the current directory
+#  creating function graphs for steps and threads.
+#
+#  The input files are created by a run using the "-Y interval" flag and
+#  should be named "threadpool_info-step<n>.dat" in the current directory.
+#  All located files will be processed using "nprocess" concurrent
+#  processes and all plots will have the same time range if one is given.
+#  An output HTML file "index.html" will be created to view all the plots.
+#
+#
+# This file is part of SWIFT:
+#
+#  Copyright (C) 2017 Peter W. Draper (p.w.draper@durham.ac.uk)
+#  All Rights Reserved.
+#
+#  This program is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU Lesser General Public License as published
+#  by the Free Software Foundation, either version 3 of the License, or
+#  (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU Lesser General Public License
+#  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+#  Handle command-line
+if test "$1" = ""; then
+    echo "Usage: $0 nprocess [time-range-ms]"
+    exit 1
+fi
+NPROCS=$1
+TIMERANGE=0
+LIMIT="(autoranged)"
+if test "$2" != ""; then
+    TIMERANGE=$2
+    LIMIT=""
+fi
+
+#  Find all thread info files. Use version sort to get into correct order.
+files=$(ls -v threadpool_info-step*.dat)
+if test $? != 0; then
+    echo "Failed to find any threadpool info files"
+    exit 1
+fi
+
+#  Construct list of names, the step no and names for the graphics.
+list=""
+for f in $files; do
+    s=$(echo $f| sed 's,threadpool_info-step\(.*\).dat,\1,')
+    list="$list $f $s poolstep${s}r"
+done
+
+#  And process them,
+echo "Processing threadpool info files..."
+echo $list | xargs -P $NPROCS -n 3 /bin/bash -c "./plot_threadpool.py --expand 1 --limit $TIMERANGE --width 16 --height 4 \$0 \$2 "
+echo $list | xargs -P $NPROCS -n 3 /bin/bash -c "./analyse_threadpool_tasks.py \$0 > \$2.stats"
+
+echo "Writing output threadpool-index.html file"
+#  Construct document - serial.
+cat <<EOF > threadpool-index.html
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+<html>
+  <head>
+    <title>SWIFT threadpool tasks $LIMIT</title>
+  </head>
+  <body>
+  <h1>SWIFT threadpool tasks $LIMIT</h1>
+EOF
+
+echo $list | xargs -n 3 | while read f s g; do
+    cat <<EOF >> threadpool-index.html
+<h2>Step $s</h2>
+EOF
+    cat <<EOF >> threadpool-index.html
+<a href="poolstep${s}r${i}.html"><img src="poolstep${s}r${i}.png" width=400px/></a>
+EOF
+    cat <<EOF > poolstep${s}r${i}.html
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+<html>
+<body>
+<img src="poolstep${s}r${i}.png">
+<pre>
+EOF
+cat poolstep${s}r${i}.stats >> poolstep${s}r${i}.html
+cat <<EOF >> poolstep${s}r${i}.html
+</body>
+</html>
+EOF
+
+done
+
+cat <<EOF >> threadpool-index.html
+  </body>
+</html>
+EOF
+
+echo "Finished"
+
+exit
diff --git a/src/Makefile.am b/src/Makefile.am
index 2ddcdb0908201c65053d7cc5380a4217277b5c13..ec01184928faf3d58b2d0890965a745d05718354 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -64,7 +64,7 @@ nobase_noinst_HEADERS = align.h approx_math.h atomic.h cycle.h error.h inline.h
 		 kernel_long_gravity.h vector.h cache.h runner_doiact.h runner_doiact_vec.h runner_doiact_grav.h runner_doiact_fft.h \
                  runner_doiact_nosort.h units.h intrinsics.h minmax.h kick.h timestep.h drift.h adiabatic_index.h io_properties.h \
 		 dimension.h equation_of_state.h part_type.h periodic.h \
-		 gravity.h gravity_io.h \
+		 gravity.h gravity_io.h gravity_cache.h \
 		 gravity/Default/gravity.h gravity/Default/gravity_iact.h gravity/Default/gravity_io.h \
 		 gravity/Default/gravity_debug.h gravity/Default/gravity_part.h  \
 		 sourceterms.h \
@@ -86,6 +86,7 @@ nobase_noinst_HEADERS = align.h approx_math.h atomic.h cycle.h error.h inline.h
                  hydro/Gizmo/hydro_slope_limiters_cell.h \
                  hydro/Gizmo/hydro_slope_limiters_face.h \
                  hydro/Gizmo/hydro_slope_limiters.h \
+                 hydro/Gizmo/hydro_flux_limiters.h \
                  hydro/Gizmo/hydro_unphysical.h \
                  hydro/Gizmo/hydro_velocities.h \
                  hydro/Shadowswift/hydro_debug.h \
diff --git a/src/align.h b/src/align.h
index 915af33e6e2ba59be1a0849c4de0e2f1bd5b0d96..54435c4c9baa1ce9dc511e2903b7e2be2d6655de 100644
--- a/src/align.h
+++ b/src/align.h
@@ -23,9 +23,71 @@
  * @brief The default struct alignment in SWIFT.
  */
 #define SWIFT_STRUCT_ALIGNMENT 32
+
 /**
  * @brief Defines alignment of structures
  */
 #define SWIFT_STRUCT_ALIGN __attribute__((aligned(SWIFT_STRUCT_ALIGNMENT)))
 
+/**
+ * @brief The default cache alignment in SWIFT.
+ */
+#define SWIFT_CACHE_ALIGNMENT 64
+
+/**
+ * @brief Defines alignment of caches
+ */
+#define SWIFT_CACHE_ALIGN __attribute__((aligned(SWIFT_CACHE_ALIGNMENT)))
+
+/**
+ * @brief Macro to tell the compiler that a given array has the specified
+ * alignment.
+ *
+ * Note that this turns into a no-op but gives information to the compiler.
+ *
+ * @param array The array.
+ * @param alignment The alignment in bytes of the array.
+ */
+#if defined(__ICC)
+#define swift_align_information(array, alignment) \
+  __assume_aligned(array, alignment);
+#elif defined(__GNUC__)
+#define swift_align_information(array, alignment) \
+  array = __builtin_assume_aligned(array, alignment);
+#else
+#define swift_align_information(array, alignment) ;
+#endif
+
+/**
+ * @brief Macro to create a restrict pointer to an array and tell the compiler
+ * that the given array has the specified
+ * alignment.
+ *
+ * Note that this turns into a no-op but gives information to the compiler.
+ *
+ * @param array The array.
+ * @param ptr Pointer to array
+ * @param type Type of array
+ * @param alignment The alignment in bytes of the array.
+ */
+#define swift_declare_aligned_ptr(type, array, ptr, alignment) \
+  type *restrict array = ptr;                                  \
+  swift_align_information(array, alignment);
+
+/**
+ * @brief Macro to tell the compiler that a given number is 0 modulo a given
+ * size.
+ *
+ * Note that this turns into a no-op but gives information to the compiler.
+ * GCC does not have the equivalent built-in so defaults to nothing.
+ *
+ * @param var The variable
+ * @param size The modulo of interest.
+ */
+#if defined(__ICC)
+#define swift_assume_size(var, size) __assume(var % size == 0);
+#else
+#define swift_assume_size(var, size) ;
+#endif
+
 #endif /* SWIFT_ALIGN_H */
diff --git a/src/approx_math.h b/src/approx_math.h
index ad07adeb4f3b1b54ca5f33d80eabb6a004d2a3aa..48319ddfd7a86c132a1cd18b4a08fa849a36a15a 100644
--- a/src/approx_math.h
+++ b/src/approx_math.h
@@ -36,4 +36,17 @@ __attribute__((always_inline)) INLINE static float approx_expf(float x) {
   return 1.f + x * (1.f + x * (0.5f + x * (1.f / 6.f + 1.f / 24.f * x)));
 }
 
+/**
+ * @brief Approximate version of expf(x) using a 6th order Taylor expansion
+ *
+ */
+__attribute__((always_inline)) INLINE static float good_approx_expf(float x) {
+  return 1.f +
+         x * (1.f +
+              x * (0.5f +
+                   x * ((1.f / 6.f) +
+                        x * ((1.f / 24.f) +
+                             x * ((1.f / 120.f) + (1.f / 720.f) * x)))));
+}
+
 #endif /* SWIFT_APPROX_MATH_H */
diff --git a/src/cache.h b/src/cache.h
index 6739c2020e897d54e6586c9d121490aaab5661bc..70c63f72a45d730c826f039f535e7e8c5d467f64 100644
--- a/src/cache.h
+++ b/src/cache.h
@@ -23,6 +23,7 @@
 #include "../config.h"
 
 /* Local headers */
+#include "align.h"
 #include "cell.h"
 #include "error.h"
 #include "part.h"
@@ -30,9 +31,7 @@
 #include "vector.h"
 
 #define NUM_VEC_PROC 2
-#define CACHE_ALIGN 64
 #define C2_CACHE_SIZE (NUM_VEC_PROC * VEC_SIZE * 6) + (NUM_VEC_PROC * VEC_SIZE)
-#define C2_CACHE_ALIGN sizeof(float) * VEC_SIZE
 
 #ifdef WITH_VECTORIZATION
 /* Cache struct to hold a local copy of a cells' particle
@@ -40,46 +39,46 @@
 struct cache {
 
   /* Particle x position. */
-  float *restrict x __attribute__((aligned(CACHE_ALIGN)));
+  float *restrict x SWIFT_CACHE_ALIGN;
 
   /* Particle y position. */
-  float *restrict y __attribute__((aligned(CACHE_ALIGN)));
+  float *restrict y SWIFT_CACHE_ALIGN;
 
   /* Particle z position. */
-  float *restrict z __attribute__((aligned(CACHE_ALIGN)));
+  float *restrict z SWIFT_CACHE_ALIGN;
 
   /* Particle smoothing length. */
-  float *restrict h __attribute__((aligned(CACHE_ALIGN)));
+  float *restrict h SWIFT_CACHE_ALIGN;
 
   /* Particle mass. */
-  float *restrict m __attribute__((aligned(CACHE_ALIGN)));
+  float *restrict m SWIFT_CACHE_ALIGN;
 
   /* Particle x velocity. */
-  float *restrict vx __attribute__((aligned(CACHE_ALIGN)));
+  float *restrict vx SWIFT_CACHE_ALIGN;
 
   /* Particle y velocity. */
-  float *restrict vy __attribute__((aligned(CACHE_ALIGN)));
+  float *restrict vy SWIFT_CACHE_ALIGN;
 
   /* Particle z velocity. */
-  float *restrict vz __attribute__((aligned(CACHE_ALIGN)));
+  float *restrict vz SWIFT_CACHE_ALIGN;
 
+  /* Maximum index into neighbouring cell for particles that are in range. */
+  int *restrict max_index SWIFT_CACHE_ALIGN;
+  
   /* Particle density. */
-  float *restrict rho __attribute__((aligned(CACHE_ALIGN)));
+  float *restrict rho SWIFT_CACHE_ALIGN;
 
   /* Particle smoothing length gradient. */
-  float *restrict grad_h __attribute__((aligned(CACHE_ALIGN)));
+  float *restrict grad_h SWIFT_CACHE_ALIGN;
 
   /* Pressure over density squared. */
-  float *restrict pOrho2 __attribute__((aligned(CACHE_ALIGN)));
+  float *restrict pOrho2 SWIFT_CACHE_ALIGN;
 
   /* Balsara switch. */
-  float *restrict balsara __attribute__((aligned(CACHE_ALIGN)));
+  float *restrict balsara SWIFT_CACHE_ALIGN;
 
   /* Particle sound speed. */
-  float *restrict soundspeed __attribute__((aligned(CACHE_ALIGN)));
-
-  /* Maximum distance of particles into neighbouring cell. */
-  float *restrict max_d __attribute__((aligned(CACHE_ALIGN)));
+  float *restrict soundspeed SWIFT_CACHE_ALIGN;
 
   /* Cache size. */
   int count;
@@ -90,46 +89,46 @@ struct cache {
 struct c2_cache {
 
   /* Separation between two particles squared. */
-  float r2q[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+  float r2q[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
 
   /* x separation between two particles. */
-  float dxq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+  float dxq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
 
   /* y separation between two particles. */
-  float dyq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+  float dyq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
 
   /* z separation between two particles. */
-  float dzq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+  float dzq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
 
   /* Mass of particle pj. */
-  float mq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+  float mq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
 
   /* x velocity of particle pj. */
-  float vxq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+  float vxq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
 
   /* y velocity of particle pj. */
-  float vyq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+  float vyq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
 
   /* z velocity of particle pj. */
-  float vzq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+  float vzq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
 
   /* Density of particle pj. */
-  float rhoq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+  float rhoq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
 
   /* Smoothing length gradient of particle pj. */
-  float grad_hq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+  float grad_hq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
 
   /* Pressure over density squared of particle pj. */
-  float pOrho2q[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+  float pOrho2q[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
 
   /* Balsara switch of particle pj. */
-  float balsaraq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+  float balsaraq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
 
   /* Sound speed of particle pj. */
-  float soundspeedq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+  float soundspeedq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
 
   /* Inverse smoothing length of particle pj. */
-  float h_invq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+  float h_invq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
 };
 
 /**
@@ -144,9 +143,10 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c,
   /* Align cache on correct byte boundary and pad cache size to be a multiple of
    * the vector size
    * and include 2 vector lengths for remainder operations. */
-  unsigned int pad = 2 * VEC_SIZE, rem = count % VEC_SIZE;
+  size_t pad = 2 * VEC_SIZE, rem = count % VEC_SIZE;
   if (rem > 0) pad += VEC_SIZE - rem;
-  unsigned int sizeBytes = (count + pad) * sizeof(float);
+  size_t sizeBytes = (count + pad) * sizeof(float);
+  size_t sizeIntBytes = (count + pad) * sizeof(int);
   int error = 0;
 
   /* Free memory if cache has already been allocated. */
@@ -159,28 +159,29 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c,
     free(c->vy);
     free(c->vz);
     free(c->h);
+    free(c->max_index);
     free(c->rho);
     free(c->grad_h);
     free(c->pOrho2);
     free(c->balsara);
     free(c->soundspeed);
-    free(c->max_d);
   }
 
-  error += posix_memalign((void **)&c->x, CACHE_ALIGN, sizeBytes);
-  error += posix_memalign((void **)&c->y, CACHE_ALIGN, sizeBytes);
-  error += posix_memalign((void **)&c->z, CACHE_ALIGN, sizeBytes);
-  error += posix_memalign((void **)&c->m, CACHE_ALIGN, sizeBytes);
-  error += posix_memalign((void **)&c->vx, CACHE_ALIGN, sizeBytes);
-  error += posix_memalign((void **)&c->vy, CACHE_ALIGN, sizeBytes);
-  error += posix_memalign((void **)&c->vz, CACHE_ALIGN, sizeBytes);
-  error += posix_memalign((void **)&c->h, CACHE_ALIGN, sizeBytes);
-  error += posix_memalign((void **)&c->max_d, CACHE_ALIGN, sizeBytes);
-  error += posix_memalign((void **)&c->rho, CACHE_ALIGN, sizeBytes);
-  error += posix_memalign((void **)&c->grad_h, CACHE_ALIGN, sizeBytes);
-  error += posix_memalign((void **)&c->pOrho2, CACHE_ALIGN, sizeBytes);
-  error += posix_memalign((void **)&c->balsara, CACHE_ALIGN, sizeBytes);
-  error += posix_memalign((void **)&c->soundspeed, CACHE_ALIGN, sizeBytes);
+  error += posix_memalign((void **)&c->x, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error += posix_memalign((void **)&c->y, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error += posix_memalign((void **)&c->z, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error += posix_memalign((void **)&c->m, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error += posix_memalign((void **)&c->vx, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error += posix_memalign((void **)&c->vy, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error += posix_memalign((void **)&c->vz, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error += posix_memalign((void **)&c->h, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error += posix_memalign((void **)&c->max_index, SWIFT_CACHE_ALIGNMENT,
+                          sizeIntBytes);
+  error += posix_memalign((void **)&c->rho, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error += posix_memalign((void **)&c->grad_h, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error += posix_memalign((void **)&c->pOrho2, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error += posix_memalign((void **)&c->balsara, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error += posix_memalign((void **)&c->soundspeed, SWIFT_CACHE_ALIGNMENT, sizeBytes);
 
   if (error != 0)
     error("Couldn't allocate cache, no. of particles: %d", (int)count);
@@ -194,162 +195,100 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c,
  * @param ci_cache The cache.
  */
 __attribute__((always_inline)) INLINE void cache_read_particles(
-    const struct cell *const ci, struct cache *const ci_cache) {
+    const struct cell *restrict const ci,
+    struct cache *restrict const ci_cache) {
 
 #if defined(GADGET2_SPH)
 
-/* Shift the particles positions to a local frame so single precision can be
- * used instead of double precision. */
-#if defined(WITH_VECTORIZATION) && defined(__ICC)
-#pragma vector aligned
-#endif
+  /* Let the compiler know that the data is aligned and create pointers to the
+   * arrays inside the cache. */
+  swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, y, ci_cache->y, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, z, ci_cache->z, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, h, ci_cache->h, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, m, ci_cache->m, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vx, ci_cache->vx, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT);
+
+  const struct part *restrict parts = ci->parts;
+  double loc[3];
+  loc[0] = ci->loc[0];
+  loc[1] = ci->loc[1];
+  loc[2] = ci->loc[2];
+
+  /* Shift the particles positions to a local frame so single precision can be
+   * used instead of double precision. */
   for (int i = 0; i < ci->count; i++) {
-    ci_cache->x[i] = ci->parts[i].x[0] - ci->loc[0];
-    ci_cache->y[i] = ci->parts[i].x[1] - ci->loc[1];
-    ci_cache->z[i] = ci->parts[i].x[2] - ci->loc[2];
-    ci_cache->h[i] = ci->parts[i].h;
-
-    ci_cache->m[i] = ci->parts[i].mass;
-    ci_cache->vx[i] = ci->parts[i].v[0];
-    ci_cache->vy[i] = ci->parts[i].v[1];
-    ci_cache->vz[i] = ci->parts[i].v[2];
-
-    ci_cache->rho[i] = ci->parts[i].rho;
-    ci_cache->grad_h[i] = ci->parts[i].force.f;
-    ci_cache->pOrho2[i] = ci->parts[i].force.P_over_rho2;
-    ci_cache->balsara[i] = ci->parts[i].force.balsara;
-    ci_cache->soundspeed[i] = ci->parts[i].force.soundspeed;
+    x[i] = (float)(parts[i].x[0] - loc[0]);
+    y[i] = (float)(parts[i].x[1] - loc[1]);
+    z[i] = (float)(parts[i].x[2] - loc[2]);
+    h[i] = parts[i].h;
+
+    m[i] = parts[i].mass;
+    vx[i] = parts[i].v[0];
+    vy[i] = parts[i].v[1];
+    vz[i] = parts[i].v[2];
   }
 
 #endif
 }
 
 /**
- * @brief Populate cache by reading in the particles from two cells in unsorted
- * order.
+ * @brief Populate cache by reading in the particles in unsorted order.
  *
- * @param ci The i #cell.
- * @param cj The j #cell.
- * @param ci_cache The cache for cell ci.
- * @param cj_cache The cache for cell cj.
- * @param shift The amount to shift the particle positions to account for BCs
+ * @param ci The #cell.
+ * @param ci_cache The cache.
  */
-__attribute__((always_inline)) INLINE void cache_read_two_cells(
-    const struct cell *const ci, const struct cell *const cj,
-    struct cache *const ci_cache, struct cache *const cj_cache,
-    const double *const shift) {
-
-  /* Shift the particles positions to a local frame (ci frame) so single
-   * precision can be
-   * used instead of double precision. Also shift the cell ci, particles
-   * positions due to BCs but leave cell cj. */
-  for (int i = 0; i < ci->count; i++) {
-    ci_cache->x[i] = ci->parts[i].x[0] - ci->loc[0] - shift[0];
-    ci_cache->y[i] = ci->parts[i].x[1] - ci->loc[1] - shift[1];
-    ci_cache->z[i] = ci->parts[i].x[2] - ci->loc[2] - shift[2];
-    ci_cache->h[i] = ci->parts[i].h;
-
-    ci_cache->m[i] = ci->parts[i].mass;
-    ci_cache->vx[i] = ci->parts[i].v[0];
-    ci_cache->vy[i] = ci->parts[i].v[1];
-    ci_cache->vz[i] = ci->parts[i].v[2];
-  }
-
-  for (int i = 0; i < cj->count; i++) {
-    cj_cache->x[i] = cj->parts[i].x[0] - ci->loc[0];
-    cj_cache->y[i] = cj->parts[i].x[1] - ci->loc[1];
-    cj_cache->z[i] = cj->parts[i].x[2] - ci->loc[2];
-    cj_cache->h[i] = cj->parts[i].h;
-
-    cj_cache->m[i] = cj->parts[i].mass;
-    cj_cache->vx[i] = cj->parts[i].v[0];
-    cj_cache->vy[i] = cj->parts[i].v[1];
-    cj_cache->vz[i] = cj->parts[i].v[2];
-  }
-}
-
-__attribute__((always_inline)) INLINE void cache_read_cell_sorted(
-    const struct cell *const ci, struct cache *const ci_cache,
-    const struct entry *restrict sort_i, double *const loc,
-    double *const shift) {
-
-  int idx;
-/* Shift the particles positions to a local frame (ci frame) so single precision
- * can be
- * used instead of double precision. Also shift the cell ci, particles positions
- * due to BCs but leave cell cj. */
-#if defined(WITH_VECTORIZATION) && defined(__ICC)
-#pragma simd
-#endif
-  for (int i = 0; i < ci->count; i++) {
-    idx = sort_i[i].i;
-
-    ci_cache->x[i] = ci->parts[idx].x[0] - loc[0] - shift[0];
-    ci_cache->y[i] = ci->parts[idx].x[1] - loc[1] - shift[1];
-    ci_cache->z[i] = ci->parts[idx].x[2] - loc[2] - shift[2];
-    ci_cache->h[i] = ci->parts[idx].h;
-
-    ci_cache->m[i] = ci->parts[idx].mass;
-    ci_cache->vx[i] = ci->parts[idx].v[0];
-    ci_cache->vy[i] = ci->parts[idx].v[1];
-    ci_cache->vz[i] = ci->parts[idx].v[2];
-  }
-}
+__attribute__((always_inline)) INLINE void cache_read_force_particles(
+    const struct cell *restrict const ci,
+    struct cache *restrict const ci_cache) {
 
-/**
- * @brief Populate cache by reading in the particles from two cells in sorted
- * order.
- *
- * @param ci The i #cell.
- * @param cj The j #cell.
- * @param ci_cache The #cache for cell ci.
- * @param cj_cache The #cache for cell cj.
- * @param sort_i The array of sorted particle indices for cell ci.
- * @param sort_j The array of sorted particle indices for cell ci.
- * @param shift The amount to shift the particle positions to account for BCs
- */
-__attribute__((always_inline)) INLINE void cache_read_two_cells_sorted(
-    const struct cell *const ci, const struct cell *const cj,
-    struct cache *const ci_cache, struct cache *const cj_cache,
-    const struct entry *restrict sort_i, const struct entry *restrict sort_j,
-    const double *const shift) {
+#if defined(GADGET2_SPH)
 
-  int idx;
-/* Shift the particles positions to a local frame (ci frame) so single precision
- * can be
- * used instead of double precision. Also shift the cell ci, particles positions
- * due to BCs but leave cell cj. */
-#if defined(WITH_VECTORIZATION) && defined(__ICC)
-#pragma simd
-#endif
+  /* Let the compiler know that the data is aligned and create pointers to the
+   * arrays inside the cache. */
+  swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, y, ci_cache->y, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, z, ci_cache->z, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, h, ci_cache->h, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, m, ci_cache->m, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vx, ci_cache->vx, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, rho, ci_cache->rho, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, grad_h, ci_cache->grad_h, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, pOrho2, ci_cache->pOrho2, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, balsara, ci_cache->balsara, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, soundspeed, ci_cache->soundspeed, SWIFT_CACHE_ALIGNMENT);
+
+  const struct part *restrict parts = ci->parts;
+  double loc[3];
+  loc[0] = ci->loc[0];
+  loc[1] = ci->loc[1];
+  loc[2] = ci->loc[2];
+
+  /* Shift the particles positions to a local frame so single precision can be
+   * used instead of double precision. */
   for (int i = 0; i < ci->count; i++) {
-    idx = sort_i[i].i;
-    ci_cache->x[i] = ci->parts[idx].x[0] - ci->loc[0] - shift[0];
-    ci_cache->y[i] = ci->parts[idx].x[1] - ci->loc[1] - shift[1];
-    ci_cache->z[i] = ci->parts[idx].x[2] - ci->loc[2] - shift[2];
-    ci_cache->h[i] = ci->parts[idx].h;
-
-    ci_cache->m[i] = ci->parts[idx].mass;
-    ci_cache->vx[i] = ci->parts[idx].v[0];
-    ci_cache->vy[i] = ci->parts[idx].v[1];
-    ci_cache->vz[i] = ci->parts[idx].v[2];
+    x[i] = (float)(parts[i].x[0] - loc[0]);
+    y[i] = (float)(parts[i].x[1] - loc[1]);
+    z[i] = (float)(parts[i].x[2] - loc[2]);
+    h[i] = parts[i].h;
+
+    m[i] = parts[i].mass;
+    vx[i] = parts[i].v[0];
+    vy[i] = parts[i].v[1];
+    vz[i] = parts[i].v[2];
+    
+    rho[i] = parts[i].rho;
+    grad_h[i] = parts[i].force.f;
+    pOrho2[i] = parts[i].force.P_over_rho2;
+    balsara[i] = parts[i].force.balsara;
+    soundspeed[i] = parts[i].force.soundspeed;
   }
 
-#if defined(WITH_VECTORIZATION) && defined(__ICC)
-#pragma simd
 #endif
-  for (int i = 0; i < cj->count; i++) {
-    idx = sort_j[i].i;
-    cj_cache->x[i] = cj->parts[idx].x[0] - ci->loc[0];
-    cj_cache->y[i] = cj->parts[idx].x[1] - ci->loc[1];
-    cj_cache->z[i] = cj->parts[idx].x[2] - ci->loc[2];
-    cj_cache->h[i] = cj->parts[idx].h;
-
-    cj_cache->m[i] = cj->parts[idx].mass;
-    cj_cache->vx[i] = cj->parts[idx].v[0];
-    cj_cache->vy[i] = cj->parts[idx].v[1];
-    cj_cache->vz[i] = cj->parts[idx].v[2];
-  }
 }
 
 /**
@@ -370,13 +309,13 @@ __attribute__((always_inline)) INLINE void cache_read_two_cells_sorted(
  * interaction.
  */
 __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
-    const struct cell *const ci, const struct cell *const cj,
-    struct cache *const ci_cache, struct cache *const cj_cache,
-    const struct entry *restrict sort_i, const struct entry *restrict sort_j,
-    const double *const shift, int *first_pi, int *last_pj,
-    const int num_vec_proc) {
+    const struct cell *restrict const ci, const struct cell *restrict const cj,
+    struct cache *restrict const ci_cache,
+    struct cache *restrict const cj_cache, const struct entry *restrict sort_i,
+    const struct entry *restrict sort_j, const double *restrict const shift,
+    int *first_pi, int *last_pj, const int num_vec_proc) {
 
-  int idx, ci_cache_idx;
+  int idx;
   /* Pad number of particles read to the vector size. */
   int rem = (ci->count - *first_pi) % (num_vec_proc * VEC_SIZE);
   if (rem != 0) {
@@ -394,74 +333,97 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
 
   int first_pi_align = *first_pi;
   int last_pj_align = *last_pj;
-
-/* Shift the particles positions to a local frame (ci frame) so single precision
- * can be
- * used instead of double precision. Also shift the cell ci, particles positions
- * due to BCs but leave cell cj. */
-#if defined(WITH_VECTORIZATION) && defined(__ICC)
-#pragma vector aligned
-#endif
-  for (int i = first_pi_align; i < ci->count; i++) {
-    /* Make sure ci_cache is filled from the first element. */
-    ci_cache_idx = i - first_pi_align;
-    idx = sort_i[i].i;
-    ci_cache->x[ci_cache_idx] = ci->parts[idx].x[0] - ci->loc[0] - shift[0];
-    ci_cache->y[ci_cache_idx] = ci->parts[idx].x[1] - ci->loc[1] - shift[1];
-    ci_cache->z[ci_cache_idx] = ci->parts[idx].x[2] - ci->loc[2] - shift[2];
-    ci_cache->h[ci_cache_idx] = ci->parts[idx].h;
-
-    ci_cache->m[ci_cache_idx] = ci->parts[idx].mass;
-    ci_cache->vx[ci_cache_idx] = ci->parts[idx].v[0];
-    ci_cache->vy[ci_cache_idx] = ci->parts[idx].v[1];
-    ci_cache->vz[ci_cache_idx] = ci->parts[idx].v[2];
+  const struct part *restrict parts_i = ci->parts;
+  const struct part *restrict parts_j = cj->parts;
+  double loc[3];
+  loc[0] = ci->loc[0];
+  loc[1] = ci->loc[1];
+  loc[2] = ci->loc[2];
+
+  /* Let the compiler know that the data is aligned and create pointers to the
+   * arrays inside the cache. */
+  swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, y, ci_cache->y, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, z, ci_cache->z, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, h, ci_cache->h, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, m, ci_cache->m, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vx, ci_cache->vx, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT);
+
+  int ci_cache_count = ci->count - first_pi_align;
+  /* Shift the particles positions to a local frame (ci frame) so single
+   * precision
+   * can be
+   * used instead of double precision. Also shift the cell ci, particles
+   * positions
+   * due to BCs but leave cell cj. */
+  for (int i = 0; i < ci_cache_count; i++) {
+    idx = sort_i[i + first_pi_align].i;
+    x[i] = (float)(parts_i[idx].x[0] - loc[0] - shift[0]);
+    y[i] = (float)(parts_i[idx].x[1] - loc[1] - shift[1]);
+    z[i] = (float)(parts_i[idx].x[2] - loc[2] - shift[2]);
+    h[i] = parts_i[idx].h;
+
+    m[i] = parts_i[idx].mass;
+    vx[i] = parts_i[idx].v[0];
+    vy[i] = parts_i[idx].v[1];
+    vz[i] = parts_i[idx].v[2];
   }
 
   /* Pad cache with fake particles that exist outside the cell so will not
    * interact.*/
-  float fake_pix = 2.0f * ci->parts[sort_i[ci->count - 1].i].x[0];
+  float fake_pix = 2.0f * parts_i[sort_i[ci->count - 1].i].x[0];
   for (int i = ci->count - first_pi_align;
        i < ci->count - first_pi_align + VEC_SIZE; i++) {
-    ci_cache->x[i] = fake_pix;
-    ci_cache->y[i] = 1.f;
-    ci_cache->z[i] = 1.f;
-    ci_cache->h[i] = 1.f;
-
-    ci_cache->m[i] = 1.f;
-    ci_cache->vx[i] = 1.f;
-    ci_cache->vy[i] = 1.f;
-    ci_cache->vz[i] = 1.f;
+    x[i] = fake_pix;
+    y[i] = 1.f;
+    z[i] = 1.f;
+    h[i] = 1.f;
+
+    m[i] = 1.f;
+    vx[i] = 1.f;
+    vy[i] = 1.f;
+    vz[i] = 1.f;
   }
 
-#if defined(WITH_VECTORIZATION) && defined(__ICC)
-#pragma vector aligned
-#endif
+  /* Let the compiler know that the data is aligned and create pointers to the
+   * arrays inside the cache. */
+  swift_declare_aligned_ptr(float, xj, cj_cache->x, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, yj, cj_cache->y, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, zj, cj_cache->z, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, hj, cj_cache->h, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, mj, cj_cache->m, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vxj, cj_cache->vx, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vyj, cj_cache->vy, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vzj, cj_cache->vz, SWIFT_CACHE_ALIGNMENT);
+
   for (int i = 0; i <= last_pj_align; i++) {
     idx = sort_j[i].i;
-    cj_cache->x[i] = cj->parts[idx].x[0] - ci->loc[0];
-    cj_cache->y[i] = cj->parts[idx].x[1] - ci->loc[1];
-    cj_cache->z[i] = cj->parts[idx].x[2] - ci->loc[2];
-    cj_cache->h[i] = cj->parts[idx].h;
-
-    cj_cache->m[i] = cj->parts[idx].mass;
-    cj_cache->vx[i] = cj->parts[idx].v[0];
-    cj_cache->vy[i] = cj->parts[idx].v[1];
-    cj_cache->vz[i] = cj->parts[idx].v[2];
+    xj[i] = (float)(parts_j[idx].x[0] - loc[0]);
+    yj[i] = (float)(parts_j[idx].x[1] - loc[1]);
+    zj[i] = (float)(parts_j[idx].x[2] - loc[2]);
+    hj[i] = parts_j[idx].h;
+
+    mj[i] = parts_j[idx].mass;
+    vxj[i] = parts_j[idx].v[0];
+    vyj[i] = parts_j[idx].v[1];
+    vzj[i] = parts_j[idx].v[2];
   }
 
   /* Pad cache with fake particles that exist outside the cell so will not
    * interact.*/
   float fake_pjx = 2.0f * cj->parts[sort_j[cj->count - 1].i].x[0];
   for (int i = last_pj_align + 1; i < last_pj_align + 1 + VEC_SIZE; i++) {
-    cj_cache->x[i] = fake_pjx;
-    cj_cache->y[i] = 1.f;
-    cj_cache->z[i] = 1.f;
-    cj_cache->h[i] = 1.f;
-
-    cj_cache->m[i] = 1.f;
-    cj_cache->vx[i] = 1.f;
-    cj_cache->vy[i] = 1.f;
-    cj_cache->vz[i] = 1.f;
+    xj[i] = fake_pjx;
+    yj[i] = 1.f;
+    zj[i] = 1.f;
+    hj[i] = 1.f;
+
+    mj[i] = 1.f;
+    vxj[i] = 1.f;
+    vyj[i] = 1.f;
+    vzj[i] = 1.f;
   }
 }
 
@@ -479,7 +441,7 @@ static INLINE void cache_clean(struct cache *c) {
     free(c->vy);
     free(c->vz);
     free(c->h);
-    free(c->max_d);
+    free(c->max_index);
   }
 }
 
diff --git a/src/cell.c b/src/cell.c
index dbccfd2f42cabf38417cd87de0450489240884be..4502f5d265dc68540e16ed0e51e681cf5733f842 100644
--- a/src/cell.c
+++ b/src/cell.c
@@ -941,53 +941,52 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, ptrdiff_t sparts_offset,
  * @brief Sanitizes the smoothing length values of cells by setting large
  * outliers to more sensible values.
  *
- * We compute the mean and standard deviation of the smoothing lengths in
- * logarithmic space and limit values to mean + 4 sigma.
+ * Each cell with <1000 part will be processed. We limit h to be the size of
+ * the cell and replace 0s with a good estimate.
  *
  * @param c The cell.
+ * @param treated Has the cell already been sanitized at this level ?
  */
-void cell_sanitize(struct cell *c) {
+void cell_sanitize(struct cell *c, int treated) {
 
   const int count = c->count;
   struct part *parts = c->parts;
+  float h_max = 0.f;
 
-  /* First collect some statistics */
-  float h_mean = 0.f, h_mean2 = 0.f;
-  float h_min = FLT_MAX, h_max = 0.f;
-  for (int i = 0; i < count; ++i) {
+  /* Treat cells will <1000 particles */
+  if (count < 1000 && !treated) {
 
-    const float h = logf(parts[i].h);
-    h_mean += h;
-    h_mean2 += h * h;
-    h_max = max(h_max, h);
-    h_min = min(h_min, h);
-  }
-  h_mean /= count;
-  h_mean2 /= count;
-  const float h_var = h_mean2 - h_mean * h_mean;
-  const float h_std = (h_var > 0.f) ? sqrtf(h_var) : 0.1f * h_mean;
-
-  /* Choose a cut */
-  const float h_limit = expf(h_mean + 4.f * h_std);
-
-  /* Be verbose this is not innocuous */
-  message("Cell properties: h_min= %f h_max= %f geometric mean= %f.",
-          expf(h_min), expf(h_max), expf(h_mean));
+    /* Get an upper bound on h */
+    const float upper_h_max = c->dmin / (1.2f * kernel_gamma);
 
-  if (c->h_max > h_limit) {
+    /* Apply it */
+    for (int i = 0; i < count; ++i) {
+      if (parts[i].h == 0.f || parts[i].h > upper_h_max)
+        parts[i].h = upper_h_max;
+    }
+  }
 
-    message("Smoothing lengths will be limited to (mean + 4sigma)= %f.",
-            h_limit);
+  /* Recurse and gather the new h_max values */
+  if (c->split) {
 
-    /* Apply the cut */
-    for (int i = 0; i < count; ++i) parts->h = min(parts[i].h, h_limit);
+    for (int k = 0; k < 8; ++k) {
+      if (c->progeny[k] != NULL) {
 
-    c->h_max = h_limit;
+        /* Recurse */
+        cell_sanitize(c->progeny[k], (count < 1000));
 
+        /* And collect */
+        h_max = max(h_max, c->progeny[k]->h_max);
+      }
+    }
   } else {
 
-    message("Smoothing lengths will not be limited.");
+    /* Get the new value of h_max */
+    for (int i = 0; i < count; ++i) h_max = max(h_max, parts[i].h);
   }
+
+  /* Record the change */
+  c->h_max = h_max;
 }
 
 /**
@@ -1280,7 +1279,11 @@ void cell_check_multipole(struct cell *c, void *data) {
  */
 void cell_clean(struct cell *c) {
 
-  free(c->sort);
+  for (int i = 0; i < 13; i++)
+    if (c->sort[i] != NULL) {
+      free(c->sort[i]);
+      c->sort[i] = NULL;
+    }
 
   /* Recurse */
   for (int k = 0; k < 8; k++)
@@ -1316,6 +1319,355 @@ int cell_is_drift_needed(struct cell *c, const struct engine *e) {
   return 0;
 }
 
+/**
+ * @brief Clear the drift flags on the given cell.
+ */
+void cell_clear_drift_flags(struct cell *c, void *data) {
+  c->do_drift = 0;
+  c->do_sub_drift = 0;
+}
+
+/**
+ * @brief Activate the drifts on the given cell.
+ */
+void cell_activate_drift_part(struct cell *c, struct scheduler *s) {
+
+  /* If this cell is already marked for drift, quit early. */
+  if (c->do_drift) return;
+
+  /* Mark this cell for drifting. */
+  c->do_drift = 1;
+
+  /* Set the do_sub_drifts all the way up and activate the super drift
+     if this has not yet been done. */
+  if (c == c->super) {
+    scheduler_activate(s, c->drift_part);
+  } else {
+    for (struct cell *parent = c->parent;
+         parent != NULL && !parent->do_sub_drift; parent = parent->parent) {
+      parent->do_sub_drift = 1;
+      if (parent == c->super) {
+        scheduler_activate(s, parent->drift_part);
+        break;
+      }
+    }
+  }
+}
+
+/**
+ * @brief Activate the sorts up a cell hierarchy.
+ */
+
+void cell_activate_sorts_up(struct cell *c, struct scheduler *s) {
+  if (c == c->super) {
+    scheduler_activate(s, c->sorts);
+    if (c->nodeID == engine_rank) cell_activate_drift_part(c, s);
+  } else {
+    for (struct cell *parent = c->parent;
+         parent != NULL && !parent->do_sub_sort; parent = parent->parent) {
+      parent->do_sub_sort = 1;
+      if (parent == c->super) {
+        scheduler_activate(s, parent->sorts);
+        if (parent->nodeID == engine_rank) cell_activate_drift_part(parent, s);
+        break;
+      }
+    }
+  }
+}
+
+/**
+ * @brief Activate the sorts on a given cell, if needed.
+ */
+void cell_activate_sorts(struct cell *c, int sid, struct scheduler *s) {
+
+  /* Do we need to re-sort? */
+  if (c->dx_max_sort > space_maxreldx * c->dmin) {
+
+    /* Climb up the tree to active the sorts in that direction */
+    for (struct cell *finger = c; finger != NULL; finger = finger->parent) {
+      if (finger->requires_sorts) {
+        atomic_or(&finger->do_sort, finger->requires_sorts);
+        cell_activate_sorts_up(finger, s);
+      }
+      finger->sorted = 0;
+    }
+  }
+
+  /* Has this cell been sorted at all for the given sid? */
+  if (!(c->sorted & (1 << sid)) || c->nodeID != engine_rank) {
+    atomic_or(&c->do_sort, (1 << sid));
+    cell_activate_sorts_up(c, s);
+  }
+}
+
+/**
+ * @brief Traverse a sub-cell task and activate the sort tasks along the way.
+ */
+void cell_activate_subcell_tasks(struct cell *ci, struct cell *cj,
+                                 struct scheduler *s) {
+  const struct engine *e = s->space->e;
+
+  /* Store the current dx_max and h_max values. */
+  ci->dx_max_old = ci->dx_max_part;
+  ci->h_max_old = ci->h_max;
+  if (cj != NULL) {
+    cj->dx_max_old = cj->dx_max_part;
+    cj->h_max_old = cj->h_max;
+  }
+
+  /* Self interaction? */
+  if (cj == NULL) {
+    /* Do anything? */
+    if (!cell_is_active(ci, e)) return;
+
+    /* Recurse? */
+    if (cell_can_recurse_in_self_task(ci)) {
+
+      /* Loop over all progenies and pairs of progenies */
+      for (int j = 0; j < 8; j++) {
+        if (ci->progeny[j] != NULL) {
+          cell_activate_subcell_tasks(ci->progeny[j], NULL, s);
+          for (int k = j + 1; k < 8; k++)
+            if (ci->progeny[k] != NULL)
+              cell_activate_subcell_tasks(ci->progeny[j], ci->progeny[k], s);
+        }
+      }
+    } else {
+
+      /* We have reached the bottom of the tree: activate drift */
+      cell_activate_drift_part(ci, s);
+    }
+  }
+
+  /* Otherwise, pair interation, recurse? */
+  else if (cell_can_recurse_in_pair_task(ci) &&
+           cell_can_recurse_in_pair_task(cj)) {
+
+    /* Get the type of pair if not specified explicitly. */
+    double shift[3];
+    int sid = space_getsid(s->space, &ci, &cj, shift);
+
+    /* Different types of flags. */
+    switch (sid) {
+
+      /* Regular sub-cell interactions of a single cell. */
+      case 0: /* (  1 ,  1 ,  1 ) */
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[7], cj->progeny[0], s);
+        break;
+
+      case 1: /* (  1 ,  1 ,  0 ) */
+        if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[6], cj->progeny[0], s);
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[6], cj->progeny[1], s);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[7], cj->progeny[0], s);
+        if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[7], cj->progeny[1], s);
+        break;
+
+      case 2: /* (  1 ,  1 , -1 ) */
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[6], cj->progeny[1], s);
+        break;
+
+      case 3: /* (  1 ,  0 ,  1 ) */
+        if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[5], cj->progeny[0], s);
+        if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[5], cj->progeny[2], s);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[7], cj->progeny[0], s);
+        if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[7], cj->progeny[2], s);
+        break;
+
+      case 4: /* (  1 ,  0 ,  0 ) */
+        if (ci->progeny[4] != NULL && cj->progeny[0] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[4], cj->progeny[0], s);
+        if (ci->progeny[4] != NULL && cj->progeny[1] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[4], cj->progeny[1], s);
+        if (ci->progeny[4] != NULL && cj->progeny[2] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[4], cj->progeny[2], s);
+        if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[4], cj->progeny[3], s);
+        if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[5], cj->progeny[0], s);
+        if (ci->progeny[5] != NULL && cj->progeny[1] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[5], cj->progeny[1], s);
+        if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[5], cj->progeny[2], s);
+        if (ci->progeny[5] != NULL && cj->progeny[3] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[5], cj->progeny[3], s);
+        if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[6], cj->progeny[0], s);
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[6], cj->progeny[1], s);
+        if (ci->progeny[6] != NULL && cj->progeny[2] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[6], cj->progeny[2], s);
+        if (ci->progeny[6] != NULL && cj->progeny[3] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[6], cj->progeny[3], s);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[7], cj->progeny[0], s);
+        if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[7], cj->progeny[1], s);
+        if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[7], cj->progeny[2], s);
+        if (ci->progeny[7] != NULL && cj->progeny[3] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[7], cj->progeny[3], s);
+        break;
+
+      case 5: /* (  1 ,  0 , -1 ) */
+        if (ci->progeny[4] != NULL && cj->progeny[1] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[4], cj->progeny[1], s);
+        if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[4], cj->progeny[3], s);
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[6], cj->progeny[1], s);
+        if (ci->progeny[6] != NULL && cj->progeny[3] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[6], cj->progeny[3], s);
+        break;
+
+      case 6: /* (  1 , -1 ,  1 ) */
+        if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[5], cj->progeny[2], s);
+        break;
+
+      case 7: /* (  1 , -1 ,  0 ) */
+        if (ci->progeny[4] != NULL && cj->progeny[2] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[4], cj->progeny[2], s);
+        if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[4], cj->progeny[3], s);
+        if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[5], cj->progeny[2], s);
+        if (ci->progeny[5] != NULL && cj->progeny[3] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[5], cj->progeny[3], s);
+        break;
+
+      case 8: /* (  1 , -1 , -1 ) */
+        if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[4], cj->progeny[3], s);
+        break;
+
+      case 9: /* (  0 ,  1 ,  1 ) */
+        if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[3], cj->progeny[0], s);
+        if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[3], cj->progeny[4], s);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[7], cj->progeny[0], s);
+        if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[7], cj->progeny[4], s);
+        break;
+
+      case 10: /* (  0 ,  1 ,  0 ) */
+        if (ci->progeny[2] != NULL && cj->progeny[0] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[2], cj->progeny[0], s);
+        if (ci->progeny[2] != NULL && cj->progeny[1] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[2], cj->progeny[1], s);
+        if (ci->progeny[2] != NULL && cj->progeny[4] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[2], cj->progeny[4], s);
+        if (ci->progeny[2] != NULL && cj->progeny[5] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[2], cj->progeny[5], s);
+        if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[3], cj->progeny[0], s);
+        if (ci->progeny[3] != NULL && cj->progeny[1] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[3], cj->progeny[1], s);
+        if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[3], cj->progeny[4], s);
+        if (ci->progeny[3] != NULL && cj->progeny[5] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[3], cj->progeny[5], s);
+        if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[6], cj->progeny[0], s);
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[6], cj->progeny[1], s);
+        if (ci->progeny[6] != NULL && cj->progeny[4] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[6], cj->progeny[4], s);
+        if (ci->progeny[6] != NULL && cj->progeny[5] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[6], cj->progeny[5], s);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[7], cj->progeny[0], s);
+        if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[7], cj->progeny[1], s);
+        if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[7], cj->progeny[4], s);
+        if (ci->progeny[7] != NULL && cj->progeny[5] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[7], cj->progeny[5], s);
+        break;
+
+      case 11: /* (  0 ,  1 , -1 ) */
+        if (ci->progeny[2] != NULL && cj->progeny[1] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[2], cj->progeny[1], s);
+        if (ci->progeny[2] != NULL && cj->progeny[5] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[2], cj->progeny[5], s);
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[6], cj->progeny[1], s);
+        if (ci->progeny[6] != NULL && cj->progeny[5] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[6], cj->progeny[5], s);
+        break;
+
+      case 12: /* (  0 ,  0 ,  1 ) */
+        if (ci->progeny[1] != NULL && cj->progeny[0] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[1], cj->progeny[0], s);
+        if (ci->progeny[1] != NULL && cj->progeny[2] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[1], cj->progeny[2], s);
+        if (ci->progeny[1] != NULL && cj->progeny[4] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[1], cj->progeny[4], s);
+        if (ci->progeny[1] != NULL && cj->progeny[6] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[1], cj->progeny[6], s);
+        if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[3], cj->progeny[0], s);
+        if (ci->progeny[3] != NULL && cj->progeny[2] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[3], cj->progeny[2], s);
+        if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[3], cj->progeny[4], s);
+        if (ci->progeny[3] != NULL && cj->progeny[6] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[3], cj->progeny[6], s);
+        if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[5], cj->progeny[0], s);
+        if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[5], cj->progeny[2], s);
+        if (ci->progeny[5] != NULL && cj->progeny[4] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[5], cj->progeny[4], s);
+        if (ci->progeny[5] != NULL && cj->progeny[6] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[5], cj->progeny[6], s);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[7], cj->progeny[0], s);
+        if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[7], cj->progeny[2], s);
+        if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[7], cj->progeny[4], s);
+        if (ci->progeny[7] != NULL && cj->progeny[6] != NULL)
+          cell_activate_subcell_tasks(ci->progeny[7], cj->progeny[6], s);
+        break;
+    }
+
+  }
+
+  /* Otherwise, activate the sorts and drifts. */
+  else if (cell_is_active(ci, e) || cell_is_active(cj, e)) {
+
+    /* Get the type of pair if not specified explicitly. */
+    double shift[3];
+    int sid = space_getsid(s->space, &ci, &cj, shift);
+
+    /* We are going to interact this pair, so store some values. */
+    atomic_or(&ci->requires_sorts, 1 << sid);
+    atomic_or(&cj->requires_sorts, 1 << sid);
+    ci->dx_max_sort_old = ci->dx_max_sort;
+    cj->dx_max_sort_old = cj->dx_max_sort;
+
+    /* Activate the drifts if the cells are local. */
+    if (ci->nodeID == engine_rank) cell_activate_drift_part(ci, s);
+    if (cj->nodeID == engine_rank) cell_activate_drift_part(cj, s);
+
+    /* Do we need to sort the cells? */
+    cell_activate_sorts(ci, sid, s);
+    cell_activate_sorts(cj, sid, s);
+  }
+}
+
 /**
  * @brief Un-skips all the tasks associated with a given cell and checks
  * if the space needs to be rebuilt.
@@ -1327,10 +1679,7 @@ int cell_is_drift_needed(struct cell *c, const struct engine *e) {
  */
 int cell_unskip_tasks(struct cell *c, struct scheduler *s) {
 
-#ifdef WITH_MPI
   struct engine *e = s->space->e;
-#endif
-
   int rebuild = 0;
 
   /* Un-skip the density tasks involved with this cell. */
@@ -1338,33 +1687,31 @@ int cell_unskip_tasks(struct cell *c, struct scheduler *s) {
     struct task *t = l->t;
     struct cell *ci = t->ci;
     struct cell *cj = t->cj;
-    scheduler_activate(s, t);
 
-    /* Set the correct sorting flags */
-    if (t->type == task_type_pair) {
-      if (ci->dx_max_sort > space_maxreldx * ci->dmin) {
-        for (struct cell *finger = ci; finger != NULL; finger = finger->parent)
-          finger->sorted = 0;
-      }
-      if (cj->dx_max_sort > space_maxreldx * cj->dmin) {
-        for (struct cell *finger = cj; finger != NULL; finger = finger->parent)
-          finger->sorted = 0;
-      }
-      if (!(ci->sorted & (1 << t->flags))) {
-#ifdef SWIFT_DEBUG_CHECKS
-        if (!(ci->sorts->flags & (1 << t->flags)))
-          error("bad flags in sort task.");
-#endif
-        scheduler_activate(s, ci->sorts);
-        if (ci->nodeID == engine_rank) scheduler_activate(s, ci->drift_part);
+    /* Only activate tasks that involve a local active cell. */
+    if ((cell_is_active(ci, e) && ci->nodeID == engine_rank) ||
+        (cj != NULL && cell_is_active(cj, e) && cj->nodeID == engine_rank)) {
+      scheduler_activate(s, t);
+
+      /* Set the correct sorting flags */
+      if (t->type == task_type_pair) {
+        /* Store some values. */
+        atomic_or(&ci->requires_sorts, 1 << t->flags);
+        atomic_or(&cj->requires_sorts, 1 << t->flags);
+        ci->dx_max_sort_old = ci->dx_max_sort;
+        cj->dx_max_sort_old = cj->dx_max_sort;
+
+        /* Activate the drift tasks. */
+        if (ci->nodeID == engine_rank) cell_activate_drift_part(ci, s);
+        if (cj->nodeID == engine_rank) cell_activate_drift_part(cj, s);
+
+        /* Check the sorts and activate them if needed. */
+        cell_activate_sorts(ci, t->flags, s);
+        cell_activate_sorts(cj, t->flags, s);
       }
-      if (!(cj->sorted & (1 << t->flags))) {
-#ifdef SWIFT_DEBUG_CHECKS
-        if (!(cj->sorts->flags & (1 << t->flags)))
-          error("bad flags in sort task.");
-#endif
-        scheduler_activate(s, cj->sorts);
-        if (cj->nodeID == engine_rank) scheduler_activate(s, cj->drift_part);
+      /* Store current values of dx_max and h_max. */
+      else if (t->type == task_type_sub_pair || t->type == task_type_sub_self) {
+        cell_activate_subcell_tasks(t->ci, t->cj, s);
       }
     }
 
@@ -1373,55 +1720,60 @@ int cell_unskip_tasks(struct cell *c, struct scheduler *s) {
 
       /* Check whether there was too much particle motion, i.e. the
          cell neighbour conditions were violated. */
-      if (max(ci->h_max, cj->h_max) + ci->dx_max_part + cj->dx_max_part >
-          cj->dmin)
-        rebuild = 1;
+      if (cell_need_rebuild_for_pair(ci, cj)) rebuild = 1;
 
 #ifdef WITH_MPI
-      /* Activate the send/recv flags. */
+      /* Activate the send/recv tasks. */
       if (ci->nodeID != engine_rank) {
 
-        /* Activate the tasks to recv foreign cell ci's data. */
-        scheduler_activate(s, ci->recv_xv);
-        if (cell_is_active(ci, e)) {
-          scheduler_activate(s, ci->recv_rho);
+        /* If the local cell is active, receive data from the foreign cell. */
+        if (cell_is_active(cj, e)) {
+          scheduler_activate(s, ci->recv_xv);
+          if (cell_is_active(ci, e)) {
+            scheduler_activate(s, ci->recv_rho);
 #ifdef EXTRA_HYDRO_LOOP
-          scheduler_activate(s, ci->recv_gradient);
+            scheduler_activate(s, ci->recv_gradient);
 #endif
-          scheduler_activate(s, ci->recv_ti);
+          }
         }
 
-        /* Look for the local cell cj's send tasks. */
-        struct link *l = NULL;
-        for (l = cj->send_xv; l != NULL && l->t->cj->nodeID != ci->nodeID;
-             l = l->next)
-          ;
-        if (l == NULL) error("Missing link to send_xv task.");
-        scheduler_activate(s, l->t);
-
-        /* Drift both cells, the foreign one at the level which it is sent. */
-        if (l->t->ci->drift_part)
-          scheduler_activate(s, l->t->ci->drift_part);
-        else
-          error("Drift task missing !");
-        if (t->type == task_type_pair) scheduler_activate(s, cj->drift_part);
-
-        if (cell_is_active(cj, e)) {
+        /* If the foreign cell is active, we want its ti_end values. */
+        if (cell_is_active(ci, e)) scheduler_activate(s, ci->recv_ti);
 
-          for (l = cj->send_rho; l != NULL && l->t->cj->nodeID != ci->nodeID;
+        /* Look for the local cell cj's send tasks. */
+        if (cell_is_active(ci, e)) {
+          struct link *l = NULL;
+          for (l = cj->send_xv; l != NULL && l->t->cj->nodeID != ci->nodeID;
                l = l->next)
             ;
-          if (l == NULL) error("Missing link to send_rho task.");
+          if (l == NULL) error("Missing link to send_xv task.");
           scheduler_activate(s, l->t);
 
+          /* Drift the cell which will be sent; note that not all sent
+             particles will be drifted, only those that are needed. */
+          cell_activate_drift_part(cj, s);
+
+          if (cell_is_active(cj, e)) {
+            struct link *l = NULL;
+            for (l = cj->send_rho; l != NULL && l->t->cj->nodeID != ci->nodeID;
+                 l = l->next)
+              ;
+            if (l == NULL) error("Missing link to send_rho task.");
+            scheduler_activate(s, l->t);
+
 #ifdef EXTRA_HYDRO_LOOP
-          for (l = cj->send_gradient;
-               l != NULL && l->t->cj->nodeID != ci->nodeID; l = l->next)
-            ;
-          if (l == NULL) error("Missing link to send_gradient task.");
-          scheduler_activate(s, l->t);
+            for (l = cj->send_gradient;
+                 l != NULL && l->t->cj->nodeID != ci->nodeID; l = l->next)
+              ;
+            if (l == NULL) error("Missing link to send_gradient task.");
+            scheduler_activate(s, l->t);
 #endif
+          }
+        }
 
+        /* If the local cell is active, send its ti_end values. */
+        if (cell_is_active(cj, e)) {
+          struct link *l = NULL;
           for (l = cj->send_ti; l != NULL && l->t->cj->nodeID != ci->nodeID;
                l = l->next)
             ;
@@ -1431,87 +1783,92 @@ int cell_unskip_tasks(struct cell *c, struct scheduler *s) {
 
       } else if (cj->nodeID != engine_rank) {
 
-        /* Activate the tasks to recv foreign cell cj's data. */
-        scheduler_activate(s, cj->recv_xv);
-        if (cell_is_active(cj, e)) {
-          scheduler_activate(s, cj->recv_rho);
+        /* If the local cell is active, receive data from the foreign cell. */
+        if (cell_is_active(ci, e)) {
+          scheduler_activate(s, cj->recv_xv);
+          if (cell_is_active(cj, e)) {
+            scheduler_activate(s, cj->recv_rho);
 #ifdef EXTRA_HYDRO_LOOP
-          scheduler_activate(s, cj->recv_gradient);
+            scheduler_activate(s, cj->recv_gradient);
 #endif
-          scheduler_activate(s, cj->recv_ti);
+          }
         }
 
-        /* Look for the local cell ci's send tasks. */
-        struct link *l = NULL;
-        for (l = ci->send_xv; l != NULL && l->t->cj->nodeID != cj->nodeID;
-             l = l->next)
-          ;
-        if (l == NULL) error("Missing link to send_xv task.");
-        scheduler_activate(s, l->t);
-
-        /* Drift both cells, the foreign one at the level which it is sent. */
-        if (l->t->ci->drift_part)
-          scheduler_activate(s, l->t->ci->drift_part);
-        else
-          error("Drift task missing !");
-        if (t->type == task_type_pair) scheduler_activate(s, ci->drift_part);
-
-        if (cell_is_active(ci, e)) {
+        /* If the foreign cell is active, we want its ti_end values. */
+        if (cell_is_active(cj, e)) scheduler_activate(s, cj->recv_ti);
 
-          for (l = ci->send_rho; l != NULL && l->t->cj->nodeID != cj->nodeID;
+        /* Look for the local cell ci's send tasks. */
+        if (cell_is_active(cj, e)) {
+          struct link *l = NULL;
+          for (l = ci->send_xv; l != NULL && l->t->cj->nodeID != cj->nodeID;
                l = l->next)
             ;
-          if (l == NULL) error("Missing link to send_rho task.");
+          if (l == NULL) error("Missing link to send_xv task.");
           scheduler_activate(s, l->t);
 
+          /* Drift the cell which will be sent; note that not all sent
+             particles will be drifted, only those that are needed. */
+          cell_activate_drift_part(ci, s);
+
+          if (cell_is_active(ci, e)) {
+
+            struct link *l = NULL;
+            for (l = ci->send_rho; l != NULL && l->t->cj->nodeID != cj->nodeID;
+                 l = l->next)
+              ;
+            if (l == NULL) error("Missing link to send_rho task.");
+            scheduler_activate(s, l->t);
+
 #ifdef EXTRA_HYDRO_LOOP
-          for (l = ci->send_gradient;
-               l != NULL && l->t->cj->nodeID != cj->nodeID; l = l->next)
-            ;
-          if (l == NULL) error("Missing link to send_gradient task.");
-          scheduler_activate(s, l->t);
+            for (l = ci->send_gradient;
+                 l != NULL && l->t->cj->nodeID != cj->nodeID; l = l->next)
+              ;
+            if (l == NULL) error("Missing link to send_gradient task.");
+            scheduler_activate(s, l->t);
 #endif
+          }
+        }
 
+        /* If the local cell is active, send its ti_end values. */
+        if (cell_is_active(ci, e)) {
+          struct link *l = NULL;
           for (l = ci->send_ti; l != NULL && l->t->cj->nodeID != cj->nodeID;
                l = l->next)
             ;
           if (l == NULL) error("Missing link to send_ti task.");
           scheduler_activate(s, l->t);
         }
-      } else if (t->type == task_type_pair) {
-        scheduler_activate(s, ci->drift_part);
-        scheduler_activate(s, cj->drift_part);
-      }
-#else
-      if (t->type == task_type_pair) {
-        scheduler_activate(s, ci->drift_part);
-        scheduler_activate(s, cj->drift_part);
       }
 #endif
     }
   }
 
   /* Unskip all the other task types. */
-  for (struct link *l = c->gradient; l != NULL; l = l->next)
-    scheduler_activate(s, l->t);
-  for (struct link *l = c->force; l != NULL; l = l->next)
-    scheduler_activate(s, l->t);
-  for (struct link *l = c->grav; l != NULL; l = l->next)
-    scheduler_activate(s, l->t);
-  if (c->extra_ghost != NULL) scheduler_activate(s, c->extra_ghost);
-  if (c->ghost != NULL) scheduler_activate(s, c->ghost);
-  if (c->init_grav != NULL) scheduler_activate(s, c->init_grav);
-  if (c->drift_part != NULL) scheduler_activate(s, c->drift_part);
-  if (c->drift_gpart != NULL) scheduler_activate(s, c->drift_gpart);
-  if (c->kick1 != NULL) scheduler_activate(s, c->kick1);
-  if (c->kick2 != NULL) scheduler_activate(s, c->kick2);
-  if (c->timestep != NULL) scheduler_activate(s, c->timestep);
-  if (c->grav_ghost[0] != NULL) scheduler_activate(s, c->grav_ghost[0]);
-  if (c->grav_ghost[1] != NULL) scheduler_activate(s, c->grav_ghost[1]);
-  if (c->grav_down != NULL) scheduler_activate(s, c->grav_down);
-  if (c->grav_long_range != NULL) scheduler_activate(s, c->grav_long_range);
-  if (c->cooling != NULL) scheduler_activate(s, c->cooling);
-  if (c->sourceterms != NULL) scheduler_activate(s, c->sourceterms);
+  if (c->nodeID == engine_rank && cell_is_active(c, e)) {
+
+    for (struct link *l = c->gradient; l != NULL; l = l->next)
+      scheduler_activate(s, l->t);
+    for (struct link *l = c->force; l != NULL; l = l->next)
+      scheduler_activate(s, l->t);
+    for (struct link *l = c->grav; l != NULL; l = l->next)
+      scheduler_activate(s, l->t);
+
+    if (c->extra_ghost != NULL) scheduler_activate(s, c->extra_ghost);
+    if (c->ghost_in != NULL) scheduler_activate(s, c->ghost_in);
+    if (c->ghost_out != NULL) scheduler_activate(s, c->ghost_out);
+    if (c->ghost != NULL) scheduler_activate(s, c->ghost);
+    if (c->init_grav != NULL) scheduler_activate(s, c->init_grav);
+    if (c->drift_gpart != NULL) scheduler_activate(s, c->drift_gpart);
+    if (c->kick1 != NULL) scheduler_activate(s, c->kick1);
+    if (c->kick2 != NULL) scheduler_activate(s, c->kick2);
+    if (c->timestep != NULL) scheduler_activate(s, c->timestep);
+    if (c->grav_ghost[0] != NULL) scheduler_activate(s, c->grav_ghost[0]);
+    if (c->grav_ghost[1] != NULL) scheduler_activate(s, c->grav_ghost[1]);
+    if (c->grav_down != NULL) scheduler_activate(s, c->grav_down);
+    if (c->grav_long_range != NULL) scheduler_activate(s, c->grav_long_range);
+    if (c->cooling != NULL) scheduler_activate(s, c->cooling);
+    if (c->sourceterms != NULL) scheduler_activate(s, c->sourceterms);
+  }
 
   return rebuild;
 }
@@ -1536,13 +1893,21 @@ void cell_set_super(struct cell *c, struct cell *super) {
       if (c->progeny[k] != NULL) cell_set_super(c->progeny[k], super);
 }
 
+void cell_set_super_mapper(void *map_data, int num_elements, void *extra_data) {
+  for (int ind = 0; ind < num_elements; ind++) {
+    struct cell *c = &((struct cell *)map_data)[ind];
+    cell_set_super(c, NULL);
+  }
+}
+
 /**
  * @brief Recursively drifts the #part in a cell hierarchy.
  *
  * @param c The #cell.
  * @param e The #engine (to get ti_current).
+ * @param force Drift the particles irrespective of the #cell flags.
  */
-void cell_drift_part(struct cell *c, const struct engine *e) {
+void cell_drift_part(struct cell *c, const struct engine *e, int force) {
 
   const float hydro_h_max = e->hydro_properties->h_max;
   const double timeBase = e->timeBase;
@@ -1557,11 +1922,19 @@ void cell_drift_part(struct cell *c, const struct engine *e) {
   float dx_max_sort = 0.0f, dx2_max_sort = 0.f;
   float cell_h_max = 0.f;
 
+  /* Drift irrespective of cell flags? */
+  force |= c->do_drift;
+
+#ifdef SWIFT_DEBUG_CHECKS
+  /* Check that we only drift local cells. */
+  if (c->nodeID != engine_rank) error("Drifting a foreign cell is nope.");
+
   /* Check that we are actually going to move forward. */
   if (ti_current < ti_old_part) error("Attempt to drift to the past");
+#endif  // SWIFT_DEBUG_CHECKS
 
   /* Are we not in a leaf ? */
-  if (c->split) {
+  if (c->split && (force || c->do_sub_drift)) {
 
     /* Loop over the progeny and collect their data. */
     for (int k = 0; k < 8; k++)
@@ -1569,7 +1942,7 @@ void cell_drift_part(struct cell *c, const struct engine *e) {
         struct cell *cp = c->progeny[k];
 
         /* Collect */
-        cell_drift_part(cp, e);
+        cell_drift_part(cp, e, force);
 
         /* Update */
         dx_max = max(dx_max, cp->dx_max_part);
@@ -1577,7 +1950,15 @@ void cell_drift_part(struct cell *c, const struct engine *e) {
         cell_h_max = max(cell_h_max, cp->h_max);
       }
 
-  } else if (ti_current > ti_old_part) {
+    /* Store the values */
+    c->h_max = cell_h_max;
+    c->dx_max_part = dx_max;
+    c->dx_max_sort = dx_max_sort;
+
+    /* Update the time of the last drift */
+    c->ti_old_part = ti_current;
+
+  } else if (!c->split && force && ti_current > ti_old_part) {
 
     /* Loop over all the gas particles in the cell */
     const size_t nr_parts = c->count;
@@ -1616,20 +1997,18 @@ void cell_drift_part(struct cell *c, const struct engine *e) {
     dx_max = sqrtf(dx2_max);
     dx_max_sort = sqrtf(dx2_max_sort);
 
-  } else {
+    /* Store the values */
+    c->h_max = cell_h_max;
+    c->dx_max_part = dx_max;
+    c->dx_max_sort = dx_max_sort;
 
-    cell_h_max = c->h_max;
-    dx_max = c->dx_max_part;
-    dx_max_sort = c->dx_max_sort;
+    /* Update the time of the last drift */
+    c->ti_old_part = ti_current;
   }
 
-  /* Store the values */
-  c->h_max = cell_h_max;
-  c->dx_max_part = dx_max;
-  c->dx_max_sort = dx_max_sort;
-
-  /* Update the time of the last drift */
-  c->ti_old_part = ti_current;
+  /* Clear the drift flags. */
+  c->do_drift = 0;
+  c->do_sub_drift = 0;
 }
 
 /**
diff --git a/src/cell.h b/src/cell.h
index 2e32533402110040310be88629d0fb33f0128c62..e97400623dbb7a66aee981d21883fe4d8f73406a 100644
--- a/src/cell.h
+++ b/src/cell.h
@@ -31,15 +31,16 @@
 
 /* Local includes. */
 #include "align.h"
+#include "kernel_hydro.h"
 #include "lock.h"
 #include "multipole.h"
 #include "part.h"
+#include "space.h"
 #include "task.h"
 #include "timeline.h"
 
 /* Avoid cyclic inclusions */
 struct engine;
-struct space;
 struct scheduler;
 
 /* Max tag size set to 2^29 to take into account some MPI implementations
@@ -122,7 +123,7 @@ struct cell {
   struct spart *sparts;
 
   /*! Pointer for the sorted indices. */
-  struct entry *sort;
+  struct entry *sort[13];
 
   /*! Pointers to the next level of cells. */
   struct cell *progeny[8];
@@ -151,7 +152,9 @@ struct cell {
   /*! The multipole initialistation task */
   struct task *init_grav;
 
-  /*! The ghost task */
+  /*! The ghost tasks */
+  struct task *ghost_in;
+  struct task *ghost_out;
   struct task *ghost;
 
   /*! The extra ghost task for complex hydro schemes */
@@ -236,9 +239,6 @@ struct cell {
   /*! Maximum beginning of (integer) time step in this cell. */
   integertime_t ti_beg_max;
 
-  /*! Last (integer) time the cell's sort arrays were updated. */
-  integertime_t ti_sort;
-
   /*! Last (integer) time the cell's part were drifted forward in time. */
   integertime_t ti_old_part;
 
@@ -269,9 +269,6 @@ struct cell {
   /*! Nr of #spart in this cell. */
   int scount;
 
-  /*! The size of the sort array */
-  int sortsize;
-
   /*! Bit-mask indicating the sorted directions */
   unsigned int sorted;
 
@@ -326,7 +323,30 @@ struct cell {
   /*! The maximal depth of this cell and its progenies */
   char maxdepth;
 
+  /*! Values of dx_max and h_max before the drifts, used for sub-cell tasks. */
+  float dx_max_old;
+  float h_max_old;
+  float dx_max_sort_old;
+
+  /* Bit mask of sort directions that will be needed in the next timestep. */
+  unsigned int requires_sorts;
+
+  /*! Does this cell need to be drifted? */
+  char do_drift;
+
+  /*! Do any of this cell's sub-cells need to be drifted? */
+  char do_sub_drift;
+
+  /*! Bit mask of sorts that need to be computed for this cell. */
+  unsigned int do_sort;
+
+  /*! Do any of this cell's sub-cells need to be sorted? */
+  char do_sub_sort;
+
 #ifdef SWIFT_DEBUG_CHECKS
+  /*! Last (integer) time the cell's sort arrays were updated. */
+  integertime_t ti_sort;
+
   /*! The list of tasks that have been executed on this cell */
   char tasks_executed[64];
 
@@ -344,7 +364,7 @@ struct cell {
 void cell_split(struct cell *c, ptrdiff_t parts_offset, ptrdiff_t sparts_offset,
                 struct cell_buff *buff, struct cell_buff *sbuff,
                 struct cell_buff *gbuff);
-void cell_sanitize(struct cell *c);
+void cell_sanitize(struct cell *c, int treated);
 int cell_locktree(struct cell *c);
 void cell_unlocktree(struct cell *c);
 int cell_glocktree(struct cell *c);
@@ -373,10 +393,103 @@ void cell_reset_task_counters(struct cell *c);
 int cell_is_drift_needed(struct cell *c, const struct engine *e);
 int cell_unskip_tasks(struct cell *c, struct scheduler *s);
 void cell_set_super(struct cell *c, struct cell *super);
-void cell_drift_part(struct cell *c, const struct engine *e);
+void cell_drift_part(struct cell *c, const struct engine *e, int force);
 void cell_drift_gpart(struct cell *c, const struct engine *e);
 void cell_drift_multipole(struct cell *c, const struct engine *e);
 void cell_drift_all_multipoles(struct cell *c, const struct engine *e);
 void cell_check_timesteps(struct cell *c);
+void cell_store_pre_drift_values(struct cell *c);
+void cell_activate_subcell_tasks(struct cell *ci, struct cell *cj,
+                                 struct scheduler *s);
+void cell_activate_drift_part(struct cell *c, struct scheduler *s);
+void cell_activate_sorts(struct cell *c, int sid, struct scheduler *s);
+void cell_clear_drift_flags(struct cell *c, void *data);
+void cell_set_super_mapper(void *map_data, int num_elements, void *extra_data);
+
+/* Inlined functions (for speed). */
+
+/**
+ * @brief Can a sub-pair hydro task recurse to a lower level based
+ * on the status of the particles in the cell.
+ *
+ * @param c The #cell.
+ */
+__attribute__((always_inline)) INLINE static int cell_can_recurse_in_pair_task(
+    const struct cell *c) {
+
+  /* Is the cell split ? */
+  /* If so, is the cut-off radius plus the max distance the parts have moved */
+  /* smaller than the sub-cell sizes ? */
+  /* Note: We use the _old values as these might have been updated by a drift */
+  return c->split &&
+         ((kernel_gamma * c->h_max_old + c->dx_max_old) < 0.5f * c->dmin);
+}
+
+/**
+ * @brief Can a sub-self hydro task recurse to a lower level based
+ * on the status of the particles in the cell.
+ *
+ * @param c The #cell.
+ */
+__attribute__((always_inline)) INLINE static int cell_can_recurse_in_self_task(
+    const struct cell *c) {
+
+  /* Is the cell split ? */
+  /* Note: No need for more checks here as all the sub-pairs and sub-self */
+  /* operations will be executed. So no need for the particle to be at exactly
+   */
+  /* the right place. */
+  return c->split;
+}
+
+/**
+ * @brief Can a pair task associated with a cell be split into smaller
+ * sub-tasks.
+ *
+ * @param c The #cell.
+ */
+__attribute__((always_inline)) INLINE static int cell_can_split_pair_task(
+    const struct cell *c) {
+
+  /* Is the cell split ? */
+  /* If so, is the cut-off radius with some leeway smaller than */
+  /* the sub-cell sizes ? */
+  /* Note that since tasks are create after a rebuild no need to take */
+  /* into account any part motion (i.e. dx_max == 0 here) */
+  return c->split && (space_stretch * kernel_gamma * c->h_max < 0.5f * c->dmin);
+}
+
+/**
+ * @brief Can a self task associated with a cell be split into smaller
+ * sub-tasks.
+ *
+ * @param c The #cell.
+ */
+__attribute__((always_inline)) INLINE static int cell_can_split_self_task(
+    const struct cell *c) {
+
+  /* Is the cell split ? */
+  /* Note: No need for more checks here as all the sub-pairs and sub-self */
+  /* tasks will be created. So no need to check for h_max */
+  return c->split && (space_stretch * kernel_gamma * c->h_max < 0.5f * c->dmin);
+}
+
+/**
+ * @brief Have particles in a pair of cells moved too much and require a rebuild
+ * ?
+ *
+ * @param ci The first #cell.
+ * @param cj The second #cell.
+ */
+__attribute__((always_inline)) INLINE static int cell_need_rebuild_for_pair(
+    const struct cell *ci, const struct cell *cj) {
+
+  /* Is the cut-off radius plus the max distance the parts in both cells have */
+  /* moved larger than the cell size ? */
+  /* Note ci->dmin == cj->dmin */
+  return (kernel_gamma * max(ci->h_max, cj->h_max) + ci->dx_max_part +
+              cj->dx_max_part >
+          cj->dmin);
+}
 
 #endif /* SWIFT_CELL_H */
diff --git a/src/collectgroup.c b/src/collectgroup.c
index 0b4ddc405772a45a1e444ef48b65fcb7d37a248f..b7e5486b59a2ec5e47b7b864071a2bb1e5ce1850 100644
--- a/src/collectgroup.c
+++ b/src/collectgroup.c
@@ -170,7 +170,7 @@ static void doreduce1(struct mpicollectgroup1 *mpigrp11,
 }
 
 /**
- * @brief MPI reduce operator for #mpicollectgroup structures.
+ * @brief MPI reduce operator for #mpicollectgroup1 structures.
  */
 static void mpicollectgroup1_reduce(void *in, void *inout, int *len,
                                     MPI_Datatype *datatype) {
diff --git a/src/const.h b/src/const.h
index 141eb48acc633542aa98655caa8debdd2dbce530..c8060a2be51468a791e192a65a74f1a4d9bc8e30 100644
--- a/src/const.h
+++ b/src/const.h
@@ -37,7 +37,7 @@
 #define const_max_u_change 0.1f
 
 /* Thermal energy per unit mass used as a constant for the isothermal EoS */
-#define const_isothermal_internal_energy 20.2615290634f
+#define const_isothermal_internal_energy 20.2678457288f
 
 /* Type of gradients to use (GIZMO_SPH only) */
 /* If no option is chosen, no gradients are used (first order scheme) */
@@ -49,6 +49,9 @@
 #define SLOPE_LIMITER_PER_FACE
 #define SLOPE_LIMITER_CELL_WIDE
 
+/* Types of flux limiter to use (GIZMO_SPH only) */
+#define GIZMO_FLUX_LIMITER
+
 /* Options to control the movement of particles for GIZMO_SPH. */
 /* This option disables particle movement */
 //#define GIZMO_FIX_PARTICLES
diff --git a/src/debug.c b/src/debug.c
index 601f63d6e11bbbf95f62eaef1ec6ec7ec06d3ad9..903d7e5a2e30bca8980078991c5155830f5e4c43 100644
--- a/src/debug.c
+++ b/src/debug.c
@@ -26,6 +26,7 @@
 /* Some standard headers. */
 #include <float.h>
 #include <stdio.h>
+#include <unistd.h>
 
 /* This object's header. */
 #include "debug.h"
@@ -450,3 +451,69 @@ void dumpCellRanks(const char *prefix, struct cell *cells_top, int nr_cells) {
 }
 
 #endif /* HAVE_MPI */
+
+/**
+ * @brief parse the process /proc/self/statm file to get the process
+ *        memory use (in KB). Top field in ().
+ *
+ * @param size     total virtual memory (VIRT)
+ * @param resident resident non-swapped memory (RES)
+ * @param share    shared (mmap'd) memory  (SHR)
+ * @param trs      text (exe) resident set (CODE)
+ * @param lrs      library resident set
+ * @param drs      data+stack resident set (DATA)
+ * @param dt       dirty pages (nDRT)
+ */
+void getProcMemUse(long *size, long *resident, long *share, long *trs,
+                   long *lrs, long *drs, long *dt) {
+
+  /* Open the file. */
+  FILE *file = fopen("/proc/self/statm", "r");
+  if (file != NULL) {
+    int nscan = fscanf(file, "%ld %ld %ld %ld %ld %ld %ld", size, resident,
+                       share, trs, lrs, drs, dt);
+
+    if (nscan == 7) {
+      /* Convert pages into bytes. Usually 4096, but could be 512 on some
+       * systems so take care in conversion to KB. */
+      long sz = sysconf(_SC_PAGESIZE);
+      *size *= sz;
+      *resident *= sz;
+      *share *= sz;
+      *trs *= sz;
+      *lrs *= sz;
+      *drs *= sz;
+      *dt *= sz;
+
+      *size /= 1024;
+      *resident /= 1024;
+      *share /= 1024;
+      *trs /= 1024;
+      *lrs /= 1024;
+      *drs /= 1024;
+      *dt /= 1024;
+    } else {
+      error("Failed to read sufficient fields from /proc/self/statm");
+    }
+    fclose(file);
+  } else {
+    error("Failed to open /proc/self/statm");
+  }
+}
+
+/**
+ * @brief Print the current memory use of the process. A la "top".
+ */
+void printProcMemUse() {
+  long size;
+  long resident;
+  long share;
+  long trs;
+  long lrs;
+  long drs;
+  long dt;
+  getProcMemUse(&size, &resident, &share, &trs, &lrs, &drs, &dt);
+  printf("## VIRT = %ld , RES = %ld , SHR = %ld , CODE = %ld, DATA = %ld\n",
+         size, resident, share, trs, drs);
+  fflush(stdout);
+}
diff --git a/src/debug.h b/src/debug.h
index 7422a6f7f9815490966f08415e0312876ce0123f..7dca848b6bf4e44de5f40fa8e1c0849e8ee3d0b4 100644
--- a/src/debug.h
+++ b/src/debug.h
@@ -44,4 +44,7 @@ void dumpMETISGraph(const char *prefix, idx_t nvtxs, idx_t ncon, idx_t *xadj,
 void dumpCellRanks(const char *prefix, struct cell *cells_top, int nr_cells);
 #endif
 
+void getProcMemUse(long *size, long *resident, long *share, long *trs,
+                   long *lrs, long *drs, long *dt);
+void printProcMemUse();
 #endif /* SWIFT_DEBUG_H */
diff --git a/src/dimension.h b/src/dimension.h
index 60c5208d846f9beebd7a1fd3e183fc771fbc5f91..0b2093d718a61c6ce850db1970412af3e2e462b9 100644
--- a/src/dimension.h
+++ b/src/dimension.h
@@ -118,6 +118,34 @@ __attribute__((always_inline)) INLINE static float pow_dimension_plus_one(
 #endif
 }
 
+/**
+ * @brief Returns the argument to the power given by the dimension minus one
+ *
+ * Computes \f$x^{d-1}\f$.
+ */
+__attribute__((always_inline)) INLINE static float pow_dimension_minus_one(
+    float x) {
+
+#if defined(HYDRO_DIMENSION_3D)
+
+  return x * x;
+
+#elif defined(HYDRO_DIMENSION_2D)
+
+  return x;
+
+#elif defined(HYDRO_DIMENSION_1D)
+
+  return 1.f;
+
+#else
+
+  error("The dimension is not defined !");
+  return 0.f;
+
+#endif
+}
+
 /**
  * @brief Inverts the given dimension by dimension matrix (in place)
  *
diff --git a/src/engine.c b/src/engine.c
index 417c9f626d7e2f8d96d49d8d2bed942102b96e4f..93481aca3d25fd9755b7c7f69ef25ddb4d9d9d06 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -57,6 +57,7 @@
 #include "error.h"
 #include "gravity.h"
 #include "hydro.h"
+#include "map.h"
 #include "minmax.h"
 #include "parallel_io.h"
 #include "part.h"
@@ -76,22 +77,23 @@
 /* Particle cache size. */
 #define CACHE_SIZE 512
 
-const char *engine_policy_names[16] = {"none",
-                                       "rand",
-                                       "steal",
-                                       "keep",
-                                       "block",
-                                       "cpu_tight",
-                                       "mpi",
-                                       "numa_affinity",
-                                       "hydro",
-                                       "self_gravity",
-                                       "external_gravity",
-                                       "cosmology_integration",
-                                       "drift_all",
-                                       "cooling",
-                                       "sourceterms",
-                                       "stars"};
+const char *engine_policy_names[] = {"none",
+                                     "rand",
+                                     "steal",
+                                     "keep",
+                                     "block",
+                                     "cpu_tight",
+                                     "mpi",
+                                     "numa_affinity",
+                                     "hydro",
+                                     "self_gravity",
+                                     "external_gravity",
+                                     "cosmology_integration",
+                                     "drift_all",
+                                     "reconstruct_mpoles",
+                                     "cooling",
+                                     "sourceterms",
+                                     "stars"};
 
 /** The rank of the engine as a global variable (for messages). */
 int engine_rank;
@@ -119,6 +121,24 @@ void engine_addlink(struct engine *e, struct link **l, struct task *t) {
   res->next = atomic_swap(l, res);
 }
 
+/**
+ * @brief Recursively add non-implicit ghost tasks to a cell hierarchy.
+ */
+void engine_add_ghosts(struct engine *e, struct cell *c, struct task *ghost_in,
+                       struct task *ghost_out) {
+  if (!c->split || c->count < engine_max_parts_per_ghost) {
+    struct scheduler *s = &e->sched;
+    c->ghost =
+        scheduler_addtask(s, task_type_ghost, task_subtype_none, 0, 0, c, NULL);
+    scheduler_addunlock(s, ghost_in, c->ghost);
+    scheduler_addunlock(s, c->ghost, ghost_out);
+  } else {
+    for (int k = 0; k < 8; k++)
+      if (c->progeny[k] != NULL)
+        engine_add_ghosts(e, c->progeny[k], ghost_in, ghost_out);
+  }
+}
+
 /**
  * @brief Generate the hydro hierarchical tasks for a hierarchy of cells -
  * i.e. all the O(Npart) tasks.
@@ -134,7 +154,7 @@ void engine_make_hierarchical_tasks(struct engine *e, struct cell *c) {
 
   struct scheduler *s = &e->sched;
   const int periodic = e->s->periodic;
-  const int is_hydro = (e->policy & engine_policy_hydro);
+  const int is_with_hydro = (e->policy & engine_policy_hydro);
   const int is_self_gravity = (e->policy & engine_policy_self_gravity);
   const int is_with_cooling = (e->policy & engine_policy_cooling);
   const int is_with_sourceterms = (e->policy & engine_policy_sourceterms);
@@ -142,9 +162,21 @@ void engine_make_hierarchical_tasks(struct engine *e, struct cell *c) {
   /* Are we in a super-cell ? */
   if (c->super == c) {
 
+    /* Add the sort task. */
+    if (is_with_hydro) {
+      c->sorts = scheduler_addtask(s, task_type_sort, task_subtype_none, 0, 0,
+                                   c, NULL);
+    }
+
     /* Local tasks only... */
     if (c->nodeID == e->nodeID) {
 
+      /* Add the drift task. */
+      if (is_with_hydro) {
+        c->drift_part = scheduler_addtask(s, task_type_drift_part,
+                                          task_subtype_none, 0, 0, c, NULL);
+      }
+
       /* Add the two half kicks */
       c->kick1 = scheduler_addtask(s, task_type_kick1, task_subtype_none, 0, 0,
                                    c, NULL);
@@ -179,17 +211,22 @@ void engine_make_hierarchical_tasks(struct engine *e, struct cell *c) {
         scheduler_addunlock(s, c->grav_down, c->kick2);
       }
 
-      /* Generate the ghost task. */
-      if (is_hydro)
-        c->ghost = scheduler_addtask(s, task_type_ghost, task_subtype_none, 0,
-                                     0, c, NULL);
+      /* Generate the ghost tasks. */
+      if (is_with_hydro) {
+        c->ghost_in =
+            scheduler_addtask(s, task_type_ghost, task_subtype_none, 0,
+                              /* implicit = */ 1, c, NULL);
+        c->ghost_out =
+            scheduler_addtask(s, task_type_ghost, task_subtype_none, 0,
+                              /* implicit = */ 1, c, NULL);
+        engine_add_ghosts(e, c, c->ghost_in, c->ghost_out);
 
 #ifdef EXTRA_HYDRO_LOOP
-      /* Generate the extra ghost task. */
-      if (is_hydro)
+        /* Generate the extra ghost task. */
         c->extra_ghost = scheduler_addtask(s, task_type_extra_ghost,
                                            task_subtype_none, 0, 0, c, NULL);
 #endif
+      }
 
       /* Cooling task */
       if (is_with_cooling) {
@@ -220,6 +257,145 @@ void engine_make_hierarchical_tasks(struct engine *e, struct cell *c) {
   }
 }
 
+void engine_make_hierarchical_tasks_mapper(void *map_data, int num_elements,
+                                           void *extra_data) {
+  struct engine *e = (struct engine *)extra_data;
+
+  for (int ind = 0; ind < num_elements; ind++) {
+    struct cell *c = &((struct cell *)map_data)[ind];
+    engine_make_hierarchical_tasks(e, c);
+  }
+}
+
+#ifdef WITH_MPI
+/**
+ * Do the exchange of one type of particles with all the other nodes.
+ *
+ * @param counts 2D array with the counts of particles to exchange with
+ *               each other node.
+ * @param parts the particle data to exchange
+ * @param new_nr_parts the number of particles this node will have after all
+ *                     exchanges have completed.
+ * @param sizeofparts sizeof the particle struct.
+ * @param alignsize the memory alignment required for this particle type.
+ * @param mpi_type the MPI_Datatype for these particles.
+ * @param nr_nodes the number of nodes to exchange with.
+ * @param nodeID the id of this node.
+ *
+ * @result new particle data constructed from all the exchanges with the
+ *         given alignment.
+ */
+static void *engine_do_redistribute(int *counts, char *parts,
+                                    size_t new_nr_parts, size_t sizeofparts,
+                                    size_t alignsize, MPI_Datatype mpi_type,
+                                    int nr_nodes, int nodeID) {
+
+  /* Allocate a new particle array with some extra margin */
+  char *parts_new = NULL;
+  if (posix_memalign(
+          (void **)&parts_new, alignsize,
+          sizeofparts * new_nr_parts * engine_redistribute_alloc_margin) != 0)
+    error("Failed to allocate new particle data.");
+
+  /* Prepare MPI requests for the asynchronous communications */
+  MPI_Request *reqs;
+  if ((reqs = (MPI_Request *)malloc(sizeof(MPI_Request) * 2 * nr_nodes)) ==
+      NULL)
+    error("Failed to allocate MPI request list.");
+
+  /* Only send and receive only "chunk" particles per request. So we need to
+   * loop as many times as necessary here. Make 2Gb/sizeofparts so we only
+   * send 2Gb packets. */
+  const int chunk = INT_MAX / sizeofparts;
+  int sent = 0;
+  int recvd = 0;
+
+  int activenodes = 1;
+  while (activenodes) {
+
+    for (int k = 0; k < 2 * nr_nodes; k++) reqs[k] = MPI_REQUEST_NULL;
+
+    /* Emit the sends and recvs for the data. */
+    size_t offset_send = sent;
+    size_t offset_recv = recvd;
+    activenodes = 0;
+
+    for (int k = 0; k < nr_nodes; k++) {
+
+      /* Indices in the count arrays of the node of interest */
+      const int ind_send = nodeID * nr_nodes + k;
+      const int ind_recv = k * nr_nodes + nodeID;
+
+      /* Are we sending any data this loop? */
+      int sending = counts[ind_send] - sent;
+      if (sending > 0) {
+        activenodes++;
+        if (sending > chunk) sending = chunk;
+
+        /* If the send and receive is local then just copy. */
+        if (k == nodeID) {
+          int receiving = counts[ind_recv] - recvd;
+          if (receiving > chunk) receiving = chunk;
+          memcpy(&parts_new[offset_recv * sizeofparts],
+                 &parts[offset_send * sizeofparts], sizeofparts * receiving);
+        } else {
+          /* Otherwise send it. */
+          int res =
+              MPI_Isend(&parts[offset_send * sizeofparts], sending, mpi_type, k,
+                        ind_send, MPI_COMM_WORLD, &reqs[2 * k + 0]);
+          if (res != MPI_SUCCESS)
+            mpi_error(res, "Failed to isend parts to node %i.", k);
+        }
+      }
+
+      /* If we're sending to this node, then move past it to next. */
+      if (counts[ind_send] > 0) offset_send += counts[ind_send];
+
+      /* Are we receiving any data from this node? Note already done if coming
+       * from this node. */
+      if (k != nodeID) {
+        int receiving = counts[ind_recv] - recvd;
+        if (receiving > 0) {
+          activenodes++;
+          if (receiving > chunk) receiving = chunk;
+          int res = MPI_Irecv(&parts_new[offset_recv * sizeofparts], receiving,
+                              mpi_type, k, ind_recv, MPI_COMM_WORLD,
+                              &reqs[2 * k + 1]);
+          if (res != MPI_SUCCESS)
+            mpi_error(res, "Failed to emit irecv of parts from node %i.", k);
+        }
+      }
+
+      /* If we're receiving from this node, then move past it to next. */
+      if (counts[ind_recv] > 0) offset_recv += counts[ind_recv];
+    }
+
+    /* Wait for all the sends and recvs to tumble in. */
+    MPI_Status stats[2 * nr_nodes];
+    int res;
+    if ((res = MPI_Waitall(2 * nr_nodes, reqs, stats)) != MPI_SUCCESS) {
+      for (int k = 0; k < 2 * nr_nodes; k++) {
+        char buff[MPI_MAX_ERROR_STRING];
+        MPI_Error_string(stats[k].MPI_ERROR, buff, &res);
+        message("request from source %i, tag %i has error '%s'.",
+                stats[k].MPI_SOURCE, stats[k].MPI_TAG, buff);
+      }
+      error("Failed during waitall for part data.");
+    }
+
+    /* Move to next chunks. */
+    sent += chunk;
+    recvd += chunk;
+  }
+
+  /* Free temps. */
+  free(reqs);
+
+  /* And return new memory. */
+  return parts_new;
+}
+#endif
+
 /**
  * @brief Redistribute the particles amongst the nodes according
  *      to their cell's node IDs.
@@ -249,32 +425,20 @@ void engine_redistribute(struct engine *e) {
   const double iwidth[3] = {s->iwidth[0], s->iwidth[1], s->iwidth[2]};
   const double dim[3] = {s->dim[0], s->dim[1], s->dim[2]};
   struct part *parts = s->parts;
-  struct xpart *xparts = s->xparts;
   struct gpart *gparts = s->gparts;
   struct spart *sparts = s->sparts;
   ticks tic = getticks();
 
   /* Allocate temporary arrays to store the counts of particles to be sent
-     and the destination of each particle */
-  int *counts, *g_counts, *s_counts;
+   * and the destination of each particle */
+  int *counts;
   if ((counts = (int *)malloc(sizeof(int) * nr_nodes * nr_nodes)) == NULL)
     error("Failed to allocate counts temporary buffer.");
-  if ((g_counts = (int *)malloc(sizeof(int) * nr_nodes * nr_nodes)) == NULL)
-    error("Failed to allocate g_gcount temporary buffer.");
-  if ((s_counts = (int *)malloc(sizeof(int) * nr_nodes * nr_nodes)) == NULL)
-    error("Failed to allocate s_counts temporary buffer.");
   bzero(counts, sizeof(int) * nr_nodes * nr_nodes);
-  bzero(g_counts, sizeof(int) * nr_nodes * nr_nodes);
-  bzero(s_counts, sizeof(int) * nr_nodes * nr_nodes);
 
-  /* Allocate the destination index arrays. */
-  int *dest, *g_dest, *s_dest;
+  int *dest;
   if ((dest = (int *)malloc(sizeof(int) * s->nr_parts)) == NULL)
     error("Failed to allocate dest temporary buffer.");
-  if ((g_dest = (int *)malloc(sizeof(int) * s->nr_gparts)) == NULL)
-    error("Failed to allocate g_dest temporary buffer.");
-  if ((s_dest = (int *)malloc(sizeof(int) * s->nr_sparts)) == NULL)
-    error("Failed to allocate s_dest temporary buffer.");
 
   /* Get destination of each particle */
   for (size_t k = 0; k < s->nr_parts; k++) {
@@ -356,8 +520,18 @@ void engine_redistribute(struct engine *e) {
       }
     }
   }
+  free(dest);
 
   /* Get destination of each s-particle */
+  int *s_counts;
+  if ((s_counts = (int *)malloc(sizeof(int) * nr_nodes * nr_nodes)) == NULL)
+    error("Failed to allocate s_counts temporary buffer.");
+  bzero(s_counts, sizeof(int) * nr_nodes * nr_nodes);
+
+  int *s_dest;
+  if ((s_dest = (int *)malloc(sizeof(int) * s->nr_sparts)) == NULL)
+    error("Failed to allocate s_dest temporary buffer.");
+
   for (size_t k = 0; k < s->nr_sparts; k++) {
 
     /* Periodic boundary conditions */
@@ -372,7 +546,7 @@ void engine_redistribute(struct engine *e) {
                    sparts[k].x[2] * iwidth[2]);
 #ifdef SWIFT_DEBUG_CHECKS
     if (cid < 0 || cid >= s->nr_cells)
-      error("Bad cell id %i for part %zu at [%.3e,%.3e,%.3e].", cid, k,
+      error("Bad cell id %i for spart %zu at [%.3e,%.3e,%.3e].", cid, k,
             sparts[k].x[0], sparts[k].x[1], sparts[k].x[2]);
 #endif
 
@@ -438,7 +612,18 @@ void engine_redistribute(struct engine *e) {
     }
   }
 
+  free(s_dest);
+
   /* Get destination of each g-particle */
+  int *g_counts;
+  if ((g_counts = (int *)malloc(sizeof(int) * nr_nodes * nr_nodes)) == NULL)
+    error("Failed to allocate g_gcount temporary buffer.");
+  bzero(g_counts, sizeof(int) * nr_nodes * nr_nodes);
+
+  int *g_dest;
+  if ((g_dest = (int *)malloc(sizeof(int) * s->nr_gparts)) == NULL)
+    error("Failed to allocate g_dest temporary buffer.");
+
   for (size_t k = 0; k < s->nr_gparts; k++) {
 
     /* Periodic boundary conditions */
@@ -453,7 +638,7 @@ void engine_redistribute(struct engine *e) {
                    gparts[k].x[2] * iwidth[2]);
 #ifdef SWIFT_DEBUG_CHECKS
     if (cid < 0 || cid >= s->nr_cells)
-      error("Bad cell id %i for part %zu at [%.3e,%.3e,%.3e].", cid, k,
+      error("Bad cell id %i for gpart %zu at [%.3e,%.3e,%.3e].", cid, k,
             gparts[k].x[0], gparts[k].x[1], gparts[k].x[2]);
 #endif
 
@@ -482,7 +667,8 @@ void engine_redistribute(struct engine *e) {
     const int new_node = c->nodeID;
 
     if (g_dest[k] != new_node)
-      error("gpart's new node index not matching sorted index.");
+      error("gpart's new node index not matching sorted index (%d != %d).",
+            g_dest[k], new_node);
 
     if (gp->x[0] < c->loc[0] || gp->x[0] > c->loc[0] + c->width[0] ||
         gp->x[1] < c->loc[1] || gp->x[1] > c->loc[1] + c->width[1] ||
@@ -491,6 +677,8 @@ void engine_redistribute(struct engine *e) {
   }
 #endif
 
+  free(g_dest);
+
   /* Get all the counts from all the nodes. */
   if (MPI_Allreduce(MPI_IN_PLACE, counts, nr_nodes * nr_nodes, MPI_INT, MPI_SUM,
                     MPI_COMM_WORLD) != MPI_SUCCESS)
@@ -538,10 +726,9 @@ void engine_redistribute(struct engine *e) {
     }
   }
 
-  /* Each node knows how many parts, sparts and gparts will be transferred
-     to every other node. We can start preparing to receive data */
-
-  /* Get the new number of parts and gparts for this node */
+  /* Now each node knows how many parts, sparts and gparts will be transferred
+   * to every other node.
+   * Get the new numbers of particles for this node. */
   size_t nr_parts = 0, nr_gparts = 0, nr_sparts = 0;
   for (int k = 0; k < nr_nodes; k++) nr_parts += counts[k * nr_nodes + nodeID];
   for (int k = 0; k < nr_nodes; k++)
@@ -549,162 +736,42 @@ void engine_redistribute(struct engine *e) {
   for (int k = 0; k < nr_nodes; k++)
     nr_sparts += s_counts[k * nr_nodes + nodeID];
 
-  /* Allocate the new arrays with some extra margin */
-  struct part *parts_new = NULL;
-  struct xpart *xparts_new = NULL;
-  struct gpart *gparts_new = NULL;
-  struct spart *sparts_new = NULL;
-  if (posix_memalign((void **)&parts_new, part_align,
-                     sizeof(struct part) * nr_parts *
-                         engine_redistribute_alloc_margin) != 0)
-    error("Failed to allocate new part data.");
-  if (posix_memalign((void **)&xparts_new, xpart_align,
-                     sizeof(struct xpart) * nr_parts *
-                         engine_redistribute_alloc_margin) != 0)
-    error("Failed to allocate new xpart data.");
-  if (posix_memalign((void **)&gparts_new, gpart_align,
-                     sizeof(struct gpart) * nr_gparts *
-                         engine_redistribute_alloc_margin) != 0)
-    error("Failed to allocate new gpart data.");
-  if (posix_memalign((void **)&sparts_new, spart_align,
-                     sizeof(struct spart) * nr_sparts *
-                         engine_redistribute_alloc_margin) != 0)
-    error("Failed to allocate new spart data.");
-
-  /* Prepare MPI requests for the asynchronous communications */
-  MPI_Request *reqs;
-  if ((reqs = (MPI_Request *)malloc(sizeof(MPI_Request) * 8 * nr_nodes)) ==
-      NULL)
-    error("Failed to allocate MPI request list.");
-  for (int k = 0; k < 8 * nr_nodes; k++) reqs[k] = MPI_REQUEST_NULL;
-
-  /* Emit the sends and recvs for the particle and gparticle data. */
-  size_t offset_send = 0, offset_recv = 0;
-  size_t g_offset_send = 0, g_offset_recv = 0;
-  size_t s_offset_send = 0, s_offset_recv = 0;
-  for (int k = 0; k < nr_nodes; k++) {
-
-    /* Indices in the count arrays of the node of interest */
-    const int ind_send = nodeID * nr_nodes + k;
-    const int ind_recv = k * nr_nodes + nodeID;
-
-    /* Are we sending any part/xpart ? */
-    if (counts[ind_send] > 0) {
-
-      /* message("Sending %d part to node %d", counts[ind_send], k); */
-
-      /* If the send is to the same node, just copy */
-      if (k == nodeID) {
-        memcpy(&parts_new[offset_recv], &s->parts[offset_send],
-               sizeof(struct part) * counts[ind_recv]);
-        memcpy(&xparts_new[offset_recv], &s->xparts[offset_send],
-               sizeof(struct xpart) * counts[ind_recv]);
-        offset_send += counts[ind_send];
-        offset_recv += counts[ind_recv];
-
-        /* Else, emit some communications */
-      } else {
-        if (MPI_Isend(&s->parts[offset_send], counts[ind_send], part_mpi_type,
-                      k, 4 * ind_send + 0, MPI_COMM_WORLD,
-                      &reqs[8 * k + 0]) != MPI_SUCCESS)
-          error("Failed to isend parts to node %i.", k);
-        if (MPI_Isend(&s->xparts[offset_send], counts[ind_send], xpart_mpi_type,
-                      k, 4 * ind_send + 1, MPI_COMM_WORLD,
-                      &reqs[8 * k + 1]) != MPI_SUCCESS)
-          error("Failed to isend xparts to node %i.", k);
-        offset_send += counts[ind_send];
-      }
-    }
-
-    /* Are we sending any gpart ? */
-    if (g_counts[ind_send] > 0) {
-
-      /* message("Sending %d gpart to node %d", g_counts[ind_send], k); */
-
-      /* If the send is to the same node, just copy */
-      if (k == nodeID) {
-        memcpy(&gparts_new[g_offset_recv], &s->gparts[g_offset_send],
-               sizeof(struct gpart) * g_counts[ind_recv]);
-        g_offset_send += g_counts[ind_send];
-        g_offset_recv += g_counts[ind_recv];
-
-        /* Else, emit some communications */
-      } else {
-        if (MPI_Isend(&s->gparts[g_offset_send], g_counts[ind_send],
-                      gpart_mpi_type, k, 4 * ind_send + 2, MPI_COMM_WORLD,
-                      &reqs[8 * k + 2]) != MPI_SUCCESS)
-          error("Failed to isend gparts to node %i.", k);
-        g_offset_send += g_counts[ind_send];
-      }
-    }
-
-    /* Are we sending any spart ? */
-    if (s_counts[ind_send] > 0) {
-
-      /* message("Sending %d spart to node %d", s_counts[ind_send], k); */
+  /* Now exchange the particles, type by type to keep the memory required
+   * under control. */
 
-      /* If the send is to the same node, just copy */
-      if (k == nodeID) {
-        memcpy(&sparts_new[s_offset_recv], &s->sparts[s_offset_send],
-               sizeof(struct spart) * s_counts[ind_recv]);
-        s_offset_send += s_counts[ind_send];
-        s_offset_recv += s_counts[ind_recv];
-
-        /* Else, emit some communications */
-      } else {
-        if (MPI_Isend(&s->sparts[s_offset_send], s_counts[ind_send],
-                      spart_mpi_type, k, 4 * ind_send + 3, MPI_COMM_WORLD,
-                      &reqs[8 * k + 3]) != MPI_SUCCESS)
-          error("Failed to isend gparts to node %i.", k);
-        s_offset_send += s_counts[ind_send];
-      }
-    }
-
-    /* Now emit the corresponding Irecv() */
-
-    /* Are we receiving any part/xpart from this node ? */
-    if (k != nodeID && counts[ind_recv] > 0) {
-      if (MPI_Irecv(&parts_new[offset_recv], counts[ind_recv], part_mpi_type, k,
-                    4 * ind_recv + 0, MPI_COMM_WORLD,
-                    &reqs[8 * k + 4]) != MPI_SUCCESS)
-        error("Failed to emit irecv of parts from node %i.", k);
-      if (MPI_Irecv(&xparts_new[offset_recv], counts[ind_recv], xpart_mpi_type,
-                    k, 4 * ind_recv + 1, MPI_COMM_WORLD,
-                    &reqs[8 * k + 5]) != MPI_SUCCESS)
-        error("Failed to emit irecv of xparts from node %i.", k);
-      offset_recv += counts[ind_recv];
-    }
+  /* SPH particles. */
+  void *new_parts = engine_do_redistribute(counts, (char *)s->parts, nr_parts,
+                                           sizeof(struct part), part_align,
+                                           part_mpi_type, nr_nodes, nodeID);
+  free(s->parts);
+  s->parts = (struct part *)new_parts;
+  s->nr_parts = nr_parts;
+  s->size_parts = engine_redistribute_alloc_margin * nr_parts;
 
-    /* Are we receiving any gpart from this node ? */
-    if (k != nodeID && g_counts[ind_recv] > 0) {
-      if (MPI_Irecv(&gparts_new[g_offset_recv], g_counts[ind_recv],
-                    gpart_mpi_type, k, 4 * ind_recv + 2, MPI_COMM_WORLD,
-                    &reqs[8 * k + 6]) != MPI_SUCCESS)
-        error("Failed to emit irecv of gparts from node %i.", k);
-      g_offset_recv += g_counts[ind_recv];
-    }
+  /* Extra SPH particle properties. */
+  new_parts = engine_do_redistribute(counts, (char *)s->xparts, nr_parts,
+                                     sizeof(struct xpart), xpart_align,
+                                     xpart_mpi_type, nr_nodes, nodeID);
+  free(s->xparts);
+  s->xparts = (struct xpart *)new_parts;
 
-    /* Are we receiving any spart from this node ? */
-    if (k != nodeID && s_counts[ind_recv] > 0) {
-      if (MPI_Irecv(&sparts_new[s_offset_recv], s_counts[ind_recv],
-                    spart_mpi_type, k, 4 * ind_recv + 3, MPI_COMM_WORLD,
-                    &reqs[8 * k + 7]) != MPI_SUCCESS)
-        error("Failed to emit irecv of sparts from node %i.", k);
-      s_offset_recv += s_counts[ind_recv];
-    }
-  }
+  /* Gravity particles. */
+  new_parts = engine_do_redistribute(g_counts, (char *)s->gparts, nr_gparts,
+                                     sizeof(struct gpart), gpart_align,
+                                     gpart_mpi_type, nr_nodes, nodeID);
+  free(s->gparts);
+  s->gparts = (struct gpart *)new_parts;
+  s->nr_gparts = nr_gparts;
+  s->size_gparts = engine_redistribute_alloc_margin * nr_gparts;
 
-  /* Wait for all the sends and recvs to tumble in. */
-  MPI_Status stats[8 * nr_nodes];
-  int res;
-  if ((res = MPI_Waitall(8 * nr_nodes, reqs, stats)) != MPI_SUCCESS) {
-    for (int k = 0; k < 8 * nr_nodes; k++) {
-      char buff[MPI_MAX_ERROR_STRING];
-      MPI_Error_string(stats[k].MPI_ERROR, buff, &res);
-      message("request %i has error '%s'.", k, buff);
-    }
-    error("Failed during waitall for part data.");
-  }
+  /* Star particles. */
+  new_parts = engine_do_redistribute(s_counts, (char *)s->sparts, nr_sparts,
+                                     sizeof(struct spart), spart_align,
+                                     spart_mpi_type, nr_nodes, nodeID);
+  free(s->sparts);
+  s->sparts = (struct spart *)new_parts;
+  s->nr_sparts = nr_sparts;
+  s->size_sparts = engine_redistribute_alloc_margin * nr_sparts;
 
   /* All particles have now arrived. Time for some final operations on the
      stuff we just received */
@@ -722,25 +789,25 @@ void engine_redistribute(struct engine *e) {
     for (size_t k = offset_gparts; k < offset_gparts + count_gparts; ++k) {
 
       /* Does this gpart have a gas partner ? */
-      if (gparts_new[k].type == swift_type_gas) {
+      if (s->gparts[k].type == swift_type_gas) {
 
         const ptrdiff_t partner_index =
-            offset_parts - gparts_new[k].id_or_neg_offset;
+            offset_parts - s->gparts[k].id_or_neg_offset;
 
         /* Re-link */
-        gparts_new[k].id_or_neg_offset = -partner_index;
-        parts_new[partner_index].gpart = &gparts_new[k];
+        s->gparts[k].id_or_neg_offset = -partner_index;
+        s->parts[partner_index].gpart = &s->gparts[k];
       }
 
       /* Does this gpart have a star partner ? */
-      if (gparts_new[k].type == swift_type_star) {
+      if (s->gparts[k].type == swift_type_star) {
 
         const ptrdiff_t partner_index =
-            offset_sparts - gparts_new[k].id_or_neg_offset;
+            offset_sparts - s->gparts[k].id_or_neg_offset;
 
         /* Re-link */
-        gparts_new[k].id_or_neg_offset = -partner_index;
-        sparts_new[partner_index].gpart = &gparts_new[k];
+        s->gparts[k].id_or_neg_offset = -partner_index;
+        s->sparts[partner_index].gpart = &s->gparts[k];
       }
     }
 
@@ -749,59 +816,43 @@ void engine_redistribute(struct engine *e) {
     offset_sparts += count_sparts;
   }
 
+  /* Clean up the counts now we done. */
+  free(counts);
+  free(g_counts);
+  free(s_counts);
+
 #ifdef SWIFT_DEBUG_CHECKS
   /* Verify that all parts are in the right place. */
   for (size_t k = 0; k < nr_parts; k++) {
-    const int cid = cell_getid(cdim, parts_new[k].x[0] * iwidth[0],
-                               parts_new[k].x[1] * iwidth[1],
-                               parts_new[k].x[2] * iwidth[2]);
+    const int cid =
+        cell_getid(cdim, s->parts[k].x[0] * iwidth[0],
+                   s->parts[k].x[1] * iwidth[1], s->parts[k].x[2] * iwidth[2]);
     if (cells[cid].nodeID != nodeID)
       error("Received particle (%zu) that does not belong here (nodeID=%i).", k,
             cells[cid].nodeID);
   }
   for (size_t k = 0; k < nr_gparts; k++) {
-    const int cid = cell_getid(cdim, gparts_new[k].x[0] * iwidth[0],
-                               gparts_new[k].x[1] * iwidth[1],
-                               gparts_new[k].x[2] * iwidth[2]);
+    const int cid = cell_getid(cdim, s->gparts[k].x[0] * iwidth[0],
+                               s->gparts[k].x[1] * iwidth[1],
+                               s->gparts[k].x[2] * iwidth[2]);
     if (cells[cid].nodeID != nodeID)
       error("Received g-particle (%zu) that does not belong here (nodeID=%i).",
             k, cells[cid].nodeID);
   }
   for (size_t k = 0; k < nr_sparts; k++) {
-    const int cid = cell_getid(cdim, sparts_new[k].x[0] * iwidth[0],
-                               sparts_new[k].x[1] * iwidth[1],
-                               sparts_new[k].x[2] * iwidth[2]);
+    const int cid = cell_getid(cdim, s->sparts[k].x[0] * iwidth[0],
+                               s->sparts[k].x[1] * iwidth[1],
+                               s->sparts[k].x[2] * iwidth[2]);
     if (cells[cid].nodeID != nodeID)
       error("Received s-particle (%zu) that does not belong here (nodeID=%i).",
             k, cells[cid].nodeID);
   }
 
   /* Verify that the links are correct */
-  part_verify_links(parts_new, gparts_new, sparts_new, nr_parts, nr_gparts,
+  part_verify_links(s->parts, s->gparts, s->sparts, nr_parts, nr_gparts,
                     nr_sparts, e->verbose);
 #endif
 
-  /* Set the new part data, free the old. */
-  free(parts);
-  free(xparts);
-  free(gparts);
-  free(sparts);
-  s->parts = parts_new;
-  s->xparts = xparts_new;
-  s->gparts = gparts_new;
-  s->sparts = sparts_new;
-  s->nr_parts = nr_parts;
-  s->nr_gparts = nr_gparts;
-  s->nr_sparts = nr_sparts;
-  s->size_parts = engine_redistribute_alloc_margin * nr_parts;
-  s->size_gparts = engine_redistribute_alloc_margin * nr_gparts;
-  s->size_sparts = engine_redistribute_alloc_margin * nr_sparts;
-
-  /* Clean up the temporary stuff. */
-  free(reqs);
-  free(counts);
-  free(dest);
-
   /* Be verbose about what just happened. */
   if (e->verbose) {
     int my_cells = 0;
@@ -851,6 +902,16 @@ void engine_repartition(struct engine *e) {
   partition_repartition(e->reparttype, e->nodeID, e->nr_nodes, e->s,
                         e->sched.tasks, e->sched.nr_tasks);
 
+  /* Partitioning requires copies of the particles, so we need to reduce the
+   * memory in use to the minimum, we can free the sorting indices and the
+   * tasks as these will be regenerated at the next rebuild. */
+
+  /* Sorting indices. */
+  if (e->s->cells_top != NULL) space_free_cells(e->s);
+
+  /* Task arrays. */
+  scheduler_free_tasks(&e->sched);
+
   /* Now comes the tricky part: Exchange particles between all nodes.
      This is done in two steps, first allreducing a matrix of
      how many particles go from where to where, then re-allocating
@@ -870,7 +931,11 @@ void engine_repartition(struct engine *e) {
     message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
             clocks_getunit());
 #else
-  error("SWIFT was not compiled with MPI and METIS support.");
+  if (e->reparttype->type != REPART_NONE)
+    error("SWIFT was not compiled with MPI and METIS support.");
+
+  /* Clear the repartition flag. */
+  e->forcerepart = 0;
 #endif
 }
 
@@ -885,14 +950,15 @@ void engine_repartition_trigger(struct engine *e) {
 
   /* Do nothing if there have not been enough steps since the last
    * repartition, don't want to repeat this too often or immediately after
-   * a repartition step. */
-  if (e->step - e->last_repartition > 2) {
+   * a repartition step. Also nothing to do when requested. */
+  if (e->step - e->last_repartition >= 2 &&
+      e->reparttype->type != REPART_NONE) {
 
     /* Old style if trigger is >1 or this is the second step (want an early
      * repartition following the initial repartition). */
     if (e->reparttype->trigger > 1 || e->step == 2) {
       if (e->reparttype->trigger > 1) {
-        if (e->step % (int)e->reparttype->trigger == 2) e->forcerepart = 1;
+        if ((e->step % (int)e->reparttype->trigger) == 0) e->forcerepart = 1;
       } else {
         e->forcerepart = 1;
       }
@@ -947,8 +1013,9 @@ void engine_repartition_trigger(struct engine *e) {
     if (e->forcerepart) e->last_repartition = e->step;
   }
 
-  /* We always reset CPU time for next check. */
-  e->cputime_last_step = clocks_get_cputime_used();
+  /* We always reset CPU time for next check, unless it will not be used. */
+  if (e->reparttype->type != REPART_NONE)
+    e->cputime_last_step = clocks_get_cputime_used();
 #endif
 }
 
@@ -1027,28 +1094,25 @@ void engine_addtasks_send(struct engine *e, struct cell *ci, struct cell *cj,
       scheduler_addunlock(s, t_rho, ci->super->extra_ghost);
 
       /* The send_rho task depends on the cell's ghost task. */
-      scheduler_addunlock(s, ci->super->ghost, t_rho);
+      scheduler_addunlock(s, ci->super->ghost_out, t_rho);
 
       /* The send_xv task should unlock the super-cell's ghost task. */
-      scheduler_addunlock(s, t_xv, ci->super->ghost);
+      scheduler_addunlock(s, t_xv, ci->super->ghost_in);
 
 #else
       /* The send_rho task should unlock the super-cell's kick task. */
       scheduler_addunlock(s, t_rho, ci->super->kick2);
 
       /* The send_rho task depends on the cell's ghost task. */
-      scheduler_addunlock(s, ci->super->ghost, t_rho);
+      scheduler_addunlock(s, ci->super->ghost_out, t_rho);
 
       /* The send_xv task should unlock the super-cell's ghost task. */
-      scheduler_addunlock(s, t_xv, ci->super->ghost);
+      scheduler_addunlock(s, t_xv, ci->super->ghost_in);
 
 #endif
 
       /* Drift before you send */
-      if (ci->drift_part == NULL)
-        ci->drift_part = scheduler_addtask(s, task_type_drift_part,
-                                           task_subtype_none, 0, 0, ci, NULL);
-      scheduler_addunlock(s, ci->drift_part, t_xv);
+      scheduler_addunlock(s, ci->super->drift_part, t_xv);
 
       /* The super-cell's timestep task should unlock the send_ti task. */
       scheduler_addunlock(s, ci->super->timestep, t_ti);
@@ -1634,25 +1698,130 @@ void engine_exchange_strays(struct engine *e, size_t offset_parts,
 
 /**
  * @brief Constructs the top-level tasks for the short-range gravity
- * interactions.
+ * and long-range gravity interactions.
  *
+ * - One FTT task per MPI rank.
+ * - Multiple gravity ghosts for dependencies.
  * - All top-cells get a self task.
  * - All pairs within range according to the multipole acceptance
  *   criterion get a pair task.
- *
- * @param e The #engine.
  */
-void engine_make_self_gravity_tasks(struct engine *e) {
+void engine_make_self_gravity_tasks_mapper(void *map_data, int num_elements,
+                                           void *extra_data) {
+
+  struct engine *e = ((struct engine **)extra_data)[0];
+  struct task **ghosts = ((struct task ***)extra_data)[1];
 
   struct space *s = e->s;
   struct scheduler *sched = &e->sched;
   const int nodeID = e->nodeID;
   const int periodic = s->periodic;
+  const double dim[3] = {s->dim[0], s->dim[1], s->dim[2]};
   const int cdim[3] = {s->cdim[0], s->cdim[1], s->cdim[2]};
   const int cdim_ghost[3] = {s->cdim[0] / 4 + 1, s->cdim[1] / 4 + 1,
                              s->cdim[2] / 4 + 1};
   const double theta_crit_inv = e->gravity_properties->theta_crit_inv;
   struct cell *cells = s->cells_top;
+  const int n_ghosts = cdim_ghost[0] * cdim_ghost[1] * cdim_ghost[2] * 2;
+
+  /* Loop through the elements, which are just byte offsets from NULL. */
+  for (int ind = 0; ind < num_elements; ind++) {
+
+    /* Get the cell index. */
+    const int cid = (size_t)(map_data) + ind;
+    const int i = cid / (cdim[1] * cdim[2]);
+    const int j = (cid / cdim[2]) % cdim[1];
+    const int k = cid % cdim[2];
+
+    /* Get the cell */
+    struct cell *ci = &cells[cid];
+
+    /* Skip cells without gravity particles */
+    if (ci->gcount == 0) continue;
+
+    /* Is that cell local ? */
+    if (ci->nodeID != nodeID) continue;
+
+    /* If the cells is local build a self-interaction */
+    scheduler_addtask(sched, task_type_self, task_subtype_grav, 0, 0, ci, NULL);
+
+    /* Deal with periodicity FFT task dependencies */
+    const int ghost_id = cell_getid(cdim_ghost, i / 4, j / 4, k / 4);
+    if (ghost_id > n_ghosts) error("Invalid ghost_id");
+    if (periodic) {
+      ci->grav_ghost[0] = ghosts[2 * ghost_id + 0];
+      ci->grav_ghost[1] = ghosts[2 * ghost_id + 1];
+    }
+
+    /* Recover the multipole information */
+    struct gravity_tensors *const multi_i = ci->multipole;
+    const double CoM_i[3] = {multi_i->CoM[0], multi_i->CoM[1], multi_i->CoM[2]};
+
+    /* Loop over every other cell */
+    for (int ii = 0; ii < cdim[0]; ii++) {
+      for (int jj = 0; jj < cdim[1]; jj++) {
+        for (int kk = 0; kk < cdim[2]; kk++) {
+
+          /* Get the cell */
+          const int cjd = cell_getid(cdim, ii, jj, kk);
+          struct cell *cj = &cells[cjd];
+
+          /* Avoid duplicates */
+          if (cid <= cjd) continue;
+
+          /* Skip cells without gravity particles */
+          if (cj->gcount == 0) continue;
+
+          /* Is that neighbour local ? */
+          if (cj->nodeID != nodeID) continue;  // MATTHIEU
+
+          /* Recover the multipole information */
+          struct gravity_tensors *const multi_j = cj->multipole;
+
+          /* Get the distance between the CoMs */
+          double dx = CoM_i[0] - multi_j->CoM[0];
+          double dy = CoM_i[1] - multi_j->CoM[1];
+          double dz = CoM_i[2] - multi_j->CoM[2];
+
+          /* Apply BC */
+          if (periodic) {
+            dx = nearest(dx, dim[0]);
+            dy = nearest(dy, dim[1]);
+            dz = nearest(dz, dim[2]);
+          }
+          const double r2 = dx * dx + dy * dy + dz * dz;
+
+          /* Are the cells too close for a MM interaction ? */
+          if (!gravity_multipole_accept_rebuild(multi_i, multi_j,
+                                                theta_crit_inv, r2)) {
+
+            /* Ok, we need to add a direct pair calculation */
+            scheduler_addtask(sched, task_type_pair, task_subtype_grav, 0, 0,
+                              ci, cj);
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * @brief Constructs the top-level tasks for the short-range gravity
+ * interactions.
+ *
+ * - All top-cells get a self task.
+ * - All pairs within range according to the multipole acceptance
+ *   criterion get a pair task.
+ *
+ * @param e The #engine.
+ */
+void engine_make_self_gravity_tasks(struct engine *e) {
+
+  struct space *s = e->s;
+  struct scheduler *sched = &e->sched;
+  const int periodic = s->periodic;
+  const int cdim_ghost[3] = {s->cdim[0] / 4 + 1, s->cdim[1] / 4 + 1,
+                             s->cdim[2] / 4 + 1};
   struct task **ghosts = NULL;
   const int n_ghosts = cdim_ghost[0] * cdim_ghost[1] * cdim_ghost[2] * 2;
 
@@ -1680,67 +1849,20 @@ void engine_make_self_gravity_tasks(struct engine *e) {
     }
   }
 
-  /* Run through the higher level cells */
-  for (int i = 0; i < cdim[0]; i++) {
-    for (int j = 0; j < cdim[1]; j++) {
-      for (int k = 0; k < cdim[2]; k++) {
-
-        /* Get the cell */
-        const int cid = cell_getid(cdim, i, j, k);
-        struct cell *ci = &cells[cid];
-
-        /* Skip cells without gravity particles */
-        if (ci->gcount == 0) continue;
-
-        /* Is that cell local ? */
-        if (ci->nodeID != nodeID) continue;
-
-        /* If the cells is local build a self-interaction */
-        scheduler_addtask(sched, task_type_self, task_subtype_grav, 0, 0, ci,
-                          NULL);
-
-        /* Deal with periodicity dependencies */
-        const int ghost_id = cell_getid(cdim_ghost, i / 4, j / 4, k / 4);
-        if (ghost_id > n_ghosts) error("Invalid ghost_id");
-        if (periodic) {
-          ci->grav_ghost[0] = ghosts[2 * ghost_id + 0];
-          ci->grav_ghost[1] = ghosts[2 * ghost_id + 1];
-        }
-
-        /* Loop over every other cell */
-        for (int ii = 0; ii < cdim[0]; ii++) {
-          for (int jj = 0; jj < cdim[1]; jj++) {
-            for (int kk = 0; kk < cdim[2]; kk++) {
-
-              /* Get the cell */
-              const int cjd = cell_getid(cdim, ii, jj, kk);
-              struct cell *cj = &cells[cjd];
-
-              /* Avoid duplicates */
-              if (cid <= cjd) continue;
-
-              /* Skip cells without gravity particles */
-              if (cj->gcount == 0) continue;
+  /* Cretae the multipole self and pair tasks. */
+  void *extra_data[2] = {e, ghosts};
+  threadpool_map(&e->threadpool, engine_make_self_gravity_tasks_mapper, NULL,
+                 s->nr_cells, 1, 0, extra_data);
 
-              /* Is that neighbour local ? */
-              if (cj->nodeID != nodeID) continue;  // MATTHIEU
-
-              /* Are the cells to close for a MM interaction ? */
-              if (!gravity_multipole_accept(ci->multipole, cj->multipole,
-                                            theta_crit_inv, 1)) {
-
-                scheduler_addtask(sched, task_type_pair, task_subtype_grav, 0,
-                                  0, ci, cj);
-              }
-            }
-          }
-        }
-      }
-    }
-  }
+  /* Clean up. */
   if (periodic) free(ghosts);
 }
 
+/**
+ * @brief Constructs the top-level tasks for the external gravity.
+ *
+ * @param e The #engine.
+ */
 void engine_make_external_gravity_tasks(struct engine *e) {
 
   struct space *s = e->s;
@@ -1774,9 +1896,15 @@ void engine_make_external_gravity_tasks(struct engine *e) {
  * Additional loop over neighbours can later be added by simply duplicating
  * all the tasks created by this function.
  *
- * @param e The #engine.
+ * @param map_data Offset of first two indices disguised as a pointer.
+ * @param num_elements Number of cells to traverse.
+ * @param extra_data The #engine.
  */
-void engine_make_hydroloop_tasks(struct engine *e) {
+void engine_make_hydroloop_tasks_mapper(void *map_data, int num_elements,
+                                        void *extra_data) {
+
+  /* Extract the engine pointer. */
+  struct engine *e = (struct engine *)extra_data;
 
   struct space *s = e->s;
   struct scheduler *sched = &e->sched;
@@ -1784,53 +1912,53 @@ void engine_make_hydroloop_tasks(struct engine *e) {
   const int *cdim = s->cdim;
   struct cell *cells = s->cells_top;
 
-  /* Run through the highest level of cells and add pairs. */
-  for (int i = 0; i < cdim[0]; i++) {
-    for (int j = 0; j < cdim[1]; j++) {
-      for (int k = 0; k < cdim[2]; k++) {
-
-        /* Get the cell */
-        const int cid = cell_getid(cdim, i, j, k);
-        struct cell *ci = &cells[cid];
-
-        /* Skip cells without hydro particles */
-        if (ci->count == 0) continue;
-
-        /* If the cells is local build a self-interaction */
-        if (ci->nodeID == nodeID)
-          scheduler_addtask(sched, task_type_self, task_subtype_density, 0, 0,
-                            ci, NULL);
-
-        /* Now loop over all the neighbours of this cell */
-        for (int ii = -1; ii < 2; ii++) {
-          int iii = i + ii;
-          if (!s->periodic && (iii < 0 || iii >= cdim[0])) continue;
-          iii = (iii + cdim[0]) % cdim[0];
-          for (int jj = -1; jj < 2; jj++) {
-            int jjj = j + jj;
-            if (!s->periodic && (jjj < 0 || jjj >= cdim[1])) continue;
-            jjj = (jjj + cdim[1]) % cdim[1];
-            for (int kk = -1; kk < 2; kk++) {
-              int kkk = k + kk;
-              if (!s->periodic && (kkk < 0 || kkk >= cdim[2])) continue;
-              kkk = (kkk + cdim[2]) % cdim[2];
-
-              /* Get the neighbouring cell */
-              const int cjd = cell_getid(cdim, iii, jjj, kkk);
-              struct cell *cj = &cells[cjd];
-
-              /* Is that neighbour local and does it have particles ? */
-              if (cid >= cjd || cj->count == 0 ||
-                  (ci->nodeID != nodeID && cj->nodeID != nodeID))
-                continue;
-
-              /* Construct the pair task */
-              const int sid =
-                  sortlistID[(kk + 1) + 3 * ((jj + 1) + 3 * (ii + 1))];
-              scheduler_addtask(sched, task_type_pair, task_subtype_density,
-                                sid, 0, ci, cj);
-            }
-          }
+  /* Loop through the elements, which are just byte offsets from NULL. */
+  for (int ind = 0; ind < num_elements; ind++) {
+
+    /* Get the cell index. */
+    const int cid = (size_t)(map_data) + ind;
+    const int i = cid / (cdim[1] * cdim[2]);
+    const int j = (cid / cdim[2]) % cdim[1];
+    const int k = cid % cdim[2];
+
+    /* Get the cell */
+    struct cell *ci = &cells[cid];
+
+    /* Skip cells without hydro particles */
+    if (ci->count == 0) continue;
+
+    /* If the cells is local build a self-interaction */
+    if (ci->nodeID == nodeID)
+      scheduler_addtask(sched, task_type_self, task_subtype_density, 0, 0, ci,
+                        NULL);
+
+    /* Now loop over all the neighbours of this cell */
+    for (int ii = -1; ii < 2; ii++) {
+      int iii = i + ii;
+      if (!s->periodic && (iii < 0 || iii >= cdim[0])) continue;
+      iii = (iii + cdim[0]) % cdim[0];
+      for (int jj = -1; jj < 2; jj++) {
+        int jjj = j + jj;
+        if (!s->periodic && (jjj < 0 || jjj >= cdim[1])) continue;
+        jjj = (jjj + cdim[1]) % cdim[1];
+        for (int kk = -1; kk < 2; kk++) {
+          int kkk = k + kk;
+          if (!s->periodic && (kkk < 0 || kkk >= cdim[2])) continue;
+          kkk = (kkk + cdim[2]) % cdim[2];
+
+          /* Get the neighbouring cell */
+          const int cjd = cell_getid(cdim, iii, jjj, kkk);
+          struct cell *cj = &cells[cjd];
+
+          /* Is that neighbour local and does it have particles ? */
+          if (cid >= cjd || cj->count == 0 ||
+              (ci->nodeID != nodeID && cj->nodeID != nodeID))
+            continue;
+
+          /* Construct the pair task */
+          const int sid = sortlistID[(kk + 1) + 3 * ((jj + 1) + 3 * (ii + 1))];
+          scheduler_addtask(sched, task_type_pair, task_subtype_density, sid, 0,
+                            ci, cj);
         }
       }
     }
@@ -1843,41 +1971,24 @@ void engine_make_hydroloop_tasks(struct engine *e) {
  * For each hydrodynamic and gravity task, construct the links with
  * the corresponding cell.  Similarly, construct the dependencies for
  * all the sorting tasks.
- *
- * @param e The #engine.
  */
-void engine_count_and_link_tasks(struct engine *e) {
+void engine_count_and_link_tasks_mapper(void *map_data, int num_elements,
+                                        void *extra_data) {
 
+  struct engine *e = (struct engine *)extra_data;
   struct scheduler *const sched = &e->sched;
-  const int nr_tasks = sched->nr_tasks;
 
-  for (int ind = 0; ind < nr_tasks; ind++) {
+  for (int ind = 0; ind < num_elements; ind++) {
+    struct task *const t = &((struct task *)map_data)[ind];
 
-    struct task *const t = &sched->tasks[ind];
     struct cell *const ci = t->ci;
     struct cell *const cj = t->cj;
 
-    /* Link sort tasks to the next-higher sort task. */
+    /* Link sort tasks to all the higher sort task. */
     if (t->type == task_type_sort) {
-      struct cell *finger = t->ci->parent;
-      while (finger != NULL && finger->sorts == NULL) finger = finger->parent;
-      if (finger != NULL) scheduler_addunlock(sched, t, finger->sorts);
-    }
-
-    /* Link drift tasks to the next-higher drift task. */
-    else if (t->type == task_type_drift_part) {
-      struct cell *finger = ci->parent;
-      while (finger != NULL && finger->drift_part == NULL)
-        finger = finger->parent;
-      if (finger != NULL) scheduler_addunlock(sched, t, finger->drift_part);
-    }
-
-    /* Link drift tasks to the next-higher drift task. */
-    else if (t->type == task_type_drift_gpart) {
-      struct cell *finger = ci->parent;
-      while (finger != NULL && finger->drift_gpart == NULL)
-        finger = finger->parent;
-      if (finger != NULL) scheduler_addunlock(sched, t, finger->drift_gpart);
+      for (struct cell *finger = t->ci->parent; finger != NULL;
+           finger = finger->parent)
+        if (finger->sorts != NULL) scheduler_addunlock(sched, t, finger->sorts);
     }
 
     /* Link self tasks to cells. */
@@ -2072,8 +2183,8 @@ static inline void engine_make_hydro_loops_dependencies(
 
   /* density loop --> ghost --> gradient loop --> extra_ghost */
   /* extra_ghost --> force loop  */
-  scheduler_addunlock(sched, density, c->super->ghost);
-  scheduler_addunlock(sched, c->super->ghost, gradient);
+  scheduler_addunlock(sched, density, c->super->ghost_in);
+  scheduler_addunlock(sched, c->super->ghost_out, gradient);
   scheduler_addunlock(sched, gradient, c->super->extra_ghost);
   scheduler_addunlock(sched, c->super->extra_ghost, force);
 
@@ -2103,8 +2214,8 @@ static inline void engine_make_hydro_loops_dependencies(struct scheduler *sched,
                                                         struct cell *c,
                                                         int with_cooling) {
   /* density loop --> ghost --> force loop */
-  scheduler_addunlock(sched, density, c->super->ghost);
-  scheduler_addunlock(sched, c->super->ghost, force);
+  scheduler_addunlock(sched, density, c->super->ghost_in);
+  scheduler_addunlock(sched, c->super->ghost_out, force);
 
   if (with_cooling) {
     /* force loop --> cooling (--> kick2)  */
@@ -2125,32 +2236,32 @@ static inline void engine_make_hydro_loops_dependencies(struct scheduler *sched,
  * corresponding to the second hydro loop over neighbours.
  * With all the relevant tasks for a given cell available, we construct
  * all the dependencies for that cell.
- *
- * @param e The #engine.
  */
-void engine_make_extra_hydroloop_tasks(struct engine *e) {
+void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
+                                              void *extra_data) {
 
+  struct engine *e = (struct engine *)extra_data;
   struct scheduler *sched = &e->sched;
-  const int nr_tasks = sched->nr_tasks;
   const int nodeID = e->nodeID;
   const int with_cooling = (e->policy & engine_policy_cooling);
 
-  for (int ind = 0; ind < nr_tasks; ind++) {
-    struct task *t = &sched->tasks[ind];
+  for (int ind = 0; ind < num_elements; ind++) {
+    struct task *t = &((struct task *)map_data)[ind];
 
     /* Sort tasks depend on the drift of the cell. */
     if (t->type == task_type_sort && t->ci->nodeID == engine_rank) {
-      scheduler_addunlock(sched, t->ci->drift_part, t);
+      scheduler_addunlock(sched, t->ci->super->drift_part, t);
     }
 
     /* Self-interaction? */
     else if (t->type == task_type_self && t->subtype == task_subtype_density) {
 
-      /* Make all density tasks depend on the drift. */
-      scheduler_addunlock(sched, t->ci->drift_part, t);
+      /* Make all density tasks depend on the drift and the sorts. */
+      scheduler_addunlock(sched, t->ci->super->drift_part, t);
+      scheduler_addunlock(sched, t->ci->super->sorts, t);
 
 #ifdef EXTRA_HYDRO_LOOP
-      /* Start by constructing the task for the second  and third hydro loop */
+      /* Start by constructing the task for the second  and third hydro loop. */
       struct task *t2 = scheduler_addtask(
           sched, task_type_self, task_subtype_gradient, 0, 0, t->ci, NULL);
       struct task *t3 = scheduler_addtask(
@@ -2181,11 +2292,15 @@ void engine_make_extra_hydroloop_tasks(struct engine *e) {
     /* Otherwise, pair interaction? */
     else if (t->type == task_type_pair && t->subtype == task_subtype_density) {
 
-      /* Make all density tasks depend on the drift. */
+      /* Make all density tasks depend on the drift and the sorts. */
       if (t->ci->nodeID == engine_rank)
-        scheduler_addunlock(sched, t->ci->drift_part, t);
-      if (t->cj->nodeID == engine_rank)
-        scheduler_addunlock(sched, t->cj->drift_part, t);
+        scheduler_addunlock(sched, t->ci->super->drift_part, t);
+      scheduler_addunlock(sched, t->ci->super->sorts, t);
+      if (t->ci->super != t->cj->super) {
+        if (t->cj->nodeID == engine_rank)
+          scheduler_addunlock(sched, t->cj->super->drift_part, t);
+        scheduler_addunlock(sched, t->cj->super->sorts, t);
+      }
 
 #ifdef EXTRA_HYDRO_LOOP
       /* Start by constructing the task for the second and third hydro loop */
@@ -2238,6 +2353,10 @@ void engine_make_extra_hydroloop_tasks(struct engine *e) {
     else if (t->type == task_type_sub_self &&
              t->subtype == task_subtype_density) {
 
+      /* Make all density tasks depend on the drift and sorts. */
+      scheduler_addunlock(sched, t->ci->super->drift_part, t);
+      scheduler_addunlock(sched, t->ci->super->sorts, t);
+
 #ifdef EXTRA_HYDRO_LOOP
 
       /* Start by constructing the task for the second and third hydro loop */
@@ -2280,6 +2399,16 @@ void engine_make_extra_hydroloop_tasks(struct engine *e) {
     else if (t->type == task_type_sub_pair &&
              t->subtype == task_subtype_density) {
 
+      /* Make all density tasks depend on the drift. */
+      if (t->ci->nodeID == engine_rank)
+        scheduler_addunlock(sched, t->ci->super->drift_part, t);
+      scheduler_addunlock(sched, t->ci->super->sorts, t);
+      if (t->ci->super != t->cj->super) {
+        if (t->cj->nodeID == engine_rank)
+          scheduler_addunlock(sched, t->cj->super->drift_part, t);
+        scheduler_addunlock(sched, t->cj->super->sorts, t);
+      }
+
 #ifdef EXTRA_HYDRO_LOOP
 
       /* Start by constructing the task for the second and third hydro loop */
@@ -2364,21 +2493,6 @@ void engine_make_gravityrecursive_tasks(struct engine *e) {
   /* } */
 }
 
-void engine_check_sort_tasks(struct engine *e, struct cell *c) {
-
-  /* Find the parent sort task, if any, and copy its flags. */
-  if (c->sorts != NULL) {
-    struct cell *parent = c->parent;
-    while (parent != NULL && parent->sorts == NULL) parent = parent->parent;
-    if (parent != NULL) c->sorts->flags |= parent->sorts->flags;
-  }
-
-  /* Recurse? */
-  if (c->split)
-    for (int k = 0; k < 8; k++)
-      if (c->progeny[k] != NULL) engine_check_sort_tasks(e, c->progeny[k]);
-}
-
 /**
  * @brief Fill the #space's task list.
  *
@@ -2396,7 +2510,10 @@ void engine_maketasks(struct engine *e) {
   scheduler_reset(sched, s->tot_cells * engine_maxtaskspercell);
 
   /* Construct the firt hydro loop over neighbours */
-  if (e->policy & engine_policy_hydro) engine_make_hydroloop_tasks(e);
+  if (e->policy & engine_policy_hydro) {
+    threadpool_map(&e->threadpool, engine_make_hydroloop_tasks_mapper, NULL,
+                   s->nr_cells, 1, 0, e);
+  }
 
   /* Add the self gravity tasks. */
   if (e->policy & engine_policy_self_gravity) engine_make_self_gravity_tasks(e);
@@ -2411,17 +2528,31 @@ void engine_maketasks(struct engine *e) {
   /* Split the tasks. */
   scheduler_splittasks(sched);
 
-  /* Allocate the list of cell-task links. The maximum number of links is the
-   * number of cells (s->tot_cells) times the number of neighbours (26) times
-   * the number of interaction types, so 26 * 3 (density, force, grav) pairs
-   * and 4 (density, force, grav, ext_grav) self.
-   */
+  /* Free the old list of cell-task links. */
   if (e->links != NULL) free(e->links);
+  e->size_links = 0;
+
+/* The maximum number of links is the
+ * number of cells (s->tot_cells) times the number of neighbours (26) times
+ * the number of interaction types, so 26 * 2 (density, force) pairs
+ * and 2 (density, force) self.
+ */
 #ifdef EXTRA_HYDRO_LOOP
-  e->size_links = s->tot_cells * (26 * 4 + 4);
+  const int hydro_tasks_per_cell = 27 * 3;
 #else
-  e->size_links = s->tot_cells * (26 * 3 + 4);
+  const int hydro_tasks_per_cell = 27 * 2;
 #endif
+  const int self_grav_tasks_per_cell = 27 * 2;
+  const int ext_grav_tasks_per_cell = 1;
+
+  if (e->policy & engine_policy_hydro)
+    e->size_links += s->tot_cells * hydro_tasks_per_cell;
+  if (e->policy & engine_policy_external_gravity)
+    e->size_links += s->tot_cells * ext_grav_tasks_per_cell;
+  if (e->policy & engine_policy_self_gravity)
+    e->size_links += s->tot_cells * self_grav_tasks_per_cell;
+
+  /* Allocate the new list */
   if ((e->links = malloc(sizeof(struct link) * e->size_links)) == NULL)
     error("Failed to allocate cell-task links.");
   e->nr_links = 0;
@@ -2433,23 +2564,23 @@ void engine_maketasks(struct engine *e) {
   /* Count the number of tasks associated with each cell and
      store the density tasks in each cell, and make each sort
      depend on the sorts of its progeny. */
-  engine_count_and_link_tasks(e);
+  threadpool_map(&e->threadpool, engine_count_and_link_tasks_mapper,
+                 sched->tasks, sched->nr_tasks, sizeof(struct task), 0, e);
 
   /* Now that the self/pair tasks are at the right level, set the super
    * pointers. */
-  for (int k = 0; k < nr_cells; k++) cell_set_super(&cells[k], NULL);
-
-  /* Append hierarchical tasks to each cell. */
-  for (int k = 0; k < nr_cells; k++)
-    engine_make_hierarchical_tasks(e, &cells[k]);
+  threadpool_map(&e->threadpool, cell_set_super_mapper, cells, nr_cells,
+                 sizeof(struct cell), 0, NULL);
 
   /* Append hierarchical tasks to each cell. */
-  for (int k = 0; k < nr_cells; k++) engine_check_sort_tasks(e, &cells[k]);
+  threadpool_map(&e->threadpool, engine_make_hierarchical_tasks_mapper, cells,
+                 nr_cells, sizeof(struct cell), 0, e);
 
   /* Run through the tasks and make force tasks for each density task.
      Each force task depends on the cell ghosts and unlocks the kick task
      of its super-cell. */
-  if (e->policy & engine_policy_hydro) engine_make_extra_hydroloop_tasks(e);
+  threadpool_map(&e->threadpool, engine_make_extra_hydroloop_tasks_mapper,
+                 sched->tasks, sched->nr_tasks, sizeof(struct task), 0, e);
 
   /* Add the dependencies for the gravity stuff */
   if (e->policy & (engine_policy_self_gravity | engine_policy_external_gravity))
@@ -2523,6 +2654,11 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
 
       /* Set this task's skip. */
       if (cell_is_active(t->ci, e)) scheduler_activate(s, t);
+
+      /* Store current values of dx_max and h_max. */
+      if (t->type == task_type_sub_self && t->subtype == task_subtype_density) {
+        cell_activate_subcell_tasks(t->ci, NULL, s);
+      }
     }
 
     /* Pair? */
@@ -2532,168 +2668,169 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
       struct cell *ci = t->ci;
       struct cell *cj = t->cj;
 
-      /* Set this task's skip, otherwise nothing to do. */
-      if (cell_is_active(t->ci, e) || cell_is_active(t->cj, e))
+      /* If this task does not involve any active cells, skip it. */
+      if (!cell_is_active(t->ci, e) && !cell_is_active(t->cj, e)) continue;
+
+      /* Only activate tasks that involve a local active cell. */
+      if ((cell_is_active(ci, e) && ci->nodeID == engine_rank) ||
+          (cj != NULL && cell_is_active(cj, e) && cj->nodeID == engine_rank)) {
         scheduler_activate(s, t);
-      else
-        continue;
-
-      /* If this is not a density task, we don't have to do any of the below. */
-      if (t->subtype != task_subtype_density) continue;
-
-      /* Too much particle movement? */
-      if (max(ci->h_max, cj->h_max) + ci->dx_max_part + cj->dx_max_part >
-          cj->dmin)
-        *rebuild_space = 1;
-
-      /* Set the correct sorting flags */
-      if (t->type == task_type_pair) {
-        if (ci->dx_max_sort > space_maxreldx * ci->dmin) {
-          for (struct cell *finger = ci; finger != NULL;
-               finger = finger->parent)
-            finger->sorted = 0;
-        }
-        if (cj->dx_max_sort > space_maxreldx * cj->dmin) {
-          for (struct cell *finger = cj; finger != NULL;
-               finger = finger->parent)
-            finger->sorted = 0;
-        }
-        if (!(ci->sorted & (1 << t->flags))) {
-#ifdef SWIFT_DEBUG_CHECKS
-          if (!(ci->sorts->flags & (1 << t->flags)))
-            error("bad flags in sort task.");
-#endif
-          scheduler_activate(s, ci->sorts);
-          if (ci->nodeID == engine_rank) scheduler_activate(s, ci->drift_part);
+
+        /* Set the correct sorting flags */
+        if (t->type == task_type_pair && t->subtype == task_subtype_density) {
+          /* Store some values. */
+          atomic_or(&ci->requires_sorts, 1 << t->flags);
+          atomic_or(&cj->requires_sorts, 1 << t->flags);
+          ci->dx_max_sort_old = ci->dx_max_sort;
+          cj->dx_max_sort_old = cj->dx_max_sort;
+
+          /* Activate the drift tasks. */
+          if (ci->nodeID == engine_rank) cell_activate_drift_part(ci, s);
+          if (cj->nodeID == engine_rank) cell_activate_drift_part(cj, s);
+
+          /* Check the sorts and activate them if needed. */
+          cell_activate_sorts(ci, t->flags, s);
+          cell_activate_sorts(cj, t->flags, s);
         }
-        if (!(cj->sorted & (1 << t->flags))) {
-#ifdef SWIFT_DEBUG_CHECKS
-          if (!(cj->sorts->flags & (1 << t->flags)))
-            error("bad flags in sort task.");
-#endif
-          scheduler_activate(s, cj->sorts);
-          if (cj->nodeID == engine_rank) scheduler_activate(s, cj->drift_part);
+        /* Store current values of dx_max and h_max. */
+        else if (t->type == task_type_sub_pair &&
+                 t->subtype == task_subtype_density) {
+          cell_activate_subcell_tasks(t->ci, t->cj, s);
         }
       }
 
-#ifdef WITH_MPI
-      /* Activate the send/recv flags. */
-      if (ci->nodeID != engine_rank) {
+      /* Only interested in density tasks as of here. */
+      if (t->subtype == task_subtype_density) {
+
+        /* Too much particle movement? */
+        if (cell_need_rebuild_for_pair(ci, cj)) *rebuild_space = 1;
 
-        /* Activate the tasks to recv foreign cell ci's data. */
-        scheduler_activate(s, ci->recv_xv);
-        if (cell_is_active(ci, e)) {
-          scheduler_activate(s, ci->recv_rho);
+#ifdef WITH_MPI
+        /* Activate the send/recv tasks. */
+        if (ci->nodeID != engine_rank) {
+
+          /* If the local cell is active, receive data from the foreign cell. */
+          if (cell_is_active(cj, e)) {
+            scheduler_activate(s, ci->recv_xv);
+            if (cell_is_active(ci, e)) {
+              scheduler_activate(s, ci->recv_rho);
 #ifdef EXTRA_HYDRO_LOOP
-          scheduler_activate(s, ci->recv_gradient);
+              scheduler_activate(s, ci->recv_gradient);
 #endif
-          scheduler_activate(s, ci->recv_ti);
-        }
+            }
+          }
 
-        /* Look for the local cell cj's send tasks. */
-        struct link *l = NULL;
-        for (l = cj->send_xv; l != NULL && l->t->cj->nodeID != ci->nodeID;
-             l = l->next)
-          ;
-        if (l == NULL) error("Missing link to send_xv task.");
-        scheduler_activate(s, l->t);
-
-        /* Drift both cells, the foreign one at the level which it is sent. */
-        if (l->t->ci->drift_part)
-          scheduler_activate(s, l->t->ci->drift_part);
-        else
-          error("Drift task missing !");
-        if (t->type == task_type_pair) scheduler_activate(s, cj->drift_part);
-
-        if (cell_is_active(cj, e)) {
-
-          for (l = cj->send_rho; l != NULL && l->t->cj->nodeID != ci->nodeID;
-               l = l->next)
-            ;
-          if (l == NULL) error("Missing link to send_rho task.");
-          scheduler_activate(s, l->t);
+          /* If the foreign cell is active, we want its ti_end values. */
+          if (cell_is_active(ci, e)) scheduler_activate(s, ci->recv_ti);
+
+          /* Look for the local cell cj's send tasks. */
+          if (cell_is_active(ci, e)) {
+            struct link *l = NULL;
+            for (l = cj->send_xv; l != NULL && l->t->cj->nodeID != ci->nodeID;
+                 l = l->next)
+              ;
+            if (l == NULL) error("Missing link to send_xv task.");
+            scheduler_activate(s, l->t);
+
+            /* Drift the cell which will be sent at the level at which it is
+               sent, i.e. drift the cell specified in the send task (l->t)
+               itself. */
+            cell_activate_drift_part(l->t->ci, s);
+
+            if (cell_is_active(cj, e)) {
+              struct link *l = NULL;
+              for (l = cj->send_rho;
+                   l != NULL && l->t->cj->nodeID != ci->nodeID; l = l->next)
+                ;
+              if (l == NULL) error("Missing link to send_rho task.");
+              scheduler_activate(s, l->t);
 
 #ifdef EXTRA_HYDRO_LOOP
-          for (l = cj->send_gradient;
-               l != NULL && l->t->cj->nodeID != ci->nodeID; l = l->next)
-            ;
-          if (l == NULL) error("Missing link to send_gradient task.");
-          scheduler_activate(s, l->t);
+              for (l = cj->send_gradient;
+                   l != NULL && l->t->cj->nodeID != ci->nodeID; l = l->next)
+                ;
+              if (l == NULL) error("Missing link to send_gradient task.");
+              scheduler_activate(s, l->t);
 #endif
+            }
+          }
 
-          for (l = cj->send_ti; l != NULL && l->t->cj->nodeID != ci->nodeID;
-               l = l->next)
-            ;
-          if (l == NULL) error("Missing link to send_ti task.");
-          scheduler_activate(s, l->t);
-        }
+          /* If the local cell is active, send its ti_end values. */
+          if (cell_is_active(cj, e)) {
+            struct link *l = NULL;
+            for (l = cj->send_ti; l != NULL && l->t->cj->nodeID != ci->nodeID;
+                 l = l->next)
+              ;
+            if (l == NULL) error("Missing link to send_ti task.");
+            scheduler_activate(s, l->t);
+          }
 
-      } else if (cj->nodeID != engine_rank) {
+        } else if (cj->nodeID != engine_rank) {
 
-        /* Activate the tasks to recv foreign cell cj's data. */
-        scheduler_activate(s, cj->recv_xv);
-        if (cell_is_active(cj, e)) {
-          scheduler_activate(s, cj->recv_rho);
+          /* If the local cell is active, receive data from the foreign cell. */
+          if (cell_is_active(ci, e)) {
+            scheduler_activate(s, cj->recv_xv);
+            if (cell_is_active(cj, e)) {
+              scheduler_activate(s, cj->recv_rho);
 #ifdef EXTRA_HYDRO_LOOP
-          scheduler_activate(s, cj->recv_gradient);
+              scheduler_activate(s, cj->recv_gradient);
 #endif
-          scheduler_activate(s, cj->recv_ti);
-        }
+            }
+          }
+
+          /* If the foreign cell is active, we want its ti_end values. */
+          if (cell_is_active(cj, e)) scheduler_activate(s, cj->recv_ti);
+
+          /* Look for the local cell ci's send tasks. */
+          if (cell_is_active(cj, e)) {
+            struct link *l = NULL;
+            for (l = ci->send_xv; l != NULL && l->t->cj->nodeID != cj->nodeID;
+                 l = l->next)
+              ;
+            if (l == NULL) error("Missing link to send_xv task.");
+            scheduler_activate(s, l->t);
+
+            /* Drift the cell which will be sent at the level at which it is
+               sent, i.e. drift the cell specified in the send task (l->t)
+               itself. */
+            cell_activate_drift_part(l->t->ci, s);
 
-        /* Look for the local cell ci's send tasks. */
-        struct link *l = NULL;
-        for (l = ci->send_xv; l != NULL && l->t->cj->nodeID != cj->nodeID;
-             l = l->next)
-          ;
-        if (l == NULL) error("Missing link to send_xv task.");
-        scheduler_activate(s, l->t);
-
-        /* Drift both cells, the foreign one at the level which it is sent. */
-        if (l->t->ci->drift_part)
-          scheduler_activate(s, l->t->ci->drift_part);
-        else
-          error("Drift task missing !");
-        if (t->type == task_type_pair) scheduler_activate(s, ci->drift_part);
-
-        if (cell_is_active(ci, e)) {
-          for (l = ci->send_rho; l != NULL && l->t->cj->nodeID != cj->nodeID;
-               l = l->next)
-            ;
-          if (l == NULL) error("Missing link to send_rho task.");
-          scheduler_activate(s, l->t);
+            if (cell_is_active(ci, e)) {
+
+              struct link *l = NULL;
+              for (l = ci->send_rho;
+                   l != NULL && l->t->cj->nodeID != cj->nodeID; l = l->next)
+                ;
+              if (l == NULL) error("Missing link to send_rho task.");
+              scheduler_activate(s, l->t);
 
 #ifdef EXTRA_HYDRO_LOOP
-          for (l = ci->send_gradient;
-               l != NULL && l->t->cj->nodeID != cj->nodeID; l = l->next)
-            ;
-          if (l == NULL) error("Missing link to send_gradient task.");
-          scheduler_activate(s, l->t);
+              for (l = ci->send_gradient;
+                   l != NULL && l->t->cj->nodeID != cj->nodeID; l = l->next)
+                ;
+              if (l == NULL) error("Missing link to send_gradient task.");
+              scheduler_activate(s, l->t);
 #endif
+            }
+          }
 
-          for (l = ci->send_ti; l != NULL && l->t->cj->nodeID != cj->nodeID;
-               l = l->next)
-            ;
-          if (l == NULL) error("Missing link to send_ti task.");
-          scheduler_activate(s, l->t);
+          /* If the local cell is active, send its ti_end values. */
+          if (cell_is_active(ci, e)) {
+            struct link *l = NULL;
+            for (l = ci->send_ti; l != NULL && l->t->cj->nodeID != cj->nodeID;
+                 l = l->next)
+              ;
+            if (l == NULL) error("Missing link to send_ti task.");
+            scheduler_activate(s, l->t);
+          }
         }
-
-      } else if (t->type == task_type_pair) {
-        scheduler_activate(s, ci->drift_part);
-        scheduler_activate(s, cj->drift_part);
-      }
-#else
-      if (t->type == task_type_pair) {
-        scheduler_activate(s, ci->drift_part);
-        scheduler_activate(s, cj->drift_part);
-      }
 #endif
+      }
     }
 
     /* Kick/Drift/init ? */
-    else if (t->type == task_type_kick1 || t->type == task_type_kick2 ||
-             t->type == task_type_drift_part ||
-             t->type == task_type_drift_gpart ||
-             t->type == task_type_init_grav) {
+    if (t->type == task_type_kick1 || t->type == task_type_kick2 ||
+        t->type == task_type_drift_gpart || t->type == task_type_init_grav) {
       if (cell_is_active(t->ci, e)) scheduler_activate(s, t);
     }
 
@@ -2733,7 +2870,7 @@ int engine_marktasks(struct engine *e) {
   /* Run through the tasks and mark as skip or not. */
   size_t extra_data[3] = {(size_t)e, rebuild_space, (size_t)&e->sched};
   threadpool_map(&e->threadpool, engine_marktasks_mapper, s->tasks, s->nr_tasks,
-                 sizeof(struct task), 10000, extra_data);
+                 sizeof(struct task), 0, extra_data);
   rebuild_space = extra_data[1];
 
   if (e->verbose)
@@ -2790,8 +2927,10 @@ void engine_print_task_counts(struct engine *e) {
  * @brief Rebuild the space and tasks.
  *
  * @param e The #engine.
+ * @param clean_h_values Are we cleaning up the values of h before building
+ * the tasks ?
  */
-void engine_rebuild(struct engine *e) {
+void engine_rebuild(struct engine *e, int clean_h_values) {
 
   const ticks tic = getticks();
 
@@ -2802,7 +2941,7 @@ void engine_rebuild(struct engine *e) {
   space_rebuild(e->s, e->verbose);
 
   /* Initial cleaning up session ? */
-  if (e->s->sanitized == 0) space_sanitize(e->s);
+  if (clean_h_values) space_sanitize(e->s);
 
 /* If in parallel, exchange the cell structure. */
 #ifdef WITH_MPI
@@ -2856,7 +2995,7 @@ void engine_prepare(struct engine *e) {
   if (e->forcerepart) engine_repartition(e);
 
   /* Do we need rebuilding ? */
-  if (e->forcerebuild) engine_rebuild(e);
+  if (e->forcerebuild) engine_rebuild(e, 0);
 
   /* Unskip active tasks and check for rebuild */
   engine_unskip(e);
@@ -2878,39 +3017,14 @@ void engine_prepare(struct engine *e) {
  * @brief Implements a barrier for the #runner threads.
  *
  * @param e The #engine.
- * @param tid The thread ID
  */
-void engine_barrier(struct engine *e, int tid) {
-
-  /* First, get the barrier mutex. */
-  if (pthread_mutex_lock(&e->barrier_mutex) != 0)
-    error("Failed to get barrier mutex.");
-
-  /* This thread is no longer running. */
-  e->barrier_running -= 1;
+void engine_barrier(struct engine *e) {
 
-  /* If all threads are in, send a signal... */
-  if (e->barrier_running == 0)
-    if (pthread_cond_broadcast(&e->barrier_cond) != 0)
-      error("Failed to broadcast barrier full condition.");
+  /* Wait at the wait barrier. */
+  pthread_barrier_wait(&e->wait_barrier);
 
-  /* Wait for the barrier to open. */
-  while (e->barrier_launch == 0 || tid >= e->barrier_launchcount)
-    if (pthread_cond_wait(&e->barrier_cond, &e->barrier_mutex) != 0)
-      error("Error waiting for barrier to close.");
-
-  /* This thread has been launched. */
-  e->barrier_running += 1;
-  e->barrier_launch -= 1;
-
-  /* If I'm the last one out, signal the condition again. */
-  if (e->barrier_launch == 0)
-    if (pthread_cond_broadcast(&e->barrier_cond) != 0)
-      error("Failed to broadcast empty barrier condition.");
-
-  /* Last but not least, release the mutex. */
-  if (pthread_mutex_unlock(&e->barrier_mutex) != 0)
-    error("Failed to get unlock the barrier mutex.");
+  /* Wait at the run barrier. */
+  pthread_barrier_wait(&e->run_barrier);
 }
 
 /**
@@ -3145,6 +3259,9 @@ void engine_skip_force_and_kick(struct engine *e) {
         t->type == task_type_cooling || t->type == task_type_sourceterms)
       t->skip = 1;
   }
+
+  /* Run through the cells and clear some flags. */
+  space_map_cells_pre(e->s, 1, cell_clear_drift_flags, NULL);
 }
 
 /**
@@ -3161,19 +3278,20 @@ void engine_skip_drift(struct engine *e) {
 
     struct task *t = &tasks[i];
 
-    /* Skip everything that moves the particles */
-    if (t->type == task_type_drift_part || t->type == task_type_drift_gpart)
-      t->skip = 1;
+    /* Skip everything that updates the particles */
+    if (t->type == task_type_drift_part) t->skip = 1;
   }
+
+  /* Run through the cells and clear some flags. */
+  space_map_cells_pre(e->s, 1, cell_clear_drift_flags, NULL);
 }
 
 /**
  * @brief Launch the runners.
  *
  * @param e The #engine.
- * @param nr_runners The number of #runner to let loose.
  */
-void engine_launch(struct engine *e, int nr_runners) {
+void engine_launch(struct engine *e) {
 
   const ticks tic = getticks();
 
@@ -3186,15 +3304,10 @@ void engine_launch(struct engine *e, int nr_runners) {
   atomic_inc(&e->sched.waiting);
 
   /* Cry havoc and let loose the dogs of war. */
-  e->barrier_launch = nr_runners;
-  e->barrier_launchcount = nr_runners;
-  if (pthread_cond_broadcast(&e->barrier_cond) != 0)
-    error("Failed to broadcast barrier open condition.");
+  pthread_barrier_wait(&e->run_barrier);
 
   /* Load the tasks. */
-  pthread_mutex_unlock(&e->barrier_mutex);
   scheduler_start(&e->sched);
-  pthread_mutex_lock(&e->barrier_mutex);
 
   /* Remove the safeguard. */
   pthread_mutex_lock(&e->sched.sleep_mutex);
@@ -3203,9 +3316,7 @@ void engine_launch(struct engine *e, int nr_runners) {
   pthread_mutex_unlock(&e->sched.sleep_mutex);
 
   /* Sit back and wait for the runners to come home. */
-  while (e->barrier_launch || e->barrier_running)
-    if (pthread_cond_wait(&e->barrier_cond, &e->barrier_mutex) != 0)
-      error("Error while waiting for barrier.");
+  pthread_barrier_wait(&e->wait_barrier);
 
   if (e->verbose)
     message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
@@ -3218,9 +3329,12 @@ void engine_launch(struct engine *e, int nr_runners) {
  *
  * @param e The #engine
  * @param flag_entropy_ICs Did the 'Internal Energy' of the particles actually
- *contain entropy ?
+ * contain entropy ?
+ * @param clean_h_values Are we cleaning up the values of h before building
+ * the tasks ?
  */
-void engine_init_particles(struct engine *e, int flag_entropy_ICs) {
+void engine_init_particles(struct engine *e, int flag_entropy_ICs,
+                           int clean_h_values) {
 
   struct space *s = e->s;
 
@@ -3237,7 +3351,7 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs) {
   }
 
   /* Construct all cells and tasks to start everything */
-  engine_rebuild(e);
+  engine_rebuild(e, clean_h_values);
 
   /* No time integration. We just want the density and ghosts */
   engine_skip_force_and_kick(e);
@@ -3252,7 +3366,7 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs) {
 
   /* Now, launch the calculation */
   TIMER_TIC;
-  engine_launch(e, e->nr_threads);
+  engine_launch(e);
   TIMER_TOC(timer_runners);
 
   /* Apply some conversions (e.g. internal energy -> entropy) */
@@ -3268,7 +3382,7 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs) {
     if (hydro_need_extra_init_loop) {
       engine_marktasks(e);
       engine_skip_force_and_kick(e);
-      engine_launch(e, e->nr_threads);
+      engine_launch(e);
     }
   }
 
@@ -3310,7 +3424,7 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs) {
 #endif
 
   /* Run the 0th time-step */
-  engine_launch(e, e->nr_threads);
+  engine_launch(e);
 
 #ifdef SWIFT_GRAVITY_FORCE_CHECKS
   /* Check the accuracy of the gravity calculation */
@@ -3321,6 +3435,69 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs) {
   /* Recover the (integer) end of the next time-step */
   engine_collect_timestep_and_rebuild(e, 1);
 
+  /* Check if any particles have the same position. This is not
+   * allowed (/0) so we abort.*/
+  if (s->nr_parts > 0) {
+
+    /* Sorting should put the same positions next to each other... */
+    int failed = 0;
+    double *prev_x = s->parts[0].x;
+    for (size_t k = 1; k < s->nr_parts; k++) {
+      if (prev_x[0] == s->parts[k].x[0] && prev_x[1] == s->parts[k].x[1] &&
+          prev_x[2] == s->parts[k].x[2]) {
+        if (e->verbose)
+          message("Two particles occupy location: %f %f %f", prev_x[0],
+                  prev_x[1], prev_x[2]);
+        failed++;
+      }
+      prev_x = s->parts[k].x;
+    }
+    if (failed > 0)
+      error(
+          "Have %d particle pairs with the same locations.\n"
+          "Cannot continue",
+          failed);
+  }
+
+  /* Also check any gparts. This is not supposed to be fatal so only warn. */
+  if (s->nr_gparts > 0) {
+    int failed = 0;
+    double *prev_x = s->gparts[0].x;
+    for (size_t k = 1; k < s->nr_gparts; k++) {
+      if (prev_x[0] == s->gparts[k].x[0] && prev_x[1] == s->gparts[k].x[1] &&
+          prev_x[2] == s->gparts[k].x[2]) {
+        if (e->verbose)
+          message("Two gparts occupy location: %f %f %f / %f %f %f", prev_x[0],
+                  prev_x[1], prev_x[2], s->gparts[k].x[0], s->gparts[k].x[1],
+                  s->gparts[k].x[2]);
+        failed++;
+      }
+      prev_x = s->gparts[k].x;
+    }
+    if (failed > 0)
+      message(
+          "WARNING: found %d gpart pairs at the same location. "
+          "That is not optimal",
+          failed);
+  }
+
+  /* Check the top-level cell h_max matches the particles as these can be
+   * updated in the the ghost tasks (only a problem if the ICs estimates for h
+   * are too small). Note this must be followed by a rebuild as sub-cells will
+   * not be updated until that is done. */
+  if (s->cells_top != NULL && s->nr_parts > 0) {
+    for (int i = 0; i < s->nr_cells; i++) {
+      struct cell *c = &s->cells_top[i];
+      if (c->nodeID == engine_rank && c->count > 0) {
+        float part_h_max = c->parts[0].h;
+        for (int k = 1; k < c->count; k++) {
+          if (c->parts[k].h > part_h_max) part_h_max = c->parts[k].h;
+        }
+        c->h_max = max(part_h_max, c->h_max);
+      }
+    }
+  }
+
   clocks_gettime(&time2);
 
 #ifdef SWIFT_DEBUG_CHECKS
@@ -3421,7 +3598,7 @@ void engine_step(struct engine *e) {
 
   /* Start all the tasks. */
   TIMER_TIC;
-  engine_launch(e, e->nr_threads);
+  engine_launch(e);
   TIMER_TOC(timer_runners);
 
 #ifdef SWIFT_GRAVITY_FORCE_CHECKS
@@ -3537,7 +3714,7 @@ void engine_do_drift_all_mapper(void *map_data, int num_elements,
     struct cell *c = &cells[ind];
     if (c != NULL && c->nodeID == e->nodeID) {
       /* Drift all the particles */
-      cell_drift_part(c, e);
+      cell_drift_part(c, e, 1);
 
       /* Drift all the g-particles */
       cell_drift_gpart(c, e);
@@ -3564,7 +3741,7 @@ void engine_drift_all(struct engine *e) {
 #endif
 
   threadpool_map(&e->threadpool, engine_do_drift_all_mapper, e->s->cells_top,
-                 e->s->nr_cells, sizeof(struct cell), 1, e);
+                 e->s->nr_cells, sizeof(struct cell), 0, e);
 
   /* Synchronize particle positions */
   space_synchronize_particle_positions(e->s);
@@ -3616,7 +3793,7 @@ void engine_drift_top_multipoles(struct engine *e) {
   const ticks tic = getticks();
 
   threadpool_map(&e->threadpool, engine_do_drift_top_multipoles_mapper,
-                 e->s->cells_top, e->s->nr_cells, sizeof(struct cell), 10, e);
+                 e->s->cells_top, e->s->nr_cells, sizeof(struct cell), 0, e);
 
 #ifdef SWIFT_DEBUG_CHECKS
   /* Check that all cells have been drifted to the current time. */
@@ -3654,7 +3831,7 @@ void engine_reconstruct_multipoles(struct engine *e) {
   const ticks tic = getticks();
 
   threadpool_map(&e->threadpool, engine_do_reconstruct_multipoles_mapper,
-                 e->s->cells_top, e->s->nr_cells, sizeof(struct cell), 10, e);
+                 e->s->cells_top, e->s->nr_cells, sizeof(struct cell), 0, e);
 
   if (e->verbose)
     message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
@@ -4054,7 +4231,7 @@ void engine_init(struct engine *e, struct space *s,
   e->parameter_file = params;
 #ifdef WITH_MPI
   e->cputime_last_step = 0;
-  e->last_repartition = -1;
+  e->last_repartition = 0;
 #endif
   engine_rank = nodeID;
 
@@ -4228,7 +4405,7 @@ void engine_init(struct engine *e, struct space *s,
             "Version: %s \n# "
             "Number of threads: %d\n# Number of MPI ranks: %d\n# Hydrodynamic "
             "scheme: %s\n# Hydrodynamic kernel: %s\n# No. of neighbours: %.2f "
-            "+/- %.2f\n# Eta: %f\n",
+            "+/- %.4f\n# Eta: %f\n",
             hostname(), git_branch(), git_revision(), compiler_name(),
             compiler_version(), e->nr_threads, e->nr_nodes, SPH_IMPLEMENTATION,
             kernel_name, e->hydro_properties->target_neighbours,
@@ -4322,20 +4499,14 @@ void engine_init(struct engine *e, struct space *s,
   threadpool_init(&e->threadpool, e->nr_threads);
 
   /* First of all, init the barrier and lock it. */
-  if (pthread_mutex_init(&e->barrier_mutex, NULL) != 0)
-    error("Failed to initialize barrier mutex.");
-  if (pthread_cond_init(&e->barrier_cond, NULL) != 0)
-    error("Failed to initialize barrier condition variable.");
-  if (pthread_mutex_lock(&e->barrier_mutex) != 0)
-    error("Failed to lock barrier mutex.");
-  e->barrier_running = 0;
-  e->barrier_launch = 0;
-  e->barrier_launchcount = 0;
+  if (pthread_barrier_init(&e->wait_barrier, NULL, e->nr_threads + 1) != 0 ||
+      pthread_barrier_init(&e->run_barrier, NULL, e->nr_threads + 1) != 0)
+    error("Failed to initialize barrier.");
 
   /* Init the scheduler with enough tasks for the initial sorting tasks. */
   const int nr_tasks = 2 * s->tot_cells + 2 * e->nr_threads;
-  scheduler_init(&e->sched, e->s, nr_tasks, nr_queues, scheduler_flag_steal,
-                 e->nodeID, &e->threadpool);
+  scheduler_init(&e->sched, e->s, nr_tasks, nr_queues,
+                 (policy & scheduler_flag_steal), e->nodeID, &e->threadpool);
 
   /* Allocate and init the threads. */
   if ((e->runners = (struct runner *)malloc(sizeof(struct runner) *
@@ -4344,7 +4515,6 @@ void engine_init(struct engine *e, struct space *s,
   for (int k = 0; k < e->nr_threads; k++) {
     e->runners[k].id = k;
     e->runners[k].e = e;
-    e->barrier_running += 1;
     if (pthread_create(&e->runners[k].thread, NULL, &runner_main,
                        &e->runners[k]) != 0)
       error("Failed to create runner thread.");
@@ -4380,8 +4550,12 @@ void engine_init(struct engine *e, struct space *s,
       e->runners[k].qid = k * nr_queues / e->nr_threads;
     }
 
-#ifdef WITH_VECTORIZATION
     /* Allocate particle caches. */
+    e->runners[k].ci_gravity_cache.count = 0;
+    e->runners[k].cj_gravity_cache.count = 0;
+    gravity_cache_init(&e->runners[k].ci_gravity_cache, space_splitsize);
+    gravity_cache_init(&e->runners[k].cj_gravity_cache, space_splitsize);
+#ifdef WITH_VECTORIZATION
     e->runners[k].ci_cache.count = 0;
     e->runners[k].cj_cache.count = 0;
     cache_init(&e->runners[k].ci_cache, CACHE_SIZE);
@@ -4407,9 +4581,7 @@ void engine_init(struct engine *e, struct space *s,
 #endif
 
   /* Wait for the runner threads to be in place. */
-  while (e->barrier_running || e->barrier_launch)
-    if (pthread_cond_wait(&e->barrier_cond, &e->barrier_mutex) != 0)
-      error("Error while waiting for runner threads to get in place.");
+  pthread_barrier_wait(&e->wait_barrier);
 }
 
 /**
@@ -4423,7 +4595,7 @@ void engine_print_policy(struct engine *e) {
   if (e->nodeID == 0) {
     printf("[0000] %s engine_policy: engine policies are [ ",
            clocks_get_timesincestart());
-    for (int k = 1; k < 32; k++)
+    for (int k = 0; k <= engine_maxpolicy; k++)
       if (e->policy & (1 << k)) printf(" %s ", engine_policy_names[k + 1]);
     printf(" ]\n");
     fflush(stdout);
@@ -4431,7 +4603,7 @@ void engine_print_policy(struct engine *e) {
 #else
   printf("%s engine_policy: engine policies are [ ",
          clocks_get_timesincestart());
-  for (int k = 1; k < 31; k++)
+  for (int k = 0; k <= engine_maxpolicy; k++)
     if (e->policy & (1 << k)) printf(" %s ", engine_policy_names[k + 1]);
   printf(" ]\n");
   fflush(stdout);
@@ -4474,8 +4646,12 @@ void engine_compute_next_snapshot_time(struct engine *e) {
 void engine_clean(struct engine *e) {
 
 #ifdef WITH_VECTORIZATION
-  for (int i = 0; i < e->nr_threads; ++i) cache_clean(&e->runners[i].ci_cache);
-  for (int i = 0; i < e->nr_threads; ++i) cache_clean(&e->runners[i].cj_cache);
+  for (int i = 0; i < e->nr_threads; ++i) {
+    cache_clean(&e->runners[i].ci_cache);
+    cache_clean(&e->runners[i].cj_cache);
+    gravity_cache_clean(&e->runners[i].ci_gravity_cache);
+    gravity_cache_clean(&e->runners[i].cj_gravity_cache);
+  }
 #endif
   free(e->runners);
   free(e->snapshotUnits);
diff --git a/src/engine.h b/src/engine.h
index e62b12332d3ac1b985b8f6d7181ea66824ec4f13..47a30a99b696304365a0ddf31d4499628a649a37 100644
--- a/src/engine.h
+++ b/src/engine.h
@@ -71,17 +71,18 @@ enum engine_policy {
   engine_policy_sourceterms = (1 << 14),
   engine_policy_stars = (1 << 15)
 };
-
+#define engine_maxpolicy 15
 extern const char *engine_policy_names[];
 
 #define engine_queue_scale 1.2
 #define engine_maxtaskspercell 96
 #define engine_maxproxies 64
-#define engine_tasksreweight 10
+#define engine_tasksreweight 1
 #define engine_parts_size_grow 1.05
 #define engine_redistribute_alloc_margin 1.2
 #define engine_default_energy_file_name "energy"
 #define engine_default_timesteps_file_name "timesteps"
+#define engine_max_parts_per_ghost 1000
 
 /* The rank of the engine as a global variable (for messages). */
 extern int engine_rank;
@@ -156,7 +157,7 @@ struct engine {
   double timeFirstSnapshot;
   double deltaTimeSnapshot;
   integertime_t ti_nextSnapshot;
-  char snapshotBaseName[200];
+  char snapshotBaseName[PARSER_MAX_LINE_SIZE];
   int snapshotCompression;
   struct unit_system *snapshotUnits;
 
@@ -175,9 +176,8 @@ struct engine {
   int count_step;
 
   /* Data for the threads' barrier. */
-  pthread_mutex_t barrier_mutex;
-  pthread_cond_t barrier_cond;
-  volatile int barrier_running, barrier_launch, barrier_launchcount;
+  pthread_barrier_t wait_barrier;
+  pthread_barrier_t run_barrier;
 
   /* ID of the node this engine lives on. */
   int nr_nodes, nodeID;
@@ -252,7 +252,7 @@ struct engine {
 };
 
 /* Function prototypes. */
-void engine_barrier(struct engine *e, int tid);
+void engine_barrier(struct engine *e);
 void engine_compute_next_snapshot_time(struct engine *e);
 void engine_unskip(struct engine *e);
 void engine_drift_all(struct engine *e);
@@ -270,9 +270,10 @@ void engine_init(struct engine *e, struct space *s,
                  const struct external_potential *potential,
                  const struct cooling_function_data *cooling_func,
                  struct sourceterms *sourceterms);
-void engine_launch(struct engine *e, int nr_runners);
+void engine_launch(struct engine *e);
 void engine_prepare(struct engine *e);
-void engine_init_particles(struct engine *e, int flag_entropy_ICs);
+void engine_init_particles(struct engine *e, int flag_entropy_ICs,
+                           int clean_h_values);
 void engine_step(struct engine *e);
 void engine_maketasks(struct engine *e);
 void engine_split(struct engine *e, struct partition *initial_partition);
@@ -281,7 +282,7 @@ void engine_exchange_strays(struct engine *e, size_t offset_parts,
                             int *ind_gpart, size_t *Ngpart,
                             size_t offset_sparts, int *ind_spart,
                             size_t *Nspart);
-void engine_rebuild(struct engine *e);
+void engine_rebuild(struct engine *e, int clean_h_values);
 void engine_repartition(struct engine *e);
 void engine_repartition_trigger(struct engine *e);
 void engine_makeproxies(struct engine *e);
diff --git a/src/gravity.c b/src/gravity.c
index 97b2955b32e1513c3d86d1d1f4da2169130feb77..f58bc1b7456bc5dfc95b4c976ebda8e1999ff3e0 100644
--- a/src/gravity.c
+++ b/src/gravity.c
@@ -21,9 +21,15 @@
 #include "../config.h"
 
 /* Some standard headers. */
+#include <float.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <unistd.h>
 
+#ifdef HAVE_HDF5
+#include <hdf5.h>
+#endif
+
 /* This object's header. */
 #include "gravity.h"
 
@@ -39,6 +45,256 @@ struct exact_force_data {
   double const_G;
 };
 
+#ifdef SWIFT_GRAVITY_FORCE_CHECKS
+
+/* Size of the Ewald table */
+#define Newald 64
+
+/* Components of the Ewald correction */
+static float fewald_x[Newald + 1][Newald + 1][Newald + 1];
+static float fewald_y[Newald + 1][Newald + 1][Newald + 1];
+static float fewald_z[Newald + 1][Newald + 1][Newald + 1];
+
+/* Factor used to normalize the access to the Ewald table */
+float ewald_fac;
+#endif
+
+/**
+ * @brief Allocates the memory and computes one octant of the
+ * Ewald correction table.
+ *
+ * We follow Hernquist, Bouchet & Suto, 1991, ApJS, Volume 75, p.231-240,
+ * equations (2.14a) and (2.14b) with alpha = 2. We consider all terms with
+ * |x - nL| < 4L and |h|^2 < 16.
+ *
+ * @param boxSize The side-length (L) of the volume.
+ */
+void gravity_exact_force_ewald_init(double boxSize) {
+
+#ifdef SWIFT_GRAVITY_FORCE_CHECKS
+  const ticks tic = getticks();
+  message("Computing Ewald correction table...");
+
+  /* Level of correction  (Hernquist et al. 1991)*/
+  const float alpha = 2.f;
+
+  /* some useful constants */
+  const float alpha2 = alpha * alpha;
+  const float factor_exp1 = 2.f * alpha / sqrt(M_PI);
+  const float factor_exp2 = -M_PI * M_PI / alpha2;
+  const float factor_sin = 2.f * M_PI;
+  const float boxSize_inv2 = 1.f / (boxSize * boxSize);
+
+  /* Ewald factor to access the table */
+  ewald_fac = (double)(2 * Newald) / boxSize;
+
+  /* Zero everything */
+  bzero(fewald_x, (Newald + 1) * (Newald + 1) * (Newald + 1) * sizeof(float));
+  bzero(fewald_y, (Newald + 1) * (Newald + 1) * (Newald + 1) * sizeof(float));
+  bzero(fewald_z, (Newald + 1) * (Newald + 1) * (Newald + 1) * sizeof(float));
+
+  /* Compute the values in one of the octants */
+  for (int i = 0; i <= Newald; ++i) {
+    for (int j = 0; j <= Newald; ++j) {
+      for (int k = 0; k <= Newald; ++k) {
+
+        if (i == 0 && j == 0 && k == 0) continue;
+
+        /* Distance vector */
+        const float r_x = 0.5f * ((float)i) / Newald;
+        const float r_y = 0.5f * ((float)j) / Newald;
+        const float r_z = 0.5f * ((float)k) / Newald;
+
+        /* Norm of distance vector */
+        const float r2 = r_x * r_x + r_y * r_y + r_z * r_z;
+        const float r_inv = 1.f / sqrtf(r2);
+        const float r_inv3 = r_inv * r_inv * r_inv;
+
+        /* Normal gravity potential term */
+        float f_x = r_x * r_inv3;
+        float f_y = r_y * r_inv3;
+        float f_z = r_z * r_inv3;
+
+        for (int n_i = -4; n_i <= 4; ++n_i) {
+          for (int n_j = -4; n_j <= 4; ++n_j) {
+            for (int n_k = -4; n_k <= 4; ++n_k) {
+
+              const float d_x = r_x - n_i;
+              const float d_y = r_y - n_j;
+              const float d_z = r_z - n_k;
+
+              /* Discretised distance */
+              const float r_tilde2 = d_x * d_x + d_y * d_y + d_z * d_z;
+              const float r_tilde_inv = 1.f / sqrtf(r_tilde2);
+              const float r_tilde = r_tilde_inv * r_tilde2;
+              const float r_tilde_inv3 =
+                  r_tilde_inv * r_tilde_inv * r_tilde_inv;
+
+              const float val =
+                  erfcf(alpha * r_tilde) +
+                  factor_exp1 * r_tilde * expf(-alpha2 * r_tilde2);
+
+              /* First correction term */
+              const float f = val * r_tilde_inv3;
+              f_x -= f * d_x;
+              f_y -= f * d_y;
+              f_z -= f * d_z;
+            }
+          }
+        }
+
+        for (int h_i = -4; h_i <= 4; ++h_i) {
+          for (int h_j = -4; h_j <= 4; ++h_j) {
+            for (int h_k = -4; h_k <= 4; ++h_k) {
+
+              const float h2 = h_i * h_i + h_j * h_j + h_k * h_k;
+
+              const float h2_inv = 1.f / (h2 + FLT_MIN);
+              const float h_dot_x = h_i * r_x + h_j * r_y + h_k * r_z;
+
+              const float val = 2.f * h2_inv * expf(h2 * factor_exp2) *
+                                sinf(factor_sin * h_dot_x);
+
+              /* Second correction term */
+              f_x -= val * h_i;
+              f_y -= val * h_j;
+              f_z -= val * h_k;
+            }
+          }
+        }
+
+        /* Save back to memory */
+        fewald_x[i][j][k] = f_x;
+        fewald_y[i][j][k] = f_y;
+        fewald_z[i][j][k] = f_z;
+      }
+    }
+  }
+
+/* Dump the Ewald table to a file */
+#ifdef HAVE_HDF5
+  hid_t h_file =
+      H5Fcreate("Ewald.hdf5", H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);
+  if (h_file < 0) error("Error while opening file for Ewald dump.");
+
+  /* Create dataspace */
+  hsize_t dim[3] = {Newald + 1, Newald + 1, Newald + 1};
+  hid_t h_space = H5Screate_simple(3, dim, NULL);
+  hid_t h_data;
+  h_data = H5Dcreate(h_file, "Ewald_x", H5T_NATIVE_FLOAT, h_space, H5P_DEFAULT,
+                     H5P_DEFAULT, H5P_DEFAULT);
+  H5Dwrite(h_data, H5T_NATIVE_FLOAT, h_space, H5S_ALL, H5P_DEFAULT,
+           &(fewald_x[0][0][0]));
+  H5Dclose(h_data);
+  h_data = H5Dcreate(h_file, "Ewald_y", H5T_NATIVE_FLOAT, h_space, H5P_DEFAULT,
+                     H5P_DEFAULT, H5P_DEFAULT);
+  H5Dwrite(h_data, H5T_NATIVE_FLOAT, h_space, H5S_ALL, H5P_DEFAULT,
+           &(fewald_y[0][0][0]));
+  H5Dclose(h_data);
+  h_data = H5Dcreate(h_file, "Ewald_z", H5T_NATIVE_FLOAT, h_space, H5P_DEFAULT,
+                     H5P_DEFAULT, H5P_DEFAULT);
+  H5Dwrite(h_data, H5T_NATIVE_FLOAT, h_space, H5S_ALL, H5P_DEFAULT,
+           &(fewald_z[0][0][0]));
+  H5Dclose(h_data);
+  H5Sclose(h_space);
+  H5Fclose(h_file);
+#endif
+
+  /* Apply the box-size correction */
+  for (int i = 0; i <= Newald; ++i) {
+    for (int j = 0; j <= Newald; ++j) {
+      for (int k = 0; k <= Newald; ++k) {
+        fewald_x[i][j][k] *= boxSize_inv2;
+        fewald_y[i][j][k] *= boxSize_inv2;
+        fewald_z[i][j][k] *= boxSize_inv2;
+      }
+    }
+  }
+
+  /* Say goodbye */
+  message("Ewald correction table computed (took %.3f %s). ",
+          clocks_from_ticks(getticks() - tic), clocks_getunit());
+#else
+  error("Gravity checking function called without the corresponding flag.");
+#endif
+}
+
+#ifdef SWIFT_GRAVITY_FORCE_CHECKS
+/**
+ * @brief Compute the Ewald correction for a given distance vector r.
+ *
+ * We interpolate the Ewald correction tables using a tri-linear interpolation
+ * similar to a CIC.
+ *
+ * @param rx x-coordinate of distance vector.
+ * @param ry y-coordinate of distance vector.
+ * @param rz z-coordinate of distance vector.
+ * @param corr (return) The Ewald correction.
+ */
+__attribute__((always_inline)) INLINE static void
+gravity_exact_force_ewald_evaluate(double rx, double ry, double rz,
+                                   double corr[3]) {
+
+  const double s_x = (rx < 0.) ? 1. : -1.;
+  const double s_y = (ry < 0.) ? 1. : -1.;
+  const double s_z = (rz < 0.) ? 1. : -1.;
+  rx = fabs(rx);
+  ry = fabs(ry);
+  rz = fabs(rz);
+
+  int i = (int)(rx * ewald_fac);
+  if (i >= Newald) i = Newald - 1;
+  const double dx = rx * ewald_fac - i;
+  const double tx = 1. - dx;
+
+  int j = (int)(ry * ewald_fac);
+  if (j >= Newald) j = Newald - 1;
+  const double dy = ry * ewald_fac - j;
+  const double ty = 1. - dy;
+
+  int k = (int)(rz * ewald_fac);
+  if (k >= Newald) k = Newald - 1;
+  const double dz = rz * ewald_fac - k;
+  const double tz = 1. - dz;
+
+  /* Interpolation in X */
+  corr[0] = 0.;
+  corr[0] += fewald_x[i + 0][j + 0][k + 0] * tx * ty * tz;
+  corr[0] += fewald_x[i + 0][j + 0][k + 1] * tx * ty * dz;
+  corr[0] += fewald_x[i + 0][j + 1][k + 0] * tx * dy * tz;
+  corr[0] += fewald_x[i + 0][j + 1][k + 1] * tx * dy * dz;
+  corr[0] += fewald_x[i + 1][j + 0][k + 0] * dx * ty * tz;
+  corr[0] += fewald_x[i + 1][j + 0][k + 1] * dx * ty * dz;
+  corr[0] += fewald_x[i + 1][j + 1][k + 0] * dx * dy * tz;
+  corr[0] += fewald_x[i + 1][j + 1][k + 1] * dx * dy * dz;
+  corr[0] *= s_x;
+
+  /* Interpolation in Y */
+  corr[1] = 0.;
+  corr[1] += fewald_y[i + 0][j + 0][k + 0] * tx * ty * tz;
+  corr[1] += fewald_y[i + 0][j + 0][k + 1] * tx * ty * dz;
+  corr[1] += fewald_y[i + 0][j + 1][k + 0] * tx * dy * tz;
+  corr[1] += fewald_y[i + 0][j + 1][k + 1] * tx * dy * dz;
+  corr[1] += fewald_y[i + 1][j + 0][k + 0] * dx * ty * tz;
+  corr[1] += fewald_y[i + 1][j + 0][k + 1] * dx * ty * dz;
+  corr[1] += fewald_y[i + 1][j + 1][k + 0] * dx * dy * tz;
+  corr[1] += fewald_y[i + 1][j + 1][k + 1] * dx * dy * dz;
+  corr[1] *= s_y;
+
+  /* Interpolation in Z */
+  corr[2] = 0.;
+  corr[2] += fewald_z[i + 0][j + 0][k + 0] * tx * ty * tz;
+  corr[2] += fewald_z[i + 0][j + 0][k + 1] * tx * ty * dz;
+  corr[2] += fewald_z[i + 0][j + 1][k + 0] * tx * dy * tz;
+  corr[2] += fewald_z[i + 0][j + 1][k + 1] * tx * dy * dz;
+  corr[2] += fewald_z[i + 1][j + 0][k + 0] * dx * ty * tz;
+  corr[2] += fewald_z[i + 1][j + 0][k + 1] * dx * ty * dz;
+  corr[2] += fewald_z[i + 1][j + 1][k + 0] * dx * dy * tz;
+  corr[2] += fewald_z[i + 1][j + 1][k + 1] * dx * dy * dz;
+  corr[2] *= s_z;
+}
+#endif
+
 /**
  * @brief Checks whether the file containing the exact accelerations for
  * the current choice of parameters already exists.
@@ -63,7 +319,7 @@ int gravity_exact_force_file_exits(const struct engine *e) {
     char line[100];
     char dummy1[10], dummy2[10];
     double epsilon, newton_G;
-    int N;
+    int N, periodic;
     /* Reads file header */
     if (fgets(line, 100, file) != line) error("Problem reading title");
     if (fgets(line, 100, file) != line) error("Problem reading G");
@@ -72,10 +328,12 @@ int gravity_exact_force_file_exits(const struct engine *e) {
     sscanf(line, "%s %s %d", dummy1, dummy2, &N);
     if (fgets(line, 100, file) != line) error("Problem reading epsilon");
     sscanf(line, "%s %s %le", dummy1, dummy2, &epsilon);
+    if (fgets(line, 100, file) != line) error("Problem reading BC");
+    sscanf(line, "%s %s %d", dummy1, dummy2, &periodic);
     fclose(file);
 
     /* Check whether it matches the current parameters */
-    if (N == SWIFT_GRAVITY_FORCE_CHECKS &&
+    if (N == SWIFT_GRAVITY_FORCE_CHECKS && periodic == e->s->periodic &&
         (fabs(epsilon - e->gravity_properties->epsilon) / epsilon < 1e-5) &&
         (fabs(newton_G - e->physical_constants->const_newton_G) / newton_G <
          1e-5)) {
@@ -101,6 +359,8 @@ void gravity_exact_force_compute_mapper(void *map_data, int nr_gparts,
   struct exact_force_data *data = (struct exact_force_data *)extra_data;
   const struct space *s = data->s;
   const struct engine *e = data->e;
+  const int periodic = s->periodic;
+  const double dim[3] = {s->dim[0], s->dim[1], s->dim[2]};
   const double const_G = data->const_G;
   int counter = 0;
 
@@ -112,6 +372,12 @@ void gravity_exact_force_compute_mapper(void *map_data, int nr_gparts,
     if (gpi->id_or_neg_offset % SWIFT_GRAVITY_FORCE_CHECKS == 0 &&
         gpart_is_active(gpi, e)) {
 
+      /* Get some information about the particle */
+      const double pix[3] = {gpi->x[0], gpi->x[1], gpi->x[2]};
+      const double hi = gpi->epsilon;
+      const double hi_inv = 1. / hi;
+      const double hi_inv3 = hi_inv * hi_inv * hi_inv;
+
       /* Be ready for the calculation */
       double a_grav[3] = {0., 0., 0.};
 
@@ -124,43 +390,53 @@ void gravity_exact_force_compute_mapper(void *map_data, int nr_gparts,
         if (gpi == gpj) continue;
 
         /* Compute the pairwise distance. */
-        const double dx[3] = {gpi->x[0] - gpj->x[0],   // x
-                              gpi->x[1] - gpj->x[1],   // y
-                              gpi->x[2] - gpj->x[2]};  // z
-        const double r2 = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2];
+        double dx = gpj->x[0] - pix[0];
+        double dy = gpj->x[1] - pix[1];
+        double dz = gpj->x[2] - pix[2];
+
+        /* Now apply periodic BC */
+        if (periodic) {
+          dx = nearest(dx, dim[0]);
+          dy = nearest(dy, dim[1]);
+          dz = nearest(dz, dim[2]);
+        }
 
-        const double r = sqrt(r2);
-        const double ir = 1. / r;
+        const double r2 = dx * dx + dy * dy + dz * dz;
+        const double r_inv = 1. / sqrt(r2);
+        const double r = r2 * r_inv;
         const double mj = gpj->mass;
-        const double hi = gpi->epsilon;
         double f;
-        const double f_lr = 1.;
 
         if (r >= hi) {
 
           /* Get Newtonian gravity */
-          f = mj * ir * ir * ir * f_lr;
+          f = mj * r_inv * r_inv * r_inv;
 
         } else {
 
-          const double hi_inv = 1. / hi;
-          const double hi_inv3 = hi_inv * hi_inv * hi_inv;
           const double ui = r * hi_inv;
           double W;
 
           kernel_grav_eval_double(ui, &W);
 
           /* Get softened gravity */
-          f = mj * hi_inv3 * W * f_lr;
-
-          // printf("r=%e hi=%e W=%e fac=%e\n", r, hi, W, f);
+          f = mj * hi_inv3 * W;
         }
 
-        const double fdx[3] = {f * dx[0], f * dx[1], f * dx[2]};
+        a_grav[0] += f * dx;
+        a_grav[1] += f * dy;
+        a_grav[2] += f * dz;
 
-        a_grav[0] -= fdx[0];
-        a_grav[1] -= fdx[1];
-        a_grav[2] -= fdx[2];
+        /* Apply Ewald correction for periodic BC */
+        if (periodic && r > 1e-5 * hi) {
+
+          double corr[3];
+          gravity_exact_force_ewald_evaluate(dx, dy, dz, corr);
+
+          a_grav[0] += mj * corr[0];
+          a_grav[1] += mj * corr[1];
+          a_grav[2] += mj * corr[2];
+        }
       }
 
       /* Store the exact answer */
@@ -207,7 +483,7 @@ void gravity_exact_force_compute(struct space *s, const struct engine *e) {
   data.const_G = e->physical_constants->const_newton_G;
 
   threadpool_map(&s->e->threadpool, gravity_exact_force_compute_mapper,
-                 s->gparts, s->nr_gparts, sizeof(struct gpart), 1000, &data);
+                 s->gparts, s->nr_gparts, sizeof(struct gpart), 0, &data);
 
   message("Computed exact gravity for %d gparts (took %.3f %s). ",
           data.counter_global, clocks_from_ticks(getticks() - tic),
@@ -245,8 +521,9 @@ void gravity_exact_force_check(struct space *s, const struct engine *e,
   fprintf(file_swift, "# Gravity accuracy test - SWIFT FORCES\n");
   fprintf(file_swift, "# G= %16.8e\n", e->physical_constants->const_newton_G);
   fprintf(file_swift, "# N= %d\n", SWIFT_GRAVITY_FORCE_CHECKS);
-  fprintf(file_swift, "# epsilon=%16.8e\n", e->gravity_properties->epsilon);
-  fprintf(file_swift, "# theta=%16.8e\n", e->gravity_properties->theta_crit);
+  fprintf(file_swift, "# epsilon= %16.8e\n", e->gravity_properties->epsilon);
+  fprintf(file_swift, "# periodic= %d\n", s->periodic);
+  fprintf(file_swift, "# theta= %16.8e\n", e->gravity_properties->theta_crit);
   fprintf(file_swift, "# Git Branch: %s\n", git_branch());
   fprintf(file_swift, "# Git Revision: %s\n", git_revision());
   fprintf(file_swift, "# %16s %16s %16s %16s %16s %16s %16s\n", "id", "pos[0]",
diff --git a/src/gravity.h b/src/gravity.h
index 00b930c00fb2558f274feb2991b78e96dc8b990b..85e42370bc456dceb577c42ee609e3f0724e14ea 100644
--- a/src/gravity.h
+++ b/src/gravity.h
@@ -34,6 +34,8 @@
 #include "./gravity/Default/gravity.h"
 #include "./gravity/Default/gravity_iact.h"
 
+void gravity_exact_force_ewald_init(double boxSize);
+void gravity_exact_force_ewald_free();
 void gravity_exact_force_compute(struct space *s, const struct engine *e);
 void gravity_exact_force_check(struct space *s, const struct engine *e,
                                float rel_tol);
diff --git a/src/gravity/Default/gravity_iact.h b/src/gravity/Default/gravity_iact.h
index eca5c2491cbdcf5f0eca01417c8e6b29efc53459..d4a95540de17631ad445075d672d03a1236e34e3 100644
--- a/src/gravity/Default/gravity_iact.h
+++ b/src/gravity/Default/gravity_iact.h
@@ -28,11 +28,11 @@
 #include "vector.h"
 
 /**
- * @brief Gravity forces between particles
+ * @brief Gravity forces between particles truncated by the long-range kernel
  */
-__attribute__((always_inline)) INLINE static void runner_iact_grav_pp(
-    float rlr_inv, float r2, const float *dx, struct gpart *gpi,
-    struct gpart *gpj) {
+__attribute__((always_inline)) INLINE static void runner_iact_grav_pp_truncated(
+    float r2, const float *dx, struct gpart *gpi, struct gpart *gpj,
+    float rlr_inv) {
 
   /* Apply the gravitational acceleration. */
   const float r = sqrtf(r2);
@@ -41,7 +41,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_grav_pp(
   const float mj = gpj->mass;
   const float hi = gpi->epsilon;
   const float hj = gpj->epsilon;
-  const float u = r * rlr_inv;
+  const float u_lr = r * rlr_inv;
   float f_lr, fi, fj, W;
 
 #ifdef SWIFT_DEBUG_CHECKS
@@ -49,7 +49,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_grav_pp(
 #endif
 
   /* Get long-range correction */
-  kernel_long_grav_eval(u, &f_lr);
+  kernel_long_grav_eval(u_lr, &f_lr);
 
   if (r >= hi) {
 
@@ -97,18 +97,84 @@ __attribute__((always_inline)) INLINE static void runner_iact_grav_pp(
 }
 
 /**
- * @brief Gravity forces between particles (non-symmetric version)
+ * @brief Gravity forces between particles
  */
-__attribute__((always_inline)) INLINE static void runner_iact_grav_pp_nonsym(
-    float rlr_inv, float r2, const float *dx, struct gpart *gpi,
-    const struct gpart *gpj) {
+__attribute__((always_inline)) INLINE static void runner_iact_grav_pp(
+    float r2, const float *dx, struct gpart *gpi, struct gpart *gpj) {
+
+  /* Apply the gravitational acceleration. */
+  const float r = sqrtf(r2);
+  const float ir = 1.f / r;
+  const float mi = gpi->mass;
+  const float mj = gpj->mass;
+  const float hi = gpi->epsilon;
+  const float hj = gpj->epsilon;
+  float fi, fj, W;
+
+#ifdef SWIFT_DEBUG_CHECKS
+  if (r == 0.f) error("Interacting particles with 0 distance");
+#endif
+
+  if (r >= hi) {
+
+    /* Get Newtonian gravity */
+    fi = mj * ir * ir * ir;
+
+  } else {
+
+    const float hi_inv = 1.f / hi;
+    const float hi_inv3 = hi_inv * hi_inv * hi_inv;
+    const float ui = r * hi_inv;
+
+    kernel_grav_eval(ui, &W);
+
+    /* Get softened gravity */
+    fi = mj * hi_inv3 * W;
+  }
+
+  if (r >= hj) {
+
+    /* Get Newtonian gravity */
+    fj = mi * ir * ir * ir;
+
+  } else {
+
+    const float hj_inv = 1.f / hj;
+    const float hj_inv3 = hj_inv * hj_inv * hj_inv;
+    const float uj = r * hj_inv;
+
+    kernel_grav_eval(uj, &W);
+
+    /* Get softened gravity */
+    fj = mi * hj_inv3 * W;
+  }
+
+  const float fidx[3] = {fi * dx[0], fi * dx[1], fi * dx[2]};
+  gpi->a_grav[0] -= fidx[0];
+  gpi->a_grav[1] -= fidx[1];
+  gpi->a_grav[2] -= fidx[2];
+
+  const float fjdx[3] = {fj * dx[0], fj * dx[1], fj * dx[2]};
+  gpj->a_grav[0] += fjdx[0];
+  gpj->a_grav[1] += fjdx[1];
+  gpj->a_grav[2] += fjdx[2];
+}
+
+/**
+ * @brief Gravity forces between particles truncated by the long-range kernel
+ * (non-symmetric version)
+ */
+__attribute__((always_inline)) INLINE static void
+runner_iact_grav_pp_truncated_nonsym(float r2, const float *dx,
+                                     struct gpart *gpi, const struct gpart *gpj,
+                                     float rlr_inv) {
 
   /* Apply the gravitational acceleration. */
   const float r = sqrtf(r2);
   const float ir = 1.f / r;
   const float mj = gpj->mass;
   const float hi = gpi->epsilon;
-  const float u = r * rlr_inv;
+  const float u_lr = r * rlr_inv;
   float f_lr, f, W;
 
 #ifdef SWIFT_DEBUG_CHECKS
@@ -116,7 +182,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_grav_pp_nonsym(
 #endif
 
   /* Get long-range correction */
-  kernel_long_grav_eval(u, &f_lr);
+  kernel_long_grav_eval(u_lr, &f_lr);
 
   if (r >= hi) {
 
@@ -143,13 +209,44 @@ __attribute__((always_inline)) INLINE static void runner_iact_grav_pp_nonsym(
 }
 
 /**
- * @brief Gravity forces between particle and multipole
+ * @brief Gravity forces between particles (non-symmetric version)
  */
-__attribute__((always_inline)) INLINE static void runner_iact_grav_pm(
-    float rlr_inv, float r2, const float *dx, struct gpart *gp,
-    const struct multipole *multi) {
+__attribute__((always_inline)) INLINE static void runner_iact_grav_pp_nonsym(
+    float r2, const float *dx, struct gpart *gpi, const struct gpart *gpj) {
+
+  /* Apply the gravitational acceleration. */
+  const float r = sqrtf(r2);
+  const float ir = 1.f / r;
+  const float mj = gpj->mass;
+  const float hi = gpi->epsilon;
+  float f, W;
 
-  error("Dead function");
+#ifdef SWIFT_DEBUG_CHECKS
+  if (r == 0.f) error("Interacting particles with 0 distance");
+#endif
+
+  if (r >= hi) {
+
+    /* Get Newtonian gravity */
+    f = mj * ir * ir * ir;
+
+  } else {
+
+    const float hi_inv = 1.f / hi;
+    const float hi_inv3 = hi_inv * hi_inv * hi_inv;
+    const float ui = r * hi_inv;
+
+    kernel_grav_eval(ui, &W);
+
+    /* Get softened gravity */
+    f = mj * hi_inv3 * W;
+  }
+
+  const float fdx[3] = {f * dx[0], f * dx[1], f * dx[2]};
+
+  gpi->a_grav[0] -= fdx[0];
+  gpi->a_grav[1] -= fdx[1];
+  gpi->a_grav[2] -= fdx[2];
 }
 
 #endif /* SWIFT_DEFAULT_GRAVITY_IACT_H */
diff --git a/src/gravity_cache.h b/src/gravity_cache.h
new file mode 100644
index 0000000000000000000000000000000000000000..14b672233aa9958ec39af32a87baead98c0bae04
--- /dev/null
+++ b/src/gravity_cache.h
@@ -0,0 +1,247 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (c) 2016 Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+#ifndef SWIFT_GRAVITY_CACHE_H
+#define SWIFT_GRAVITY_CACHE_H
+
+/* Config parameters. */
+#include "../config.h"
+
+/* Local headers */
+#include "align.h"
+#include "error.h"
+#include "gravity.h"
+#include "vector.h"
+
+/**
+ * @brief A SoA object for the #gpart of a cell.
+ *
+ * This is used to help vectorize the leaf-leaf gravity interactions.
+ */
+struct gravity_cache {
+
+  /*! #gpart x position. */
+  float *restrict x SWIFT_CACHE_ALIGN;
+
+  /*! #gpart y position. */
+  float *restrict y SWIFT_CACHE_ALIGN;
+
+  /*! #gpart z position. */
+  float *restrict z SWIFT_CACHE_ALIGN;
+
+  /*! #gpart softening length. */
+  float *restrict epsilon SWIFT_CACHE_ALIGN;
+
+  /*! #gpart mass. */
+  float *restrict m SWIFT_CACHE_ALIGN;
+
+  /*! #gpart x acceleration. */
+  float *restrict a_x SWIFT_CACHE_ALIGN;
+
+  /*! #gpart y acceleration. */
+  float *restrict a_y SWIFT_CACHE_ALIGN;
+
+  /*! #gpart z acceleration. */
+  float *restrict a_z SWIFT_CACHE_ALIGN;
+
+  /*! Cache size */
+  int count;
+};
+
+/**
+ * @brief Frees the memory allocated in a #gravity_cache
+ *
+ * @param c The #gravity_cache to free.
+ */
+static INLINE void gravity_cache_clean(struct gravity_cache *c) {
+
+  if (c->count > 0) {
+    free(c->x);
+    free(c->y);
+    free(c->z);
+    free(c->epsilon);
+    free(c->m);
+    free(c->a_x);
+    free(c->a_y);
+    free(c->a_z);
+  }
+  c->count = 0;
+}
+
+/**
+ * @brief Allocates memory for the #gpart caches used in the leaf-leaf
+ * interactions.
+ *
+ * The cache is padded for the vector size and aligned properly
+ *
+ * @param c The #gravity_cache to allocate.
+ * @param count The number of #gpart to allocated for (space_splitsize is a good
+ * choice).
+ */
+static INLINE void gravity_cache_init(struct gravity_cache *c, int count) {
+
+  /* Size of the gravity cache */
+  const int padded_count = count - (count % VEC_SIZE) + VEC_SIZE;
+  const size_t sizeBytes = padded_count * sizeof(float);
+
+  /* Delete old stuff if any */
+  gravity_cache_clean(c);
+
+  int error = 0;
+  error += posix_memalign((void **)&c->x, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error += posix_memalign((void **)&c->y, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error += posix_memalign((void **)&c->z, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error +=
+      posix_memalign((void **)&c->epsilon, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error += posix_memalign((void **)&c->m, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error += posix_memalign((void **)&c->a_x, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error += posix_memalign((void **)&c->a_y, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error += posix_memalign((void **)&c->a_z, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+
+  if (error != 0)
+    error("Couldn't allocate gravity cache, size: %d", padded_count);
+
+  c->count = padded_count;
+}
+
+/**
+ * @brief Fills a #gravity_cache structure with some #gpart and shift them.
+ *
+ * @param c The #gravity_cache to fill.
+ * @param gparts The #gpart array to read from.
+ * @param gcount The number of particles to read.
+ * @param gcount_padded The number of particle to read padded to the next
+ * multiple of the vector length.
+ * @param shift A shift to apply to all the particles.
+ */
+__attribute__((always_inline)) INLINE void gravity_cache_populate(
+    struct gravity_cache *c, const struct gpart *restrict gparts, int gcount,
+    int gcount_padded, const double shift[3]) {
+
+  /* Make the compiler understand we are in happy vectorization land */
+  float *restrict x = c->x;
+  float *restrict y = c->y;
+  float *restrict z = c->z;
+  float *restrict m = c->m;
+  float *restrict epsilon = c->epsilon;
+  swift_align_information(x, SWIFT_CACHE_ALIGNMENT);
+  swift_align_information(y, SWIFT_CACHE_ALIGNMENT);
+  swift_align_information(z, SWIFT_CACHE_ALIGNMENT);
+  swift_align_information(epsilon, SWIFT_CACHE_ALIGNMENT);
+  swift_align_information(m, SWIFT_CACHE_ALIGNMENT);
+  swift_assume_size(gcount_padded, VEC_SIZE);
+
+  /* Fill the input caches */
+  for (int i = 0; i < gcount; ++i) {
+    x[i] = (float)(gparts[i].x[0] - shift[0]);
+    y[i] = (float)(gparts[i].x[1] - shift[1]);
+    z[i] = (float)(gparts[i].x[2] - shift[2]);
+    epsilon[i] = gparts[i].epsilon;
+    m[i] = gparts[i].mass;
+  }
+
+#ifdef SWIFT_DEBUG_CHECKS
+  if (gcount_padded < gcount) error("Padded counter smaller than counter");
+#endif
+
+  /* Pad the caches */
+  for (int i = gcount; i < gcount_padded; ++i) {
+    x[i] = 0.f;
+    y[i] = 0.f;
+    z[i] = 0.f;
+    epsilon[i] = 0.f;
+    m[i] = 0.f;
+  }
+}
+
+/**
+ * @brief Fills a #gravity_cache structure with some #gpart.
+ *
+ * @param c The #gravity_cache to fill.
+ * @param gparts The #gpart array to read from.
+ * @param gcount The number of particles to read.
+ * @param gcount_padded The number of particle to read padded to the next
+ * multiple of the vector length.
+ */
+__attribute__((always_inline)) INLINE void gravity_cache_populate_no_shift(
+    struct gravity_cache *c, const struct gpart *restrict gparts, int gcount,
+    int gcount_padded) {
+
+  /* Make the compiler understand we are in happy vectorization land */
+  float *restrict x = c->x;
+  float *restrict y = c->y;
+  float *restrict z = c->z;
+  float *restrict m = c->m;
+  float *restrict epsilon = c->epsilon;
+  swift_align_information(x, SWIFT_CACHE_ALIGNMENT);
+  swift_align_information(y, SWIFT_CACHE_ALIGNMENT);
+  swift_align_information(z, SWIFT_CACHE_ALIGNMENT);
+  swift_align_information(epsilon, SWIFT_CACHE_ALIGNMENT);
+  swift_align_information(m, SWIFT_CACHE_ALIGNMENT);
+  swift_assume_size(gcount_padded, VEC_SIZE);
+
+  /* Fill the input caches */
+  for (int i = 0; i < gcount; ++i) {
+    x[i] = (float)(gparts[i].x[0]);
+    y[i] = (float)(gparts[i].x[1]);
+    z[i] = (float)(gparts[i].x[2]);
+    epsilon[i] = gparts[i].epsilon;
+    m[i] = gparts[i].mass;
+  }
+
+#ifdef SWIFT_DEBUG_CHECKS
+  if (gcount_padded < gcount) error("Padded counter smaller than counter");
+#endif
+
+  /* Pad the caches */
+  for (int i = gcount; i < gcount_padded; ++i) {
+    x[i] = 0.f;
+    y[i] = 0.f;
+    z[i] = 0.f;
+    epsilon[i] = 0.f;
+    m[i] = 0.f;
+  }
+}
+
+/**
+ * @brief Write the output cache values back to the #gpart.
+ *
+ * @param c The #gravity_cache to read from.
+ * @param gparts The #gpart array to write to.
+ * @param gcount The number of particles to write.
+ */
+__attribute__((always_inline)) INLINE void gravity_cache_write_back(
+    const struct gravity_cache *c, struct gpart *restrict gparts, int gcount) {
+
+  /* Make the compiler understand we are in happy vectorization land */
+  float *restrict a_x = c->a_x;
+  float *restrict a_y = c->a_y;
+  float *restrict a_z = c->a_z;
+  swift_align_information(a_x, SWIFT_CACHE_ALIGNMENT);
+  swift_align_information(a_y, SWIFT_CACHE_ALIGNMENT);
+  swift_align_information(a_z, SWIFT_CACHE_ALIGNMENT);
+
+  /* Write stuff back to the particles */
+  for (int i = 0; i < gcount; ++i) {
+    gparts[i].a_grav[0] += a_x[i];
+    gparts[i].a_grav[1] += a_y[i];
+    gparts[i].a_grav[2] += a_z[i];
+  }
+}
+
+#endif /* SWIFT_GRAVITY_CACHE_H */
diff --git a/src/gravity_properties.c b/src/gravity_properties.c
index b1098888b96cdef2205ed513e60a3799c63e8b9f..18cf044434f7840a5a76f483540bb924a2365e26 100644
--- a/src/gravity_properties.c
+++ b/src/gravity_properties.c
@@ -33,7 +33,8 @@
 #include "kernel_gravity.h"
 
 #define gravity_props_default_a_smooth 1.25f
-#define gravity_props_default_r_cut 4.5f
+#define gravity_props_default_r_cut_max 4.5f
+#define gravity_props_default_r_cut_min 0.1f
 
 void gravity_props_init(struct gravity_props *p,
                         const struct swift_params *params) {
@@ -41,8 +42,10 @@ void gravity_props_init(struct gravity_props *p,
   /* Tree-PM parameters */
   p->a_smooth = parser_get_opt_param_float(params, "Gravity:a_smooth",
                                            gravity_props_default_a_smooth);
-  p->r_cut = parser_get_opt_param_float(params, "Gravity:r_cut",
-                                        gravity_props_default_r_cut);
+  p->r_cut_max = parser_get_opt_param_float(params, "Gravity:r_cut_max",
+                                            gravity_props_default_r_cut_max);
+  p->r_cut_min = parser_get_opt_param_float(params, "Gravity:r_cut_min",
+                                            gravity_props_default_r_cut_min);
 
   /* Time integration */
   p->eta = parser_get_param_float(params, "Gravity:eta");
@@ -69,9 +72,10 @@ void gravity_props_print(const struct gravity_props *p) {
   message("Self-gravity softening:    epsilon=%.4f (Plummer equivalent: %.4f)",
           p->epsilon, p->epsilon / 3.);
 
-  message("Self-gravity MM smoothing-scale: a_smooth=%f", p->a_smooth);
+  message("Self-gravity mesh smoothing-scale: a_smooth=%f", p->a_smooth);
 
-  message("Self-gravity MM cut-off: r_cut=%f", p->r_cut);
+  message("Self-gravity tree cut-off: r_cut_max=%f", p->r_cut_max);
+  message("Self-gravity truncation cut-off: r_cut_min=%f", p->r_cut_min);
 }
 
 #if defined(HAVE_HDF5)
@@ -84,7 +88,8 @@ void gravity_props_print_snapshot(hid_t h_grpgrav,
                        p->epsilon / 3.);
   io_write_attribute_f(h_grpgrav, "Opening angle", p->theta_crit);
   io_write_attribute_d(h_grpgrav, "MM order", SELF_GRAVITY_MULTIPOLE_ORDER);
-  io_write_attribute_f(h_grpgrav, "MM a_smooth", p->a_smooth);
-  io_write_attribute_f(h_grpgrav, "MM r_cut", p->r_cut);
+  io_write_attribute_f(h_grpgrav, "Mesh a_smooth", p->a_smooth);
+  io_write_attribute_f(h_grpgrav, "Mesh r_cut_max", p->r_cut_max);
+  io_write_attribute_f(h_grpgrav, "Mesh r_cut_min", p->r_cut_min);
 }
 #endif
diff --git a/src/gravity_properties.h b/src/gravity_properties.h
index be26f0d1d23b8cec71fa3cbbeedac9f61f337b2c..2a5e4cb1e07ea591e2e3821704ec55abe7980360 100644
--- a/src/gravity_properties.h
+++ b/src/gravity_properties.h
@@ -34,9 +34,16 @@
  */
 struct gravity_props {
 
-  /* Tree-PM parameters */
+  /*! Mesh smoothing scale in units of top-level cell size */
   float a_smooth;
-  float r_cut;
+
+  /*! Distance below which the truncated mesh force is Newtonian in units of
+   * a_smooth */
+  float r_cut_min;
+
+  /*! Distance above which the truncated mesh force is negligible in units of
+   * a_smooth */
+  float r_cut_max;
 
   /*! Time integration dimensionless multiplier */
   float eta;
diff --git a/src/hydro/Default/hydro.h b/src/hydro/Default/hydro.h
index 051c22f46b3ecdff5d3de910e0f75409b0e78f02..31f0c4172099479abff9e1ed19487130a0a8938b 100644
--- a/src/hydro/Default/hydro.h
+++ b/src/hydro/Default/hydro.h
@@ -210,9 +210,6 @@ __attribute__((always_inline)) INLINE static void hydro_end_density(
 
   const float irho = 1.f / p->rho;
 
-  /* Compute the derivative term */
-  p->rho_dh = 1.f / (1.f + hydro_dimension_inv * p->h * p->rho_dh * irho);
-
   /* Finish calculation of the velocity curl components */
   p->density.rot_v[0] *= h_inv_dim_plus_one * irho;
   p->density.rot_v[1] *= h_inv_dim_plus_one * irho;
@@ -222,6 +219,31 @@ __attribute__((always_inline)) INLINE static void hydro_end_density(
   p->density.div_v *= h_inv_dim_plus_one * irho;
 }
 
+/**
+ * @brief Sets all particle fields to sensible values when the #part has 0 ngbs.
+ *
+ * @param p The particle to act upon
+ * @param xp The extended particle data to act upon
+ */
+__attribute__((always_inline)) INLINE static void hydro_part_has_no_neighbours(
+    struct part *restrict p, struct xpart *restrict xp) {
+
+  /* Some smoothing length multiples. */
+  const float h = p->h;
+  const float h_inv = 1.0f / h;                 /* 1/h */
+  const float h_inv_dim = pow_dimension(h_inv); /* 1/h^d */
+
+  /* Re-set problematic values */
+  p->rho = p->mass * kernel_root * h_inv_dim;
+  p->density.wcount = kernel_root * kernel_norm * h_inv_dim;
+  p->rho_dh = 0.f;
+  p->density.wcount_dh = 0.f;
+  p->density.div_v = 0.f;
+  p->density.rot_v[0] = 0.f;
+  p->density.rot_v[1] = 0.f;
+  p->density.rot_v[2] = 0.f;
+}
+
 /**
  * @brief Prepare a particle for the force calculation.
  *
@@ -249,6 +271,9 @@ __attribute__((always_inline)) INLINE static void hydro_prepare_force(
   const float fc = p->force.soundspeed =
       sqrtf(hydro_gamma * hydro_gamma_minus_one * u);
 
+  /* Compute the derivative term */
+  p->rho_dh = 1.f / (1.f + hydro_dimension_inv * p->h * p->rho_dh / p->rho);
+
   /* Compute the P/Omega/rho2. */
   xp->omega = 1.0f + hydro_dimension_inv * h * p->rho_dh / p->rho;
   p->force.P_over_rho2 = u * hydro_gamma_minus_one / (p->rho * xp->omega);
diff --git a/src/hydro/Gadget2/hydro.h b/src/hydro/Gadget2/hydro.h
index 91626749a89ede387547b6351dce59fa3569307a..66a475f32ec06eb40ff2bc890bc156f76e3b7b9f 100644
--- a/src/hydro/Gadget2/hydro.h
+++ b/src/hydro/Gadget2/hydro.h
@@ -206,12 +206,13 @@ __attribute__((always_inline)) INLINE static void hydro_end_density(
   p->rho += p->mass * kernel_root;
   p->density.rho_dh -= hydro_dimension * p->mass * kernel_root;
   p->density.wcount += kernel_root;
+  p->density.wcount_dh -= hydro_dimension * kernel_root;
 
   /* Finish the calculation by inserting the missing h-factors */
   p->rho *= h_inv_dim;
   p->density.rho_dh *= h_inv_dim_plus_one;
-  p->density.wcount *= kernel_norm;
-  p->density.wcount_dh *= h_inv * kernel_gamma * kernel_norm;
+  p->density.wcount *= h_inv_dim;
+  p->density.wcount_dh *= h_inv_dim_plus_one;
 
   const float rho_inv = 1.f / p->rho;
 
@@ -224,6 +225,31 @@ __attribute__((always_inline)) INLINE static void hydro_end_density(
   p->density.div_v *= h_inv_dim_plus_one * rho_inv;
 }
 
+/**
+ * @brief Sets all particle fields to sensible values when the #part has 0 ngbs.
+ *
+ * @param p The particle to act upon
+ * @param xp The extended particle data to act upon
+ */
+__attribute__((always_inline)) INLINE static void hydro_part_has_no_neighbours(
+    struct part *restrict p, struct xpart *restrict xp) {
+
+  /* Some smoothing length multiples. */
+  const float h = p->h;
+  const float h_inv = 1.0f / h;                 /* 1/h */
+  const float h_inv_dim = pow_dimension(h_inv); /* 1/h^d */
+
+  /* Re-set problematic values */
+  p->rho = p->mass * kernel_root * h_inv_dim;
+  p->density.wcount = kernel_root * kernel_norm * h_inv_dim;
+  p->density.rho_dh = 0.f;
+  p->density.wcount_dh = 0.f;
+  p->density.div_v = 0.f;
+  p->density.rot_v[0] = 0.f;
+  p->density.rot_v[1] = 0.f;
+  p->density.rot_v[2] = 0.f;
+}
+
 /**
  * @brief Prepare a particle for the force calculation.
  *
@@ -239,6 +265,9 @@ __attribute__((always_inline)) INLINE static void hydro_prepare_force(
 
   const float fac_mu = 1.f; /* Will change with cosmological integration */
 
+  /* Inverse of the physical density */
+  const float rho_inv = 1.f / p->rho;
+
   /* Compute the norm of the curl */
   const float curl_v = sqrtf(p->density.rot_v[0] * p->density.rot_v[0] +
                              p->density.rot_v[1] * p->density.rot_v[1] +
@@ -254,7 +283,6 @@ __attribute__((always_inline)) INLINE static void hydro_prepare_force(
   const float soundspeed = gas_soundspeed_from_pressure(p->rho, pressure);
 
   /* Divide the pressure by the density squared to get the SPH term */
-  const float rho_inv = 1.f / p->rho;
   const float P_over_rho2 = pressure * rho_inv * rho_inv;
 
   /* Compute the Balsara switch */
@@ -262,11 +290,11 @@ __attribute__((always_inline)) INLINE static void hydro_prepare_force(
       abs_div_v / (abs_div_v + curl_v + 0.0001f * soundspeed / fac_mu / p->h);
 
   /* Compute the "grad h" term */
-  const float grad_h_term =
+  const float omega_inv =
       1.f / (1.f + hydro_dimension_inv * p->h * p->density.rho_dh * rho_inv);
 
   /* Update variables. */
-  p->force.f = grad_h_term;
+  p->force.f = omega_inv;
   p->force.P_over_rho2 = P_over_rho2;
   p->force.soundspeed = soundspeed;
   p->force.balsara = balsara;
diff --git a/src/hydro/Gadget2/hydro_iact.h b/src/hydro/Gadget2/hydro_iact.h
index b117b5a08a82679d0a4311235b4ac32fd1379dd6..81b6381f277284468c22d64312866c2e39cd1f0d 100644
--- a/src/hydro/Gadget2/hydro_iact.h
+++ b/src/hydro/Gadget2/hydro_iact.h
@@ -64,7 +64,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_density(
 
   /* Compute contribution to the number of neighbours */
   pi->density.wcount += wi;
-  pi->density.wcount_dh -= ui * wi_dx;
+  pi->density.wcount_dh -= (hydro_dimension * wi + ui * wi_dx);
 
   /* Compute the kernel function for pj */
   const float hj_inv = 1.f / hj;
@@ -77,7 +77,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_density(
 
   /* Compute contribution to the number of neighbours */
   pj->density.wcount += wj;
-  pj->density.wcount_dh -= uj * wj_dx;
+  pj->density.wcount_dh -= (hydro_dimension * wj + uj * wj_dx);
 
   const float faci = mj * wi_dx * r_inv;
   const float facj = mi * wj_dx * r_inv;
@@ -112,9 +112,9 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_density(
     float *R2, float *Dx, float *Hi, float *Hj, struct part **pi,
     struct part **pj) {
 
-#ifdef WITH_VECTORIZATION
+#ifdef WITH_OLD_VECTORIZATION
 
-  vector r, ri, r2, xi, xj, hi, hj, hi_inv, hj_inv, wi, wj, wi_dx, wj_dx;
+  vector r, ri, r2, ui, uj, hi, hj, hi_inv, hj_inv, wi, wj, wi_dx, wj_dx;
   vector rhoi, rhoj, rhoi_dh, rhoj_dh, wcounti, wcountj, wcounti_dh, wcountj_dh;
   vector mi, mj;
   vector dx[3], dv[3];
@@ -161,15 +161,15 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_density(
 
   hi.v = vec_load(Hi);
   hi_inv = vec_reciprocal(hi);
-  xi.v = r.v * hi_inv.v;
+  ui.v = r.v * hi_inv.v;
 
   hj.v = vec_load(Hj);
   hj_inv = vec_reciprocal(hj);
-  xj.v = r.v * hj_inv.v;
+  uj.v = r.v * hj_inv.v;
 
   /* Compute the kernel function. */
-  kernel_deval_vec(&xi, &wi, &wi_dx);
-  kernel_deval_vec(&xj, &wj, &wj_dx);
+  kernel_deval_vec(&ui, &wi, &wi_dx);
+  kernel_deval_vec(&uj, &wj, &wj_dx);
 
   /* Compute dv. */
   dv[0].v = vi[0].v - vj[0].v;
@@ -188,17 +188,17 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_density(
 
   /* Compute density of pi. */
   rhoi.v = mj.v * wi.v;
-  rhoi_dh.v = mj.v * (vec_set1(hydro_dimension) * wi.v + xi.v * wi_dx.v);
+  rhoi_dh.v = mj.v * (vec_set1(hydro_dimension) * wi.v + ui.v * wi_dx.v);
   wcounti.v = wi.v;
-  wcounti_dh.v = xi.v * wi_dx.v;
+  wcounti_dh.v = (vec_set1(hydro_dimension) * wi.v + ui.v * wi_dx.v);
   div_vi.v = mj.v * dvdr.v * wi_dx.v;
   for (k = 0; k < 3; k++) curl_vi[k].v = mj.v * curlvr[k].v * wi_dx.v;
 
   /* Compute density of pj. */
   rhoj.v = mi.v * wj.v;
-  rhoj_dh.v = mi.v * (vec_set1(hydro_dimension) * wj.v + xj.v * wj_dx.v);
+  rhoj_dh.v = mi.v * (vec_set1(hydro_dimension) * wj.v + uj.v * wj_dx.v);
   wcountj.v = wj.v;
-  wcountj_dh.v = xj.v * wj_dx.v;
+  wcountj_dh.v = (vec_set1(hydro_dimension) * wj.v + uj.v * wj_dx.v);
   div_vj.v = mi.v * dvdr.v * wj_dx.v;
   for (k = 0; k < 3; k++) curl_vj[k].v = mi.v * curlvr[k].v * wj_dx.v;
 
@@ -241,7 +241,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density(
 
   /* Get r and r inverse. */
   const float r = sqrtf(r2);
-  const float ri = 1.0f / r;
+  const float r_inv = 1.0f / r;
 
   /* Compute the kernel function */
   const float hi_inv = 1.0f / hi;
@@ -254,9 +254,9 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density(
 
   /* Compute contribution to the number of neighbours */
   pi->density.wcount += wi;
-  pi->density.wcount_dh -= ui * wi_dx;
+  pi->density.wcount_dh -= (hydro_dimension * wi + ui * wi_dx);
 
-  const float fac = mj * wi_dx * ri;
+  const float fac = mj * wi_dx * r_inv;
 
   /* Compute dv dot r */
   dv[0] = pi->v[0] - pj->v[0];
@@ -282,9 +282,9 @@ __attribute__((always_inline)) INLINE static void
 runner_iact_nonsym_vec_density(float *R2, float *Dx, float *Hi, float *Hj,
                                struct part **pi, struct part **pj) {
 
-#ifdef WITH_VECTORIZATION
+#ifdef WITH_OLD_VECTORIZATION
 
-  vector r, ri, r2, xi, hi, hi_inv, wi, wi_dx;
+  vector r, ri, r2, ui, hi, hi_inv, wi, wi_dx;
   vector rhoi, rhoi_dh, wcounti, wcounti_dh, div_vi;
   vector mj;
   vector dx[3], dv[3];
@@ -328,9 +328,9 @@ runner_iact_nonsym_vec_density(float *R2, float *Dx, float *Hi, float *Hj,
 
   hi.v = vec_load(Hi);
   hi_inv = vec_reciprocal(hi);
-  xi.v = r.v * hi_inv.v;
+  ui.v = r.v * hi_inv.v;
 
-  kernel_deval_vec(&xi, &wi, &wi_dx);
+  kernel_deval_vec(&ui, &wi, &wi_dx);
 
   /* Compute dv. */
   dv[0].v = vi[0].v - vj[0].v;
@@ -349,9 +349,9 @@ runner_iact_nonsym_vec_density(float *R2, float *Dx, float *Hi, float *Hj,
 
   /* Compute density of pi. */
   rhoi.v = mj.v * wi.v;
-  rhoi_dh.v = mj.v * (vec_set1(hydro_dimension) * wi.v + xi.v * wi_dx.v);
+  rhoi_dh.v = mj.v * (vec_set1(hydro_dimension) * wi.v + ui.v * wi_dx.v);
   wcounti.v = wi.v;
-  wcounti_dh.v = xi.v * wi_dx.v;
+  wcounti_dh.v = (vec_set1(hydro_dimension) * wi.v + ui.v * wi_dx.v);
   div_vi.v = mj.v * dvdr.v * wi_dx.v;
   for (k = 0; k < 3; k++) curl_vi[k].v = mj.v * curlvr[k].v * wi_dx.v;
 
@@ -390,7 +390,7 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz,
                                  vector *curlvySum, vector *curlvzSum,
                                  mask_t mask) {
 
-  vector r, ri, xi, wi, wi_dx;
+  vector r, ri, ui, wi, wi_dx;
   vector mj;
   vector dvx, dvy, dvz;
   vector vjx, vjy, vjz;
@@ -407,10 +407,10 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz,
   ri = vec_reciprocal_sqrt(*r2);
   r.v = vec_mul(r2->v, ri.v);
 
-  xi.v = vec_mul(r.v, hi_inv.v);
+  ui.v = vec_mul(r.v, hi_inv.v);
 
   /* Calculate the kernel for two particles. */
-  kernel_deval_1_vec(&xi, &wi, &wi_dx);
+  kernel_deval_1_vec(&ui, &wi, &wi_dx);
 
   /* Compute dv. */
   dvx.v = vec_sub(vix.v, vjx.v);
@@ -432,14 +432,16 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz,
   curlvry.v = vec_mul(curlvry.v, ri.v);
   curlvrz.v = vec_mul(curlvrz.v, ri.v);
 
+  vector wcount_dh_update;
+  wcount_dh_update.v =
+      vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v));
+
   /* Mask updates to intermediate vector sums for particle pi. */
   rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj.v, wi.v), mask);
-  rho_dhSum->v = vec_mask_sub(
-      rho_dhSum->v, vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
-                                          vec_mul(xi.v, wi_dx.v))),
-      mask);
+  rho_dhSum->v =
+      vec_mask_sub(rho_dhSum->v, vec_mul(mj.v, wcount_dh_update.v), mask);
   wcountSum->v = vec_mask_add(wcountSum->v, wi.v, mask);
-  wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, vec_mul(xi.v, wi_dx.v), mask);
+  wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, wcount_dh_update.v, mask);
   div_vSum->v =
       vec_mask_sub(div_vSum->v, vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask);
   curlvxSum->v = vec_mask_add(curlvxSum->v,
@@ -464,13 +466,14 @@ runner_iact_nonsym_2_vec_density(float *R2, float *Dx, float *Dy, float *Dz,
                                  vector *curlvySum, vector *curlvzSum,
                                  mask_t mask, mask_t mask2, short mask_cond) {
 
-  vector r, ri, r2, xi, wi, wi_dx;
+
+  vector r, ri, r2, ui, wi, wi_dx;
   vector mj;
   vector dx, dy, dz, dvx, dvy, dvz;
   vector vjx, vjy, vjz;
   vector dvdr;
   vector curlvrx, curlvry, curlvrz;
-  vector r_2, ri2, r2_2, xi2, wi2, wi_dx2;
+  vector r_2, ri2, r2_2, ui2, wi2, wi_dx2;
   vector mj2;
   vector dx2, dy2, dz2, dvx2, dvy2, dvz2;
   vector vjx2, vjy2, vjz2;
@@ -501,11 +504,11 @@ runner_iact_nonsym_2_vec_density(float *R2, float *Dx, float *Dy, float *Dz,
   r.v = vec_mul(r2.v, ri.v);
   r_2.v = vec_mul(r2_2.v, ri2.v);
 
-  xi.v = vec_mul(r.v, hi_inv.v);
-  xi2.v = vec_mul(r_2.v, hi_inv.v);
+  ui.v = vec_mul(r.v, hi_inv.v);
+  ui2.v = vec_mul(r_2.v, hi_inv.v);
 
   /* Calculate the kernel for two particles. */
-  kernel_deval_2_vec(&xi, &wi, &wi_dx, &xi2, &wi2, &wi_dx2);
+  kernel_deval_2_vec(&ui, &wi, &wi_dx, &ui2, &wi2, &wi_dx2);
 
   /* Compute dv. */
   dvx.v = vec_sub(vix.v, vjx.v);
@@ -542,25 +545,25 @@ runner_iact_nonsym_2_vec_density(float *R2, float *Dx, float *Dy, float *Dz,
   curlvrz.v = vec_mul(curlvrz.v, ri.v);
   curlvrz2.v = vec_mul(curlvrz2.v, ri2.v);
 
+  vector wcount_dh_update, wcount_dh_update2;
+  wcount_dh_update.v =
+      vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v));
+  wcount_dh_update2.v =
+      vec_fma(vec_set1(hydro_dimension), wi2.v, vec_mul(ui2.v, wi_dx2.v));
+
   /* Mask updates to intermediate vector sums for particle pi. */
   /* Mask only when needed. */
   if (mask_cond) {
     rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj.v, wi.v), mask);
     rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj2.v, wi2.v), mask2);
-    rho_dhSum->v = vec_mask_sub(
-        rho_dhSum->v, vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
-                                            vec_mul(xi.v, wi_dx.v))),
-        mask);
-    rho_dhSum->v = vec_mask_sub(
-        rho_dhSum->v, vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v,
-                                             vec_mul(xi2.v, wi_dx2.v))),
-        mask2);
+    rho_dhSum->v =
+        vec_mask_sub(rho_dhSum->v, vec_mul(mj.v, wcount_dh_update.v), mask);
+    rho_dhSum->v =
+        vec_mask_sub(rho_dhSum->v, vec_mul(mj2.v, wcount_dh_update2.v), mask2);
     wcountSum->v = vec_mask_add(wcountSum->v, wi.v, mask);
     wcountSum->v = vec_mask_add(wcountSum->v, wi2.v, mask2);
-    wcount_dhSum->v =
-        vec_mask_sub(wcount_dhSum->v, vec_mul(xi.v, wi_dx.v), mask);
-    wcount_dhSum->v =
-        vec_mask_sub(wcount_dhSum->v, vec_mul(xi2.v, wi_dx2.v), mask2);
+    wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, wcount_dh_update.v, mask);
+    wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, wcount_dh_update2.v, mask2);
     div_vSum->v = vec_mask_sub(div_vSum->v,
                                vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask);
     div_vSum->v = vec_mask_sub(
@@ -580,22 +583,27 @@ runner_iact_nonsym_2_vec_density(float *R2, float *Dx, float *Dy, float *Dz,
   } else {
     rhoSum->v = vec_add(rhoSum->v, vec_mul(mj.v, wi.v));
     rhoSum->v = vec_add(rhoSum->v, vec_mul(mj2.v, wi2.v));
-    rho_dhSum->v = vec_sub(rho_dhSum->v, vec_mul(
-        mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(xi.v, wi_dx.v))));
-    rho_dhSum->v = vec_sub(rho_dhSum->v, vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v,
-                                           vec_mul(xi2.v, wi_dx2.v))));
+    rho_dhSum->v = vec_sub(rho_dhSum->v, vec_mul(mj.v, wcount_dh_update.v));
+    rho_dhSum->v = vec_sub(rho_dhSum->v, vec_mul(mj2.v, wcount_dh_update2.v));
     wcountSum->v = vec_add(wcountSum->v, wi.v);
     wcountSum->v = vec_add(wcountSum->v, wi2.v);
-    wcount_dhSum->v = vec_sub(wcount_dhSum->v, vec_mul(xi.v, wi_dx.v));
-    wcount_dhSum->v = vec_sub(wcount_dhSum->v, vec_mul(xi2.v, wi_dx2.v));
+    wcount_dhSum->v = vec_sub(wcount_dhSum->v, wcount_dh_update.v);
+    wcount_dhSum->v = vec_sub(wcount_dhSum->v, wcount_dh_update2.v);
     div_vSum->v = vec_sub(div_vSum->v, vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)));
-    div_vSum->v = vec_sub(div_vSum->v, vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)));
-    curlvxSum->v = vec_add(curlvxSum->v, vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)));
-    curlvxSum->v = vec_add(curlvxSum->v, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)));
-    curlvySum->v = vec_add(curlvySum->v, vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)));
-    curlvySum->v = vec_add(curlvySum->v, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)));
-    curlvzSum->v = vec_add(curlvzSum->v, vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)));
-    curlvzSum->v = vec_add(curlvzSum->v, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)));
+    div_vSum->v =
+        vec_sub(div_vSum->v, vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)));
+    curlvxSum->v =
+        vec_add(curlvxSum->v, vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)));
+    curlvxSum->v =
+        vec_add(curlvxSum->v, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)));
+    curlvySum->v =
+        vec_add(curlvySum->v, vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)));
+    curlvySum->v =
+        vec_add(curlvySum->v, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)));
+    curlvzSum->v =
+        vec_add(curlvzSum->v, vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)));
+    curlvzSum->v =
+        vec_add(curlvzSum->v, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)));
   }
 }
 #endif
@@ -703,7 +711,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_force(
     float *R2, float *Dx, float *Hi, float *Hj, struct part **pi,
     struct part **pj) {
 
-#ifdef WITH_VECTORIZATION
+#ifdef WITH_OLD_VECTORIZATION
 
   vector r, r2, ri;
   vector xi, xj;
@@ -985,7 +993,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force(
     float *R2, float *Dx, float *Hi, float *Hj, struct part **pi,
     struct part **pj) {
 
-#ifdef WITH_VECTORIZATION
+#ifdef WITH_OLD_VECTORIZATION
 
   vector r, r2, ri;
   vector xi, xj;
diff --git a/src/hydro/Gizmo/hydro.h b/src/hydro/Gizmo/hydro.h
index 6d39c54d2ddc3571ac34c54fc9eede6f7dee6ac5..2c2f54699bb380a491edf61a83ad8a031572c86c 100644
--- a/src/hydro/Gizmo/hydro.h
+++ b/src/hydro/Gizmo/hydro.h
@@ -49,17 +49,21 @@ __attribute__((always_inline)) INLINE static float hydro_compute_timestep(
   return CFL_condition;
 #endif
 
-  if (p->timestepvars.vmax == 0.) {
-    /* vmax can be zero in vacuum cells that only have vacuum neighbours */
-    /* in this case, the time step should be limited by the maximally
-       allowed time step. Since we do not know what that value is here, we set
-       the time step to a very large value */
-    return FLT_MAX;
-  } else {
-    const float psize = powf(p->geometry.volume / hydro_dimension_unit_sphere,
-                             hydro_dimension_inv);
-    return 2. * CFL_condition * psize / fabsf(p->timestepvars.vmax);
+  float vrel[3];
+  vrel[0] = p->primitives.v[0] - xp->v_full[0];
+  vrel[1] = p->primitives.v[1] - xp->v_full[1];
+  vrel[2] = p->primitives.v[2] - xp->v_full[2];
+  float vmax =
+      sqrtf(vrel[0] * vrel[0] + vrel[1] * vrel[1] + vrel[2] * vrel[2]) +
+      sqrtf(hydro_gamma * p->primitives.P / p->primitives.rho);
+  vmax = max(vmax, p->timestepvars.vmax);
+  const float psize = powf(p->geometry.volume / hydro_dimension_unit_sphere,
+                           hydro_dimension_inv);
+  float dt = FLT_MAX;
+  if (vmax > 0.) {
+    dt = psize / vmax;
   }
+  return CFL_condition * dt;
 }
 
 /**
@@ -225,14 +229,15 @@ __attribute__((always_inline)) INLINE static void hydro_end_density(
   /* Some smoothing length multiples. */
   const float h = p->h;
   const float ih = 1.0f / h;
+  const float ihdim = pow_dimension(ih);
+  const float ihdim_plus_one = ihdim * ih;
 
   /* Final operation on the density. */
   p->density.wcount += kernel_root;
-  p->density.wcount *= kernel_norm;
+  p->density.wcount *= ihdim;
 
-  p->density.wcount_dh *= ih * kernel_gamma * kernel_norm;
-
-  const float ihdim = pow_dimension(ih);
+  p->density.wcount_dh -= hydro_dimension * kernel_root;
+  p->density.wcount_dh *= ihdim_plus_one;
 
   /* Final operation on the geometry. */
   /* we multiply with the smoothing kernel normalization ih3 and calculate the
@@ -366,6 +371,42 @@ __attribute__((always_inline)) INLINE static void hydro_end_density(
   p->density.wcount_dh *= p->density.wcorr;
 }
 
+/**
+ * @brief Sets all particle fields to sensible values when the #part has 0 ngbs.
+ *
+ * @param p The particle to act upon
+ * @param xp The extended particle data to act upon
+ */
+__attribute__((always_inline)) INLINE static void hydro_part_has_no_neighbours(
+    struct part* restrict p, struct xpart* restrict xp) {
+
+  /* Some smoothing length multiples. */
+  const float h = p->h;
+  const float h_inv = 1.0f / h;                 /* 1/h */
+  const float h_inv_dim = pow_dimension(h_inv); /* 1/h^d */
+
+  /* Re-set problematic values */
+  p->density.wcount = kernel_root * kernel_norm * h_inv_dim;
+  p->density.wcount_dh = 0.f;
+  p->geometry.volume = 1.0f;
+  p->geometry.matrix_E[0][0] = 1.0f;
+  p->geometry.matrix_E[0][1] = 0.0f;
+  p->geometry.matrix_E[0][2] = 0.0f;
+  p->geometry.matrix_E[1][0] = 0.0f;
+  p->geometry.matrix_E[1][1] = 1.0f;
+  p->geometry.matrix_E[1][2] = 0.0f;
+  p->geometry.matrix_E[2][0] = 0.0f;
+  p->geometry.matrix_E[2][1] = 0.0f;
+  p->geometry.matrix_E[2][2] = 1.0f;
+  /* centroid is relative w.r.t. particle position */
+  /* by setting the centroid to 0.0f, we make sure no velocity correction is
+     applied */
+  p->geometry.centroid[0] = 0.0f;
+  p->geometry.centroid[1] = 0.0f;
+  p->geometry.centroid[2] = 0.0f;
+  p->geometry.Atot = 1.0f;
+}
+
 /**
  * @brief Prepare a particle for the gradient calculation.
  *
@@ -384,7 +425,7 @@ __attribute__((always_inline)) INLINE static void hydro_prepare_force(
     struct part* restrict p, struct xpart* restrict xp) {
 
   /* Initialize time step criterion variables */
-  p->timestepvars.vmax = 0.0f;
+  p->timestepvars.vmax = 0.;
 
   /* Set the actual velocity of the particle */
   hydro_velocities_prepare_force(p, xp);
@@ -601,24 +642,12 @@ __attribute__((always_inline)) INLINE static void hydro_kick_extra(
     a_grav[1] = p->gpart->a_grav[1];
     a_grav[2] = p->gpart->a_grav[2];
 
-    /* Store the gravitational acceleration for later use. */
-    /* This is used for the prediction step. */
-    p->gravity.old_a[0] = a_grav[0];
-    p->gravity.old_a[1] = a_grav[1];
-    p->gravity.old_a[2] = a_grav[2];
-
     /* Make sure the gpart knows the mass has changed. */
     p->gpart->mass = p->conserved.mass;
 
-    /* Kick the momentum for half a time step */
-    /* Note that this also affects the particle movement, as the velocity for
-       the particles is set after this. */
-    p->conserved.momentum[0] += dt * p->conserved.mass * a_grav[0];
-    p->conserved.momentum[1] += dt * p->conserved.mass * a_grav[1];
-    p->conserved.momentum[2] += dt * p->conserved.mass * a_grav[2];
-
 #if !defined(EOS_ISOTHERMAL_GAS)
-    /* This part still needs to be tested! */
+    /* If the energy needs to be updated, we need to do it before the momentum
+       is updated, as the old value of the momentum enters the equations. */
     p->conserved.energy += dt * (p->conserved.momentum[0] * a_grav[0] +
                                  p->conserved.momentum[1] * a_grav[1] +
                                  p->conserved.momentum[2] * a_grav[2]);
@@ -627,6 +656,13 @@ __attribute__((always_inline)) INLINE static void hydro_kick_extra(
                                  a_grav[1] * p->gravity.mflux[1] +
                                  a_grav[2] * p->gravity.mflux[2]);
 #endif
+
+    /* Kick the momentum for half a time step */
+    /* Note that this also affects the particle movement, as the velocity for
+       the particles is set after this. */
+    p->conserved.momentum[0] += dt * p->conserved.mass * a_grav[0];
+    p->conserved.momentum[1] += dt * p->conserved.mass * a_grav[1];
+    p->conserved.momentum[2] += dt * p->conserved.mass * a_grav[2];
   }
 
   /* reset fluxes */
diff --git a/src/hydro/Gizmo/hydro_debug.h b/src/hydro/Gizmo/hydro_debug.h
index a05ff9a7d96f04ca3354235540adc31386a2d2e3..17e7f8a08570e355a701f8e165ee8af745fa34ab 100644
--- a/src/hydro/Gizmo/hydro_debug.h
+++ b/src/hydro/Gizmo/hydro_debug.h
@@ -46,7 +46,7 @@ __attribute__((always_inline)) INLINE static void hydro_debug_particle(
       "volume=%.3e, "
       "matrix_E=[[%.3e,%.3e,%.3e],[%.3e,%.3e,%.3e],[%.3e,%.3e,%.3e]]}, "
       "timestepvars={"
-      "vmax=%.3e}, "
+      "vmax=%.3e},"
       "density={"
       "div_v=%.3e, "
       "wcount_dh=%.3e, "
diff --git a/src/hydro/Gizmo/hydro_flux_limiters.h b/src/hydro/Gizmo/hydro_flux_limiters.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc91cf2808e02d903ff97efddc20c164db9c954e
--- /dev/null
+++ b/src/hydro/Gizmo/hydro_flux_limiters.h
@@ -0,0 +1,81 @@
+
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (c) 2017 Bert Vandenbroucke (bert.vandenbroucke@gmail.com)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+
+#ifndef SWIFT_HYDRO_FLUX_LIMITERS_H
+#define SWIFT_HYDRO_FLUX_LIMITERS_H
+
+#ifdef GIZMO_FLUX_LIMITER
+
+#define HYDRO_FLUX_LIMITER_IMPLEMENTATION "GIZMO flux limiter"
+
+/**
+ * @brief Limit the flux between two particles.
+ *
+ * @param flux Unlimited flux between the particles.
+ * @param pi Particle i.
+ * @param pj Particle j.
+ */
+__attribute__((always_inline)) INLINE static void hydro_flux_limiters_apply(
+    float* flux, struct part* pi, struct part* pj) {
+
+  float flux_limit_factor = 1.;
+  const float timefac = max(pi->force.dt, pj->force.dt);
+  const float areafac = max(pi->geometry.Atot, pj->geometry.Atot);
+  const float totfac = timefac * areafac;
+  if (flux[0] * totfac > pi->conserved.mass) {
+    flux_limit_factor = pi->conserved.mass / (flux[0] * totfac);
+  }
+  if (flux[0] * totfac > pj->conserved.mass) {
+    flux_limit_factor =
+        min(pj->conserved.mass / (flux[0] * totfac), flux_limit_factor);
+  }
+  if (flux[4] * totfac > pi->conserved.energy) {
+    flux_limit_factor =
+        min(pi->conserved.energy / (flux[4] * totfac), flux_limit_factor);
+  }
+  if (flux[4] * totfac > pj->conserved.energy) {
+    flux_limit_factor =
+        min(pj->conserved.energy / (flux[4] * totfac), flux_limit_factor);
+  }
+
+  flux[0] *= flux_limit_factor;
+  flux[1] *= flux_limit_factor;
+  flux[2] *= flux_limit_factor;
+  flux[3] *= flux_limit_factor;
+  flux[4] *= flux_limit_factor;
+}
+
+#else
+
+#define HYDRO_FLUX_LIMITER_IMPLEMENTATION "No flux limiter"
+
+/**
+ * @brief Limit the flux between two particles.
+ *
+ * @param flux Unlimited flux between the particles.
+ * @param pi Particle i.
+ * @param pj Particle j.
+ */
+__attribute__((always_inline)) INLINE static void hydro_flux_limiters_apply(
+    float* flux, struct part* pi, struct part* pj) {}
+
+#endif
+
+#endif  // SWIFT_HYDRO_FLUX_LIMITERS_H
diff --git a/src/hydro/Gizmo/hydro_gradients.h b/src/hydro/Gizmo/hydro_gradients.h
index 5ad6d87619a7629a703a8b9c03d089e69ffbdf7d..896128bd45d7964c1f4c8d63564f6fced38db770 100644
--- a/src/hydro/Gizmo/hydro_gradients.h
+++ b/src/hydro/Gizmo/hydro_gradients.h
@@ -99,7 +99,6 @@ __attribute__((always_inline)) INLINE static void hydro_gradients_predict(
   float xij_j[3];
   int k;
   float xfac;
-  float a_grav_i[3], a_grav_j[3];
 
   /* perform gradient reconstruction in space and time */
   /* space */
@@ -141,34 +140,6 @@ __attribute__((always_inline)) INLINE static void hydro_gradients_predict(
            pj->primitives.gradients.P[1] * xij_j[1] +
            pj->primitives.gradients.P[2] * xij_j[2];
 
-  a_grav_i[0] = pi->gravity.old_a[0];
-  a_grav_i[1] = pi->gravity.old_a[1];
-  a_grav_i[2] = pi->gravity.old_a[2];
-
-  a_grav_i[0] += pi->gravity.grad_a[0][0] * xij_i[0] +
-                 pi->gravity.grad_a[0][1] * xij_i[1] +
-                 pi->gravity.grad_a[0][2] * xij_i[2];
-  a_grav_i[1] += pi->gravity.grad_a[1][0] * xij_i[0] +
-                 pi->gravity.grad_a[1][1] * xij_i[1] +
-                 pi->gravity.grad_a[1][2] * xij_i[2];
-  a_grav_i[2] += pi->gravity.grad_a[2][0] * xij_i[0] +
-                 pi->gravity.grad_a[2][1] * xij_i[1] +
-                 pi->gravity.grad_a[2][2] * xij_i[2];
-
-  a_grav_j[0] = pj->gravity.old_a[0];
-  a_grav_j[1] = pj->gravity.old_a[1];
-  a_grav_j[2] = pj->gravity.old_a[2];
-
-  a_grav_j[0] += pj->gravity.grad_a[0][0] * xij_j[0] +
-                 pj->gravity.grad_a[0][1] * xij_j[1] +
-                 pj->gravity.grad_a[0][2] * xij_j[2];
-  a_grav_j[1] += pj->gravity.grad_a[1][0] * xij_j[0] +
-                 pj->gravity.grad_a[1][1] * xij_j[1] +
-                 pj->gravity.grad_a[1][2] * xij_j[2];
-  a_grav_j[2] += pj->gravity.grad_a[2][0] * xij_j[0] +
-                 pj->gravity.grad_a[2][1] * xij_j[1] +
-                 pj->gravity.grad_a[2][2] * xij_j[2];
-
   hydro_slope_limit_face(Wi, Wj, dWi, dWj, xij_i, xij_j, r);
 
   /* time */
@@ -198,10 +169,6 @@ __attribute__((always_inline)) INLINE static void hydro_gradients_predict(
                hydro_gamma * Wi[4] * (pi->primitives.gradients.v[0][0] +
                                       pi->primitives.gradients.v[1][1] +
                                       pi->primitives.gradients.v[2][2]));
-
-    dWi[1] += 0.5 * mindt * a_grav_i[0];
-    dWi[2] += 0.5 * mindt * a_grav_i[1];
-    dWi[3] += 0.5 * mindt * a_grav_i[2];
   }
 
   if (Wj[0] > 0.0f) {
@@ -230,10 +197,6 @@ __attribute__((always_inline)) INLINE static void hydro_gradients_predict(
                hydro_gamma * Wj[4] * (pj->primitives.gradients.v[0][0] +
                                       pj->primitives.gradients.v[1][1] +
                                       pj->primitives.gradients.v[2][2]));
-
-    dWj[1] += 0.5 * mindt * a_grav_j[0];
-    dWj[2] += 0.5 * mindt * a_grav_j[1];
-    dWj[3] += 0.5 * mindt * a_grav_j[2];
   }
 
   Wi[0] += dWi[0];
diff --git a/src/hydro/Gizmo/hydro_gradients_gizmo.h b/src/hydro/Gizmo/hydro_gradients_gizmo.h
index ee3ad6919f81f042ceacc5db8b4e818d63c90266..bc50c10d84cdd6b444887a8bb5fdf7b49a004eb8 100644
--- a/src/hydro/Gizmo/hydro_gradients_gizmo.h
+++ b/src/hydro/Gizmo/hydro_gradients_gizmo.h
@@ -45,18 +45,6 @@ __attribute__((always_inline)) INLINE static void hydro_gradients_init(
   p->primitives.gradients.P[1] = 0.0f;
   p->primitives.gradients.P[2] = 0.0f;
 
-  p->gravity.grad_a[0][0] = 0.0f;
-  p->gravity.grad_a[0][1] = 0.0f;
-  p->gravity.grad_a[0][2] = 0.0f;
-
-  p->gravity.grad_a[1][0] = 0.0f;
-  p->gravity.grad_a[1][1] = 0.0f;
-  p->gravity.grad_a[1][2] = 0.0f;
-
-  p->gravity.grad_a[2][0] = 0.0f;
-  p->gravity.grad_a[2][1] = 0.0f;
-  p->gravity.grad_a[2][2] = 0.0f;
-
   hydro_slope_limit_cell_init(p);
 }
 
@@ -157,35 +145,6 @@ __attribute__((always_inline)) INLINE static void hydro_gradients_collect(
         (Wi[4] - Wj[4]) * wi *
         (Bi[2][0] * dx[0] + Bi[2][1] * dx[1] + Bi[2][2] * dx[2]);
 
-    pi->gravity.grad_a[0][0] +=
-        (pi->gravity.old_a[0] - pj->gravity.old_a[0]) * wi *
-        (Bi[0][0] * dx[0] + Bi[0][1] * dx[1] + Bi[0][2] * dx[2]);
-    pi->gravity.grad_a[0][1] +=
-        (pi->gravity.old_a[0] - pj->gravity.old_a[0]) * wi *
-        (Bi[1][0] * dx[0] + Bi[1][1] * dx[1] + Bi[1][2] * dx[2]);
-    pi->gravity.grad_a[0][2] +=
-        (pi->gravity.old_a[0] - pj->gravity.old_a[0]) * wi *
-        (Bi[2][0] * dx[0] + Bi[2][1] * dx[1] + Bi[2][2] * dx[2]);
-
-    pi->gravity.grad_a[1][0] +=
-        (pi->gravity.old_a[1] - pj->gravity.old_a[1]) * wi *
-        (Bi[0][0] * dx[0] + Bi[0][1] * dx[1] + Bi[0][2] * dx[2]);
-    pi->gravity.grad_a[1][1] +=
-        (pi->gravity.old_a[1] - pj->gravity.old_a[1]) * wi *
-        (Bi[1][0] * dx[0] + Bi[1][1] * dx[1] + Bi[1][2] * dx[2]);
-    pi->gravity.grad_a[1][2] +=
-        (pi->gravity.old_a[1] - pj->gravity.old_a[1]) * wi *
-        (Bi[2][0] * dx[0] + Bi[2][1] * dx[1] + Bi[2][2] * dx[2]);
-
-    pi->gravity.grad_a[2][0] +=
-        (pi->gravity.old_a[2] - pj->gravity.old_a[2]) * wi *
-        (Bi[0][0] * dx[0] + Bi[0][1] * dx[1] + Bi[0][2] * dx[2]);
-    pi->gravity.grad_a[2][1] +=
-        (pi->gravity.old_a[2] - pj->gravity.old_a[2]) * wi *
-        (Bi[1][0] * dx[0] + Bi[1][1] * dx[1] + Bi[1][2] * dx[2]);
-    pi->gravity.grad_a[2][2] +=
-        (pi->gravity.old_a[2] - pj->gravity.old_a[2]) * wi *
-        (Bi[2][0] * dx[0] + Bi[2][1] * dx[1] + Bi[2][2] * dx[2]);
   } else {
     /* The gradient matrix was not well-behaved, switch to SPH gradients */
 
@@ -223,27 +182,6 @@ __attribute__((always_inline)) INLINE static void hydro_gradients_collect(
         wi_dx * dx[1] * (pi->primitives.P - pj->primitives.P) / r;
     pi->primitives.gradients.P[2] -=
         wi_dx * dx[2] * (pi->primitives.P - pj->primitives.P) / r;
-
-    pi->gravity.grad_a[0][0] -=
-        wi_dx * dx[0] * (pi->gravity.old_a[0] - pj->gravity.old_a[0]) / r;
-    pi->gravity.grad_a[0][1] -=
-        wi_dx * dx[1] * (pi->gravity.old_a[0] - pj->gravity.old_a[0]) / r;
-    pi->gravity.grad_a[0][2] -=
-        wi_dx * dx[2] * (pi->gravity.old_a[0] - pj->gravity.old_a[0]) / r;
-
-    pi->gravity.grad_a[1][0] -=
-        wi_dx * dx[0] * (pi->gravity.old_a[1] - pj->gravity.old_a[1]) / r;
-    pi->gravity.grad_a[1][1] -=
-        wi_dx * dx[1] * (pi->gravity.old_a[1] - pj->gravity.old_a[1]) / r;
-    pi->gravity.grad_a[1][2] -=
-        wi_dx * dx[2] * (pi->gravity.old_a[1] - pj->gravity.old_a[1]) / r;
-
-    pi->gravity.grad_a[2][0] -=
-        wi_dx * dx[0] * (pi->gravity.old_a[2] - pj->gravity.old_a[2]) / r;
-    pi->gravity.grad_a[2][1] -=
-        wi_dx * dx[1] * (pi->gravity.old_a[2] - pj->gravity.old_a[2]) / r;
-    pi->gravity.grad_a[2][2] -=
-        wi_dx * dx[2] * (pi->gravity.old_a[2] - pj->gravity.old_a[2]) / r;
   }
 
   hydro_slope_limit_cell_collect(pi, pj, r);
@@ -306,35 +244,6 @@ __attribute__((always_inline)) INLINE static void hydro_gradients_collect(
         (Wi[4] - Wj[4]) * wj *
         (Bj[2][0] * dx[0] + Bj[2][1] * dx[1] + Bj[2][2] * dx[2]);
 
-    pj->gravity.grad_a[0][0] +=
-        (pi->gravity.old_a[0] - pj->gravity.old_a[0]) * wj *
-        (Bj[0][0] * dx[0] + Bj[0][1] * dx[1] + Bj[0][2] * dx[2]);
-    pj->gravity.grad_a[0][1] +=
-        (pi->gravity.old_a[0] - pj->gravity.old_a[0]) * wj *
-        (Bj[1][0] * dx[0] + Bj[1][1] * dx[1] + Bj[1][2] * dx[2]);
-    pj->gravity.grad_a[0][2] +=
-        (pi->gravity.old_a[0] - pj->gravity.old_a[0]) * wj *
-        (Bj[2][0] * dx[0] + Bj[2][1] * dx[1] + Bj[2][2] * dx[2]);
-
-    pj->gravity.grad_a[1][0] +=
-        (pi->gravity.old_a[1] - pj->gravity.old_a[1]) * wj *
-        (Bj[0][0] * dx[0] + Bj[0][1] * dx[1] + Bj[0][2] * dx[2]);
-    pj->gravity.grad_a[1][1] +=
-        (pi->gravity.old_a[1] - pj->gravity.old_a[1]) * wj *
-        (Bj[1][0] * dx[0] + Bj[1][1] * dx[1] + Bj[1][2] * dx[2]);
-    pj->gravity.grad_a[1][2] +=
-        (pi->gravity.old_a[1] - pj->gravity.old_a[1]) * wj *
-        (Bj[2][0] * dx[0] + Bj[2][1] * dx[1] + Bj[2][2] * dx[2]);
-
-    pj->gravity.grad_a[2][0] +=
-        (pi->gravity.old_a[2] - pj->gravity.old_a[2]) * wj *
-        (Bj[0][0] * dx[0] + Bj[0][1] * dx[1] + Bj[0][2] * dx[2]);
-    pj->gravity.grad_a[2][1] +=
-        (pi->gravity.old_a[2] - pj->gravity.old_a[2]) * wj *
-        (Bj[1][0] * dx[0] + Bj[1][1] * dx[1] + Bj[1][2] * dx[2]);
-    pj->gravity.grad_a[2][2] +=
-        (pi->gravity.old_a[2] - pj->gravity.old_a[2]) * wj *
-        (Bj[2][0] * dx[0] + Bj[2][1] * dx[1] + Bj[2][2] * dx[2]);
   } else {
     /* SPH gradients */
 
@@ -371,27 +280,6 @@ __attribute__((always_inline)) INLINE static void hydro_gradients_collect(
         wj_dx * dx[1] * (pi->primitives.P - pj->primitives.P) / r;
     pj->primitives.gradients.P[2] -=
         wj_dx * dx[2] * (pi->primitives.P - pj->primitives.P) / r;
-
-    pj->gravity.grad_a[0][0] -=
-        wj_dx * dx[0] * (pi->gravity.old_a[0] - pj->gravity.old_a[0]) / r;
-    pj->gravity.grad_a[0][1] -=
-        wj_dx * dx[1] * (pi->gravity.old_a[0] - pj->gravity.old_a[0]) / r;
-    pj->gravity.grad_a[0][2] -=
-        wj_dx * dx[2] * (pi->gravity.old_a[0] - pj->gravity.old_a[0]) / r;
-
-    pj->gravity.grad_a[1][0] -=
-        wj_dx * dx[0] * (pi->gravity.old_a[1] - pj->gravity.old_a[1]) / r;
-    pj->gravity.grad_a[1][1] -=
-        wj_dx * dx[1] * (pi->gravity.old_a[1] - pj->gravity.old_a[1]) / r;
-    pj->gravity.grad_a[1][2] -=
-        wj_dx * dx[2] * (pi->gravity.old_a[1] - pj->gravity.old_a[1]) / r;
-
-    pj->gravity.grad_a[2][0] -=
-        wj_dx * dx[0] * (pi->gravity.old_a[2] - pj->gravity.old_a[2]) / r;
-    pj->gravity.grad_a[2][1] -=
-        wj_dx * dx[1] * (pi->gravity.old_a[2] - pj->gravity.old_a[2]) / r;
-    pj->gravity.grad_a[2][2] -=
-        wj_dx * dx[2] * (pi->gravity.old_a[2] - pj->gravity.old_a[2]) / r;
   }
 
   hydro_slope_limit_cell_collect(pj, pi, r);
@@ -493,35 +381,6 @@ hydro_gradients_nonsym_collect(float r2, float *dx, float hi, float hj,
         (Wi[4] - Wj[4]) * wi *
         (Bi[2][0] * dx[0] + Bi[2][1] * dx[1] + Bi[2][2] * dx[2]);
 
-    pi->gravity.grad_a[0][0] +=
-        (pi->gravity.old_a[0] - pj->gravity.old_a[0]) * wi *
-        (Bi[0][0] * dx[0] + Bi[0][1] * dx[1] + Bi[0][2] * dx[2]);
-    pi->gravity.grad_a[0][1] +=
-        (pi->gravity.old_a[0] - pj->gravity.old_a[0]) * wi *
-        (Bi[1][0] * dx[0] + Bi[1][1] * dx[1] + Bi[1][2] * dx[2]);
-    pi->gravity.grad_a[0][2] +=
-        (pi->gravity.old_a[0] - pj->gravity.old_a[0]) * wi *
-        (Bi[2][0] * dx[0] + Bi[2][1] * dx[1] + Bi[2][2] * dx[2]);
-
-    pi->gravity.grad_a[1][0] +=
-        (pi->gravity.old_a[1] - pj->gravity.old_a[1]) * wi *
-        (Bi[0][0] * dx[0] + Bi[0][1] * dx[1] + Bi[0][2] * dx[2]);
-    pi->gravity.grad_a[1][1] +=
-        (pi->gravity.old_a[1] - pj->gravity.old_a[1]) * wi *
-        (Bi[1][0] * dx[0] + Bi[1][1] * dx[1] + Bi[1][2] * dx[2]);
-    pi->gravity.grad_a[1][2] +=
-        (pi->gravity.old_a[1] - pj->gravity.old_a[1]) * wi *
-        (Bi[2][0] * dx[0] + Bi[2][1] * dx[1] + Bi[2][2] * dx[2]);
-
-    pi->gravity.grad_a[2][0] +=
-        (pi->gravity.old_a[2] - pj->gravity.old_a[2]) * wi *
-        (Bi[0][0] * dx[0] + Bi[0][1] * dx[1] + Bi[0][2] * dx[2]);
-    pi->gravity.grad_a[2][1] +=
-        (pi->gravity.old_a[2] - pj->gravity.old_a[2]) * wi *
-        (Bi[1][0] * dx[0] + Bi[1][1] * dx[1] + Bi[1][2] * dx[2]);
-    pi->gravity.grad_a[2][2] +=
-        (pi->gravity.old_a[2] - pj->gravity.old_a[2]) * wi *
-        (Bi[2][0] * dx[0] + Bi[2][1] * dx[1] + Bi[2][2] * dx[2]);
   } else {
     /* Gradient matrix is not well-behaved, switch to SPH gradients */
 
@@ -558,27 +417,6 @@ hydro_gradients_nonsym_collect(float r2, float *dx, float hi, float hj,
         wi_dx * dx[1] * (pi->primitives.P - pj->primitives.P) / r;
     pi->primitives.gradients.P[2] -=
         wi_dx * dx[2] * (pi->primitives.P - pj->primitives.P) / r;
-
-    pi->gravity.grad_a[0][0] -=
-        wi_dx * dx[0] * (pi->gravity.old_a[0] - pj->gravity.old_a[0]) / r;
-    pi->gravity.grad_a[0][1] -=
-        wi_dx * dx[1] * (pi->gravity.old_a[0] - pj->gravity.old_a[0]) / r;
-    pi->gravity.grad_a[0][2] -=
-        wi_dx * dx[2] * (pi->gravity.old_a[0] - pj->gravity.old_a[0]) / r;
-
-    pi->gravity.grad_a[1][0] -=
-        wi_dx * dx[0] * (pi->gravity.old_a[1] - pj->gravity.old_a[1]) / r;
-    pi->gravity.grad_a[1][1] -=
-        wi_dx * dx[1] * (pi->gravity.old_a[1] - pj->gravity.old_a[1]) / r;
-    pi->gravity.grad_a[1][2] -=
-        wi_dx * dx[2] * (pi->gravity.old_a[1] - pj->gravity.old_a[1]) / r;
-
-    pi->gravity.grad_a[2][0] -=
-        wi_dx * dx[0] * (pi->gravity.old_a[2] - pj->gravity.old_a[2]) / r;
-    pi->gravity.grad_a[2][1] -=
-        wi_dx * dx[1] * (pi->gravity.old_a[2] - pj->gravity.old_a[2]) / r;
-    pi->gravity.grad_a[2][2] -=
-        wi_dx * dx[2] * (pi->gravity.old_a[2] - pj->gravity.old_a[2]) / r;
   }
 
   hydro_slope_limit_cell_collect(pi, pj, r);
@@ -618,17 +456,6 @@ __attribute__((always_inline)) INLINE static void hydro_gradients_finalize(
     p->primitives.gradients.P[1] *= ihdim;
     p->primitives.gradients.P[2] *= ihdim;
 
-    p->gravity.grad_a[0][0] *= ihdim;
-    p->gravity.grad_a[0][1] *= ihdim;
-    p->gravity.grad_a[0][2] *= ihdim;
-
-    p->gravity.grad_a[1][0] *= ihdim;
-    p->gravity.grad_a[1][1] *= ihdim;
-    p->gravity.grad_a[1][2] *= ihdim;
-
-    p->gravity.grad_a[2][0] *= ihdim;
-    p->gravity.grad_a[2][1] *= ihdim;
-    p->gravity.grad_a[2][2] *= ihdim;
   } else {
     const float ihdimp1 = pow_dimension_plus_one(ih);
 
@@ -653,18 +480,6 @@ __attribute__((always_inline)) INLINE static void hydro_gradients_finalize(
     p->primitives.gradients.P[0] *= ihdimp1 * volume;
     p->primitives.gradients.P[1] *= ihdimp1 * volume;
     p->primitives.gradients.P[2] *= ihdimp1 * volume;
-
-    p->gravity.grad_a[0][0] *= ihdimp1 * volume;
-    p->gravity.grad_a[0][1] *= ihdimp1 * volume;
-    p->gravity.grad_a[0][2] *= ihdimp1 * volume;
-
-    p->gravity.grad_a[1][0] *= ihdimp1 * volume;
-    p->gravity.grad_a[1][1] *= ihdimp1 * volume;
-    p->gravity.grad_a[1][2] *= ihdimp1 * volume;
-
-    p->gravity.grad_a[2][0] *= ihdimp1 * volume;
-    p->gravity.grad_a[2][1] *= ihdimp1 * volume;
-    p->gravity.grad_a[2][2] *= ihdimp1 * volume;
   }
 
   hydro_slope_limit_cell(p);
diff --git a/src/hydro/Gizmo/hydro_iact.h b/src/hydro/Gizmo/hydro_iact.h
index 8798dc859a790a83ab7a3b6f1709b1302f574581..0c7c8251b7d1c105dfc0c4b1637724accadaa4ae 100644
--- a/src/hydro/Gizmo/hydro_iact.h
+++ b/src/hydro/Gizmo/hydro_iact.h
@@ -20,6 +20,7 @@
  ******************************************************************************/
 
 #include "adiabatic_index.h"
+#include "hydro_flux_limiters.h"
 #include "hydro_gradients.h"
 #include "riemann.h"
 
@@ -57,7 +58,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_density(
   kernel_deval(xi, &wi, &wi_dx);
 
   pi->density.wcount += wi;
-  pi->density.wcount_dh -= xi * wi_dx;
+  pi->density.wcount_dh -= (hydro_dimension * wi + xi * wi_dx);
 
   /* these are eqns. (1) and (2) in the summary */
   pi->geometry.volume += wi;
@@ -74,7 +75,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_density(
   kernel_deval(xj, &wj, &wj_dx);
 
   pj->density.wcount += wj;
-  pj->density.wcount_dh -= xj * wj_dx;
+  pj->density.wcount_dh -= (hydro_dimension * wj + xj * wj_dx);
 
   /* these are eqns. (1) and (2) in the summary */
   pj->geometry.volume += wj;
@@ -121,7 +122,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density(
   kernel_deval(xi, &wi, &wi_dx);
 
   pi->density.wcount += wi;
-  pi->density.wcount_dh -= xi * wi_dx;
+  pi->density.wcount_dh -= (hydro_dimension * wi + xi * wi_dx);
 
   /* these are eqns. (1) and (2) in the summary */
   pi->geometry.volume += wi;
@@ -346,8 +347,11 @@ __attribute__((always_inline)) INLINE static void runner_iact_fluxes_common(
   }
   dvdotdx = (Wi[1] - Wj[1]) * dx[0] + (Wi[2] - Wj[2]) * dx[1] +
             (Wi[3] - Wj[3]) * dx[2];
-  if (dvdotdx > 0.) {
-    vmax -= dvdotdx / r;
+  dvdotdx = min(dvdotdx, (vi[0] - vj[0]) * dx[0] + (vi[1] - vj[1]) * dx[1] +
+                             (vi[2] - vj[2]) * dx[2]);
+  if (dvdotdx < 0.) {
+    /* the magical factor 3 also appears in Gadget2 */
+    vmax -= 3. * dvdotdx / r;
   }
   pi->timestepvars.vmax = max(pi->timestepvars.vmax, vmax);
   if (mode == 1) {
@@ -487,36 +491,10 @@ __attribute__((always_inline)) INLINE static void runner_iact_fluxes_common(
   float totflux[5];
   riemann_solve_for_flux(Wi, Wj, n_unit, vij, totflux);
 
-  /* Flux limiter */
-  float flux_limit_factor = 1.;
-  float timefac = max(dti, dtj);
-  float areafac = max(pi->geometry.Atot, pj->geometry.Atot);
-  if (totflux[0] * areafac * timefac > pi->conserved.mass) {
-    flux_limit_factor = pi->conserved.mass / (totflux[0] * areafac * timefac);
-  }
-  if (totflux[0] * areafac * timefac > pj->conserved.mass) {
-    flux_limit_factor =
-        min(pj->conserved.mass / (totflux[0] * areafac * timefac),
-            flux_limit_factor);
-  }
-  if (totflux[4] * areafac * timefac > pi->conserved.energy) {
-    flux_limit_factor =
-        min(pi->conserved.energy / (totflux[4] * areafac * timefac),
-            flux_limit_factor);
-  }
-  if (totflux[4] * areafac * timefac > pj->conserved.energy) {
-    flux_limit_factor =
-        min(pj->conserved.energy / (totflux[4] * areafac * timefac),
-            flux_limit_factor);
-  }
-  totflux[0] *= flux_limit_factor;
-  totflux[1] *= flux_limit_factor;
-  totflux[2] *= flux_limit_factor;
-  totflux[3] *= flux_limit_factor;
-  totflux[4] *= flux_limit_factor;
+  hydro_flux_limiters_apply(totflux, pi, pj);
 
   /* Store mass flux */
-  float mflux = mindt * Anorm * totflux[0];
+  float mflux = Anorm * totflux[0];
   pi->gravity.mflux[0] += mflux * dx[0];
   pi->gravity.mflux[1] += mflux * dx[1];
   pi->gravity.mflux[2] += mflux * dx[2];
@@ -554,7 +532,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_fluxes_common(
 
   if (mode == 1 || pj->force.active == 0) {
     /* Store mass flux */
-    mflux = mindt * Anorm * totflux[0];
+    mflux = Anorm * totflux[0];
     pj->gravity.mflux[0] -= mflux * dx[0];
     pj->gravity.mflux[1] -= mflux * dx[1];
     pj->gravity.mflux[2] -= mflux * dx[2];
diff --git a/src/hydro/Gizmo/hydro_io.h b/src/hydro/Gizmo/hydro_io.h
index 3d58be2f47c4e1904aaac5f69d1862f1d453e488..d20f7e2eb1cf50be7690e15a9569d8e9c4605af5 100644
--- a/src/hydro/Gizmo/hydro_io.h
+++ b/src/hydro/Gizmo/hydro_io.h
@@ -18,6 +18,7 @@
  ******************************************************************************/
 
 #include "adiabatic_index.h"
+#include "hydro_flux_limiters.h"
 #include "hydro_gradients.h"
 #include "hydro_slope_limiters.h"
 #include "io_properties.h"
@@ -127,7 +128,7 @@ float convert_Etot(struct engine* e, struct part* p) {
 void hydro_write_particles(struct part* parts, struct io_props* list,
                            int* num_fields) {
 
-  *num_fields = 11;
+  *num_fields = 10;
 
   /* List what we want to write */
   list[0] = io_make_output_field("Coordinates", DOUBLE, 3, UNIT_CONV_LENGTH,
@@ -152,8 +153,6 @@ void hydro_write_particles(struct part* parts, struct io_props* list,
   list[9] =
       io_make_output_field_convert_part("TotEnergy", FLOAT, 1, UNIT_CONV_ENERGY,
                                         parts, conserved.energy, convert_Etot);
-  list[10] = io_make_output_field("GravAcceleration", FLOAT, 3,
-                                  UNIT_CONV_ACCELERATION, parts, gravity.old_a);
 }
 
 /**
@@ -171,6 +170,10 @@ void writeSPHflavour(hid_t h_grpsph) {
   io_write_attribute_s(h_grpsph, "Piecewise slope limiter model",
                        HYDRO_SLOPE_LIMITER_FACE_IMPLEMENTATION);
 
+  /* Flux limiter information */
+  io_write_attribute_s(h_grpsph, "Flux limiter model",
+                       HYDRO_FLUX_LIMITER_IMPLEMENTATION);
+
   /* Riemann solver information */
   io_write_attribute_s(h_grpsph, "Riemann solver type",
                        RIEMANN_SOLVER_IMPLEMENTATION);
diff --git a/src/hydro/Gizmo/hydro_part.h b/src/hydro/Gizmo/hydro_part.h
index 6c96004847ae23b46ec3f5182f742e0e84f1118d..47f722c5a2dcce2f3ce603ade3029821d6686067 100644
--- a/src/hydro/Gizmo/hydro_part.h
+++ b/src/hydro/Gizmo/hydro_part.h
@@ -153,10 +153,13 @@ struct part {
 
   } geometry;
 
-  /* Variables used for timestep calculation (currently not used). */
+  /* Variables used for timestep calculation. */
   struct {
 
-    /* Maximum fluid velocity among all neighbours. */
+    /* Maximum signal velocity among all the neighbours of the particle. The
+     * signal velocity encodes information about the relative fluid velocities
+     * AND particle velocities of the neighbour and this particle, as well as
+     * the sound speed of both particles. */
     float vmax;
 
   } timestepvars;
@@ -201,14 +204,6 @@ struct part {
   /* Specific stuff for the gravity-hydro coupling. */
   struct {
 
-    /* Previous value of the gravitational acceleration. */
-    float old_a[3];
-
-    float grad_a[3][3];
-
-    /* Previous value of the mass flux vector. */
-    float old_mflux[3];
-
     /* Current value of the mass flux vector. */
     float mflux[3];
 
diff --git a/src/hydro/Minimal/hydro.h b/src/hydro/Minimal/hydro.h
index 8f216a550ae061d01a594ff23d57575e754f85dc..4d8ca5b05547467c973e17983774b64736060471 100644
--- a/src/hydro/Minimal/hydro.h
+++ b/src/hydro/Minimal/hydro.h
@@ -219,12 +219,34 @@ __attribute__((always_inline)) INLINE static void hydro_end_density(
   p->rho += p->mass * kernel_root;
   p->density.rho_dh -= hydro_dimension * p->mass * kernel_root;
   p->density.wcount += kernel_root;
+  p->density.wcount_dh -= hydro_dimension * kernel_root;
 
   /* Finish the calculation by inserting the missing h-factors */
   p->rho *= h_inv_dim;
   p->density.rho_dh *= h_inv_dim_plus_one;
   p->density.wcount *= kernel_norm;
-  p->density.wcount_dh *= h_inv * kernel_gamma * kernel_norm;
+  p->density.wcount_dh *= h_inv_dim_plus_one;
+}
+
+/**
+ * @brief Sets all particle fields to sensible values when the #part has 0 ngbs.
+ *
+ * @param p The particle to act upon
+ * @param xp The extended particle data to act upon
+ */
+__attribute__((always_inline)) INLINE static void hydro_part_has_no_neighbours(
+    struct part *restrict p, struct xpart *restrict xp) {
+
+  /* Some smoothing length multiples. */
+  const float h = p->h;
+  const float h_inv = 1.0f / h;                 /* 1/h */
+  const float h_inv_dim = pow_dimension(h_inv); /* 1/h^d */
+
+  /* Re-set problematic values */
+  p->rho = p->mass * kernel_root * h_inv_dim;
+  p->density.wcount = kernel_root * kernel_norm * h_inv_dim;
+  p->density.rho_dh = 0.f;
+  p->density.wcount_dh = 0.f;
 }
 
 /**
diff --git a/src/hydro/Minimal/hydro_iact.h b/src/hydro/Minimal/hydro_iact.h
index 169947b99e92d9bd1b0870d502a49e311820ff81..621177a3363e651e12dd728ad96ddadce3812f0e 100644
--- a/src/hydro/Minimal/hydro_iact.h
+++ b/src/hydro/Minimal/hydro_iact.h
@@ -51,23 +51,23 @@ __attribute__((always_inline)) INLINE static void runner_iact_density(
 
   /* Compute density of pi. */
   const float hi_inv = 1.f / hi;
-  const float xi = r * hi_inv;
-  kernel_deval(xi, &wi, &wi_dx);
+  const float ui = r * hi_inv;
+  kernel_deval(ui, &wi, &wi_dx);
 
   pi->rho += mj * wi;
-  pi->density.rho_dh -= mj * (hydro_dimension * wi + xi * wi_dx);
+  pi->density.rho_dh -= mj * (hydro_dimension * wi + ui * wi_dx);
   pi->density.wcount += wi;
-  pi->density.wcount_dh -= xi * wi_dx;
+  pi->density.wcount_dh -= (hydro_dimension * wi + ui * wi_dx);
 
   /* Compute density of pj. */
   const float hj_inv = 1.f / hj;
-  const float xj = r * hj_inv;
-  kernel_deval(xj, &wj, &wj_dx);
+  const float uj = r * hj_inv;
+  kernel_deval(uj, &wj, &wj_dx);
 
   pj->rho += mi * wj;
-  pj->density.rho_dh -= mi * (hydro_dimension * wj + xj * wj_dx);
+  pj->density.rho_dh -= mi * (hydro_dimension * wj + uj * wj_dx);
   pj->density.wcount += wj;
-  pj->density.wcount_dh -= xj * wj_dx;
+  pj->density.wcount_dh -= (hydro_dimension * wj + uj * wj_dx);
 }
 
 /**
@@ -96,13 +96,13 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density(
   const float r = sqrtf(r2);
 
   const float h_inv = 1.f / hi;
-  const float xi = r * h_inv;
-  kernel_deval(xi, &wi, &wi_dx);
+  const float ui = r * h_inv;
+  kernel_deval(ui, &wi, &wi_dx);
 
   pi->rho += mj * wi;
-  pi->density.rho_dh -= mj * (hydro_dimension * wi + xi * wi_dx);
+  pi->density.rho_dh -= mj * (hydro_dimension * wi + ui * wi_dx);
   pi->density.wcount += wi;
-  pi->density.wcount_dh -= xi * wi_dx;
+  pi->density.wcount_dh -= (hydro_dimension * wi + ui * wi_dx);
 }
 
 /**
diff --git a/src/hydro/PressureEntropy/hydro.h b/src/hydro/PressureEntropy/hydro.h
index 4c4868cd3703e5ec5466d4878749a61284b19344..080b796b21d7f3b48191cd375574ae1de6d11d1a 100644
--- a/src/hydro/PressureEntropy/hydro.h
+++ b/src/hydro/PressureEntropy/hydro.h
@@ -212,14 +212,15 @@ __attribute__((always_inline)) INLINE static void hydro_end_density(
   p->density.pressure_dh -=
       hydro_dimension * p->mass * p->entropy_one_over_gamma * kernel_root;
   p->density.wcount += kernel_root;
+  p->density.wcount_dh -= hydro_dimension * kernel_root;
 
   /* Finish the calculation by inserting the missing h-factors */
   p->rho *= h_inv_dim;
   p->rho_bar *= h_inv_dim;
   p->density.rho_dh *= h_inv_dim_plus_one;
   p->density.pressure_dh *= h_inv_dim_plus_one;
-  p->density.wcount *= kernel_norm;
-  p->density.wcount_dh *= h_inv * kernel_gamma * kernel_norm;
+  p->density.wcount *= h_inv_dim;
+  p->density.wcount_dh *= h_inv_dim_plus_one;
 
   const float rho_inv = 1.f / p->rho;
   const float entropy_minus_one_over_gamma = 1.f / p->entropy_one_over_gamma;
@@ -236,6 +237,33 @@ __attribute__((always_inline)) INLINE static void hydro_end_density(
   p->density.div_v *= h_inv_dim_plus_one * rho_inv;
 }
 
+/**
+ * @brief Sets all particle fields to sensible values when the #part has 0 ngbs.
+ *
+ * @param p The particle to act upon
+ * @param xp The extended particle data to act upon
+ */
+__attribute__((always_inline)) INLINE static void hydro_part_has_no_neighbours(
+    struct part *restrict p, struct xpart *restrict xp) {
+
+  /* Some smoothing length multiples. */
+  const float h = p->h;
+  const float h_inv = 1.0f / h;                 /* 1/h */
+  const float h_inv_dim = pow_dimension(h_inv); /* 1/h^d */
+
+  /* Re-set problematic values */
+  p->rho = p->mass * kernel_root * h_inv_dim;
+  p->rho_bar = p->mass * kernel_root * h_inv_dim;
+  p->density.wcount = kernel_root * kernel_norm * h_inv_dim;
+  p->density.rho_dh = 0.f;
+  p->density.wcount_dh = 0.f;
+  p->density.pressure_dh = 0.f;
+  p->density.div_v = 0.f;
+  p->density.rot_v[0] = 0.f;
+  p->density.rot_v[1] = 0.f;
+  p->density.rot_v[2] = 0.f;
+}
+
 /**
  * @brief Prepare a particle for the force calculation.
  *
diff --git a/src/hydro/PressureEntropy/hydro_iact.h b/src/hydro/PressureEntropy/hydro_iact.h
index ce1c38ca69954252dc804af9181b9060a14afcb9..37a9f2b01af16fe598b414a9f67123849bee1442 100644
--- a/src/hydro/PressureEntropy/hydro_iact.h
+++ b/src/hydro/PressureEntropy/hydro_iact.h
@@ -59,7 +59,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_density(
 
   /* Compute contribution to the number of neighbours */
   pi->density.wcount += wi;
-  pi->density.wcount_dh -= ui * wi_dx;
+  pi->density.wcount_dh -= (hydro_dimension * wi + ui * wi_dx);
 
   /* Compute contribution to the weighted density */
   pi->rho_bar += mj * pj->entropy_one_over_gamma * wi;
@@ -77,7 +77,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_density(
 
   /* Compute contribution to the number of neighbours */
   pj->density.wcount += wj;
-  pj->density.wcount_dh -= uj * wj_dx;
+  pj->density.wcount_dh -= (hydro_dimension * wj + uj * wj_dx);
 
   /* Compute contribution to the weighted density */
   pj->rho_bar += mi * pi->entropy_one_over_gamma * wj;
@@ -147,7 +147,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density(
 
   /* Compute contribution to the number of neighbours */
   pi->density.wcount += wi;
-  pi->density.wcount_dh -= ui * wi_dx;
+  pi->density.wcount_dh -= (hydro_dimension * wi + ui * wi_dx);
 
   /* Compute contribution to the weighted density */
   pi->rho_bar += mj * pj->entropy_one_over_gamma * wi;
diff --git a/src/hydro/Shadowswift/hydro.h b/src/hydro/Shadowswift/hydro.h
index 0568d47ee7ed33c59790cbca943cccbf1ceda58f..abbcdcd2f7879d8063a906e44ab2fe6a3e675828 100644
--- a/src/hydro/Shadowswift/hydro.h
+++ b/src/hydro/Shadowswift/hydro.h
@@ -238,6 +238,25 @@ __attribute__((always_inline)) INLINE static void hydro_end_density(
 #endif
 }
 
+/**
+ * @brief Sets all particle fields to sensible values when the #part has 0 ngbs.
+ *
+ * @param p The particle to act upon
+ * @param xp The extended particle data to act upon
+ */
+__attribute__((always_inline)) INLINE static void hydro_part_has_no_neighbours(
+    struct part* restrict p, struct xpart* restrict xp) {
+
+  /* Some smoothing length multiples. */
+  const float h = p->h;
+  const float h_inv = 1.0f / h;                 /* 1/h */
+  const float h_inv_dim = pow_dimension(h_inv); /* 1/h^d */
+
+  /* Re-set problematic values */
+  p->density.wcount = kernel_root * kernel_norm * h_inv_dim;
+  p->density.wcount_dh = 0.f;
+}
+
 /**
  * @brief Prepare a particle for the gradient calculation.
  *
diff --git a/src/hydro_properties.c b/src/hydro_properties.c
index 818c1b6349192ed73b28cd4c3ae771f89a3754cd..1e7554f7d84220b8c962d60cc4538c685b5bad52 100644
--- a/src/hydro_properties.c
+++ b/src/hydro_properties.c
@@ -33,16 +33,26 @@
 #include "kernel_hydro.h"
 
 #define hydro_props_default_max_iterations 30
-#define hydro_props_default_volume_change 2.0f
+#define hydro_props_default_volume_change 1.4f
 #define hydro_props_default_h_max FLT_MAX
+#define hydro_props_default_h_tolerance 1e-4
 
 void hydro_props_init(struct hydro_props *p,
                       const struct swift_params *params) {
 
   /* Kernel properties */
   p->eta_neighbours = parser_get_param_float(params, "SPH:resolution_eta");
+
+  /* Tolerance for the smoothing length Newton-Raphson scheme */
+  p->h_tolerance = parser_get_opt_param_float(params, "SPH:h_tolerance",
+                                              hydro_props_default_h_tolerance);
+
+  /* Get derived properties */
   p->target_neighbours = pow_dimension(p->eta_neighbours) * kernel_norm;
-  p->delta_neighbours = parser_get_param_float(params, "SPH:delta_neighbours");
+  const float delta_eta = p->eta_neighbours * (1.f + p->h_tolerance);
+  p->delta_neighbours =
+      (pow_dimension(delta_eta) - pow_dimension(p->eta_neighbours)) *
+      kernel_norm;
 
 #ifdef SHADOWFAX_SPH
   /* change the meaning of target_neighbours and delta_neighbours */
@@ -81,9 +91,11 @@ void hydro_props_print(const struct hydro_props *p) {
   message("Hydrodynamic scheme: %s in %dD.", SPH_IMPLEMENTATION,
           (int)hydro_dimension);
 
-  message("Hydrodynamic kernel: %s with %.2f +/- %.2f neighbours (eta=%f).",
-          kernel_name, p->target_neighbours, p->delta_neighbours,
-          p->eta_neighbours);
+  message("Hydrodynamic kernel: %s with eta=%f (%.2f neighbours).", kernel_name,
+          p->eta_neighbours, p->target_neighbours);
+
+  message("Hydrodynamic relative tolerance in h: %.5f (+/- %.4f neighbours).",
+          p->h_tolerance, p->delta_neighbours);
 
   message("Hydrodynamic integration: CFL parameter: %.4f.", p->CFL_condition);
 
@@ -110,6 +122,7 @@ void hydro_props_print_snapshot(hid_t h_grpsph, const struct hydro_props *p) {
   io_write_attribute_f(h_grpsph, "Kernel target N_ngb", p->target_neighbours);
   io_write_attribute_f(h_grpsph, "Kernel delta N_ngb", p->delta_neighbours);
   io_write_attribute_f(h_grpsph, "Kernel eta", p->eta_neighbours);
+  io_write_attribute_f(h_grpsph, "Smoothing length tolerance", p->h_tolerance);
   io_write_attribute_f(h_grpsph, "Maximal smoothing length", p->h_max);
   io_write_attribute_f(h_grpsph, "CFL parameter", p->CFL_condition);
   io_write_attribute_f(h_grpsph, "Volume log(max(delta h))",
diff --git a/src/hydro_properties.h b/src/hydro_properties.h
index 716c4c060c21eb95d05f9d50e13d4681a958a6fd..a887ccb6df13b649cd1ef1009059c6f08908669c 100644
--- a/src/hydro_properties.h
+++ b/src/hydro_properties.h
@@ -16,10 +16,14 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  *
  ******************************************************************************/
-
 #ifndef SWIFT_HYDRO_PROPERTIES
 #define SWIFT_HYDRO_PROPERTIES
 
+/**
+ * @file hydro_properties.h
+ * @brief Contains all the constants and parameters of the hydro scheme
+ */
+
 /* Config parameters. */
 #include "../config.h"
 
@@ -35,19 +39,28 @@
  */
 struct hydro_props {
 
-  /* Kernel properties */
+  /*! Resolution parameter */
   float eta_neighbours;
+
+  /*! Target weightd number of neighbours (for info only)*/
   float target_neighbours;
+
+  /*! Smoothing length tolerance */
+  float h_tolerance;
+
+  /*! Tolerance on neighbour number  (for info only)*/
   float delta_neighbours;
 
-  /* Maximal smoothing length */
+  /*! Maximal smoothing length */
   float h_max;
 
-  /* Number of iterations to converge h */
+  /*! Maximal number of iterations to converge h */
   int max_smoothing_iterations;
 
-  /* Time integration properties */
+  /*! Time integration properties */
   float CFL_condition;
+
+  /*! Maximal change of h over one time-step */
   float log_max_h_change;
 };
 
diff --git a/src/kernel_hydro.h b/src/kernel_hydro.h
index 45384e1aabb0189fd69a6a3cff122df95706af85..2e0f457d05c926fc1efa4fd334e7c8cc69189133 100644
--- a/src/kernel_hydro.h
+++ b/src/kernel_hydro.h
@@ -341,20 +341,7 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx(
 
 /* ------------------------------------------------------------------------- */
 
-#ifdef WITH_VECTORIZATION
-
-static const vector kernel_gamma_inv_vec = FILL_VEC((float)kernel_gamma_inv);
-
-static const vector kernel_ivals_vec = FILL_VEC((float)kernel_ivals);
-
-static const vector kernel_constant_vec = FILL_VEC((float)kernel_constant);
-
-static const vector kernel_gamma_inv_dim_vec =
-    FILL_VEC((float)kernel_gamma_inv_dim);
-
-static const vector kernel_gamma_inv_dim_plus_one_vec =
-    FILL_VEC((float)kernel_gamma_inv_dim_plus_one);
-
+#ifdef WITH_OLD_VECTORIZATION
 /**
  * @brief Computes the kernel function and its derivative (Vectorised version).
  *
@@ -373,7 +360,8 @@ __attribute__((always_inline)) INLINE static void kernel_deval_vec(
 
   /* Load x and get the interval id. */
   vector ind;
-  ind.m = vec_ftoi(vec_fmin(vec_mul(x.v, kernel_ivals_vec.v), kernel_ivals_vec.v));
+  ind.m =
+      vec_ftoi(vec_fmin(vec_mul(x.v, kernel_ivals_vec.v), kernel_ivals_vec.v));
 
   /* load the coefficients. */
   vector c[kernel_degree + 1];
@@ -392,9 +380,26 @@ __attribute__((always_inline)) INLINE static void kernel_deval_vec(
   }
 
   /* Return everything */
-  w->v = vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
-  dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_plus_one_vec.v));
+  w->v =
+      vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
+  dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
+                                       kernel_gamma_inv_dim_plus_one_vec.v));
 }
+#endif
+
+#ifdef WITH_VECTORIZATION
+
+static const vector kernel_gamma_inv_vec = FILL_VEC((float)kernel_gamma_inv);
+
+static const vector kernel_ivals_vec = FILL_VEC((float)kernel_ivals);
+
+static const vector kernel_constant_vec = FILL_VEC((float)kernel_constant);
+
+static const vector kernel_gamma_inv_dim_vec =
+    FILL_VEC((float)kernel_gamma_inv_dim);
+
+static const vector kernel_gamma_inv_dim_plus_one_vec =
+    FILL_VEC((float)kernel_gamma_inv_dim_plus_one);
 
 /* Define constant vectors for the Wendland C2 and Cubic Spline kernel
  * coefficients. */
@@ -468,14 +473,15 @@ __attribute__((always_inline)) INLINE static void kernel_deval_1_vec(
   w->v = vec_fma(x.v, w->v, wendland_const_c5.v);
 #elif defined(CUBIC_SPLINE_KERNEL)
   vector w2, dw_dx2;
-  mask_t mask_reg1, mask_reg2;
+  mask_t mask_reg;
 
-  /* Form a mask for each part of the kernel. */
-  vec_create_mask(mask_reg1, vec_cmp_lt(x.v, cond.v));  /* 0 < x < 0.5 */
-  vec_create_mask(mask_reg2, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */
+  /* Form a mask for one part of the kernel. */
+  /* Only need the mask for one region as the vec_blend defaults to the vector
+   * when the mask is 0.*/
+  vec_create_mask(mask_reg, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */
 
   /* Work out w for both regions of the kernel and combine the results together
-   * using masks. */
+   * using a mask. */
 
   /* Init the iteration for Horner's scheme. */
   w->v = vec_fma(cubic_1_const_c0.v, x.v, cubic_1_const_c1.v);
@@ -494,20 +500,17 @@ __attribute__((always_inline)) INLINE static void kernel_deval_1_vec(
   w->v = vec_fma(x.v, w->v, cubic_1_const_c3.v);
   w2.v = vec_fma(x.v, w2.v, cubic_2_const_c3.v);
 
-  /* Mask out unneeded values. */
-  w->v = vec_and_mask(w->v, mask_reg1);
-  w2.v = vec_and_mask(w2.v, mask_reg2);
-  dw_dx->v = vec_and_mask(dw_dx->v, mask_reg1);
-  dw_dx2.v = vec_and_mask(dw_dx2.v, mask_reg2);
+  /* Blend both kernel regions into one vector (mask out unneeded values). */
+  /* Only need the mask for one region as the vec_blend defaults to the vector
+   * when the mask is 0.*/
+  w->v = vec_blend(mask_reg, w->v, w2.v);
+  dw_dx->v = vec_blend(mask_reg, dw_dx->v, dw_dx2.v);
 
-  /* Added both w and w2 together to form complete result. */
-  w->v = vec_add(w->v, w2.v);
-  dw_dx->v = vec_add(dw_dx->v, dw_dx2.v);
 #else
 #error "Vectorisation not supported for this kernel!!!"
 #endif
 
-  /* Return everything */
+  /* Return everyting */
   w->v =
       vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
   dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
@@ -579,13 +582,13 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
 #elif defined(CUBIC_SPLINE_KERNEL)
   vector w_2, dw_dx_2;
   vector w2_2, dw_dx2_2;
-  mask_t mask_reg1, mask_reg2, mask_reg1_v2, mask_reg2_v2;
+  mask_t mask_reg, mask_reg_v2;
 
-  /* Form a mask for each part of the kernel. */
-  vec_create_mask(mask_reg1, vec_cmp_lt(x.v, cond.v));      /* 0 < x < 0.5 */
-  vec_create_mask(mask_reg1_v2, vec_cmp_lt(x2.v, cond.v));  /* 0 < x < 0.5 */
-  vec_create_mask(mask_reg2, vec_cmp_gte(x.v, cond.v));     /* 0.5 < x < 1 */
-  vec_create_mask(mask_reg2_v2, vec_cmp_gte(x2.v, cond.v)); /* 0.5 < x < 1 */
+  /* Form a mask for one part of the kernel for each vector. */
+  /* Only need the mask for one region as the vec_blend defaults to the vector
+   * when the mask is 0.*/
+  vec_create_mask(mask_reg, vec_cmp_gte(x.v, cond.v));     /* 0.5 < x < 1 */
+  vec_create_mask(mask_reg_v2, vec_cmp_gte(x2.v, cond.v)); /* 0.5 < x < 1 */
 
   /* Work out w for both regions of the kernel and combine the results together
    * using masks. */
@@ -619,29 +622,23 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
   w_2.v = vec_fma(x.v, w_2.v, cubic_2_const_c3.v);
   w2_2.v = vec_fma(x2.v, w2_2.v, cubic_2_const_c3.v);
 
-  /* Mask out unneeded values. */
-  w->v = vec_and_mask(w->v, mask_reg1);
-  w2->v = vec_and_mask(w2->v, mask_reg1_v2);
-  w_2.v = vec_and_mask(w_2.v, mask_reg2);
-  w2_2.v = vec_and_mask(w2_2.v, mask_reg2_v2);
-  dw_dx->v = vec_and_mask(dw_dx->v, mask_reg1);
-  dw_dx2->v = vec_and_mask(dw_dx2->v, mask_reg1_v2);
-  dw_dx_2.v = vec_and_mask(dw_dx_2.v, mask_reg2);
-  dw_dx2_2.v = vec_and_mask(dw_dx2_2.v, mask_reg2_v2);
-
-  /* Added both w and w2 together to form complete result. */
-  w->v = vec_add(w->v, w_2.v);
-  w2->v = vec_add(w2->v, w2_2.v);
-  dw_dx->v = vec_add(dw_dx->v, dw_dx_2.v);
-  dw_dx2->v = vec_add(dw_dx2->v, dw_dx2_2.v);
+  /* Blend both kernel regions into one vector (mask out unneeded values). */
+  /* Only need the mask for one region as the vec_blend defaults to the vector
+   * when the mask is 0.*/
+  w->v = vec_blend(mask_reg, w->v, w_2.v);
+  w2->v = vec_blend(mask_reg_v2, w2->v, w2_2.v);
+  dw_dx->v = vec_blend(mask_reg, dw_dx->v, dw_dx_2.v);
+  dw_dx2->v = vec_blend(mask_reg_v2, dw_dx2->v, dw_dx2_2.v);
 
   /* Return everything */
-  w->v = vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
-  w2->v = vec_mul(w2->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
-  dw_dx->v =
-      vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_plus_one_vec.v));
-  dw_dx2->v =
-      vec_mul(dw_dx2->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_plus_one_vec.v));
+  w->v =
+      vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
+  w2->v = vec_mul(w2->v,
+                  vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
+  dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
+                                       kernel_gamma_inv_dim_plus_one_vec.v));
+  dw_dx2->v = vec_mul(dw_dx2->v, vec_mul(kernel_constant_vec.v,
+                                         kernel_gamma_inv_dim_plus_one_vec.v));
 
 #endif
 }
@@ -672,12 +669,13 @@ __attribute__((always_inline)) INLINE static void kernel_eval_W_vec(vector *u,
   w->v = vec_fma(x.v, w->v, wendland_const_c5.v);
 #elif defined(CUBIC_SPLINE_KERNEL)
   vector w2;
-  mask_t mask_reg1, mask_reg2;
+  mask_t mask_reg;
 
   /* Form a mask for each part of the kernel. */
-  vec_create_mask(mask_reg1, vec_cmp_lt(x.v, cond.v));  /* 0 < x < 0.5 */
-  vec_create_mask(mask_reg2, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */
-  
+  /* Only need the mask for one region as the vec_blend defaults to the vector
+   * when the mask is 0.*/
+  vec_create_mask(mask_reg, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */
+
   /* Work out w for both regions of the kernel and combine the results together
    * using masks. */
 
@@ -693,11 +691,10 @@ __attribute__((always_inline)) INLINE static void kernel_eval_W_vec(vector *u,
   w2.v = vec_fma(x.v, w2.v, cubic_2_const_c3.v);
 
   /* Mask out unneeded values. */
-  w->v = vec_and_mask(w->v, mask_reg1);
-  w2.v = vec_and_mask(w2.v, mask_reg2);
+  /* Only need the mask for one region as the vec_blend defaults to the vector
+   * when the mask is 0.*/
+  w->v = vec_blend(mask_reg, w->v, w2.v);
 
-  /* Added both w and w2 together to form complete result. */
-  w->v = vec_add(w->v, w2.v);
 #else
 #error "Vectorisation not supported for this kernel!!!"
 #endif
@@ -796,11 +793,12 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_vec(
 
 #elif defined(CUBIC_SPLINE_KERNEL)
   vector dw_dx2;
-  mask_t mask_reg1, mask_reg2;
+  mask_t mask_reg;
 
   /* Form a mask for each part of the kernel. */
-  vec_create_mask(mask_reg1, vec_cmp_lt(x.v, cond.v));  /* 0 < x < 0.5 */
-  vec_create_mask(mask_reg2, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */
+  /* Only need the mask for one region as the vec_blend defaults to the vector
+   * when the mask is 0.*/
+  vec_create_mask(mask_reg, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */
 
   /* Work out w for both regions of the kernel and combine the results together
    * using masks. */
@@ -814,18 +812,17 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_vec(
   dw_dx2.v = vec_fma(dw_dx2.v, x.v, cubic_2_dwdx_const_c2.v);
 
   /* Mask out unneeded values. */
-  dw_dx->v = vec_and_mask(dw_dx->v, mask_reg1);
-  dw_dx2.v = vec_and_mask(dw_dx2.v, mask_reg2);
+  /* Only need the mask for one region as the vec_blend defaults to the vector
+   * when the mask is 0.*/
+  dw_dx->v = vec_blend(mask_reg, dw_dx->v, dw_dx2.v);
 
-  /* Added both dwdx and dwdx2 together to form complete result. */
-  dw_dx->v = vec_add(dw_dx->v, dw_dx2.v);
 #else
 #error "Vectorisation not supported for this kernel!!!"
 #endif
 
   /* Mask out result for particles that lie outside of the kernel function. */
   mask_t mask;
-  vec_create_mask(mask, vec_cmp_lt(x.v, vec_set1(1.f)));  /* x < 1 */
+  vec_create_mask(mask, vec_cmp_lt(x.v, vec_set1(1.f))); /* x < 1 */
 
   dw_dx->v = vec_and_mask(dw_dx->v, mask);
 
@@ -842,6 +839,10 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_vec(
  *
  * @param u The ratio of the distance to the smoothing length $u = x/h$.
  * @param dw_dx (return) The norm of the gradient of $|\\nabla W(x,h)|$.
+ * @param u_2 The ratio of the distance to the smoothing length $u = x/h$ for
+ * second particle.
+ * @param dw_dx_2 (return) The norm of the gradient of $|\\nabla W(x,h)|$ for
+ * second particle.
  */
 __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_2_vec(
     vector *u, vector *dw_dx, vector *u_2, vector *dw_dx_2) {
@@ -869,15 +870,15 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_2_vec(
 
 #elif defined(CUBIC_SPLINE_KERNEL)
   vector dw_dx2, dw_dx2_2;
-  mask_t mask_reg1, mask_reg2;
-  mask_t mask_reg1_2, mask_reg2_2;
+  mask_t mask_reg;
+  mask_t mask_reg_v2;
+
+  /* Form a mask for one part of the kernel. */
+  /* Only need the mask for one region as the vec_blend defaults to the vector
+   * when the mask is 0.*/
+  vec_create_mask(mask_reg, vec_cmp_gte(x.v, cond.v));      /* 0.5 < x < 1 */
+  vec_create_mask(mask_reg_v2, vec_cmp_gte(x_2.v, cond.v)); /* 0.5 < x < 1 */
 
-  /* Form a mask for each part of the kernel. */
-  vec_create_mask(mask_reg1, vec_cmp_lt(x.v, cond.v));      /* 0 < x < 0.5 */
-  vec_create_mask(mask_reg1_2, vec_cmp_lt(x_2.v, cond.v));  /* 0 < x < 0.5 */
-  vec_create_mask(mask_reg2, vec_cmp_gte(x.v, cond.v));     /* 0.5 < x < 1 */
-  vec_create_mask(mask_reg2_2, vec_cmp_gte(x_2.v, cond.v)); /* 0.5 < x < 1 */
-  
   /* Work out w for both regions of the kernel and combine the results together
    * using masks. */
 
@@ -894,22 +895,19 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_2_vec(
   dw_dx2_2.v = vec_fma(dw_dx2_2.v, x_2.v, cubic_2_dwdx_const_c2.v);
 
   /* Mask out unneeded values. */
-  dw_dx->v = vec_and_mask(dw_dx->v, mask_reg1);
-  dw_dx_2->v = vec_and_mask(dw_dx_2->v, mask_reg1_2);
-  dw_dx2.v = vec_and_mask(dw_dx2.v, mask_reg2);
-  dw_dx2_2.v = vec_and_mask(dw_dx2_2.v, mask_reg2_2);
+  /* Only need the mask for one region as the vec_blend defaults to the vector
+   * when the mask is 0.*/
+  dw_dx->v = vec_blend(mask_reg, dw_dx->v, dw_dx2.v);
+  dw_dx_2->v = vec_blend(mask_reg_v2, dw_dx_2->v, dw_dx2_2.v);
 
-  /* Added both dwdx and dwdx2 together to form complete result. */
-  dw_dx->v = vec_add(dw_dx->v, dw_dx2.v);
-  dw_dx_2->v = vec_add(dw_dx_2->v, dw_dx2_2.v);
 #else
 #error "Vectorisation not supported for this kernel!!!"
 #endif
 
   /* Mask out result for particles that lie outside of the kernel function. */
   mask_t mask, mask_2;
-  vec_create_mask(mask, vec_cmp_lt(x.v, vec_set1(1.f)));  /* x < 1 */
-  vec_create_mask(mask_2, vec_cmp_lt(x_2.v, vec_set1(1.f)));  /* x < 1 */
+  vec_create_mask(mask, vec_cmp_lt(x.v, vec_set1(1.f)));     /* x < 1 */
+  vec_create_mask(mask_2, vec_cmp_lt(x_2.v, vec_set1(1.f))); /* x < 1 */
 
   dw_dx->v = vec_and_mask(dw_dx->v, mask);
   dw_dx_2->v = vec_and_mask(dw_dx_2->v, mask_2);
diff --git a/src/kernel_long_gravity.h b/src/kernel_long_gravity.h
index 7b1c5984647c3be232770dc32fc1b112ad8bee94..ec31c2743079da22d1f3dd0c8683adf674aca1e3 100644
--- a/src/kernel_long_gravity.h
+++ b/src/kernel_long_gravity.h
@@ -19,33 +19,67 @@
 #ifndef SWIFT_KERNEL_LONG_GRAVITY_H
 #define SWIFT_KERNEL_LONG_GRAVITY_H
 
-#include <math.h>
+/* Config parameters. */
+#include "../config.h"
 
-/* Includes. */
+/* Local headers. */
+#include "approx_math.h"
 #include "const.h"
 #include "inline.h"
-#include "vector.h"
 
-#define one_over_sqrt_pi ((float)(M_2_SQRTPI * 0.5))
+/* Standard headers */
+#include <math.h>
 
 /**
  * @brief Computes the long-range correction term for the FFT calculation.
  *
- * @param u The ratio of the distance to the FFT cell scale $u = x/A$.
+ * @param u The ratio of the distance to the FFT cell scale \f$u = r/r_s\f$.
  * @param W (return) The value of the kernel function.
  */
 __attribute__((always_inline)) INLINE static void kernel_long_grav_eval(
     float u, float *const W) {
 
-  /* const float arg1 = u * 0.5f; */
-  /* const float arg2 = u * one_over_sqrt_pi; */
-  /* const float arg3 = -arg1 * arg1; */
+#ifdef GADGET2_LONG_RANGE_CORRECTION
+
+  const float one_over_sqrt_pi = ((float)(M_2_SQRTPI * 0.5));
+
+  const float arg1 = u * 0.5f;
+  const float arg2 = u * one_over_sqrt_pi;
+  const float arg3 = -arg1 * arg1;
+
+  const float term1 = erfcf(arg1);
+  const float term2 = arg2 * expf(arg3);
+
+  *W = term1 + term2;
+#else
+
+  const float arg = 2.f * u;
+  const float exp_arg = good_approx_expf(arg);
+  const float term = 1.f / (1.f + exp_arg);
 
-  /* const float term1 = erfcf(arg1); */
-  /* const float term2 = arg2 * expf(arg3); */
+  *W = arg * exp_arg * term * term - exp_arg * term + 1.f;
+  *W *= 2.f;
+#endif
+}
+
+/**
+ * @brief Returns the long-range truncation of the Poisson potential in Fourier
+ * space.
+ *
+ * @param u2 The square of the Fourier mode times the cell scale
+ * \f$u^2 = k^2r_s^2\f$.
+ * @param W (return) The value of the kernel function.
+ */
+__attribute__((always_inline)) INLINE static void fourier_kernel_long_grav_eval(
+    double u2, double *const W) {
 
-  /* *W = term1 + term2; */
-  *W = 1.f;
+#ifdef GADGET2_LONG_RANGE_CORRECTION
+  *W = exp(-u2);
+#else
+  const double u = sqrt(u2);
+  const double arg = M_PI_2 * u;
+  *W = arg / sinh(arg);
+#endif
 }
 
 #endif  // SWIFT_KERNEL_LONG_GRAVITY_H
diff --git a/src/multipole.h b/src/multipole.h
index 23f5194a30b7316aac15073cba36dc404efa21c1..004757924cccb6bc2f450c19f1ccd600f50e1990 100644
--- a/src/multipole.h
+++ b/src/multipole.h
@@ -1498,23 +1498,28 @@ INLINE static void gravity_M2M(struct multipole *m_a,
  * @param pos_a The position of the multipole.
  * @param props The #gravity_props of this calculation.
  * @param periodic Is the calculation periodic ?
+ * @param dim The size of the simulation box.
  */
 INLINE static void gravity_M2L(struct grav_tensor *l_b,
                                const struct multipole *m_a,
                                const double pos_b[3], const double pos_a[3],
-                               const struct gravity_props *props,
-                               int periodic) {
+                               const struct gravity_props *props, int periodic,
+                               const double dim[3]) {
 
   /* Recover some constants */
   const double eps2 = props->epsilon2;
 
   /* Compute distance vector */
-  const double dx =
-      periodic ? box_wrap(pos_b[0] - pos_a[0], 0., 1.) : pos_b[0] - pos_a[0];
-  const double dy =
-      periodic ? box_wrap(pos_b[1] - pos_a[1], 0., 1.) : pos_b[1] - pos_a[1];
-  const double dz =
-      periodic ? box_wrap(pos_b[2] - pos_a[2], 0., 1.) : pos_b[2] - pos_a[2];
+  double dx = pos_b[0] - pos_a[0];
+  double dy = pos_b[1] - pos_a[1];
+  double dz = pos_b[2] - pos_a[2];
+
+  /* Apply BC */
+  if (periodic) {
+    dx = nearest(dx, dim[0]);
+    dy = nearest(dy, dim[1]);
+    dz = nearest(dz, dim[2]);
+  }
 
   /* Compute distance */
   const double r2 = dx * dx + dy * dy + dz * dz;
@@ -2174,12 +2179,10 @@ INLINE static void gravity_M2L(struct grav_tensor *l_b,
  * @param lb The #grav_tensor to shift.
  * @param pos_a The position to which m_b will be shifted.
  * @param pos_b The current postion of the multipole to shift.
- * @param periodic Is the calculation periodic ?
  */
 INLINE static void gravity_L2L(struct grav_tensor *la,
                                const struct grav_tensor *lb,
-                               const double pos_a[3], const double pos_b[3],
-                               int periodic) {
+                               const double pos_a[3], const double pos_b[3]) {
 
   /* Initialise everything to zero */
   gravity_field_tensors_init(la);
@@ -2636,31 +2639,50 @@ INLINE static void gravity_L2P(const struct grav_tensor *lb,
 
 /**
  * @brief Checks whether a cell-cell interaction can be appromixated by a M-M
- * interaction.
+ * interaction using the CoM and cell radius at rebuild.
+ *
+ * We use the multipole acceptance criterion of Dehnen, 2002, JCoPh, Volume 179,
+ * Issue 1, pp.27-42, equation 10.
  *
  * @param ma The #multipole of the first #cell.
  * @param mb The #multipole of the second #cell.
  * @param theta_crit_inv The inverse of the critical opening angle.
- * @param rebuild Are we using the current value of CoM or the ones from
- * the last rebuild ?
+ * @param r2 Square of the distance (periodically wrapped) between the
+ * multipoles.
  */
-__attribute__((always_inline)) INLINE static int gravity_multipole_accept(
-    const struct gravity_tensors *ma, const struct gravity_tensors *mb,
-    double theta_crit_inv, int rebuild) {
+__attribute__((always_inline)) INLINE static int
+gravity_multipole_accept_rebuild(const struct gravity_tensors *const ma,
+                                 const struct gravity_tensors *const mb,
+                                 double theta_crit_inv, double r2) {
 
-  const double r_crit_a =
-      (rebuild ? ma->r_max_rebuild : ma->r_max) * theta_crit_inv;
-  const double r_crit_b =
-      (rebuild ? mb->r_max_rebuild : mb->r_max) * theta_crit_inv;
+  const double r_crit_a = ma->r_max_rebuild * theta_crit_inv;
+  const double r_crit_b = mb->r_max_rebuild * theta_crit_inv;
 
-  const double dx = rebuild ? ma->CoM_rebuild[0] - mb->CoM_rebuild[0]
-                            : ma->CoM[0] - mb->CoM[0];
-  const double dy = rebuild ? ma->CoM_rebuild[1] - mb->CoM_rebuild[1]
-                            : ma->CoM[1] - mb->CoM[1];
-  const double dz = rebuild ? ma->CoM_rebuild[2] - mb->CoM_rebuild[2]
-                            : ma->CoM[2] - mb->CoM[2];
+  // MATTHIEU: Make this mass-dependent ?
 
-  const double r2 = dx * dx + dy * dy + dz * dz;
+  /* Multipole acceptance criterion (Dehnen 2002, eq.10) */
+  return (r2 > (r_crit_a + r_crit_b) * (r_crit_a + r_crit_b));
+}
+
+/**
+ * @brief Checks whether a cell-cell interaction can be appromixated by a M-M
+ * interaction using the CoM and cell radius at the current time.
+ *
+ * We use the multipole acceptance criterion of Dehnen, 2002, JCoPh, Volume 179,
+ * Issue 1, pp.27-42, equation 10.
+ *
+ * @param ma The #multipole of the first #cell.
+ * @param mb The #multipole of the second #cell.
+ * @param theta_crit_inv The inverse of the critical opening angle.
+ * @param r2 Square of the distance (periodically wrapped) between the
+ * multipoles.
+ */
+__attribute__((always_inline)) INLINE static int gravity_multipole_accept(
+    const struct gravity_tensors *const ma,
+    const struct gravity_tensors *const mb, double theta_crit_inv, double r2) {
+
+  const double r_crit_a = ma->r_max * theta_crit_inv;
+  const double r_crit_b = mb->r_max * theta_crit_inv;
 
   // MATTHIEU: Make this mass-dependent ?
 
diff --git a/src/parallel_io.c b/src/parallel_io.c
index b857fd76a53738b19e5b26b8717881e71c424b6e..65f8fc9c20b1856a9c2f72625fb3bba0c8f7be8e 100644
--- a/src/parallel_io.c
+++ b/src/parallel_io.c
@@ -667,7 +667,7 @@ void write_output_parallel(struct engine* e, const char* baseName,
 
   /* File name */
   char fileName[FILENAME_BUFFER_SIZE];
-  snprintf(fileName, FILENAME_BUFFER_SIZE, "%s_%03i.hdf5", baseName,
+  snprintf(fileName, FILENAME_BUFFER_SIZE, "%s_%04i.hdf5", baseName,
            outputCount);
 
   /* First time, we need to create the XMF file */
diff --git a/src/parser.c b/src/parser.c
index 41a3e8637630eceb3beb9383acb3344028d38659..0b608b29263342240af68fd99d2fdd3241e2a1e6 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -1,6 +1,7 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Copyright (c) 2016 James Willis (james.s.willis@durham.ac.uk)
+ *               2017 Peter W. Draper (p.w.draper@durham.ac.uk)
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
@@ -89,6 +90,64 @@ void parser_read_file(const char *file_name, struct swift_params *params) {
   fclose(file);
 }
 
+/**
+ * @brief Set or update a parameter using a compressed format.
+ *
+ * The compressed format allows a value to be given as a single
+ * string and has the format "section:parameter:value", with all
+ * names as would be given in the parameter file.
+ *
+ * @param params Structure that holds the parameters.
+ * @param namevalue the parameter name and value as described.
+ */
+void parser_set_param(struct swift_params *params, const char *namevalue) {
+
+  /* Get the various parts. */
+  char name[PARSER_MAX_LINE_SIZE];
+  char value[PARSER_MAX_LINE_SIZE];
+  name[0] = '\0';
+  value[0] = '\0';
+
+  /* Name is part until second colon. */
+  char *p1 = strchr(namevalue, ':');
+  if (p1 != NULL) {
+    char *p2 = strchr(p1 + 1, ':');
+    if (p2 != NULL) {
+      memcpy(name, namevalue, p2 - namevalue);
+      name[p2 - namevalue] = '\0';
+
+      /* Value is rest after second colon. */
+      p2++;
+      strcpy(value, p2);
+    }
+  }
+
+  /* Sanity check. */
+  if (strlen(name) == 0 || strlen(value) == 0 || strchr(value, ':') != NULL)
+    error(
+        "Cannot parse compressed parameter string: '%s', check syntax "
+        "should be section:parameter:value",
+        namevalue);
+
+  /* And update or set. */
+  int updated = 0;
+  for (int i = 0; i < params->paramCount; i++) {
+    if (strcmp(name, params->data[i].name) == 0) {
+      message("Value of '%s' changed from '%s' to '%s'", params->data[i].name,
+              params->data[i].value, value);
+      strcpy(params->data[i].value, value);
+      updated = 1;
+    }
+  }
+  if (!updated) {
+    strcpy(params->data[params->paramCount].name, name);
+    strcpy(params->data[params->paramCount].value, value);
+    params->paramCount++;
+    if (params->paramCount == PARSER_MAX_NO_OF_PARAMS)
+      error("Too many parameters, current maximum is %d.", params->paramCount);
+  }
+}
+
 /**
  * @brief Counts the number of times a specific character appears in a string.
  *
@@ -238,7 +297,7 @@ static void parse_value(char *line, struct swift_params *params) {
 
   /* Check for more than one value on the same line. */
   if (count_char(line, PARSER_VALUE_CHAR) > 1) {
-    error("Inavlid line:%d '%s', only one value allowed per line.", lineNumber,
+    error("Invalid line:%d '%s', only one value allowed per line.", lineNumber,
           line);
   }
 
diff --git a/src/parser.h b/src/parser.h
index b78e21194d256ed7b50b8a09718c9725d52a1e0b..bab6d8b25f5334546ac2aaf39a3f25ef7fb6ff57 100644
--- a/src/parser.h
+++ b/src/parser.h
@@ -1,6 +1,7 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Copyright (c) 2016 James Willis (james.s.willis@durham.ac.uk)
+ *               2017 Peter W. Draper (p.w.draper@durham.ac.uk)
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
@@ -55,6 +56,7 @@ void parser_read_file(const char *file_name, struct swift_params *params);
 void parser_print_params(const struct swift_params *params);
 void parser_write_params_to_file(const struct swift_params *params,
                                  const char *file_name);
+void parser_set_param(struct swift_params *params, const char *desc);
 
 char parser_get_param_char(const struct swift_params *params, const char *name);
 int parser_get_param_int(const struct swift_params *params, const char *name);
diff --git a/src/partition.c b/src/partition.c
index c57918745c11d2858b40eefc218e2551e635d6fb..f30e5d0ad3c9ce8750a39891b2527729d9ad3b5d 100644
--- a/src/partition.c
+++ b/src/partition.c
@@ -897,27 +897,7 @@ void partition_initial_partition(struct partition *initial_partition,
       bzero(weights, sizeof(int) * s->nr_cells);
 
       /* Check each particle and accumilate the counts per cell. */
-      struct part *parts = s->parts;
-      int *cdim = s->cdim;
-      double iwidth[3], dim[3];
-      iwidth[0] = s->iwidth[0];
-      iwidth[1] = s->iwidth[1];
-      iwidth[2] = s->iwidth[2];
-      dim[0] = s->dim[0];
-      dim[1] = s->dim[1];
-      dim[2] = s->dim[2];
-      for (size_t k = 0; k < s->nr_parts; k++) {
-        for (int j = 0; j < 3; j++) {
-          if (parts[k].x[j] < 0.0)
-            parts[k].x[j] += dim[j];
-          else if (parts[k].x[j] >= dim[j])
-            parts[k].x[j] -= dim[j];
-        }
-        const int cid =
-            cell_getid(cdim, parts[k].x[0] * iwidth[0],
-                       parts[k].x[1] * iwidth[1], parts[k].x[2] * iwidth[2]);
-        weights[cid]++;
-      }
+      accumulate_counts(s, weights);
 
       /* Get all the counts from all the nodes. */
       if (MPI_Allreduce(MPI_IN_PLACE, weights, s->nr_cells, MPI_INT, MPI_SUM,
@@ -1090,6 +1070,10 @@ void partition_init(struct partition *partition,
       parser_get_opt_param_float(params, "DomainDecomposition:trigger", 0.05f);
   if (repartition->trigger <= 0)
     error("Invalid DomainDecomposition:trigger, must be greater than zero");
+  if (repartition->trigger < 2 && repartition->trigger >= 1)
+    error(
+        "Invalid DomainDecomposition:trigger, must be 2 or greater or less"
+        " than 1");
 
   /* Fraction of particles that should be updated before a repartition
    * based on CPU time is considered. */
diff --git a/src/potential/disc_patch/potential.h b/src/potential/disc_patch/potential.h
index 8fa40ecd4e6503cde8be00db8c6fb8a70c84ebdf..ab229d009c692db727e8f2341c3c49813f74f2b8 100644
--- a/src/potential/disc_patch/potential.h
+++ b/src/potential/disc_patch/potential.h
@@ -30,6 +30,7 @@
 /* Local includes. */
 #include "const.h"
 #include "error.h"
+#include "minmax.h"
 #include "parser.h"
 #include "part.h"
 #include "physical_constants.h"
@@ -39,34 +40,63 @@
 /**
  * @brief External Potential Properties - Disc patch case
  *
- * See Creasey, Theuns & Bower, 2013, MNRAS, Volume 429, Issue 3, p.1922-1948
+ * See Creasey, Theuns & Bower, 2013, MNRAS, Volume 429, Issue 3, p.1922-1948.
+ *
+ * We truncate the accelerations beyond z_trunc using a 1-cos(z) function
+ * that smoothly brings the accelerations to 0 at z_max.
  */
 struct external_potential {
 
-  /*! Surface density of the disc */
-  double surface_density;
+  /*! Surface density of the disc (sigma) */
+  float surface_density;
+
+  /*! Disc scale-height (b) */
+  float scale_height;
+
+  /*! Inverse of disc scale-height (1/b) */
+  float scale_height_inv;
+
+  /*! Position of the disc along the x-axis */
+  float x_disc;
 
-  /*! Disc scale-height */
-  double scale_height;
+  /*! Position above which the accelerations get truncated */
+  float x_trunc;
 
-  /*! Position of the disc along the z-axis */
-  double z_disc;
+  /*! Position above which the accelerations are zero */
+  float x_max;
+
+  /*! The truncated transition regime */
+  float x_trans;
+
+  /*! Inverse of the truncated transition regime */
+  float x_trans_inv;
 
   /*! Dynamical time of the system */
-  double dynamical_time;
+  float dynamical_time;
 
-  /*! Time over which to grow the disk in units of the dynamical time */
-  double growth_time;
+  /*! Time over which to grow the disk */
+  float growth_time;
+
+  /*! Inverse of the growth time */
+  float growth_time_inv;
 
   /*! Time-step condition pre-factor */
-  double timestep_mult;
+  float timestep_mult;
+
+  /*! Constant pre-factor (2 pi G sigma) */
+  float norm;
+
+  /*! Constant pre-factor (2 pi sigma)*/
+  float norm_over_G;
 };
 
 /**
  * @brief Computes the time-step from the acceleration due to a hydrostatic
  * disc.
  *
- * See Creasey, Theuns & Bower, 2013, MNRAS, Volume 429, Issue 3, p.1922-1948
+ * See Creasey, Theuns & Bower, 2013, MNRAS, Volume 429, Issue 3, p.1922-1948,
+ * equations 17 and 20.
+ * We do not use the truncated potential here.
  *
  * @param time The current time.
  * @param potential The properties of the potential.
@@ -80,39 +110,41 @@ __attribute__((always_inline)) INLINE static float external_gravity_timestep(
 
   /* initilize time step to disc dynamical time */
   const float dt_dyn = potential->dynamical_time;
-  float dt = dt_dyn;
+  const float b = potential->scale_height;
+  const float b_inv = potential->scale_height_inv;
+  const float norm = potential->norm;
 
   /* absolute value of height above disc */
-  const float dz = fabsf(g->x[2] - potential->z_disc);
+  const float dx = fabsf(g->x[0] - potential->x_disc);
 
   /* vertical acceleration */
-  const float z_accel = 2.f * M_PI * phys_const->const_newton_G *
-                        potential->surface_density *
-                        tanhf(dz / potential->scale_height);
+  const float x_accel = norm * tanhf(dx * b_inv);
+
+  float dt = dt_dyn;
 
   /* demand that dt * velocity <  fraction of scale height of disc */
-  float dt1 = FLT_MAX;
-  if (g->v_full[2] != 0.f) {
-    dt1 = potential->scale_height / fabsf(g->v_full[2]);
-    if (dt1 < dt) dt = dt1;
+  if (g->v_full[0] != 0.f) {
+
+    const float dt1 = b / fabsf(g->v_full[0]);
+    dt = min(dt1, dt);
   }
 
   /* demand that dt^2 * acceleration < fraction of scale height of disc */
-  float dt2 = FLT_MAX;
-  if (z_accel != 0.f) {
-    dt2 = potential->scale_height / fabsf(z_accel);
+  if (x_accel != 0.f) {
+
+    const float dt2 = b / fabsf(x_accel);
     if (dt2 < dt * dt) dt = sqrtf(dt2);
   }
 
   /* demand that dt^3 * jerk < fraction of scale height of disc */
-  float dt3 = FLT_MAX;
-  if (g->v_full[2] != 0.f) {
-    const float dz_accel_over_dt =
-        2.f * M_PI * phys_const->const_newton_G * potential->surface_density /
-        potential->scale_height / coshf(dz / potential->scale_height) /
-        coshf(dz / potential->scale_height) * fabsf(g->v_full[2]);
-
-    dt3 = potential->scale_height / fabsf(dz_accel_over_dt);
+  if (g->v_full[0] != 0.f) {
+
+    const float cosh_dx_inv = 1.f / coshf(dx * b_inv);
+    const float cosh_dx_inv2 = cosh_dx_inv * cosh_dx_inv;
+    const float dx_accel_over_dt =
+        norm * cosh_dx_inv2 * b_inv * fabsf(g->v_full[0]);
+
+    const float dt3 = b / fabsf(dx_accel_over_dt);
     if (dt3 < dt * dt * dt) dt = cbrtf(dt3);
   }
 
@@ -120,11 +152,13 @@ __attribute__((always_inline)) INLINE static float external_gravity_timestep(
 }
 
 /**
- * @brief Computes the gravitational acceleration along z due to a hydrostatic
+ * @brief Computes the gravitational acceleration along x due to a hydrostatic
  * disc
  *
  * See Creasey, Theuns & Bower, 2013, MNRAS, Volume 429, Issue 3, p.1922-1948,
  * equation 17.
+ * We truncate the accelerations beyond x_trunc using a 1-cos(x) function
+ * that smoothly brings the accelerations to 0 at x_max.
  *
  * @param time The current time in internal units.
  * @param potential The properties of the potential.
@@ -135,20 +169,40 @@ __attribute__((always_inline)) INLINE static void external_gravity_acceleration(
     double time, const struct external_potential* restrict potential,
     const struct phys_const* restrict phys_const, struct gpart* restrict g) {
 
-  const float dz = g->x[2] - potential->z_disc;
-  const float t_dyn = potential->dynamical_time;
-
-  float reduction_factor = 1.f;
-  if (time < potential->growth_time * t_dyn)
-    reduction_factor = time / (potential->growth_time * t_dyn);
-
-  /* Accelerations. Note that they are multiplied by G later on */
-  const float z_accel = reduction_factor * 2.f * M_PI *
-                        potential->surface_density *
-                        tanhf(fabsf(dz) / potential->scale_height);
+  const float dx = g->x[0] - potential->x_disc;
+  const float abs_dx = fabsf(dx);
+  const float t_growth = potential->growth_time;
+  const float t_growth_inv = potential->growth_time_inv;
+  const float b_inv = potential->scale_height_inv;
+  const float x_trunc = potential->x_trunc;
+  const float x_max = potential->x_max;
+  const float x_trans_inv = potential->x_trans_inv;
+  const float norm_over_G = potential->norm_over_G;
+
+  /* Are we still growing the disc ? */
+  const float reduction_factor = time < t_growth ? time * t_growth_inv : 1.f;
+
+  /* Truncated or not ? */
+  float a_x;
+  if (abs_dx < x_trunc) {
+
+    /* Acc. 2 pi sigma tanh(x/b) */
+    a_x = reduction_factor * norm_over_G * tanhf(abs_dx * b_inv);
+  } else if (abs_dx < x_max) {
+
+    /* Acc. 2 pi sigma tanh(x/b) [1/2 + 1/2cos((x-xmax)/(pi x_trans))] */
+    a_x =
+        reduction_factor * norm_over_G * tanhf(abs_dx * b_inv) *
+        (0.5f + 0.5f * cosf((float)(M_PI) * (abs_dx - x_trunc) * x_trans_inv));
+  } else {
+
+    /* Acc. 0 */
+    a_x = 0.f;
+  }
 
-  if (dz > 0) g->a_grav[2] -= z_accel;
-  if (dz < 0) g->a_grav[2] += z_accel;
+  /* Get the correct sign. Recall G is multipiled in later on */
+  if (dx > 0) g->a_grav[0] -= a_x;
+  if (dx < 0) g->a_grav[0] += a_x;
 }
 
 /**
@@ -156,7 +210,9 @@ __attribute__((always_inline)) INLINE static void external_gravity_acceleration(
  * disc patch potential.
  *
  * See Creasey, Theuns & Bower, 2013, MNRAS, Volume 429, Issue 3, p.1922-1948,
- * equation 24.
+ * equation 22.
+ * We truncate the accelerations beyond x_trunc using a 1-cos(x) function
+ * that smoothly brings the accelerations to 0 at x_max.
  *
  * @param time The current time.
  * @param potential The #external_potential used in the run.
@@ -168,17 +224,36 @@ external_gravity_get_potential_energy(
     double time, const struct external_potential* potential,
     const struct phys_const* const phys_const, const struct gpart* gp) {
 
-  const float dz = gp->x[2] - potential->z_disc;
-  const float t_dyn = potential->dynamical_time;
+  const float dx = gp->x[0] - potential->x_disc;
+  const float abs_dx = fabsf(dx);
+  const float t_growth = potential->growth_time;
+  const float t_growth_inv = potential->growth_time_inv;
+  const float b = potential->scale_height;
+  const float b_inv = potential->scale_height_inv;
+  const float norm = potential->norm;
+  const float x_trunc = potential->x_trunc;
+  const float x_max = potential->x_max;
+
+  /* Are we still growing the disc ? */
+  const float reduction_factor = time < t_growth ? time * t_growth_inv : 1.f;
+
+  /* Truncated or not ? */
+  float pot;
+  if (abs_dx < x_trunc) {
 
-  float reduction_factor = 1.f;
-  if (time < potential->growth_time * t_dyn)
-    reduction_factor = time / (potential->growth_time * t_dyn);
+    /* Potential (2 pi G sigma b ln(cosh(x/b)) */
+    pot = b * logf(coshf(dx * b_inv));
+  } else if (abs_dx < x_max) {
 
-  /* Accelerations. Note that they are multiplied by G later on */
-  return reduction_factor * 2.f * M_PI * phys_const->const_newton_G *
-         potential->surface_density * potential->scale_height *
-         logf(coshf(dz / potential->scale_height));
+    /* Potential. At x>>b, phi(x) = norm * x / b */
+    pot = 0.f;
+
+  } else {
+
+    pot = 0.f;
+  }
+
+  return pot * reduction_factor * norm;
 }
 
 /**
@@ -202,15 +277,49 @@ static INLINE void potential_init_backend(
       parameter_file, "DiscPatchPotential:surface_density");
   potential->scale_height = parser_get_param_double(
       parameter_file, "DiscPatchPotential:scale_height");
-  potential->z_disc =
-      parser_get_param_double(parameter_file, "DiscPatchPotential:z_disc");
+  potential->x_disc =
+      parser_get_param_double(parameter_file, "DiscPatchPotential:x_disc");
+  potential->x_trunc = parser_get_opt_param_double(
+      parameter_file, "DiscPatchPotential:x_trunc", FLT_MAX);
+  potential->x_max = parser_get_opt_param_double(
+      parameter_file, "DiscPatchPotential:x_max", FLT_MAX);
+  potential->x_disc =
+      parser_get_param_double(parameter_file, "DiscPatchPotential:x_disc");
   potential->timestep_mult = parser_get_param_double(
       parameter_file, "DiscPatchPotential:timestep_mult");
   potential->growth_time = parser_get_opt_param_double(
       parameter_file, "DiscPatchPotential:growth_time", 0.);
+
+  /* Compute the dynamical time */
   potential->dynamical_time =
       sqrt(potential->scale_height /
            (phys_const->const_newton_G * potential->surface_density));
+
+  /* Convert the growth time multiplier to physical time */
+  potential->growth_time *= potential->dynamical_time;
+
+  /* Some cross-checks */
+  if (potential->x_trunc > potential->x_max)
+    error("Potential truncation x larger than maximal z");
+  if (potential->x_trunc < potential->scale_height)
+    error("Potential truncation x smaller than scale height");
+
+  /* Compute derived quantities */
+  potential->scale_height_inv = 1. / potential->scale_height;
+  potential->norm =
+      2. * M_PI * phys_const->const_newton_G * potential->surface_density;
+  potential->norm_over_G = 2 * M_PI * potential->surface_density;
+  potential->x_trans = potential->x_max - potential->x_trunc;
+
+  if (potential->x_trans != 0.f)
+    potential->x_trans_inv = 1. / potential->x_trans;
+  else
+    potential->x_trans_inv = FLT_MAX;
+
+  if (potential->growth_time != 0.)
+    potential->growth_time_inv = 1. / potential->growth_time;
+  else
+    potential->growth_time_inv = FLT_MAX;
 }
 
 /**
@@ -222,13 +331,19 @@ static INLINE void potential_print_backend(
     const struct external_potential* potential) {
 
   message(
-      "External potential is 'Disk-patch' with properties surface_density = %e "
-      "disc height= %e scale height = %e timestep multiplier = %e.",
-      potential->surface_density, potential->z_disc, potential->scale_height,
+      "External potential is 'Disk-patch' with Sigma=%f, x_disc=%f, b=%f and "
+      "dt_mult=%f.",
+      potential->surface_density, potential->x_disc, potential->scale_height,
       potential->timestep_mult);
 
+  if (potential->x_max < FLT_MAX)
+    message("Potential will be truncated at x_trunc=%f and zeroed at x_max=%f",
+            potential->x_trunc, potential->x_max);
+
   if (potential->growth_time > 0.)
-    message("Disc will grow for %f dynamical times.", potential->growth_time);
+    message("Disc will grow for %f [time_units]. (%f dynamical time)",
+            potential->growth_time,
+            potential->growth_time / potential->dynamical_time);
 }
 
 #endif /* SWIFT_DISC_PATCH_H */
diff --git a/src/queue.h b/src/queue.h
index 951a3e5a056d7ad0c3935f98341a0d93c805e3ad..c85cf0cabe30a03d163e2564fdc216c19495761a 100644
--- a/src/queue.h
+++ b/src/queue.h
@@ -29,7 +29,7 @@
 #define queue_sizeinit 100
 #define queue_sizegrow 2
 #define queue_search_window 8
-#define queue_incoming_size 1024
+#define queue_incoming_size 10240
 #define queue_struct_align 64
 
 /* Counters. */
diff --git a/src/runner.c b/src/runner.c
index 54039609621945f7c529ef945c05e2ac2fe3f17c..ec08b743452508364a7f1900963aae73061a944d 100644
--- a/src/runner.c
+++ b/src/runner.c
@@ -316,23 +316,35 @@ void runner_check_sorts(struct cell *c, int flags) {
  * @param r The #runner.
  * @param c The #cell.
  * @param flags Cell flag.
+ * @param cleanup If true, re-build the sorts for the selected flags instead
+ *        of just adding them.
  * @param clock Flag indicating whether to record the timing or not, needed
  *      for recursive calls.
  */
-void runner_do_sort(struct runner *r, struct cell *c, int flags, int clock) {
+void runner_do_sort(struct runner *r, struct cell *c, int flags, int cleanup,
+                    int clock) {
 
   struct entry *finger;
   struct entry *fingers[8];
   struct part *parts = c->parts;
   struct xpart *xparts = c->xparts;
-  struct entry *sort;
   const int count = c->count;
   float buff[8];
 
   TIMER_TIC;
 
+  /* We need to do the local sorts plus whatever was requested further up. */
+  flags |= c->do_sort;
+  if (cleanup) {
+    c->sorted = 0;
+  } else {
+    flags &= ~c->sorted;
+  }
+  if (flags == 0 && !c->do_sub_sort) return;
+
   /* Check that the particles have been moved to the current time */
-  if (!cell_are_part_drifted(c, r->e)) error("Sorting un-drifted cell");
+  if (flags && !cell_are_part_drifted(c, r->e))
+    error("Sorting un-drifted cell");
 
 #ifdef SWIFT_DEBUG_CHECKS
   /* Make sure the sort flags are consistent (downward). */
@@ -343,44 +355,40 @@ void runner_do_sort(struct runner *r, struct cell *c, int flags, int clock) {
        finger = finger->parent) {
     if (finger->sorted & ~c->sorted) error("Inconsistent sort flags (upward).");
   }
+
+  /* Update the sort timer which represents the last time the sorts
+     were re-set. */
+  if (c->sorted == 0) c->ti_sort = r->e->ti_current;
 #endif
 
-  /* Clean-up the flags, i.e. filter out what's already been sorted, but
-     only if the sorts are recent. */
-  if (c->ti_sort == r->e->ti_current) {
-    /* Ignore dimensions that have been sorted in this timestep. */
-    // flags &= ~c->sorted;
-  } else {
-    /* Clean old (stale) sorts. */
-    flags |= c->sorted;
-    c->sorted = 0;
-  }
-  if (flags == 0) return;
-
-  /* start by allocating the entry arrays. */
-  if (c->sort == NULL || c->sortsize < count) {
-    if (c->sort != NULL) free(c->sort);
-    c->sortsize = count * 1.1;
-    if ((c->sort = (struct entry *)malloc(sizeof(struct entry) *
-                                          (c->sortsize + 1) * 13)) == NULL)
-      error("Failed to allocate sort memory.");
+  /* start by allocating the entry arrays in the requested dimensions. */
+  for (int j = 0; j < 13; j++) {
+    if ((flags & (1 << j)) && c->sort[j] == NULL) {
+      if ((c->sort[j] = (struct entry *)malloc(sizeof(struct entry) *
+                                               (count + 1))) == NULL)
+        error("Failed to allocate sort memory.");
+    }
   }
-  sort = c->sort;
 
   /* Does this cell have any progeny? */
   if (c->split) {
 
     /* Fill in the gaps within the progeny. */
     float dx_max_sort = 0.0f;
+    float dx_max_sort_old = 0.0f;
     for (int k = 0; k < 8; k++) {
       if (c->progeny[k] != NULL) {
-        if (flags & ~c->progeny[k]->sorted ||
-            c->progeny[k]->dx_max_sort > c->dmin * space_maxreldx)
-          runner_do_sort(r, c->progeny[k], flags, 0);
+        /* Only propagate cleanup if the progeny is stale. */
+        runner_do_sort(r, c->progeny[k], flags,
+                       cleanup && (c->progeny[k]->dx_max_sort >
+                                   space_maxreldx * c->progeny[k]->dmin),
+                       0);
         dx_max_sort = max(dx_max_sort, c->progeny[k]->dx_max_sort);
+        dx_max_sort_old = max(dx_max_sort_old, c->progeny[k]->dx_max_sort_old);
       }
     }
     c->dx_max_sort = dx_max_sort;
+    c->dx_max_sort_old = dx_max_sort_old;
 
     /* Loop over the 13 different sort arrays. */
     for (int j = 0; j < 13; j++) {
@@ -402,7 +410,7 @@ void runner_do_sort(struct runner *r, struct cell *c, int flags, int clock) {
       for (int k = 0; k < 8; k++) {
         inds[k] = k;
         if (c->progeny[k] != NULL && c->progeny[k]->count > 0) {
-          fingers[k] = &c->progeny[k]->sort[j * (c->progeny[k]->count + 1)];
+          fingers[k] = c->progeny[k]->sort[j];
           buff[k] = fingers[k]->d;
           off[k] = off[k];
         } else
@@ -419,7 +427,7 @@ void runner_do_sort(struct runner *r, struct cell *c, int flags, int clock) {
           }
 
       /* For each entry in the new sort list. */
-      finger = &sort[j * (count + 1)];
+      finger = c->sort[j];
       for (int ind = 0; ind < count; ind++) {
 
         /* Copy the minimum into the new sort array. */
@@ -440,11 +448,11 @@ void runner_do_sort(struct runner *r, struct cell *c, int flags, int clock) {
       } /* Merge. */
 
       /* Add a sentinel. */
-      sort[j * (count + 1) + count].d = FLT_MAX;
-      sort[j * (count + 1) + count].i = 0;
+      c->sort[j][count].d = FLT_MAX;
+      c->sort[j][count].i = 0;
 
       /* Mark as sorted. */
-      c->sorted |= (1 << j);
+      atomic_or(&c->sorted, 1 << j);
 
     } /* loop over sort arrays. */
 
@@ -453,13 +461,23 @@ void runner_do_sort(struct runner *r, struct cell *c, int flags, int clock) {
   /* Otherwise, just sort. */
   else {
 
-    /* Reset the sort distance if we are in a local cell */
-    if (xparts != NULL) {
-      for (int k = 0; k < count; k++) {
-        xparts[k].x_diff_sort[0] = 0.0f;
-        xparts[k].x_diff_sort[1] = 0.0f;
-        xparts[k].x_diff_sort[2] = 0.0f;
+    /* Reset the sort distance */
+    if (c->sorted == 0) {
+#ifdef SWIFT_DEBUG_CHECKS
+      if (xparts != NULL && c->nodeID != engine_rank)
+        error("Have non-NULL xparts in foreign cell");
+#endif
+
+      /* And the individual sort distances if we are a local cell */
+      if (xparts != NULL) {
+        for (int k = 0; k < count; k++) {
+          xparts[k].x_diff_sort[0] = 0.0f;
+          xparts[k].x_diff_sort[1] = 0.0f;
+          xparts[k].x_diff_sort[2] = 0.0f;
+        }
       }
+      c->dx_max_sort_old = 0.f;
+      c->dx_max_sort = 0.f;
     }
 
     /* Fill the sort array. */
@@ -467,40 +485,28 @@ void runner_do_sort(struct runner *r, struct cell *c, int flags, int clock) {
       const double px[3] = {parts[k].x[0], parts[k].x[1], parts[k].x[2]};
       for (int j = 0; j < 13; j++)
         if (flags & (1 << j)) {
-          sort[j * (count + 1) + k].i = k;
-          sort[j * (count + 1) + k].d = px[0] * runner_shift[j][0] +
-                                        px[1] * runner_shift[j][1] +
-                                        px[2] * runner_shift[j][2];
+          c->sort[j][k].i = k;
+          c->sort[j][k].d = px[0] * runner_shift[j][0] +
+                            px[1] * runner_shift[j][1] +
+                            px[2] * runner_shift[j][2];
         }
     }
 
     /* Add the sentinel and sort. */
     for (int j = 0; j < 13; j++)
       if (flags & (1 << j)) {
-        sort[j * (count + 1) + count].d = FLT_MAX;
-        sort[j * (count + 1) + count].i = 0;
-        runner_do_sort_ascending(&sort[j * (count + 1)], count);
-        c->sorted |= (1 << j);
+        c->sort[j][count].d = FLT_MAX;
+        c->sort[j][count].i = 0;
+        runner_do_sort_ascending(c->sort[j], count);
+        atomic_or(&c->sorted, 1 << j);
       }
-
-    /* Finally, clear the dx_max_sort field of this cell. */
-    c->dx_max_sort = 0.f;
-
-    /* If this was not just an update, invalidate the sorts above this one. */
-    if (c->ti_sort < r->e->ti_current)
-      for (struct cell *finger = c->parent; finger != NULL;
-           finger = finger->parent)
-        finger->sorted = 0;
   }
 
-  /* Update the sort timer. */
-  c->ti_sort = r->e->ti_current;
-
 #ifdef SWIFT_DEBUG_CHECKS
   /* Verify the sorting. */
   for (int j = 0; j < 13; j++) {
     if (!(flags & (1 << j))) continue;
-    finger = &sort[j * (count + 1)];
+    finger = c->sort[j];
     for (int k = 1; k < count; k++) {
       if (finger[k].d < finger[k - 1].d)
         error("Sorting failed, ascending array.");
@@ -518,6 +524,11 @@ void runner_do_sort(struct runner *r, struct cell *c, int flags, int clock) {
   }
 #endif
 
+  /* Clear the cell's sort flags. */
+  c->do_sort = 0;
+  c->do_sub_sort = 0;
+  c->requires_sorts = 0;
+
   if (clock) TIMER_TOC(timer_dosort);
 }
 
@@ -621,11 +632,9 @@ void runner_do_ghost(struct runner *r, struct cell *c, int timer) {
   const struct engine *e = r->e;
   const struct space *s = e->s;
   const float hydro_h_max = e->hydro_properties->h_max;
-  const float target_wcount = e->hydro_properties->target_neighbours;
-  const float max_wcount =
-      target_wcount + e->hydro_properties->delta_neighbours;
-  const float min_wcount =
-      target_wcount - e->hydro_properties->delta_neighbours;
+  const float eps = e->hydro_properties->h_tolerance;
+  const float hydro_eta_dim =
+      pow_dimension(e->hydro_properties->eta_neighbours);
   const int max_smoothing_iter = e->hydro_properties->max_smoothing_iterations;
   int redo = 0, count = 0;
 
@@ -669,28 +678,47 @@ void runner_do_ghost(struct runner *r, struct cell *c, int timer) {
         if (!part_is_active(p, e)) error("Ghost applied to inactive particle");
 #endif
 
-        /* Finish the density calculation */
-        hydro_end_density(p);
+        /* Get some useful values */
+        const float h_old = p->h;
+        const float h_old_dim = pow_dimension(h_old);
+        const float h_old_dim_minus_one = pow_dimension_minus_one(h_old);
+        float h_new;
 
-        /* Did we get the right number of neighbours? */
-        if (p->density.wcount > max_wcount || p->density.wcount < min_wcount) {
+        if (p->density.wcount == 0.f) { /* No neighbours case */
 
-          float h_corr = 0.f;
+          /* Double h and try again */
+          h_new = 2.f * h_old;
+        } else {
 
-          /* If no derivative, double the smoothing length. */
-          if (p->density.wcount_dh == 0.0f) h_corr = p->h;
+          /* Finish the density calculation */
+          hydro_end_density(p);
 
-          /* Otherwise, compute the smoothing length update (Newton step). */
-          else {
-            h_corr = (target_wcount - p->density.wcount) / p->density.wcount_dh;
+          /* Compute one step of the Newton-Raphson scheme */
+          const float n_sum = p->density.wcount * h_old_dim;
+          const float n_target = hydro_eta_dim;
+          const float f = n_sum - n_target;
+          const float f_prime =
+              p->density.wcount_dh * h_old_dim +
+              hydro_dimension * p->density.wcount * h_old_dim_minus_one;
 
-            /* Truncate to the range [ -p->h/2 , p->h ]. */
-            h_corr = (h_corr < p->h) ? h_corr : p->h;
-            h_corr = (h_corr > -0.5f * p->h) ? h_corr : -0.5f * p->h;
-          }
+          h_new = h_old - f / f_prime;
+
+#ifdef SWIFT_DEBUG_CHECKS
+          if ((f > 0.f && h_new > h_old) || (f < 0.f && h_new < h_old))
+            error(
+                "Smoothing length correction not going in the right direction");
+#endif
+
+          /* Safety check: truncate to the range [ h_old/2 , 2h_old ]. */
+          h_new = min(h_new, 2.f * h_old);
+          h_new = max(h_new, 0.5f * h_old);
+        }
+
+        /* Check whether the particle has an inappropriate smoothing length */
+        if (fabsf(h_new - h_old) > eps * h_old) {
 
           /* Ok, correct then */
-          p->h += h_corr;
+          p->h = h_new;
 
           /* If below the absolute maximum, try again */
           if (p->h < hydro_h_max) {
@@ -708,6 +736,10 @@ void runner_do_ghost(struct runner *r, struct cell *c, int timer) {
 
             /* Ok, this particle is a lost cause... */
             p->h = hydro_h_max;
+
+            /* Do some damage control if no neighbours at all were found */
+            if (p->density.wcount == kernel_root * kernel_norm)
+              hydro_part_has_no_neighbours(p, xp);
           }
         }
 
@@ -738,6 +770,11 @@ void runner_do_ghost(struct runner *r, struct cell *c, int timer) {
           /* Run through this cell's density interactions. */
           for (struct link *l = finger->density; l != NULL; l = l->next) {
 
+#ifdef SWIFT_DEBUG_CHECKS
+            if (l->t->ti_run < r->e->ti_current)
+              error("Density task should have been run.");
+#endif
+
             /* Self-interaction? */
             if (l->t->type == task_type_self)
               runner_doself_subset_density(r, finger, parts, pid, count);
@@ -782,7 +819,7 @@ void runner_do_ghost(struct runner *r, struct cell *c, int timer) {
     }
 #else
     if (count)
-      message("Smoothing length failed to converge on %i particles.", count);
+      error("Smoothing length failed to converge on %i particles.", count);
 #endif
 
     /* Be clean */
@@ -850,7 +887,7 @@ void runner_do_drift_part(struct runner *r, struct cell *c, int timer) {
 
   TIMER_TIC;
 
-  cell_drift_part(c, r->e);
+  cell_drift_part(c, r->e, 0);
 
   if (timer) TIMER_TOC(timer_drift_part);
 }
@@ -1492,6 +1529,10 @@ void runner_do_recv_part(struct runner *r, struct cell *c, int clear_sorts,
   timebin_t time_bin_max = 0;
   float h_max = 0.f;
 
+#ifdef SWIFT_DEBUG_CHECKS
+  if (c->nodeID == engine_rank) error("Updating a local cell!");
+#endif
+
   /* Clear this cell's sorted mask. */
   if (clear_sorts) c->sorted = 0;
 
@@ -1504,11 +1545,6 @@ void runner_do_recv_part(struct runner *r, struct cell *c, int clear_sorts,
       time_bin_min = min(time_bin_min, parts[k].time_bin);
       time_bin_max = max(time_bin_max, parts[k].time_bin);
       h_max = max(h_max, parts[k].h);
-
-#ifdef SWIFT_DEBUG_CHECKS
-      if (parts[k].ti_drift != ti_current)
-        error("Received un-drifted particle !");
-#endif
     }
 
     /* Convert into a time */
@@ -1571,6 +1607,10 @@ void runner_do_recv_gpart(struct runner *r, struct cell *c, int timer) {
   timebin_t time_bin_min = num_time_bins;
   timebin_t time_bin_max = 0;
 
+#ifdef SWIFT_DEBUG_CHECKS
+  if (c->nodeID == engine_rank) error("Updating a local cell!");
+#endif
+
   /* If this cell is a leaf, collect the particle data. */
   if (!c->split) {
 
@@ -1644,6 +1684,10 @@ void runner_do_recv_spart(struct runner *r, struct cell *c, int timer) {
   timebin_t time_bin_min = num_time_bins;
   timebin_t time_bin_max = 0;
 
+#ifdef SWIFT_DEBUG_CHECKS
+  if (c->nodeID == engine_rank) error("Updating a local cell!");
+#endif
+
   /* If this cell is a leaf, collect the particle data. */
   if (!c->split) {
 
@@ -1710,7 +1754,7 @@ void *runner_main(void *data) {
   while (1) {
 
     /* Wait at the barrier. */
-    engine_barrier(e, r->id);
+    engine_barrier(e);
 
     /* Re-set the pointer to the previous task, as there is none. */
     struct task *t = NULL;
@@ -1735,9 +1779,19 @@ void *runner_main(void *data) {
       struct cell *ci = t->ci;
       struct cell *cj = t->cj;
 
-/* Mark the thread we run on */
 #ifdef SWIFT_DEBUG_TASKS
+      /* Mark the thread we run on */
       t->rid = r->cpuid;
+
+      /* And recover the pair direction */
+      if (t->type == task_type_pair || t->type == task_type_sub_pair) {
+        struct cell *ci_temp = ci;
+        struct cell *cj_temp = cj;
+        double shift[3];
+        t->sid = space_getsid(e->s, &ci_temp, &cj_temp, shift);
+      } else {
+        t->sid = -1;
+      }
 #endif
 
 /* Check that we haven't scheduled an inactive task */
@@ -1764,7 +1818,7 @@ void *runner_main(void *data) {
 
         /* Special case for sorts */
         if (!cell_is_active(ci, e) && t->type == task_type_sort &&
-            t->flags == 0)
+            !(ci->do_sort || ci->do_sub_sort))
           error(
               "Task (type='%s/%s') should have been skipped ti_current=%lld "
               "c->ti_end_min=%lld t->flags=%d",
@@ -1822,16 +1876,11 @@ void *runner_main(void *data) {
           break;
 
         case task_type_pair:
-          if (t->subtype == task_subtype_density) {
-#if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH)
-            runner_dopair1_density_vec(r, ci, cj);
-#else
-            runner_dopair1_density(r, ci, cj);
-#endif
-          }
+          if (t->subtype == task_subtype_density)
+            runner_dopair1_branch_density(r, ci, cj);
 #ifdef EXTRA_HYDRO_LOOP
           else if (t->subtype == task_subtype_gradient)
-            runner_dopair1_gradient(r, ci, cj);
+            runner_dopair1_branch_gradient(r, ci, cj);
 #endif
           else if (t->subtype == task_subtype_force)
             runner_dopair2_force(r, ci, cj);
@@ -1874,7 +1923,11 @@ void *runner_main(void *data) {
           break;
 
         case task_type_sort:
-          runner_do_sort(r, ci, t->flags, 1);
+          /* Cleanup only if any of the indices went stale. */
+          runner_do_sort(r, ci, t->flags,
+                         ci->dx_max_sort_old > space_maxreldx * ci->dmin, 1);
+          /* Reset the sort flags as our work here is done. */
+          t->flags = 0;
           break;
         case task_type_init_grav:
           runner_do_init_grav(r, ci, 1);
@@ -1917,9 +1970,9 @@ void *runner_main(void *data) {
           } else if (t->subtype == task_subtype_xv) {
             runner_do_recv_part(r, ci, 1, 1);
           } else if (t->subtype == task_subtype_rho) {
-            runner_do_recv_part(r, ci, 1, 1);
+            runner_do_recv_part(r, ci, 0, 1);
           } else if (t->subtype == task_subtype_gradient) {
-            runner_do_recv_part(r, ci, 1, 1);
+            runner_do_recv_part(r, ci, 0, 1);
           } else if (t->subtype == task_subtype_gpart) {
             runner_do_recv_gpart(r, ci, 1);
           } else if (t->subtype == task_subtype_spart) {
diff --git a/src/runner.h b/src/runner.h
index 0c6edc3c0c1406855ac79c96617bbdaa310bb46d..e33a3e380e6097a67258d116d617483caca35086 100644
--- a/src/runner.h
+++ b/src/runner.h
@@ -28,6 +28,7 @@
 
 /* Includes. */
 #include "cache.h"
+#include "gravity_cache.h"
 
 struct cell;
 struct engine;
@@ -49,7 +50,14 @@ struct runner {
   /*! The engine owing this runner. */
   struct engine *e;
 
+  /*! The particle gravity_cache of cell ci. */
+  struct gravity_cache ci_gravity_cache;
+
+  /*! The particle gravity_cache of cell cj. */
+  struct gravity_cache cj_gravity_cache;
+
 #ifdef WITH_VECTORIZATION
+
   /*! The particle cache of cell ci. */
   struct cache ci_cache;
 
@@ -61,7 +69,8 @@ struct runner {
 /* Function prototypes. */
 void runner_do_ghost(struct runner *r, struct cell *c, int timer);
 void runner_do_extra_ghost(struct runner *r, struct cell *c, int timer);
-void runner_do_sort(struct runner *r, struct cell *c, int flag, int clock);
+void runner_do_sort(struct runner *r, struct cell *c, int flag, int cleanup,
+                    int clock);
 void runner_do_drift_part(struct runner *r, struct cell *c, int timer);
 void runner_do_drift_gpart(struct runner *r, struct cell *c, int timer);
 void runner_do_kick1(struct runner *r, struct cell *c, int timer);
diff --git a/src/runner_doiact.h b/src/runner_doiact.h
index 9e6adb9e267f0ee48d28cde937f280b51ca372dc..c07d70f3e48bb6f1c9e7e343a50cdbba71da0785 100644
--- a/src/runner_doiact.h
+++ b/src/runner_doiact.h
@@ -634,15 +634,13 @@ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci,
   const int flipped = runner_flip[sid];
   sid = sortlistID[sid];
 
-  /* Have the cells been sorted? */
+  /* Has the cell cj been sorted? */
   if (!(cj->sorted & (1 << sid)) ||
-      cj->dx_max_sort > space_maxreldx * cj->dmin) {
-    DOPAIR_SUBSET_NAIVE(r, ci, parts_i, ind, count, cj);
-    return;
-  }
+      cj->dx_max_sort_old > space_maxreldx * cj->dmin)
+    error("Interacting unsorted cells.");
 
   /* Pick-out the sorted lists. */
-  const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)];
+  const struct entry *restrict sort_j = cj->sort[sid];
   const float dxj = cj->dx_max_sort;
 
   /* Parts are on the left? */
@@ -884,8 +882,11 @@ void DOSELF_SUBSET(struct runner *r, struct cell *restrict ci,
  * @param r The #runner.
  * @param ci The first #cell.
  * @param cj The second #cell.
+ * @param sid The direction of the pair
+ * @param shift The shift vector to apply to the particles in ci.
  */
-void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) {
+void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj, const int sid,
+             const double *shift) {
 
   const struct engine *restrict e = r->e;
 
@@ -900,29 +901,13 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) {
 
   TIMER_TIC;
 
-  /* Anything to do here? */
-  if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return;
-
-  if (!cell_are_part_drifted(ci, e) || !cell_are_part_drifted(cj, e))
-    error("Interacting undrifted cells.");
-
-  /* Get the sort ID. */
-  double shift[3] = {0.0, 0.0, 0.0};
-  const int sid = space_getsid(e->s, &ci, &cj, shift);
-
-  /* Have the cells been sorted? */
-  if (!(ci->sorted & (1 << sid)) || ci->dx_max_sort > space_maxreldx * ci->dmin)
-    runner_do_sort(r, ci, (1 << sid), 1);
-  if (!(cj->sorted & (1 << sid)) || cj->dx_max_sort > space_maxreldx * cj->dmin)
-    runner_do_sort(r, cj, (1 << sid), 1);
-
   /* Get the cutoff shift. */
   double rshift = 0.0;
   for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k];
 
   /* Pick-out the sorted lists. */
-  const struct entry *restrict sort_i = &ci->sort[sid * (ci->count + 1)];
-  const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)];
+  const struct entry *restrict sort_i = ci->sort[sid];
+  const struct entry *restrict sort_j = cj->sort[sid];
 
 #ifdef SWIFT_DEBUG_CHECKS
   /* Check that the dx_max_sort values in the cell are indeed an upper
@@ -933,8 +918,13 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) {
                     p->x[1] * runner_shift[sid][1] +
                     p->x[2] * runner_shift[sid][2];
     if (fabsf(d - sort_i[pid].d) - ci->dx_max_sort >
-        1.0e-6 * max(fabsf(d), ci->dx_max_sort))
-      error("particle shift diff exceeds dx_max_sort.");
+        1.0e-4 * max(fabsf(d), ci->dx_max_sort_old))
+      error(
+          "particle shift diff exceeds dx_max_sort in cell ci. ci->nodeID=%d "
+          "cj->nodeID=%d d=%e sort_i[pid].d=%e ci->dx_max_sort=%e "
+          "ci->dx_max_sort_old=%e",
+          ci->nodeID, cj->nodeID, d, sort_i[pid].d, ci->dx_max_sort,
+          ci->dx_max_sort_old);
   }
   for (int pjd = 0; pjd < cj->count; pjd++) {
     const struct part *p = &cj->parts[sort_j[pjd].i];
@@ -942,8 +932,13 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) {
                     p->x[1] * runner_shift[sid][1] +
                     p->x[2] * runner_shift[sid][2];
     if (fabsf(d - sort_j[pjd].d) - cj->dx_max_sort >
-        1.0e-6 * max(fabsf(d), cj->dx_max_sort))
-      error("particle shift diff exceeds dx_max_sort.");
+        1.0e-4 * max(fabsf(d), cj->dx_max_sort_old))
+      error(
+          "particle shift diff exceeds dx_max_sort in cell cj. cj->nodeID=%d "
+          "ci->nodeID=%d d=%e sort_j[pjd].d=%e cj->dx_max_sort=%e "
+          "cj->dx_max_sort_old=%e",
+          cj->nodeID, ci->nodeID, d, sort_j[pjd].d, cj->dx_max_sort,
+          cj->dx_max_sort_old);
   }
 #endif /* SWIFT_DEBUG_CHECKS */
 
@@ -1042,9 +1037,9 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) {
       struct part *restrict pj = &parts_j[sort_j[pjd].i];
       if (!part_is_active(pj, e)) continue;
       const float hj = pj->h;
-      const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift;
-      if (dj > di_max) continue;
-
+      const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max + rshift;
+      if (dj - rshift > di_max) continue;
+      
       double pjx[3];
       for (int k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k];
       const float hjg2 = hj * hj * kernel_gamma2;
@@ -1116,6 +1111,49 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) {
   TIMER_TOC(TIMER_DOPAIR);
 }
 
+/**
+ * @brief Determine which version of DOPAIR1 needs to be called depending on the
+ * orientation of the cells or whether DOPAIR1 needs to be called at all.
+ *
+ * @param r #runner
+ * @param ci #cell ci
+ * @param cj #cell cj
+ *
+ */
+void DOPAIR1_BRANCH(struct runner *r, struct cell *ci, struct cell *cj) {
+
+  const struct engine *restrict e = r->e;
+
+  /* Anything to do here? */
+  if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return;
+
+  /* Check that cells are drifted. */
+  if (!cell_are_part_drifted(ci, e) || !cell_are_part_drifted(cj, e))
+    error("Interacting undrifted cells.");
+
+  /* Get the sort ID. */
+  double shift[3] = {0.0, 0.0, 0.0};
+  const int sid = space_getsid(e->s, &ci, &cj, shift);
+
+  /* Have the cells been sorted? */
+  if (!(ci->sorted & (1 << sid)) ||
+      ci->dx_max_sort_old > space_maxreldx * ci->dmin)
+    error("Interacting unsorted cells.");
+  if (!(cj->sorted & (1 << sid)) ||
+      cj->dx_max_sort_old > space_maxreldx * cj->dmin)
+    error("Interacting unsorted cells.");
+
+#if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH) && \
+    (DOPAIR1_BRANCH == runner_dopair1_density_branch)
+  if (!sort_is_corner(sid))
+    runner_dopair1_density_vec(r, ci, cj, sid, shift);
+  else
+    DOPAIR1(r, ci, cj, sid, shift);
+#else
+  DOPAIR1(r, ci, cj, sid, shift);
+#endif
+}
+
 /**
  * @brief Compute the interactions between a cell pair (symmetric)
  *
@@ -1155,18 +1193,20 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
   const int sid = space_getsid(e->s, &ci, &cj, shift);
 
   /* Have the cells been sorted? */
-  if (!(ci->sorted & (1 << sid)) || ci->dx_max_sort > space_maxreldx * ci->dmin)
-    runner_do_sort(r, ci, (1 << sid), 1);
-  if (!(cj->sorted & (1 << sid)) || cj->dx_max_sort > space_maxreldx * cj->dmin)
-    runner_do_sort(r, cj, (1 << sid), 1);
+  if (!(ci->sorted & (1 << sid)) ||
+      ci->dx_max_sort_old > space_maxreldx * ci->dmin)
+    error("Interacting unsorted cells.");
+  if (!(cj->sorted & (1 << sid)) ||
+      cj->dx_max_sort_old > space_maxreldx * cj->dmin)
+    error("Interacting unsorted cells.");
 
   /* Get the cutoff shift. */
   double rshift = 0.0;
   for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k];
 
   /* Pick-out the sorted lists. */
-  struct entry *restrict sort_i = &ci->sort[sid * (ci->count + 1)];
-  struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)];
+  struct entry *restrict sort_i = ci->sort[sid];
+  struct entry *restrict sort_j = cj->sort[sid];
 
 #ifdef SWIFT_DEBUG_CHECKS
   /* Check that the dx_max_sort values in the cell are indeed an upper
@@ -1177,8 +1217,13 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
                     p->x[1] * runner_shift[sid][1] +
                     p->x[2] * runner_shift[sid][2];
     if (fabsf(d - sort_i[pid].d) - ci->dx_max_sort >
-        1.0e-6 * max(fabsf(d), ci->dx_max_sort))
-      error("particle shift diff exceeds dx_max_sort.");
+        1.0e-4 * max(fabsf(d), ci->dx_max_sort_old))
+      error(
+          "particle shift diff exceeds dx_max_sort in cell ci. ci->nodeID=%d "
+          "cj->nodeID=%d d=%e sort_i[pid].d=%e ci->dx_max_sort=%e "
+          "ci->dx_max_sort_old=%e",
+          ci->nodeID, cj->nodeID, d, sort_i[pid].d, ci->dx_max_sort,
+          ci->dx_max_sort_old);
   }
   for (int pjd = 0; pjd < cj->count; pjd++) {
     const struct part *p = &cj->parts[sort_j[pjd].i];
@@ -1186,8 +1231,13 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
                     p->x[1] * runner_shift[sid][1] +
                     p->x[2] * runner_shift[sid][2];
     if (fabsf(d - sort_j[pjd].d) - cj->dx_max_sort >
-        1.0e-6 * max(fabsf(d), cj->dx_max_sort))
-      error("particle shift diff exceeds dx_max_sort.");
+        1.0e-4 * max(fabsf(d), cj->dx_max_sort_old))
+      error(
+          "particle shift diff exceeds dx_max_sort in cell cj. cj->nodeID=%d "
+          "ci->nodeID=%d d=%e sort_j[pjd].d=%e cj->dx_max_sort=%e "
+          "cj->dx_max_sort_old=%e",
+          cj->nodeID, ci->nodeID, d, sort_j[pjd].d, cj->dx_max_sort,
+          cj->dx_max_sort_old);
   }
 #endif /* SWIFT_DEBUG_CHECKS */
 
@@ -1399,9 +1449,9 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
     /* Get a hold of the jth part in cj. */
     struct part *restrict pj = &parts_j[sort_j[pjd].i];
     const float hj = pj->h;
-    const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift;
-    if (dj > di_max) continue;
-
+    const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max + rshift;
+    if (dj - rshift > di_max) continue;
+    
     double pjx[3];
     for (int k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k];
     const float hjg2 = hj * hj * kernel_gamma2;
@@ -2063,19 +2113,12 @@ void DOSUB_PAIR1(struct runner *r, struct cell *ci, struct cell *cj, int sid,
   if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return;
   if (ci->count == 0 || cj->count == 0) return;
 
-  /* Get the cell dimensions. */
-  const float h = min(ci->width[0], min(ci->width[1], ci->width[2]));
-
   /* Get the type of pair if not specified explicitly. */
-  // if ( sid < 0 )
   double shift[3];
   sid = space_getsid(s, &ci, &cj, shift);
 
   /* Recurse? */
-  if (ci->split && cj->split &&
-      max(ci->h_max, cj->h_max) * kernel_gamma + ci->dx_max_sort +
-              cj->dx_max_sort <
-          h / 2) {
+  if (cell_can_recurse_in_pair_task(ci) && cell_can_recurse_in_pair_task(cj)) {
 
     /* Different types of flags. */
     switch (sid) {
@@ -2279,24 +2322,19 @@ void DOSUB_PAIR1(struct runner *r, struct cell *ci, struct cell *cj, int sid,
   else if (cell_is_active(ci, e) || cell_is_active(cj, e)) {
 
     /* Make sure both cells are drifted to the current timestep. */
-    if (!cell_are_part_drifted(ci, e)) cell_drift_part(ci, e);
-    if (!cell_are_part_drifted(cj, e)) cell_drift_part(cj, e);
+    if (!cell_are_part_drifted(ci, e) || !cell_are_part_drifted(cj, e))
+      error("Interacting undrifted cells.");
 
     /* Do any of the cells need to be sorted first? */
     if (!(ci->sorted & (1 << sid)) ||
-        ci->dx_max_sort > ci->dmin * space_maxreldx)
-      runner_do_sort(r, ci, (1 << sid), 1);
+        ci->dx_max_sort_old > ci->dmin * space_maxreldx)
+      error("Interacting unsorted cell.");
     if (!(cj->sorted & (1 << sid)) ||
-        cj->dx_max_sort > cj->dmin * space_maxreldx)
-      runner_do_sort(r, cj, (1 << sid), 1);
+        cj->dx_max_sort_old > cj->dmin * space_maxreldx)
+      error("Interacting unsorted cell.");
 
-/* Compute the interactions. */
-#if (DOPAIR1 == runner_dopair1_density) && defined(WITH_VECTORIZATION) && \
-    defined(GADGET2_SPH)
-    runner_dopair1_density_vec(r, ci, cj);
-#else
-    DOPAIR1(r, ci, cj);
-#endif
+    /* Compute the interactions. */
+    DOPAIR1_BRANCH(r, ci, cj);
   }
 
   if (gettimer) TIMER_TOC(TIMER_DOSUB_PAIR);
@@ -2317,7 +2355,7 @@ void DOSUB_SELF1(struct runner *r, struct cell *ci, int gettimer) {
   if (ci->count == 0 || !cell_is_active(ci, r->e)) return;
 
   /* Recurse? */
-  if (ci->split) {
+  if (cell_can_recurse_in_self_task(ci)) {
 
     /* Loop over all progeny. */
     for (int k = 0; k < 8; k++)
@@ -2333,7 +2371,7 @@ void DOSUB_SELF1(struct runner *r, struct cell *ci, int gettimer) {
   else {
 
     /* Drift the cell to the current timestep if needed. */
-    if (!cell_are_part_drifted(ci, r->e)) cell_drift_part(ci, r->e);
+    if (!cell_are_part_drifted(ci, r->e)) error("Interacting undrifted cell.");
 
 #if (DOSELF1 == runner_doself1_density) && defined(WITH_VECTORIZATION) && \
     defined(GADGET2_SPH)
@@ -2370,19 +2408,12 @@ void DOSUB_PAIR2(struct runner *r, struct cell *ci, struct cell *cj, int sid,
   if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return;
   if (ci->count == 0 || cj->count == 0) return;
 
-  /* Get the cell dimensions. */
-  const float h = min(ci->width[0], min(ci->width[1], ci->width[2]));
-
   /* Get the type of pair if not specified explicitly. */
-  // if ( sid < 0 )
   double shift[3];
   sid = space_getsid(s, &ci, &cj, shift);
 
   /* Recurse? */
-  if (ci->split && cj->split &&
-      max(ci->h_max, cj->h_max) * kernel_gamma + ci->dx_max_sort +
-              cj->dx_max_sort <
-          h / 2) {
+  if (cell_can_recurse_in_pair_task(ci) && cell_can_recurse_in_pair_task(cj)) {
 
     /* Different types of flags. */
     switch (sid) {
@@ -2586,16 +2617,16 @@ void DOSUB_PAIR2(struct runner *r, struct cell *ci, struct cell *cj, int sid,
   else if (cell_is_active(ci, e) || cell_is_active(cj, e)) {
 
     /* Make sure both cells are drifted to the current timestep. */
-    if (!cell_are_part_drifted(ci, e)) cell_drift_part(ci, e);
-    if (!cell_are_part_drifted(cj, e)) cell_drift_part(cj, e);
+    if (!cell_are_part_drifted(ci, e) || !cell_are_part_drifted(cj, e))
+      error("Interacting undrifted cells.");
 
     /* Do any of the cells need to be sorted first? */
     if (!(ci->sorted & (1 << sid)) ||
-        ci->dx_max_sort > ci->dmin * space_maxreldx)
-      runner_do_sort(r, ci, (1 << sid), 1);
+        ci->dx_max_sort_old > ci->dmin * space_maxreldx)
+      error("Interacting unsorted cells.");
     if (!(cj->sorted & (1 << sid)) ||
-        cj->dx_max_sort > cj->dmin * space_maxreldx)
-      runner_do_sort(r, cj, (1 << sid), 1);
+        cj->dx_max_sort_old > cj->dmin * space_maxreldx)
+      error("Interacting unsorted cells.");
 
     /* Compute the interactions. */
     DOPAIR2(r, ci, cj);
@@ -2619,7 +2650,7 @@ void DOSUB_SELF2(struct runner *r, struct cell *ci, int gettimer) {
   if (ci->count == 0 || !cell_is_active(ci, r->e)) return;
 
   /* Recurse? */
-  if (ci->split) {
+  if (cell_can_recurse_in_self_task(ci)) {
 
     /* Loop over all progeny. */
     for (int k = 0; k < 8; k++)
@@ -2652,22 +2683,29 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts,
 
   TIMER_TIC;
 
+  /* Should we even bother? */
+  if (!cell_is_active(ci, e) && (cj == NULL || !cell_is_active(cj, e))) return;
+  if (ci->count == 0 || (cj != NULL && cj->count == 0)) return;
+
   /* Find out in which sub-cell of ci the parts are. */
   struct cell *sub = NULL;
-  for (int k = 0; k < 8; k++)
-    if (ci->progeny[k] != NULL) {
-      if (&parts[ind[0]] >= &ci->progeny[k]->parts[0] &&
-          &parts[ind[0]] < &ci->progeny[k]->parts[ci->progeny[k]->count]) {
-        sub = ci->progeny[k];
-        break;
+  if (ci->split) {
+    for (int k = 0; k < 8; k++) {
+      if (ci->progeny[k] != NULL) {
+        if (&parts[ind[0]] >= &ci->progeny[k]->parts[0] &&
+            &parts[ind[0]] < &ci->progeny[k]->parts[ci->progeny[k]->count]) {
+          sub = ci->progeny[k];
+          break;
+        }
       }
     }
+  }
 
   /* Is this a single cell? */
   if (cj == NULL) {
 
     /* Recurse? */
-    if (ci->split) {
+    if (cell_can_recurse_in_self_task(ci)) {
 
       /* Loop over all progeny. */
       DOSUB_SUBSET(r, sub, parts, ind, count, NULL, -1, 0);
@@ -2686,14 +2724,9 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts,
   /* Otherwise, it's a pair interaction. */
   else {
 
-    /* Get the cell dimensions. */
-    const float h = min(ci->width[0], min(ci->width[1], ci->width[2]));
-
     /* Recurse? */
-    if (ci->split && cj->split &&
-        max(ci->h_max, cj->h_max) * kernel_gamma + ci->dx_max_sort +
-                cj->dx_max_sort <
-            h / 2) {
+    if (cell_can_recurse_in_pair_task(ci) &&
+        cell_can_recurse_in_pair_task(cj)) {
 
       /* Get the type of pair if not specified explicitly. */
       double shift[3] = {0.0, 0.0, 0.0};
@@ -3204,26 +3237,8 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts,
     /* Otherwise, compute the pair directly. */
     else if (cell_is_active(ci, e) || cell_is_active(cj, e)) {
 
-      /* Get the relative distance between the pairs, wrapping. */
-      double shift[3] = {0.0, 0.0, 0.0};
-      for (int k = 0; k < 3; k++) {
-        if (cj->loc[k] - ci->loc[k] < -s->dim[k] / 2)
-          shift[k] = s->dim[k];
-        else if (cj->loc[k] - ci->loc[k] > s->dim[k] / 2)
-          shift[k] = -s->dim[k];
-      }
-
-      /* Get the sorting index. */
-      int new_sid = 0;
-      for (int k = 0; k < 3; k++)
-        new_sid = 3 * new_sid +
-                  ((cj->loc[k] - ci->loc[k] + shift[k] < 0)
-                       ? 0
-                       : (cj->loc[k] - ci->loc[k] + shift[k] > 0) ? 2 : 1);
-      new_sid = sortlistID[new_sid];
-
       /* Do any of the cells need to be drifted first? */
-      if (!cell_are_part_drifted(cj, e)) cell_drift_part(cj, e);
+      if (!cell_are_part_drifted(cj, e)) error("Cell should be drifted!");
 
       DOPAIR_SUBSET(r, ci, parts, ind, count, cj);
     }
diff --git a/src/runner_doiact_fft.c b/src/runner_doiact_fft.c
index a3e3f38fba920c0c58d600bb25feda88d4a3cf84..26b59f9f6b864445df9190c6041ee684c456ba22 100644
--- a/src/runner_doiact_fft.c
+++ b/src/runner_doiact_fft.c
@@ -20,9 +20,6 @@
 /* Config parameters. */
 #include "../config.h"
 
-/* Some standard headers. */
-#include <pthread.h>
-
 #ifdef HAVE_FFTW
 #include <fftw3.h>
 #endif
@@ -33,6 +30,7 @@
 /* Local includes. */
 #include "engine.h"
 #include "error.h"
+#include "kernel_long_gravity.h"
 #include "runner.h"
 #include "space.h"
 #include "timers.h"
@@ -179,11 +177,12 @@ void runner_do_grav_fft(struct runner* r, int timer) {
   // error("Top-level multipole %d not drifted", i);
 
   /* Allocates some memory for the density mesh */
-  double* restrict rho = fftw_alloc_real(N * N * N);
+  double* restrict rho = fftw_malloc(sizeof(double) * N * N * N);
   if (rho == NULL) error("Error allocating memory for density mesh");
 
   /* Allocates some memory for the mesh in Fourier space */
-  fftw_complex* restrict frho = fftw_alloc_complex(N * N * (N_half + 1));
+  fftw_complex* restrict frho =
+      fftw_malloc(sizeof(fftw_complex) * N * N * (N_half + 1));
   if (frho == NULL)
     error("Error allocating memory for transform of density mesh");
 
@@ -241,7 +240,9 @@ void runner_do_grav_fft(struct runner* r, int timer) {
         if (k2 == 0.) continue;
 
         /* Green function */
-        const double green_cor = green_fac * exp(-k2 * a_smooth2) / k2;
+        double W;
+        fourier_kernel_long_grav_eval(k2 * a_smooth2, &W);
+        const double green_cor = green_fac * W / k2;
 
         /* Deconvolution of CIC */
         const double CIC_cor = sinc_kx_inv * sinc_ky_inv * sinc_kz_inv;
diff --git a/src/runner_doiact_grav.h b/src/runner_doiact_grav.h
index a66cc5e0c9ed241aba3bb1b4329016b8e505e280..01ea6a073211a08430e77721f4c2e60ef7adfd04 100644
--- a/src/runner_doiact_grav.h
+++ b/src/runner_doiact_grav.h
@@ -36,8 +36,10 @@
  */
 void runner_do_grav_down(struct runner *r, struct cell *c, int timer) {
 
+  /* Some constants */
   const struct engine *e = r->e;
-  const int periodic = e->s->periodic;
+
+  /* Cell properties */
   struct gpart *gparts = c->gparts;
   const int gcount = c->gcount;
 
@@ -52,7 +54,6 @@ void runner_do_grav_down(struct runner *r, struct cell *c, int timer) {
     /* Add the field-tensor to all the 8 progenitors */
     for (int k = 0; k < 8; ++k) {
       struct cell *cp = c->progeny[k];
-      struct grav_tensor temp;
 
       /* Do we have a progenitor with any active g-particles ? */
       if (cp != NULL && cell_is_active(cp, e)) {
@@ -61,13 +62,14 @@ void runner_do_grav_down(struct runner *r, struct cell *c, int timer) {
         if (cp->ti_old_multipole != e->ti_current)
           error("cp->multipole not drifted.");
 #endif
+        struct grav_tensor shifted_tensor;
 
         /* Shift the field tensor */
-        gravity_L2L(&temp, &c->multipole->pot, cp->multipole->CoM,
-                    c->multipole->CoM, 0 * periodic);
+        gravity_L2L(&shifted_tensor, &c->multipole->pot, cp->multipole->CoM,
+                    c->multipole->CoM);
 
         /* Add it to this level's tensor */
-        gravity_field_tensors_add(&cp->multipole->pot, &temp);
+        gravity_field_tensors_add(&cp->multipole->pot, &shifted_tensor);
 
         /* Recurse */
         runner_do_grav_down(r, cp, 0);
@@ -91,6 +93,7 @@ void runner_do_grav_down(struct runner *r, struct cell *c, int timer) {
           error("gpart not drifted to current time");
 #endif
 
+        /* Apply the kernel */
         gravity_L2P(&c->multipole->pot, c->multipole->CoM, gp);
       }
     }
@@ -110,10 +113,12 @@ void runner_do_grav_down(struct runner *r, struct cell *c, int timer) {
 void runner_dopair_grav_mm(const struct runner *r, struct cell *restrict ci,
                            struct cell *restrict cj) {
 
+  /* Some constants */
   const struct engine *e = r->e;
+  const struct space *s = e->s;
+  const int periodic = s->periodic;
+  const double dim[3] = {s->dim[0], s->dim[1], s->dim[2]};
   const struct gravity_props *props = e->gravity_properties;
-  const int periodic = e->s->periodic;
-  const struct multipole *multi_j = &cj->multipole->m_pole;
   // const float a_smooth = e->gravity_properties->a_smooth;
   // const float rlr_inv = 1. / (a_smooth * ci->super->width[0]);
 
@@ -122,6 +127,9 @@ void runner_dopair_grav_mm(const struct runner *r, struct cell *restrict ci,
   /* Anything to do here? */
   if (!cell_is_active(ci, e)) return;
 
+  /* Short-cut to the multipole */
+  const struct multipole *multi_j = &cj->multipole->m_pole;
+
 #ifdef SWIFT_DEBUG_CHECKS
   if (ci == cj) error("Interacting a cell with itself using M2L");
 
@@ -136,202 +144,1133 @@ void runner_dopair_grav_mm(const struct runner *r, struct cell *restrict ci,
 
   /* Let's interact at this level */
   gravity_M2L(&ci->multipole->pot, multi_j, ci->multipole->CoM,
-              cj->multipole->CoM, props, periodic * 0);
+              cj->multipole->CoM, props, periodic, dim);
 
   TIMER_TOC(timer_dopair_grav_mm);
 }
 
 /**
- * @brief Computes the interaction of all the particles in a cell with the
- * multipole of another cell.
+ * @brief Computes the interaction of all the particles in a cell with all the
+ * particles of another cell using the full Newtonian potential
  *
  * @param r The #runner.
- * @param ci The #cell with particles to interct.
- * @param cj The #cell with the multipole.
+ * @param ci The first #cell.
+ * @param cj The other #cell.
+ * @param shift The distance vector (periodically wrapped) between the cell
+ * centres.
  */
-void runner_dopair_grav_pm(const struct runner *r,
-                           const struct cell *restrict ci,
-                           const struct cell *restrict cj) {
+void runner_dopair_grav_pp_full(struct runner *r, struct cell *ci,
+                                struct cell *cj, double shift[3]) {
+
+  /* Some constants */
+  const struct engine *const e = r->e;
+  struct gravity_cache *const ci_cache = &r->ci_gravity_cache;
+  struct gravity_cache *const cj_cache = &r->cj_gravity_cache;
+
+  /* Cell properties */
+  const int gcount_i = ci->gcount;
+  const int gcount_j = cj->gcount;
+  struct gpart *restrict gparts_i = ci->gparts;
+  struct gpart *restrict gparts_j = cj->gparts;
+  const int ci_active = cell_is_active(ci, e);
+  const int cj_active = cell_is_active(cj, e);
+  const double loc_i[3] = {ci->loc[0], ci->loc[1], ci->loc[2]};
+  const double loc_j[3] = {cj->loc[0], cj->loc[1], cj->loc[2]};
+  const double loc_mean[3] = {0.5 * (loc_i[0] + loc_j[0]),
+                              0.5 * (loc_i[1] + loc_j[1]),
+                              0.5 * (loc_i[2] + loc_j[2])};
+
+  /* Anything to do here ?*/
+  if (!ci_active && !cj_active) return;
+
+  /* Check that we fit in cache */
+  if (gcount_i > ci_cache->count || gcount_j > cj_cache->count)
+    error("Not enough space in the caches! gcount_i=%d gcount_j=%d", gcount_i,
+          gcount_j);
+
+  /* Computed the padded counts */
+  const int gcount_padded_i = gcount_i - (gcount_i % VEC_SIZE) + VEC_SIZE;
+  const int gcount_padded_j = gcount_j - (gcount_j % VEC_SIZE) + VEC_SIZE;
+
+  /* Fill the caches */
+  gravity_cache_populate(ci_cache, gparts_i, gcount_i, gcount_padded_i,
+                         loc_mean);
+  gravity_cache_populate(cj_cache, gparts_j, gcount_j, gcount_padded_j,
+                         loc_mean);
+
+  /* Ok... Here we go ! */
+
+  if (ci_active) {
+
+    /* Loop over all particles in ci... */
+    for (int pid = 0; pid < gcount_i; pid++) {
+
+      /* Skip inactive particles */
+      if (!gpart_is_active(&gparts_i[pid], e)) continue;
+
+      const float x_i = ci_cache->x[pid];
+      const float y_i = ci_cache->y[pid];
+      const float z_i = ci_cache->z[pid];
+
+      /* Some powers of the softening length */
+      const float h_i = ci_cache->epsilon[pid];
+      const float h2_i = h_i * h_i;
+      const float h_inv_i = 1.f / h_i;
+      const float h_inv3_i = h_inv_i * h_inv_i * h_inv_i;
+
+      /* Local accumulators for the acceleration */
+      float a_x = 0.f, a_y = 0.f, a_z = 0.f;
+
+      /* Make the compiler understand we are in happy vectorization land */
+      swift_align_information(cj_cache->x, SWIFT_CACHE_ALIGNMENT);
+      swift_align_information(cj_cache->y, SWIFT_CACHE_ALIGNMENT);
+      swift_align_information(cj_cache->z, SWIFT_CACHE_ALIGNMENT);
+      swift_align_information(cj_cache->m, SWIFT_CACHE_ALIGNMENT);
+      swift_assume_size(gcount_padded_j, VEC_SIZE);
+
+      /* Loop over every particle in the other cell. */
+      for (int pjd = 0; pjd < gcount_padded_j; pjd++) {
+
+        /* Get info about j */
+        const float x_j = cj_cache->x[pjd];
+        const float y_j = cj_cache->y[pjd];
+        const float z_j = cj_cache->z[pjd];
+        const float mass_j = cj_cache->m[pjd];
+
+        /* Compute the pairwise (square) distance. */
+        const float dx = x_i - x_j;
+        const float dy = y_i - y_j;
+        const float dz = z_i - z_j;
+        const float r2 = dx * dx + dy * dy + dz * dz;
+
+#ifdef SWIFT_DEBUG_CHECKS
+        if (r2 == 0.f) error("Interacting particles with 0 distance");
+
+        /* Check that particles have been drifted to the current time */
+        if (gparts_i[pid].ti_drift != e->ti_current)
+          error("gpi not drifted to current time");
+        if (pjd < gcount_j && gparts_j[pjd].ti_drift != e->ti_current)
+          error("gpj not drifted to current time");
+#endif
+
+        /* Get the inverse distance */
+        const float r_inv = 1.f / sqrtf(r2);
+
+        float f_ij, W_ij;
+
+        if (r2 >= h2_i) {
+
+          /* Get Newtonian gravity */
+          f_ij = mass_j * r_inv * r_inv * r_inv;
+
+        } else {
+
+          const float r = r2 * r_inv;
+          const float ui = r * h_inv_i;
+
+          kernel_grav_eval(ui, &W_ij);
+
+          /* Get softened gravity */
+          f_ij = mass_j * h_inv3_i * W_ij;
+        }
+
+        /* Store it back */
+        a_x -= f_ij * dx;
+        a_y -= f_ij * dy;
+        a_z -= f_ij * dz;
+
+#ifdef SWIFT_DEBUG_CHECKS
+        /* Update the interaction counter if it's not a padded gpart */
+        if (pjd < gcount_j) gparts_i[pid].num_interacted++;
+#endif
+      }
+
+      /* Store everything back in cache */
+      ci_cache->a_x[pid] = a_x;
+      ci_cache->a_y[pid] = a_y;
+      ci_cache->a_z[pid] = a_z;
+    }
+  }
+
+  /* Now do the opposite loop */
+  if (cj_active) {
+
+    /* Loop over all particles in ci... */
+    for (int pjd = 0; pjd < gcount_j; pjd++) {
+
+      /* Skip inactive particles */
+      if (!gpart_is_active(&gparts_j[pjd], e)) continue;
+
+      const float x_j = cj_cache->x[pjd];
+      const float y_j = cj_cache->y[pjd];
+      const float z_j = cj_cache->z[pjd];
+
+      /* Some powers of the softening length */
+      const float h_j = cj_cache->epsilon[pjd];
+      const float h2_j = h_j * h_j;
+      const float h_inv_j = 1.f / h_j;
+      const float h_inv3_j = h_inv_j * h_inv_j * h_inv_j;
+
+      /* Local accumulators for the acceleration */
+      float a_x = 0.f, a_y = 0.f, a_z = 0.f;
+
+      /* Make the compiler understand we are in happy vectorization land */
+      swift_align_information(ci_cache->x, SWIFT_CACHE_ALIGNMENT);
+      swift_align_information(ci_cache->y, SWIFT_CACHE_ALIGNMENT);
+      swift_align_information(ci_cache->z, SWIFT_CACHE_ALIGNMENT);
+      swift_align_information(ci_cache->m, SWIFT_CACHE_ALIGNMENT);
+      swift_assume_size(gcount_padded_i, VEC_SIZE);
+
+      /* Loop over every particle in the other cell. */
+      for (int pid = 0; pid < gcount_padded_i; pid++) {
+
+        /* Get info about j */
+        const float x_i = ci_cache->x[pid];
+        const float y_i = ci_cache->y[pid];
+        const float z_i = ci_cache->z[pid];
+        const float mass_i = ci_cache->m[pid];
+
+        /* Compute the pairwise (square) distance. */
+        const float dx = x_j - x_i;
+        const float dy = y_j - y_i;
+        const float dz = z_j - z_i;
+        const float r2 = dx * dx + dy * dy + dz * dz;
+
+#ifdef SWIFT_DEBUG_CHECKS
+        if (r2 == 0.f) error("Interacting particles with 0 distance");
+
+        /* Check that particles have been drifted to the current time */
+        if (gparts_j[pjd].ti_drift != e->ti_current)
+          error("gpj not drifted to current time");
+        if (pid < gcount_i && gparts_i[pid].ti_drift != e->ti_current)
+          error("gpi not drifted to current time");
+#endif
+
+        /* Get the inverse distance */
+        const float r_inv = 1.f / sqrtf(r2);
+
+        float f_ji, W_ji;
+
+        if (r2 >= h2_j) {
+
+          /* Get Newtonian gravity */
+          f_ji = mass_i * r_inv * r_inv * r_inv;
+
+        } else {
+
+          const float r = r2 * r_inv;
+          const float uj = r * h_inv_j;
+
+          kernel_grav_eval(uj, &W_ji);
+
+          /* Get softened gravity */
+          f_ji = mass_i * h_inv3_j * W_ji;
+        }
+
+        /* Store it back */
+        a_x -= f_ji * dx;
+        a_y -= f_ji * dy;
+        a_z -= f_ji * dz;
+
+#ifdef SWIFT_DEBUG_CHECKS
+        /* Update the interaction counter if it's not a padded gpart */
+        if (pid < gcount_i) gparts_j[pjd].num_interacted++;
+#endif
+      }
+
+      /* Store everything back in cache */
+      cj_cache->a_x[pjd] = a_x;
+      cj_cache->a_y[pjd] = a_y;
+      cj_cache->a_z[pjd] = a_z;
+    }
+  }
+
+  /* Write back to the particles */
+  if (ci_active) gravity_cache_write_back(ci_cache, gparts_i, gcount_i);
+  if (cj_active) gravity_cache_write_back(cj_cache, gparts_j, gcount_j);
+
+#ifdef MATTHIEU_OLD_STUFF
+
+  /* Some constants */
+  const struct engine *const e = r->e;
 
-  error("Function should not be called");
+  /* Cell properties */
+  const int gcount_i = ci->gcount;
+  const int gcount_j = cj->gcount;
+  struct gpart *restrict gparts_i = ci->gparts;
+  struct gpart *restrict gparts_j = cj->gparts;
+
+  /* MATTHIEU: Should we use local DP accumulators ? */
+
+  /* Loop over all particles in ci... */
+  if (cell_is_active(ci, e)) {
+    for (int pid = 0; pid < gcount_i; pid++) {
+
+      /* Get a hold of the ith part in ci. */
+      struct gpart *restrict gpi = &gparts_i[pid];
+
+      if (!gpart_is_active(gpi, e)) continue;
+
+      /* Apply boundary condition */
+      const double pix[3] = {gpi->x[0] - shift[0], gpi->x[1] - shift[1],
+                             gpi->x[2] - shift[2]};
+
+      /* Loop over every particle in the other cell. */
+      for (int pjd = 0; pjd < gcount_j; pjd++) {
+
+        /* Get a hold of the jth part in cj. */
+        const struct gpart *restrict gpj = &gparts_j[pjd];
+
+        /* Compute the pairwise distance. */
+        const float dx[3] = {pix[0] - gpj->x[0],   // x
+                             pix[1] - gpj->x[1],   // y
+                             pix[2] - gpj->x[2]};  // z
+        const float r2 = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2];
+
+#ifdef SWIFT_DEBUG_CHECKS
+        /* Check that particles have been drifted to the current time */
+        if (gpi->ti_drift != e->ti_current)
+          error("gpi not drifted to current time");
+        if (gpj->ti_drift != e->ti_current)
+          error("gpj not drifted to current time");
+#endif
+
+        /* Interact ! */
+        runner_iact_grav_pp_nonsym(r2, dx, gpi, gpj);
+
+#ifdef SWIFT_DEBUG_CHECKS
+        gpi->num_interacted++;
+#endif
+      }
+    }
+  }
+
+  /* Loop over all particles in cj... */
+  if (cell_is_active(cj, e)) {
+    for (int pjd = 0; pjd < gcount_j; pjd++) {
+
+      /* Get a hold of the ith part in ci. */
+      struct gpart *restrict gpj = &gparts_j[pjd];
+
+      if (!gpart_is_active(gpj, e)) continue;
+
+      /* Apply boundary condition */
+      const double pjx[3] = {gpj->x[0] + shift[0], gpj->x[1] + shift[1],
+                             gpj->x[2] + shift[2]};
+
+      /* Loop over every particle in the other cell. */
+      for (int pid = 0; pid < gcount_i; pid++) {
+
+        /* Get a hold of the ith part in ci. */
+        const struct gpart *restrict gpi = &gparts_i[pid];
+
+        /* Compute the pairwise distance. */
+        const float dx[3] = {pjx[0] - gpi->x[0],   // x
+                             pjx[1] - gpi->x[1],   // y
+                             pjx[2] - gpi->x[2]};  // z
+        const float r2 = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2];
+
+#ifdef SWIFT_DEBUG_CHECKS
+        /* Check that particles have been drifted to the current time */
+        if (gpi->ti_drift != e->ti_current)
+          error("gpi not drifted to current time");
+        if (gpj->ti_drift != e->ti_current)
+          error("gpj not drifted to current time");
+#endif
+
+        /* Interact ! */
+        runner_iact_grav_pp_nonsym(r2, dx, gpj, gpi);
+
+#ifdef SWIFT_DEBUG_CHECKS
+        gpj->num_interacted++;
+#endif
+      }
+    }
+  }
+#endif
 }
 
 /**
  * @brief Computes the interaction of all the particles in a cell with all the
- * particles of another cell.
+ * particles of another cell using the truncated Newtonian potential
+ *
+ * @param r The #runner.
+ * @param ci The first #cell.
+ * @param cj The other #cell.
+ * @param shift The distance vector (periodically wrapped) between the cell
+ * centres.
+ */
+void runner_dopair_grav_pp_truncated(struct runner *r, struct cell *ci,
+                                     struct cell *cj, double shift[3]) {
+
+  /* Some constants */
+  const struct engine *const e = r->e;
+  const struct space *s = e->s;
+  const double cell_width = s->width[0];
+  const double a_smooth = e->gravity_properties->a_smooth;
+  const double rlr = cell_width * a_smooth;
+  const float rlr_inv = 1. / rlr;
+
+  /* Caches to play with */
+  struct gravity_cache *const ci_cache = &r->ci_gravity_cache;
+  struct gravity_cache *const cj_cache = &r->cj_gravity_cache;
+
+  /* Cell properties */
+  const int gcount_i = ci->gcount;
+  const int gcount_j = cj->gcount;
+  struct gpart *restrict gparts_i = ci->gparts;
+  struct gpart *restrict gparts_j = cj->gparts;
+  const int ci_active = cell_is_active(ci, e);
+  const int cj_active = cell_is_active(cj, e);
+  const double loc_i[3] = {ci->loc[0], ci->loc[1], ci->loc[2]};
+  const double loc_j[3] = {cj->loc[0], cj->loc[1], cj->loc[2]};
+  const double loc_mean[3] = {0.5 * (loc_i[0] + loc_j[0]),
+                              0.5 * (loc_i[1] + loc_j[1]),
+                              0.5 * (loc_i[2] + loc_j[2])};
+
+  /* Anything to do here ?*/
+  if (!ci_active && !cj_active) return;
+
+  /* Check that we fit in cache */
+  if (gcount_i > ci_cache->count || gcount_j > cj_cache->count)
+    error("Not enough space in the caches! gcount_i=%d gcount_j=%d", gcount_i,
+          gcount_j);
+
+  /* Computed the padded counts */
+  const int gcount_padded_i = gcount_i - (gcount_i % VEC_SIZE) + VEC_SIZE;
+  const int gcount_padded_j = gcount_j - (gcount_j % VEC_SIZE) + VEC_SIZE;
+
+  /* Fill the caches */
+  gravity_cache_populate(ci_cache, gparts_i, gcount_i, gcount_padded_i,
+                         loc_mean);
+  gravity_cache_populate(cj_cache, gparts_j, gcount_j, gcount_padded_j,
+                         loc_mean);
+
+  /* Ok... Here we go ! */
+
+  if (ci_active) {
+
+    /* Loop over all particles in ci... */
+    for (int pid = 0; pid < gcount_i; pid++) {
+
+      /* Skip inactive particles */
+      if (!gpart_is_active(&gparts_i[pid], e)) continue;
+
+      const float x_i = ci_cache->x[pid];
+      const float y_i = ci_cache->y[pid];
+      const float z_i = ci_cache->z[pid];
+
+      /* Some powers of the softening length */
+      const float h_i = ci_cache->epsilon[pid];
+      const float h2_i = h_i * h_i;
+      const float h_inv_i = 1.f / h_i;
+      const float h_inv3_i = h_inv_i * h_inv_i * h_inv_i;
+
+      /* Local accumulators for the acceleration */
+      float a_x = 0.f, a_y = 0.f, a_z = 0.f;
+
+      /* Make the compiler understand we are in happy vectorization land */
+      swift_align_information(cj_cache->x, SWIFT_CACHE_ALIGNMENT);
+      swift_align_information(cj_cache->y, SWIFT_CACHE_ALIGNMENT);
+      swift_align_information(cj_cache->z, SWIFT_CACHE_ALIGNMENT);
+      swift_align_information(cj_cache->m, SWIFT_CACHE_ALIGNMENT);
+      swift_assume_size(gcount_padded_j, VEC_SIZE);
+
+      /* Loop over every particle in the other cell. */
+      for (int pjd = 0; pjd < gcount_padded_j; pjd++) {
+
+        /* Get info about j */
+        const float x_j = cj_cache->x[pjd];
+        const float y_j = cj_cache->y[pjd];
+        const float z_j = cj_cache->z[pjd];
+        const float mass_j = cj_cache->m[pjd];
+
+        /* Compute the pairwise (square) distance. */
+        const float dx = x_i - x_j;
+        const float dy = y_i - y_j;
+        const float dz = z_i - z_j;
+        const float r2 = dx * dx + dy * dy + dz * dz;
+
+#ifdef SWIFT_DEBUG_CHECKS
+        if (r2 == 0.f) error("Interacting particles with 0 distance");
+
+        /* Check that particles have been drifted to the current time */
+        if (gparts_i[pid].ti_drift != e->ti_current)
+          error("gpi not drifted to current time");
+        if (pjd < gcount_j && gparts_j[pjd].ti_drift != e->ti_current)
+          error("gpj not drifted to current time");
+#endif
+
+        /* Get the inverse distance */
+        const float r_inv = 1.f / sqrtf(r2);
+        const float r = r2 * r_inv;
+
+        float f_ij, W_ij, corr_lr;
+
+        if (r2 >= h2_i) {
+
+          /* Get Newtonian gravity */
+          f_ij = mass_j * r_inv * r_inv * r_inv;
+
+        } else {
+
+          const float ui = r * h_inv_i;
+
+          kernel_grav_eval(ui, &W_ij);
+
+          /* Get softened gravity */
+          f_ij = mass_j * h_inv3_i * W_ij;
+        }
+
+        /* Get long-range correction */
+        const float u_lr = r * rlr_inv;
+        kernel_long_grav_eval(u_lr, &corr_lr);
+        f_ij *= corr_lr;
+
+        /* Store it back */
+        a_x -= f_ij * dx;
+        a_y -= f_ij * dy;
+        a_z -= f_ij * dz;
+
+#ifdef SWIFT_DEBUG_CHECKS
+        /* Update the interaction counter if it's not a padded gpart */
+        if (pjd < gcount_j) gparts_i[pid].num_interacted++;
+#endif
+      }
+
+      /* Store everything back in cache */
+      ci_cache->a_x[pid] = a_x;
+      ci_cache->a_y[pid] = a_y;
+      ci_cache->a_z[pid] = a_z;
+    }
+  }
+
+  /* Now do the opposite loop */
+  if (cj_active) {
+
+    /* Loop over all particles in ci... */
+    for (int pjd = 0; pjd < gcount_j; pjd++) {
+
+      /* Skip inactive particles */
+      if (!gpart_is_active(&gparts_j[pjd], e)) continue;
+
+      const float x_j = cj_cache->x[pjd];
+      const float y_j = cj_cache->y[pjd];
+      const float z_j = cj_cache->z[pjd];
+
+      /* Some powers of the softening length */
+      const float h_j = cj_cache->epsilon[pjd];
+      const float h2_j = h_j * h_j;
+      const float h_inv_j = 1.f / h_j;
+      const float h_inv3_j = h_inv_j * h_inv_j * h_inv_j;
+
+      /* Local accumulators for the acceleration */
+      float a_x = 0.f, a_y = 0.f, a_z = 0.f;
+
+      /* Make the compiler understand we are in happy vectorization land */
+      swift_align_information(ci_cache->x, SWIFT_CACHE_ALIGNMENT);
+      swift_align_information(ci_cache->y, SWIFT_CACHE_ALIGNMENT);
+      swift_align_information(ci_cache->z, SWIFT_CACHE_ALIGNMENT);
+      swift_align_information(ci_cache->m, SWIFT_CACHE_ALIGNMENT);
+      swift_assume_size(gcount_padded_i, VEC_SIZE);
+
+      /* Loop over every particle in the other cell. */
+      for (int pid = 0; pid < gcount_padded_i; pid++) {
+
+        /* Get info about j */
+        const float x_i = ci_cache->x[pid];
+        const float y_i = ci_cache->y[pid];
+        const float z_i = ci_cache->z[pid];
+        const float mass_i = ci_cache->m[pid];
+
+        /* Compute the pairwise (square) distance. */
+        const float dx = x_j - x_i;
+        const float dy = y_j - y_i;
+        const float dz = z_j - z_i;
+        const float r2 = dx * dx + dy * dy + dz * dz;
+
+#ifdef SWIFT_DEBUG_CHECKS
+        if (r2 == 0.f) error("Interacting particles with 0 distance");
+
+        /* Check that particles have been drifted to the current time */
+        if (gparts_j[pjd].ti_drift != e->ti_current)
+          error("gpj not drifted to current time");
+        if (pid < gcount_i && gparts_i[pid].ti_drift != e->ti_current)
+          error("gpi not drifted to current time");
+#endif
+
+        /* Get the inverse distance */
+        const float r_inv = 1.f / sqrtf(r2);
+        const float r = r2 * r_inv;
+
+        float f_ji, W_ji, corr_lr;
+
+        if (r2 >= h2_j) {
+
+          /* Get Newtonian gravity */
+          f_ji = mass_i * r_inv * r_inv * r_inv;
+
+        } else {
+
+          const float uj = r * h_inv_j;
+
+          kernel_grav_eval(uj, &W_ji);
+
+          /* Get softened gravity */
+          f_ji = mass_i * h_inv3_j * W_ji;
+        }
+
+        /* Get long-range correction */
+        const float u_lr = r * rlr_inv;
+        kernel_long_grav_eval(u_lr, &corr_lr);
+        f_ji *= corr_lr;
+
+        /* Store it back */
+        a_x -= f_ji * dx;
+        a_y -= f_ji * dy;
+        a_z -= f_ji * dz;
+
+#ifdef SWIFT_DEBUG_CHECKS
+        /* Update the interaction counter if it's not a padded gpart */
+        if (pid < gcount_i) gparts_j[pjd].num_interacted++;
+#endif
+      }
+
+      /* Store everything back in cache */
+      cj_cache->a_x[pjd] = a_x;
+      cj_cache->a_y[pjd] = a_y;
+      cj_cache->a_z[pjd] = a_z;
+    }
+  }
+
+  /* Write back to the particles */
+  if (ci_active) gravity_cache_write_back(ci_cache, gparts_i, gcount_i);
+  if (cj_active) gravity_cache_write_back(cj_cache, gparts_j, gcount_j);
+
+#ifdef MATTHIEU_OLD_STUFF
+  /* Some constants */
+  const struct engine *const e = r->e;
+  const struct space *s = e->s;
+  const double cell_width = s->width[0];
+  const double a_smooth = e->gravity_properties->a_smooth;
+  const double rlr = cell_width * a_smooth;
+  const float rlr_inv = 1. / rlr;
+
+  /* Cell properties */
+  const int gcount_i = ci->gcount;
+  const int gcount_j = cj->gcount;
+  struct gpart *restrict gparts_i = ci->gparts;
+  struct gpart *restrict gparts_j = cj->gparts;
+
+  /* MATTHIEU: Should we use local DP accumulators ? */
+
+  /* Loop over all particles in ci... */
+  if (cell_is_active(ci, e)) {
+    for (int pid = 0; pid < gcount_i; pid++) {
+
+      /* Get a hold of the ith part in ci. */
+      struct gpart *restrict gpi = &gparts_i[pid];
+
+      if (!gpart_is_active(gpi, e)) continue;
+
+      /* Apply boundary condition */
+      const double pix[3] = {gpi->x[0] - shift[0], gpi->x[1] - shift[1],
+                             gpi->x[2] - shift[2]};
+
+      /* Loop over every particle in the other cell. */
+      for (int pjd = 0; pjd < gcount_j; pjd++) {
+
+        /* Get a hold of the jth part in cj. */
+        const struct gpart *restrict gpj = &gparts_j[pjd];
+
+        /* Compute the pairwise distance. */
+        const float dx[3] = {pix[0] - gpj->x[0],   // x
+                             pix[1] - gpj->x[1],   // y
+                             pix[2] - gpj->x[2]};  // z
+        const float r2 = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2];
+
+#ifdef SWIFT_DEBUG_CHECKS
+        /* Check that particles have been drifted to the current time */
+        if (gpi->ti_drift != e->ti_current)
+          error("gpi not drifted to current time");
+        if (gpj->ti_drift != e->ti_current)
+          error("gpj not drifted to current time");
+#endif
+
+        /* Interact ! */
+        runner_iact_grav_pp_truncated_nonsym(r2, dx, gpi, gpj, rlr_inv);
+
+#ifdef SWIFT_DEBUG_CHECKS
+        gpi->num_interacted++;
+#endif
+      }
+    }
+  }
+
+  /* Loop over all particles in cj... */
+  if (cell_is_active(cj, e)) {
+    for (int pjd = 0; pjd < gcount_j; pjd++) {
+
+      /* Get a hold of the ith part in ci. */
+      struct gpart *restrict gpj = &gparts_j[pjd];
+
+      if (!gpart_is_active(gpj, e)) continue;
+
+      /* Apply boundary condition */
+      const double pjx[3] = {gpj->x[0] + shift[0], gpj->x[1] + shift[1],
+                             gpj->x[2] + shift[2]};
+
+      /* Loop over every particle in the other cell. */
+      for (int pid = 0; pid < gcount_i; pid++) {
+
+        /* Get a hold of the ith part in ci. */
+        const struct gpart *restrict gpi = &gparts_i[pid];
+
+        /* Compute the pairwise distance. */
+        const float dx[3] = {pjx[0] - gpi->x[0],   // x
+                             pjx[1] - gpi->x[1],   // y
+                             pjx[2] - gpi->x[2]};  // z
+        const float r2 = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2];
+
+#ifdef SWIFT_DEBUG_CHECKS
+        /* Check that particles have been drifted to the current time */
+        if (gpi->ti_drift != e->ti_current)
+          error("gpi not drifted to current time");
+        if (gpj->ti_drift != e->ti_current)
+          error("gpj not drifted to current time");
+#endif
+
+        /* Interact ! */
+        runner_iact_grav_pp_truncated_nonsym(r2, dx, gpj, gpi, rlr_inv);
+
+#ifdef SWIFT_DEBUG_CHECKS
+        gpj->num_interacted++;
+#endif
+      }
+    }
+  }
+
+#endif
+}
+
+/**
+ * @brief Computes the interaction of all the particles in a cell with all the
+ * particles of another cell (switching function between full and truncated).
  *
  * @param r The #runner.
  * @param ci The first #cell.
  * @param cj The other #cell.
+ */
+void runner_dopair_grav_pp(struct runner *r, struct cell *ci, struct cell *cj) {
+
+  /* Some properties of the space */
+  const struct engine *e = r->e;
+  const struct space *s = e->s;
+  const int periodic = s->periodic;
+  const double cell_width = s->width[0];
+  const double dim[3] = {s->dim[0], s->dim[1], s->dim[2]};
+  const double a_smooth = e->gravity_properties->a_smooth;
+  const double r_cut_min = e->gravity_properties->r_cut_min;
+  const double min_trunc = cell_width * r_cut_min * a_smooth;
+  double shift[3] = {0.0, 0.0, 0.0};
+
+  TIMER_TIC;
+
+  /* Anything to do here? */
+  if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return;
+
+  /* Let's start by drifting things */
+  if (!cell_are_gpart_drifted(ci, e)) cell_drift_gpart(ci, e);
+  if (!cell_are_gpart_drifted(cj, e)) cell_drift_gpart(cj, e);
+
+  /* Can we use the Newtonian version or do we need the truncated one ? */
+  if (!periodic) {
+    runner_dopair_grav_pp_full(r, ci, cj, shift);
+  } else {
+
+    /* Get the relative distance between the pairs, wrapping. */
+    shift[0] = nearest(cj->loc[0] - ci->loc[0], dim[0]);
+    shift[1] = nearest(cj->loc[1] - ci->loc[1], dim[1]);
+    shift[2] = nearest(cj->loc[2] - ci->loc[2], dim[2]);
+    const double r2 =
+        shift[0] * shift[0] + shift[1] * shift[1] + shift[2] * shift[2];
+
+    /* Get the maximal distance between any two particles */
+    const double max_r = sqrt(r2) + ci->multipole->r_max + cj->multipole->r_max;
+
+    /* Do we need to use the truncated interactions ? */
+    if (max_r > min_trunc)
+      runner_dopair_grav_pp_truncated(r, ci, cj, shift);
+    else
+      runner_dopair_grav_pp_full(r, ci, cj, shift);
+  }
+
+  TIMER_TOC(timer_dopair_grav_pp);
+}
+
+/**
+ * @brief Computes the interaction of all the particles in a cell using the
+ * full Newtonian potential.
+ *
+ * @param r The #runner.
+ * @param c The #cell.
  *
  * @todo Use a local cache for the particles.
  */
-void runner_dopair_grav_pp(struct runner *r, struct cell *ci, struct cell *cj) {
+void runner_doself_grav_pp_full(struct runner *r, struct cell *c) {
+
+  /* Some constants */
+  const struct engine *const e = r->e;
+  struct gravity_cache *const ci_cache = &r->ci_gravity_cache;
+
+  /* Cell properties */
+  const int gcount = c->gcount;
+  struct gpart *restrict gparts = c->gparts;
+  const int c_active = cell_is_active(c, e);
+  const double loc[3] = {c->loc[0] + 0.5 * c->width[0],
+                         c->loc[1] + 0.5 * c->width[1],
+                         c->loc[2] + 0.5 * c->width[2]};
+
+  /* Anything to do here ?*/
+  if (!c_active) return;
+
+  /* Check that we fit in cache */
+  if (gcount > ci_cache->count)
+    error("Not enough space in the cache! gcount=%d", gcount);
+
+  /* Computed the padded counts */
+  const int gcount_padded = gcount - (gcount % VEC_SIZE) + VEC_SIZE;
+
+  gravity_cache_populate(ci_cache, gparts, gcount, gcount_padded, loc);
+
+  /* Ok... Here we go ! */
+
+  /* Loop over all particles in ci... */
+  for (int pid = 0; pid < gcount; pid++) {
 
-  const struct engine *e = r->e;
-  const int gcount_i = ci->gcount;
-  const int gcount_j = cj->gcount;
-  struct gpart *restrict gparts_i = ci->gparts;
-  struct gpart *restrict gparts_j = cj->gparts;
-  const float a_smooth = e->gravity_properties->a_smooth;
-  const float rlr_inv = 1. / (a_smooth * ci->super->width[0]);
+    /* Skip inactive particles */
+    if (!gpart_is_active(&gparts[pid], e)) continue;
 
-  TIMER_TIC;
+    const float x_i = ci_cache->x[pid];
+    const float y_i = ci_cache->y[pid];
+    const float z_i = ci_cache->z[pid];
 
-  /* Anything to do here? */
-  if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return;
+    /* Some powers of the softening length */
+    const float h_i = ci_cache->epsilon[pid];
+    const float h2_i = h_i * h_i;
+    const float h_inv_i = 1.f / h_i;
+    const float h_inv3_i = h_inv_i * h_inv_i * h_inv_i;
 
-  /* Let's start by drifting things */
-  if (!cell_are_gpart_drifted(ci, e)) cell_drift_gpart(ci, e);
-  if (!cell_are_gpart_drifted(cj, e)) cell_drift_gpart(cj, e);
+    /* Local accumulators for the acceleration */
+    float a_x = 0.f, a_y = 0.f, a_z = 0.f;
 
-#if ICHECK > 0
-  for (int pid = 0; pid < gcount_i; pid++) {
+    /* Make the compiler understand we are in happy vectorization land */
+    swift_align_information(ci_cache->x, SWIFT_CACHE_ALIGNMENT);
+    swift_align_information(ci_cache->y, SWIFT_CACHE_ALIGNMENT);
+    swift_align_information(ci_cache->z, SWIFT_CACHE_ALIGNMENT);
+    swift_align_information(ci_cache->m, SWIFT_CACHE_ALIGNMENT);
+    swift_assume_size(gcount_padded, VEC_SIZE);
 
-    /* Get a hold of the ith part in ci. */
-    struct gpart *restrict gp = &gparts_i[pid];
+    /* Loop over every other particle in the cell. */
+    for (int pjd = 0; pjd < gcount_padded; pjd++) {
 
-    if (gp->id_or_neg_offset == ICHECK)
-      message("id=%lld loc=[ %f %f %f ] size= %f count= %d",
-              gp->id_or_neg_offset, cj->loc[0], cj->loc[1], cj->loc[2],
-              cj->width[0], cj->gcount);
-  }
+      /* No self interaction */
+      if (pid == pjd) continue;
 
-  for (int pid = 0; pid < gcount_j; pid++) {
+      /* Get info about j */
+      const float x_j = ci_cache->x[pjd];
+      const float y_j = ci_cache->y[pjd];
+      const float z_j = ci_cache->z[pjd];
+      const float mass_j = ci_cache->m[pjd];
 
-    /* Get a hold of the ith part in ci. */
-    struct gpart *restrict gp = &gparts_j[pid];
+      /* Compute the pairwise (square) distance. */
+      const float dx = x_i - x_j;
+      const float dy = y_i - y_j;
+      const float dz = z_i - z_j;
+      const float r2 = dx * dx + dy * dy + dz * dz;
 
-    if (gp->id_or_neg_offset == ICHECK)
-      message("id=%lld loc=[ %f %f %f ] size= %f count=%d",
-              gp->id_or_neg_offset, ci->loc[0], ci->loc[1], ci->loc[2],
-              ci->width[0], ci->gcount);
-  }
+#ifdef SWIFT_DEBUG_CHECKS
+      if (r2 == 0.f) error("Interacting particles with 0 distance");
+
+      /* Check that particles have been drifted to the current time */
+      if (gparts[pid].ti_drift != e->ti_current)
+        error("gpi not drifted to current time");
+      if (pjd < gcount && gparts[pjd].ti_drift != e->ti_current)
+        error("gpj not drifted to current time");
 #endif
 
-  /* MATTHIEU: Should we use local DP accumulators ? */
+      /* Get the inverse distance */
+      const float r_inv = 1.f / sqrtf(r2);
 
-  /* Loop over all particles in ci... */
-  if (cell_is_active(ci, e)) {
-    for (int pid = 0; pid < gcount_i; pid++) {
+      float f_ij, W_ij;
 
-      /* Get a hold of the ith part in ci. */
-      struct gpart *restrict gpi = &gparts_i[pid];
+      if (r2 >= h2_i) {
 
-      if (!gpart_is_active(gpi, e)) continue;
+        /* Get Newtonian gravity */
+        f_ij = mass_j * r_inv * r_inv * r_inv;
 
-      /* Loop over every particle in the other cell. */
-      for (int pjd = 0; pjd < gcount_j; pjd++) {
+      } else {
 
-        /* Get a hold of the jth part in cj. */
-        const struct gpart *restrict gpj = &gparts_j[pjd];
+        const float r = r2 * r_inv;
+        const float ui = r * h_inv_i;
 
-        /* Compute the pairwise distance. */
-        const float dx[3] = {gpi->x[0] - gpj->x[0],   // x
-                             gpi->x[1] - gpj->x[1],   // y
-                             gpi->x[2] - gpj->x[2]};  // z
-        const float r2 = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2];
+        kernel_grav_eval(ui, &W_ij);
 
-#ifdef SWIFT_DEBUG_CHECKS
-        /* Check that particles have been drifted to the current time */
-        if (gpi->ti_drift != e->ti_current)
-          error("gpi not drifted to current time");
-        if (gpj->ti_drift != e->ti_current)
-          error("gpj not drifted to current time");
-#endif
+        /* Get softened gravity */
+        f_ij = mass_j * h_inv3_i * W_ij;
+      }
 
-        /* Interact ! */
-        runner_iact_grav_pp_nonsym(rlr_inv, r2, dx, gpi, gpj);
+      /* Store it back */
+      a_x -= f_ij * dx;
+      a_y -= f_ij * dy;
+      a_z -= f_ij * dz;
 
 #ifdef SWIFT_DEBUG_CHECKS
-        gpi->num_interacted++;
+      /* Update the interaction counter if it's not a padded gpart */
+      if (pjd < gcount) gparts[pid].num_interacted++;
 #endif
-      }
     }
+
+    /* Store everything back in cache */
+    ci_cache->a_x[pid] = a_x;
+    ci_cache->a_y[pid] = a_y;
+    ci_cache->a_z[pid] = a_z;
   }
 
-  /* Loop over all particles in cj... */
-  if (cell_is_active(cj, e)) {
-    for (int pjd = 0; pjd < gcount_j; pjd++) {
+  /* Write back to the particles */
+  gravity_cache_write_back(ci_cache, gparts, gcount);
 
-      /* Get a hold of the ith part in ci. */
-      struct gpart *restrict gpj = &gparts_j[pjd];
+#ifdef MATTHIEU_OLD_STUFF
 
-      if (!gpart_is_active(gpj, e)) continue;
+  /* Some constants */
+  const struct engine *const e = r->e;
 
-      /* Loop over every particle in the other cell. */
-      for (int pid = 0; pid < gcount_i; pid++) {
+  /* Cell properties */
+  const int gcount = c->gcount;
+  struct gpart *restrict gparts = c->gparts;
 
-        /* Get a hold of the ith part in ci. */
-        const struct gpart *restrict gpi = &gparts_i[pid];
+  /* MATTHIEU: Should we use local DP accumulators ? */
 
-        /* Compute the pairwise distance. */
-        const float dx[3] = {gpj->x[0] - gpi->x[0],   // x
-                             gpj->x[1] - gpi->x[1],   // y
-                             gpj->x[2] - gpi->x[2]};  // z
-        const float r2 = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2];
+  /* Loop over all particles in ci... */
+  for (int pid = 0; pid < gcount; pid++) {
+
+    /* Get a hold of the ith part in ci. */
+    struct gpart *restrict gpi = &gparts[pid];
+
+    /* Loop over every particle in the other cell. */
+    for (int pjd = pid + 1; pjd < gcount; pjd++) {
+
+      /* Get a hold of the jth part in ci. */
+      struct gpart *restrict gpj = &gparts[pjd];
+
+      /* Compute the pairwise distance. */
+      float dx[3] = {gpi->x[0] - gpj->x[0],   // x
+                     gpi->x[1] - gpj->x[1],   // y
+                     gpi->x[2] - gpj->x[2]};  // z
+      const float r2 = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2];
 
 #ifdef SWIFT_DEBUG_CHECKS
-        /* Check that particles have been drifted to the current time */
-        if (gpi->ti_drift != e->ti_current)
-          error("gpi not drifted to current time");
-        if (gpj->ti_drift != e->ti_current)
-          error("gpj not drifted to current time");
+      /* Check that particles have been drifted to the current time */
+      if (gpi->ti_drift != e->ti_current)
+        error("gpi not drifted to current time");
+      if (gpj->ti_drift != e->ti_current)
+        error("gpj not drifted to current time");
 #endif
 
-        /* Interact ! */
-        runner_iact_grav_pp_nonsym(rlr_inv, r2, dx, gpj, gpi);
+      /* Interact ! */
+      if (gpart_is_active(gpi, e) && gpart_is_active(gpj, e)) {
+
+        runner_iact_grav_pp(r2, dx, gpi, gpj);
 
 #ifdef SWIFT_DEBUG_CHECKS
+        gpi->num_interacted++;
         gpj->num_interacted++;
 #endif
+
+      } else {
+
+        if (gpart_is_active(gpi, e)) {
+
+          runner_iact_grav_pp_nonsym(r2, dx, gpi, gpj);
+
+#ifdef SWIFT_DEBUG_CHECKS
+          gpi->num_interacted++;
+#endif
+
+        } else if (gpart_is_active(gpj, e)) {
+
+          dx[0] = -dx[0];
+          dx[1] = -dx[1];
+          dx[2] = -dx[2];
+          runner_iact_grav_pp_nonsym(r2, dx, gpj, gpi);
+
+#ifdef SWIFT_DEBUG_CHECKS
+          gpj->num_interacted++;
+#endif
+        }
       }
     }
   }
 
-  TIMER_TOC(timer_dopair_grav_pp);
+#endif
 }
 
 /**
- * @brief Computes the interaction of all the particles in a cell directly
+ * @brief Computes the interaction of all the particles in a cell using the
+ * truncated Newtonian potential.
  *
  * @param r The #runner.
  * @param c The #cell.
  *
  * @todo Use a local cache for the particles.
  */
-void runner_doself_grav_pp(struct runner *r, struct cell *c) {
+void runner_doself_grav_pp_truncated(struct runner *r, struct cell *c) {
 
-  const struct engine *e = r->e;
+  /* Some constants */
+  const struct engine *const e = r->e;
+  const struct space *s = e->s;
+  const double cell_width = s->width[0];
+  const double a_smooth = e->gravity_properties->a_smooth;
+  const double rlr = cell_width * a_smooth;
+  const float rlr_inv = 1. / rlr;
+
+  /* Caches to play with */
+  struct gravity_cache *const ci_cache = &r->ci_gravity_cache;
+
+  /* Cell properties */
   const int gcount = c->gcount;
   struct gpart *restrict gparts = c->gparts;
-  const float a_smooth = e->gravity_properties->a_smooth;
-  const float rlr_inv = 1. / (a_smooth * c->super->width[0]);
+  const int c_active = cell_is_active(c, e);
+  const double loc[3] = {c->loc[0] + 0.5 * c->width[0],
+                         c->loc[1] + 0.5 * c->width[1],
+                         c->loc[2] + 0.5 * c->width[2]};
 
-  TIMER_TIC;
+  /* Anything to do here ?*/
+  if (!c_active) return;
+
+  /* Check that we fit in cache */
+  if (gcount > ci_cache->count)
+    error("Not enough space in the caches! gcount=%d", gcount);
+
+  /* Computed the padded counts */
+  const int gcount_padded = gcount - (gcount % VEC_SIZE) + VEC_SIZE;
+
+  gravity_cache_populate(ci_cache, gparts, gcount, gcount_padded, loc);
+
+  /* Ok... Here we go ! */
+
+  /* Loop over all particles in ci... */
+  for (int pid = 0; pid < gcount; pid++) {
+
+    /* Skip inactive particles */
+    if (!gpart_is_active(&gparts[pid], e)) continue;
+
+    const float x_i = ci_cache->x[pid];
+    const float y_i = ci_cache->y[pid];
+    const float z_i = ci_cache->z[pid];
+
+    /* Some powers of the softening length */
+    const float h_i = ci_cache->epsilon[pid];
+    const float h2_i = h_i * h_i;
+    const float h_inv_i = 1.f / h_i;
+    const float h_inv3_i = h_inv_i * h_inv_i * h_inv_i;
+
+    /* Local accumulators for the acceleration */
+    float a_x = 0.f, a_y = 0.f, a_z = 0.f;
+
+    /* Make the compiler understand we are in happy vectorization land */
+    swift_align_information(ci_cache->x, SWIFT_CACHE_ALIGNMENT);
+    swift_align_information(ci_cache->y, SWIFT_CACHE_ALIGNMENT);
+    swift_align_information(ci_cache->z, SWIFT_CACHE_ALIGNMENT);
+    swift_align_information(ci_cache->m, SWIFT_CACHE_ALIGNMENT);
+    swift_assume_size(gcount_padded, VEC_SIZE);
+
+    /* Loop over every other particle in the cell. */
+    for (int pjd = 0; pjd < gcount_padded; pjd++) {
+
+      /* No self interaction */
+      if (pid == pjd) continue;
+
+      /* Get info about j */
+      const float x_j = ci_cache->x[pjd];
+      const float y_j = ci_cache->y[pjd];
+      const float z_j = ci_cache->z[pjd];
+      const float mass_j = ci_cache->m[pjd];
+
+      /* Compute the pairwise (square) distance. */
+      const float dx = x_i - x_j;
+      const float dy = y_i - y_j;
+      const float dz = z_i - z_j;
+      const float r2 = dx * dx + dy * dy + dz * dz;
 
 #ifdef SWIFT_DEBUG_CHECKS
-  if (c->gcount == 0) error("Doing self gravity on an empty cell !");
+      if (r2 == 0.f) error("Interacting particles with 0 distance");
+
+      /* Check that particles have been drifted to the current time */
+      if (gparts[pid].ti_drift != e->ti_current)
+        error("gpi not drifted to current time");
+      if (pjd < gcount && gparts[pjd].ti_drift != e->ti_current)
+        error("gpj not drifted to current time");
 #endif
 
-  /* Anything to do here? */
-  if (!cell_is_active(c, e)) return;
+      /* Get the inverse distance */
+      const float r_inv = 1.f / sqrtf(r2);
+      const float r = r2 * r_inv;
 
-  /* Do we need to start by drifting things ? */
-  if (!cell_are_gpart_drifted(c, e)) cell_drift_gpart(c, e);
+      float f_ij, W_ij, corr_lr;
 
-#if ICHECK > 0
-  for (int pid = 0; pid < gcount; pid++) {
+      if (r2 >= h2_i) {
 
-    /* Get a hold of the ith part in ci. */
-    struct gpart *restrict gp = &gparts[pid];
+        /* Get Newtonian gravity */
+        f_ij = mass_j * r_inv * r_inv * r_inv;
 
-    if (gp->id_or_neg_offset == ICHECK)
-      message("id=%lld loc=[ %f %f %f ] size= %f count= %d",
-              gp->id_or_neg_offset, c->loc[0], c->loc[1], c->loc[2],
-              c->width[0], c->gcount);
-  }
+      } else {
+
+        const float ui = r * h_inv_i;
+
+        kernel_grav_eval(ui, &W_ij);
+
+        /* Get softened gravity */
+        f_ij = mass_j * h_inv3_i * W_ij;
+      }
+
+      /* Get long-range correction */
+      const float u_lr = r * rlr_inv;
+      kernel_long_grav_eval(u_lr, &corr_lr);
+      f_ij *= corr_lr;
+
+      /* Store it back */
+      a_x -= f_ij * dx;
+      a_y -= f_ij * dy;
+      a_z -= f_ij * dz;
+
+#ifdef SWIFT_DEBUG_CHECKS
+      /* Update the interaction counter if it's not a padded gpart */
+      if (pjd < gcount) gparts[pid].num_interacted++;
 #endif
+    }
+
+    /* Store everything back in cache */
+    ci_cache->a_x[pid] = a_x;
+    ci_cache->a_y[pid] = a_y;
+    ci_cache->a_z[pid] = a_z;
+  }
+
+  /* Write back to the particles */
+  gravity_cache_write_back(ci_cache, gparts, gcount);
+
+#ifdef MATTHIEU_OLD_STUFF
+  /* Some constants */
+  const struct engine *const e = r->e;
+  const struct space *s = e->s;
+  const double cell_width = s->width[0];
+  const double a_smooth = e->gravity_properties->a_smooth;
+  const double rlr = cell_width * a_smooth;
+  const float rlr_inv = 1. / rlr;
+
+  /* Cell properties */
+  const int gcount = c->gcount;
+  struct gpart *restrict gparts = c->gparts;
 
   /* MATTHIEU: Should we use local DP accumulators ? */
 
@@ -364,7 +1303,7 @@ void runner_doself_grav_pp(struct runner *r, struct cell *c) {
       /* Interact ! */
       if (gpart_is_active(gpi, e) && gpart_is_active(gpj, e)) {
 
-        runner_iact_grav_pp(rlr_inv, r2, dx, gpi, gpj);
+        runner_iact_grav_pp_truncated(r2, dx, gpi, gpj, rlr_inv);
 
 #ifdef SWIFT_DEBUG_CHECKS
         gpi->num_interacted++;
@@ -375,7 +1314,7 @@ void runner_doself_grav_pp(struct runner *r, struct cell *c) {
 
         if (gpart_is_active(gpi, e)) {
 
-          runner_iact_grav_pp_nonsym(rlr_inv, r2, dx, gpi, gpj);
+          runner_iact_grav_pp_truncated_nonsym(r2, dx, gpi, gpj, rlr_inv);
 
 #ifdef SWIFT_DEBUG_CHECKS
           gpi->num_interacted++;
@@ -386,7 +1325,7 @@ void runner_doself_grav_pp(struct runner *r, struct cell *c) {
           dx[0] = -dx[0];
           dx[1] = -dx[1];
           dx[2] = -dx[2];
-          runner_iact_grav_pp_nonsym(rlr_inv, r2, dx, gpj, gpi);
+          runner_iact_grav_pp_truncated_nonsym(r2, dx, gpj, gpi, rlr_inv);
 
 #ifdef SWIFT_DEBUG_CHECKS
           gpj->num_interacted++;
@@ -395,6 +1334,53 @@ void runner_doself_grav_pp(struct runner *r, struct cell *c) {
       }
     }
   }
+#endif
+}
+
+/**
+ * @brief Computes the interaction of all the particles in a cell directly
+ * (Switching function between truncated and full)
+ *
+ * @param r The #runner.
+ * @param c The #cell.
+ */
+void runner_doself_grav_pp(struct runner *r, struct cell *c) {
+
+  /* Some properties of the space */
+  const struct engine *e = r->e;
+  const struct space *s = e->s;
+  const int periodic = s->periodic;
+  const double cell_width = s->width[0];
+  const double a_smooth = e->gravity_properties->a_smooth;
+  const double r_cut_min = e->gravity_properties->r_cut_min;
+  const double min_trunc = cell_width * r_cut_min * a_smooth;
+
+  TIMER_TIC;
+
+#ifdef SWIFT_DEBUG_CHECKS
+  if (c->gcount == 0) error("Doing self gravity on an empty cell !");
+#endif
+
+  /* Anything to do here? */
+  if (!cell_is_active(c, e)) return;
+
+  /* Do we need to start by drifting things ? */
+  if (!cell_are_gpart_drifted(c, e)) cell_drift_gpart(c, e);
+
+  /* Can we use the Newtonian version or do we need the truncated one ? */
+  if (!periodic) {
+    runner_doself_grav_pp_full(r, c);
+  } else {
+
+    /* Get the maximal distance between any two particles */
+    const double max_r = 2 * c->multipole->r_max;
+
+    /* Do we need to use the truncated interactions ? */
+    if (max_r > min_trunc)
+      runner_doself_grav_pp_truncated(r, c);
+    else
+      runner_doself_grav_pp_full(r, c);
+  }
 
   TIMER_TOC(timer_doself_grav_pp);
 }
@@ -415,8 +1401,14 @@ void runner_dopair_grav(struct runner *r, struct cell *ci, struct cell *cj,
 
   /* Some constants */
   const struct engine *e = r->e;
+  const struct space *s = e->s;
+  const int periodic = s->periodic;
+  const double cell_width = s->width[0];
+  const double dim[3] = {s->dim[0], s->dim[1], s->dim[2]};
   const struct gravity_props *props = e->gravity_properties;
   const double theta_crit_inv = props->theta_crit_inv;
+  const double max_distance = props->a_smooth * props->r_cut_max * cell_width;
+  const double max_distance2 = max_distance * max_distance;
 
 #ifdef SWIFT_DEBUG_CHECKS
 
@@ -436,35 +1428,47 @@ void runner_dopair_grav(struct runner *r, struct cell *ci, struct cell *cj,
     error("cj->multipole not drifted.");
 #endif
 
-#if ICHECK > 0
-  for (int pid = 0; pid < ci->gcount; pid++) {
-
-    /* Get a hold of the ith part in ci. */
-    struct gpart *restrict gp = &ci->gparts[pid];
+  TIMER_TIC;
 
-    if (gp->id_or_neg_offset == ICHECK)
-      message("id=%lld loc=[ %f %f %f ] size= %f count= %d",
-              gp->id_or_neg_offset, cj->loc[0], cj->loc[1], cj->loc[2],
-              cj->width[0], cj->gcount);
-  }
+  /* Anything to do here? */
+  if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return;
 
-  for (int pid = 0; pid < cj->gcount; pid++) {
+  /* Recover the multipole information */
+  struct gravity_tensors *const multi_i = ci->multipole;
+  struct gravity_tensors *const multi_j = cj->multipole;
 
-    /* Get a hold of the ith part in ci. */
-    struct gpart *restrict gp = &cj->gparts[pid];
+  /* Get the distance between the CoMs */
+  double dx = multi_i->CoM[0] - multi_j->CoM[0];
+  double dy = multi_i->CoM[1] - multi_j->CoM[1];
+  double dz = multi_i->CoM[2] - multi_j->CoM[2];
 
-    if (gp->id_or_neg_offset == ICHECK)
-      message("id=%lld loc=[ %f %f %f ] size= %f count= %d",
-              gp->id_or_neg_offset, ci->loc[0], ci->loc[1], ci->loc[2],
-              ci->width[0], ci->gcount);
+  /* Apply BC */
+  if (periodic) {
+    dx = nearest(dx, dim[0]);
+    dy = nearest(dy, dim[1]);
+    dz = nearest(dz, dim[2]);
   }
+  const double r2 = dx * dx + dy * dy + dz * dz;
+
+  /* Are we beyond the distance where the truncated forces are 0? */
+  if (periodic && r2 > max_distance2) {
+
+#ifdef SWIFT_DEBUG_CHECKS
+    /* Need to account for the interactions we missed */
+    if (cell_is_active(ci, e))
+      multi_i->pot.num_interacted += multi_j->m_pole.num_gpart;
+    if (cell_is_active(cj, e))
+      multi_j->pot.num_interacted += multi_i->m_pole.num_gpart;
 #endif
+    return;
+  }
 
-  TIMER_TIC;
+  /* OK, we actually need to compute this pair. Let's find the cheapest
+   * option... */
 
   /* Can we use M-M interactions ? */
-  if (gravity_multipole_accept(ci->multipole, cj->multipole, theta_crit_inv,
-                               0)) {
+  if (gravity_multipole_accept(multi_i, multi_j, theta_crit_inv, r2)) {
+
     /* MATTHIEU: make a symmetric M-M interaction function ! */
     runner_dopair_grav_mm(r, ci, cj);
     runner_dopair_grav_mm(r, cj, ci);
@@ -476,8 +1480,8 @@ void runner_dopair_grav(struct runner *r, struct cell *ci, struct cell *cj,
   /* Alright, we'll have to split and recurse. */
   else {
 
-    const double ri_max = ci->multipole->r_max;
-    const double rj_max = cj->multipole->r_max;
+    const double ri_max = multi_i->r_max;
+    const double rj_max = multi_j->r_max;
 
     /* Split the larger of the two cells and start over again */
     if (ri_max > rj_max) {
@@ -543,6 +1547,9 @@ void runner_dopair_grav(struct runner *r, struct cell *ci, struct cell *cj,
  */
 void runner_doself_grav(struct runner *r, struct cell *c, int gettimer) {
 
+  /* Some constants */
+  const struct engine *e = r->e;
+
 #ifdef SWIFT_DEBUG_CHECKS
   /* Early abort? */
   if (c->gcount == 0) error("Doing self gravity on an empty cell !");
@@ -550,6 +1557,9 @@ void runner_doself_grav(struct runner *r, struct cell *c, int gettimer) {
 
   TIMER_TIC;
 
+  /* Anything to do here? */
+  if (!cell_is_active(c, e)) return;
+
   /* If the cell is split, interact each progeny with itself, and with
      each of its siblings. */
   if (c->split) {
@@ -617,8 +1627,14 @@ void runner_do_grav_long_range(struct runner *r, struct cell *ci, int timer) {
 
   /* Some constants */
   const struct engine *e = r->e;
+  const struct space *s = e->s;
   const struct gravity_props *props = e->gravity_properties;
+  const int periodic = s->periodic;
+  const double cell_width = s->width[0];
+  const double dim[3] = {s->dim[0], s->dim[1], s->dim[2]};
   const double theta_crit_inv = props->theta_crit_inv;
+  const double max_distance = props->a_smooth * props->r_cut_max * cell_width;
+  const double max_distance2 = max_distance * max_distance;
 
   TIMER_TIC;
 
@@ -627,38 +1643,86 @@ void runner_do_grav_long_range(struct runner *r, struct cell *ci, int timer) {
   const int nr_cells = e->s->nr_cells;
 
   /* Anything to do here? */
-  if (!cell_is_active(ci, e)) return;  // MATTHIEU (should never happen)
+  if (!cell_is_active(ci, e)) return;
 
   /* Check multipole has been drifted */
   if (ci->ti_old_multipole != e->ti_current)
     error("Interacting un-drifted multipole");
 
+  /* Recover the local multipole */
+  struct gravity_tensors *const multi_i = ci->multipole;
+  const double CoM_i[3] = {multi_i->CoM[0], multi_i->CoM[1], multi_i->CoM[2]};
+  const double CoM_rebuild_i[3] = {multi_i->CoM_rebuild[0],
+                                   multi_i->CoM_rebuild[1],
+                                   multi_i->CoM_rebuild[2]};
+
   /* Loop over all the top-level cells and go for a M-M interaction if
    * well-separated */
   for (int i = 0; i < nr_cells; ++i) {
 
-    /* Handle on the top-level cell */
+    /* Handle on the top-level cell and it's gravity business*/
     struct cell *cj = &cells[i];
+    const struct gravity_tensors *const multi_j = cj->multipole;
 
     /* Avoid stupid cases */
     if (ci == cj || cj->gcount == 0) continue;
 
+    /* Get the distance between the CoMs */
+    double dx = CoM_i[0] - multi_j->CoM[0];
+    double dy = CoM_i[1] - multi_j->CoM[1];
+    double dz = CoM_i[2] - multi_j->CoM[2];
+
+    /* Apply BC */
+    if (periodic) {
+      dx = nearest(dx, dim[0]);
+      dy = nearest(dy, dim[1]);
+      dz = nearest(dz, dim[2]);
+    }
+    const double r2 = dx * dx + dy * dy + dz * dz;
+
+    /* Are we beyond the distance where the truncated forces are 0 ?*/
+    if (periodic && r2 > max_distance2) {
+
+#ifdef SWIFT_DEBUG_CHECKS
+      /* Need to account for the interactions we missed */
+      multi_i->pot.num_interacted += multi_j->m_pole.num_gpart;
+#endif
+      continue;
+    }
+
     /* Check the multipole acceptance criterion */
-    if (gravity_multipole_accept(ci->multipole, cj->multipole, theta_crit_inv,
-                                 0)) {
+    if (gravity_multipole_accept(multi_i, multi_j, theta_crit_inv, r2)) {
 
       /* Go for a (non-symmetric) M-M calculation */
       runner_dopair_grav_mm(r, ci, cj);
-    }
-    /* Is the criterion violated now but was OK at the last rebuild ? */
-    else if (gravity_multipole_accept(ci->multipole, cj->multipole,
-                                      theta_crit_inv, 1)) {
 
-      /* Alright, we have to take charge of that pair in a different way. */
-      // MATTHIEU: We should actually open the tree-node here and recurse.
-      runner_dopair_grav_mm(r, ci, cj);
+    } else {
+
+      /* Let's check whether we need to still operate on this pair */
+
+      /* Get the distance between the CoMs at the last rebuild*/
+      double dx = CoM_rebuild_i[0] - multi_j->CoM_rebuild[0];
+      double dy = CoM_rebuild_i[1] - multi_j->CoM_rebuild[1];
+      double dz = CoM_rebuild_i[2] - multi_j->CoM_rebuild[2];
+
+      /* Apply BC */
+      if (periodic) {
+        dx = nearest(dx, dim[0]);
+        dy = nearest(dy, dim[1]);
+        dz = nearest(dz, dim[2]);
+      }
+      const double r2_rebuild = dx * dx + dy * dy + dz * dz;
+
+      /* Is the criterion violated now but was OK at the last rebuild ? */
+      if (gravity_multipole_accept_rebuild(multi_i, multi_j, theta_crit_inv,
+                                           r2_rebuild)) {
+
+        /* Alright, we have to take charge of that pair in a different way. */
+        // MATTHIEU: We should actually open the tree-node here and recurse.
+        runner_dopair_grav_mm(r, ci, cj);
+      }
     }
-  }
+  } /* Loop over top-level cells */
 
   if (timer) TIMER_TOC(timer_dograv_long_range);
 }
diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c
index acf83b95d564ba81db8586fc0fbd3e10c0bc6cd5..552fb91d099c4d10c847abcad1fc5b33e61b8799 100644
--- a/src/runner_doiact_vec.c
+++ b/src/runner_doiact_vec.c
@@ -20,13 +20,12 @@
 /* Config parameters. */
 #include "../config.h"
 
-#include "swift.h"
-
-#include "active.h"
-
 /* This object's header. */
 #include "runner_doiact_vec.h"
 
+/* Local headers. */
+#include "active.h"
+
 #ifdef WITH_VECTORIZATION
 static const vector kernel_gamma2_vec = FILL_VEC(kernel_gamma2);
 
@@ -76,8 +75,8 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions(
     *icount_align += pad;
 
     /* Initialise masks to true. */
-    vec_init_mask(int_mask);
-    vec_init_mask(int_mask2);
+    vec_init_mask_true(int_mask);
+    vec_init_mask_true(int_mask2);
 
     /* Pad secondary cache so that there are no contributions in the interaction
      * function. */
@@ -124,10 +123,6 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions(
  * @param v_dx #vector of the x separation between two particles.
  * @param v_dy #vector of the y separation between two particles.
  * @param v_dz #vector of the z separation between two particles.
- * @param v_mj #vector of the mass of particle pj.
- * @param v_vjx #vector of x velocity of pj.
- * @param v_vjy #vector of y velocity of pj.
- * @param v_vjz #vector of z velocity of pj.
  * @param cell_cache #cache of all particles in the cell.
  * @param int_cache (return) secondary #cache of interactions between two
  * particles.
@@ -212,8 +207,8 @@ __attribute__((always_inline)) INLINE static void storeInteractions(
                         v_hi_inv, v_vix, v_viy, v_viz, &icount_align);
 
     mask_t int_mask, int_mask2;
-    vec_init_mask(int_mask);
-    vec_init_mask(int_mask2);
+    vec_init_mask_true(int_mask);
+    vec_init_mask_true(int_mask2);
 
     /* Perform interactions. */
     for (int pjd = 0; pjd < icount_align; pjd += (NUM_VEC_PROC * VEC_SIZE)) {
@@ -310,7 +305,6 @@ __attribute__((always_inline)) INLINE static void calcRemForceInteractions(
     /* Perform remainder interaction and remove remainder from aligned
      * interaction count. */
     *icount_align = icount - rem;
-
     runner_iact_nonsym_2_vec_force(
         &int_cache->r2q[*icount_align], &int_cache->dxq[*icount_align],
         &int_cache->dyq[*icount_align], &int_cache->dzq[*icount_align], v_vix,
@@ -370,7 +364,7 @@ __attribute__((always_inline)) INLINE static void storeForceInteractions(
     vector *a_hydro_ySum, vector *a_hydro_zSum, vector *h_dtSum,
     vector *v_sigSum, vector *entropy_dtSum, vector v_hi_inv, vector v_vix,
     vector v_viy, vector v_viz, vector v_rhoi, vector v_grad_hi,
-    vector v_pOrhoi2, vector v_balsara_i, vector v_ci) {
+    vector v_pOrhoi2, vector v_balsara_i, vector v_ci, int num_vec_proc) {
 
 /* Left-pack values needed into the secondary cache using the interaction mask.
  */
@@ -437,7 +431,7 @@ __attribute__((always_inline)) INLINE static void storeForceInteractions(
 #endif /* defined(HAVE_AVX2) || defined(HAVE_AVX512_F) */
 
   /* Flush the c2 cache if it has reached capacity. */
-  if (*icount >= (C2_CACHE_SIZE - (2 * VEC_SIZE))) {
+  if (*icount >= (C2_CACHE_SIZE - (num_vec_proc * VEC_SIZE))) {
 
     int icount_align = *icount;
 
@@ -454,7 +448,7 @@ __attribute__((always_inline)) INLINE static void storeForceInteractions(
     vec_init_mask(int_mask2);
 
     /* Perform interactions. */
-    for (int pjd = 0; pjd < icount_align; pjd += (2 * VEC_SIZE)) {
+    for (int pjd = 0; pjd < icount_align; pjd += (num_vec_proc * VEC_SIZE)) {
 
       runner_iact_nonsym_2_vec_force(
           &int_cache->r2q[pjd], &int_cache->dxq[pjd], &int_cache->dyq[pjd],
@@ -473,92 +467,138 @@ __attribute__((always_inline)) INLINE static void storeForceInteractions(
   }
 }
 
-/* @brief Populates the arrays max_di and max_dj with the maximum distances of
+/**
+ * @brief Populates the arrays max_index_i and max_index_j with the maximum
+ * indices of
  * particles into their neighbouring cells. Also finds the first pi that
  * interacts with any particle in cj and the last pj that interacts with any
  * particle in ci.
+ *
  * @param ci #cell pointer to ci
  * @param cj #cell pointer to cj
  * @param sort_i #entry array for particle distance in ci
  * @param sort_j #entry array for particle distance in cj
- * @param ci_cache #cache for cell ci
- * @param cj_cache #cache for cell cj
  * @param dx_max maximum particle movement allowed in cell
  * @param rshift cutoff shift
- * @param max_di array to hold the maximum distances of pi particles into cell
+ * @param hi_max Maximal smoothing length in cell ci
+ * @param hj_max Maximal smoothing length in cell cj
+ * @param di_max Maximal position on the axis that can interact in cell ci
+ * @param dj_min Minimal position on the axis that can interact in cell ci
+ * @param max_index_i array to hold the maximum distances of pi particles into
+ * cell
  * cj
- * @param max_dj array to hold the maximum distances of pj particles into cell
+ * @param max_index_j array to hold the maximum distances of pj particles into
+ * cell
  * cj
  * @param init_pi first pi to interact with a pj particle
  * @param init_pj last pj to interact with a pi particle
+ * @param e The #engine.
  */
-__attribute__((always_inline)) INLINE static void populate_max_d_no_cache(
+__attribute__((always_inline)) INLINE static void populate_max_index_no_cache(
     const struct cell *ci, const struct cell *cj,
     const struct entry *restrict sort_i, const struct entry *restrict sort_j,
-    const float dx_max, const float rshift, float *max_di, float *max_dj,
-    int *init_pi, int *init_pj, const struct engine *e) {
+    const float dx_max, const float rshift, const double hi_max,
+    const double hj_max, const double di_max, const double dj_min,
+    int *max_index_i, int *max_index_j, int *init_pi, int *init_pj,
+    const struct engine *e) {
 
-  struct part *restrict parts_i = ci->parts;
-  struct part *restrict parts_j = cj->parts;
-  struct part *p = &parts_i[sort_i[0].i];
+  const struct part *restrict parts_i = ci->parts;
+  const struct part *restrict parts_j = cj->parts;
 
-  float h, d;
+  int first_pi = 0, last_pj = cj->count - 1;
+  int temp;
+
+  /* Find the leftmost active particle in cell i that interacts with any
+   * particle in cell j. */
+  first_pi = ci->count;
+  int active_id = first_pi - 1;
+  while (first_pi > 0 && sort_i[first_pi - 1].d + dx_max + hi_max > dj_min) {
+    first_pi--;
+    /* Store the index of the particle if it is active. */
+    if (part_is_active(&parts_i[sort_i[first_pi].i], e)) active_id = first_pi;
+  }
 
-  /* Get the distance of the last pi and the first pj on the sorted axis.*/
-  const float di_max = sort_i[ci->count - 1].d - rshift;
-  const float dj_min = sort_j[0].d;
+  /* Set the first active pi in range of any particle in cell j. */
+  first_pi = active_id;
 
-  int first_pi = 0, last_pj = cj->count - 1;
+  /* Find the maximum index into cell j for each particle in range in cell i. */
+  if (first_pi < ci->count) {
 
-  /* Find the first active particle in ci to interact with any particle in cj.
-   */
-  /* Populate max_di with distances. */
-  int active_id = ci->count - 1;
-  for (int k = ci->count - 1; k >= 0; k--) {
-    p = &parts_i[sort_i[k].i];
-    h = p->h;
-    d = sort_i[k].d + h * kernel_gamma + dx_max - rshift;
-
-    max_di[k] = d;
-
-    /* If the particle is out of range set the index to
-     * the last active particle within range. */
-    if (d < dj_min) {
-      first_pi = active_id;
-      break;
-    } else {
-      if (part_is_active(p, e)) active_id = k;
+    /* Start from the first particle in cell j. */
+    temp = 0;
+
+    const struct part *pi = &parts_i[sort_i[first_pi].i];
+
+    /* Loop through particles in cell j until they are not in range of pi. */
+    while (temp <= cj->count &&
+           (sort_i[first_pi].d + (pi->h * kernel_gamma + dx_max - rshift) >
+            sort_j[temp].d))
+      temp++;
+
+    max_index_i[first_pi] = temp;
+
+    /* Populate max_index_i for remaining particles that are within range. */
+    for (int i = first_pi + 1; i < ci->count; i++) {
+      temp = max_index_i[i - 1];
+      pi = &parts_i[sort_i[i].i];
+
+      while (temp <= cj->count &&
+             (sort_i[i].d + (pi->h * kernel_gamma + dx_max - rshift) >
+              sort_j[temp].d))
+        temp++;
+
+      max_index_i[i] = temp;
     }
+  } else {
+    /* Make sure that max index is set to first particle in cj.*/
+    max_index_i[ci->count - 1] = 0;
   }
 
-  /* Find the maximum distance of pi particles into cj.*/
-  for (int k = first_pi + 1; k < ci->count; k++) {
-    max_di[k] = fmaxf(max_di[k - 1], max_di[k]);
+  /* Find the rightmost active particle in cell j that interacts with any
+   * particle in cell i. */
+  last_pj = -1;
+  active_id = last_pj;
+  while (last_pj < cj->count &&
+         sort_j[last_pj + 1].d - hj_max - dx_max < di_max) {
+    last_pj++;
+    /* Store the index of the particle if it is active. */
+    if (part_is_active(&parts_j[sort_j[last_pj].i], e)) active_id = last_pj;
   }
 
-  /* Find the last particle in cj to interact with any particle in ci. */
-  /* Populate max_dj with distances. */
-  active_id = 0;
-  for (int k = 0; k < cj->count; k++) {
-    p = &parts_j[sort_j[k].i];
-    h = p->h;
-    d = sort_j[k].d - h * kernel_gamma - dx_max - rshift;
+  /* Set the last active pj in range of any particle in cell i. */
+  last_pj = active_id;
 
-    max_dj[k] = d;
+  /* Find the maximum index into cell i for each particle in range in cell j. */
+  if (last_pj > 0) {
 
-    /* If the particle is out of range set the index to
-     * the last active particle within range. */
-    if (d > di_max) {
-      last_pj = active_id;
-      break;
-    } else {
-      if (part_is_active(p, e)) active_id = k;
-    }
-  }
+    /* Start from the last particle in cell i. */
+    temp = ci->count - 1;
+
+    const struct part *pj = &parts_j[sort_j[last_pj].i];
 
-  /* Find the maximum distance of pj particles into ci.*/
-  for (int k = 1; k <= last_pj; k++) {
-    max_dj[k] = fmaxf(max_dj[k - 1], max_dj[k]);
+    /* Loop through particles in cell i until they are not in range of pj. */
+    while (temp > 0 &&
+           sort_j[last_pj].d - dx_max - (pj->h * kernel_gamma) <
+               sort_i[temp].d - rshift)
+      temp--;
+
+    max_index_j[last_pj] = temp;
+
+    /* Populate max_index_j for remaining particles that are within range. */
+    for (int i = last_pj - 1; i >= 0; i--) {
+      temp = max_index_j[i + 1];
+      pj = &parts_j[sort_j[i].i];
+
+      while (temp > 0 &&
+             sort_j[i].d - dx_max - (pj->h * kernel_gamma) <
+                 sort_i[temp].d - rshift)
+        temp--;
+
+      max_index_j[i] = temp;
+    }
+  } else {
+    /* Make sure that max index is set to last particle in ci.*/
+    max_index_j[0] = ci->count - 1;
   }
 
   *init_pi = first_pi;
@@ -703,7 +743,8 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
       v_r2_2.v = vec_fma(v_dz_tmp2.v, v_dz_tmp2.v, v_r2_2.v);
 
       /* Form a mask from r2 < hig2 and r2 > 0.*/
-      mask_t v_doi_mask, v_doi_mask_self_check, v_doi_mask2, v_doi_mask2_self_check;
+      mask_t v_doi_mask, v_doi_mask_self_check, v_doi_mask2,
+          v_doi_mask2_self_check;
       int doi_mask, doi_mask_self_check, doi_mask2, doi_mask2_self_check;
 
       /* Form r2 > 0 mask and r2 < hig2 mask. */
@@ -711,7 +752,8 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
       vec_create_mask(v_doi_mask, vec_cmp_lt(v_r2.v, v_hig2.v));
 
       /* Form r2 > 0 mask and r2 < hig2 mask. */
-      vec_create_mask(v_doi_mask2_self_check, vec_cmp_gt(v_r2_2.v, vec_setzero()));
+      vec_create_mask(v_doi_mask2_self_check,
+                      vec_cmp_gt(v_r2_2.v, vec_setzero()));
       vec_create_mask(v_doi_mask2, vec_cmp_lt(v_r2_2.v, v_hig2.v));
 
       /* Form integer masks. */
@@ -720,7 +762,7 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
 
       doi_mask2_self_check = vec_form_int_mask(v_doi_mask2_self_check);
       doi_mask2 = vec_form_int_mask(v_doi_mask2);
-      
+
       /* Combine the two masks. */
       doi_mask = doi_mask & doi_mask_self_check;
       doi_mask2 = doi_mask2 & doi_mask2_self_check;
@@ -752,8 +794,8 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
     /* Initialise masks to true in case remainder interactions have been
      * performed. */
     mask_t int_mask, int_mask2;
-    vec_init_mask(int_mask);
-    vec_init_mask(int_mask2);
+    vec_init_mask_true(int_mask);
+    vec_init_mask_true(int_mask2);
 
     /* Perform interaction with 2 vectors. */
     for (int pjd = 0; pjd < icount_align; pjd += (num_vec_proc * VEC_SIZE)) {
@@ -954,7 +996,7 @@ for (int pid = 0; pid < count; pid++) {
           doi_mask, pjd, &v_r2, &v_dx_tmp, &v_dy_tmp, &v_dz_tmp, cell_cache,
           &int_cache, &icount, &a_hydro_xSum, &a_hydro_ySum, &a_hydro_zSum,
           &h_dtSum, &v_sigSum, &entropy_dtSum, v_hi_inv, v_vix, v_viy,
-          v_viz, v_rhoi, v_grad_hi, v_pOrhoi2, v_balsara_i, v_ci);
+          v_viz, v_rhoi, v_grad_hi, v_pOrhoi2, v_balsara_i, v_ci, 2);
     }
 
   } /* Loop over all other particles. */
@@ -968,8 +1010,8 @@ for (int pid = 0; pid < count; pid++) {
   /* Initialise masks to true in case remainder interactions have been
    * performed. */
   mask_t int_mask, int_mask2;
-  vec_init_mask(int_mask);
-  vec_init_mask(int_mask2);
+  vec_init_mask_true(int_mask);
+  vec_init_mask_true(int_mask2);
 
   /* Perform interaction with 2 vectors. */
   for (int pjd = 0; pjd < icount_align; pjd += (2 * VEC_SIZE)) {
@@ -1007,9 +1049,12 @@ TIMER_TOC(timer_doself_force);
  * @param r The #runner.
  * @param ci The first #cell.
  * @param cj The second #cell.
+ * @param sid The direction of the pair
+ * @param shift The shift vector to apply to the particles in ci.
  */
 void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
-                                struct cell *cj) {
+                                struct cell *cj, const int sid,
+                                const double *shift) {
 
 #ifdef WITH_VECTORIZATION
   const struct engine *restrict e = r->e;
@@ -1018,29 +1063,13 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
 
   TIMER_TIC;
 
-  /* Anything to do here? */
-  if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return;
-
-  if (!cell_are_part_drifted(ci, e) || !cell_are_part_drifted(cj, e))
-    error("Interacting undrifted cells.");
-
-  /* Get the sort ID. */
-  double shift[3] = {0.0, 0.0, 0.0};
-  const int sid = space_getsid(e->s, &ci, &cj, shift);
-
-  /* Have the cells been sorted? */
-  if (!(ci->sorted & (1 << sid)) || ci->dx_max_sort > space_maxreldx * ci->dmin)
-    runner_do_sort(r, ci, (1 << sid), 1);
-  if (!(cj->sorted & (1 << sid)) || cj->dx_max_sort > space_maxreldx * cj->dmin)
-    runner_do_sort(r, cj, (1 << sid), 1);
-
   /* Get the cutoff shift. */
   double rshift = 0.0;
   for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k];
 
   /* Pick-out the sorted lists. */
-  const struct entry *restrict sort_i = &ci->sort[sid * (ci->count + 1)];
-  const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)];
+  const struct entry *restrict sort_i = ci->sort[sid];
+  const struct entry *restrict sort_j = cj->sort[sid];
 
 #ifdef SWIFT_DEBUG_CHECKS
   /* Check that the dx_max_sort values in the cell are indeed an upper
@@ -1051,8 +1080,13 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
                     p->x[1] * runner_shift[sid][1] +
                     p->x[2] * runner_shift[sid][2];
     if (fabsf(d - sort_i[pid].d) - ci->dx_max_sort >
-        1.0e-6 * max(fabsf(d), ci->dx_max_sort))
-      error("particle shift diff exceeds dx_max_sort.");
+        1.0e-4 * max(fabsf(d), ci->dx_max_sort_old))
+      error(
+          "particle shift diff exceeds dx_max_sort in cell ci. ci->nodeID=%d "
+          "cj->nodeID=%d d=%e sort_i[pid].d=%e ci->dx_max_sort=%e "
+          "ci->dx_max_sort_old=%e",
+          ci->nodeID, cj->nodeID, d, sort_i[pid].d, ci->dx_max_sort,
+          ci->dx_max_sort_old);
   }
   for (int pjd = 0; pjd < cj->count; pjd++) {
     const struct part *p = &cj->parts[sort_j[pjd].i];
@@ -1060,8 +1094,13 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
                     p->x[1] * runner_shift[sid][1] +
                     p->x[2] * runner_shift[sid][2];
     if (fabsf(d - sort_j[pjd].d) - cj->dx_max_sort >
-        1.0e-6 * max(fabsf(d), cj->dx_max_sort))
-      error("particle shift diff exceeds dx_max_sort.");
+        1.0e-4 * max(fabsf(d), cj->dx_max_sort_old))
+      error(
+          "particle shift diff exceeds dx_max_sort in cell cj. cj->nodeID=%d "
+          "ci->nodeID=%d d=%e sort_j[pjd].d=%e cj->dx_max_sort=%e "
+          "cj->dx_max_sort_old=%e",
+          cj->nodeID, ci->nodeID, d, sort_j[pjd].d, cj->dx_max_sort,
+          cj->dx_max_sort_old);
   }
 #endif /* SWIFT_DEBUG_CHECKS */
 
@@ -1113,37 +1152,19 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
   }
 
   int first_pi, last_pj;
-  float *max_di __attribute__((aligned(sizeof(float) * VEC_SIZE)));
-  float *max_dj __attribute__((aligned(sizeof(float) * VEC_SIZE)));
+  int *max_index_i __attribute__((aligned(sizeof(int) * VEC_SIZE)));
+  int *max_index_j __attribute__((aligned(sizeof(int) * VEC_SIZE)));
 
-  max_di = r->ci_cache.max_d;
-  max_dj = r->cj_cache.max_d;
+  max_index_i = r->ci_cache.max_index;
+  max_index_j = r->cj_cache.max_index;
 
-  /* Find particles maximum distance into cj, max_di[] and ci, max_dj[]. */
+  /* Find particles maximum index into cj, max_index_i[] and ci, max_index_j[].
+   */
   /* Also find the first pi that interacts with any particle in cj and the last
    * pj that interacts with any particle in ci. */
-  populate_max_d_no_cache(ci, cj, sort_i, sort_j, dx_max, rshift, max_di,
-                          max_dj, &first_pi, &last_pj, e);
-
-  /* Find the maximum index into cj that is required by a particle in ci. */
-  /* Find the maximum index into ci that is required by a particle in cj. */
-  float di, dj;
-  int max_ind_j = count_j - 1;
-  int max_ind_i = 0;
-
-  dj = sort_j[max_ind_j].d;
-  while (max_ind_j > 0 && max_di[count_i - 1] < dj) {
-    max_ind_j--;
-
-    dj = sort_j[max_ind_j].d;
-  }
-
-  di = sort_i[max_ind_i].d;
-  while (max_ind_i < count_i - 1 && max_dj[0] > di) {
-    max_ind_i++;
-
-    di = sort_i[max_ind_i].d;
-  }
+  populate_max_index_no_cache(ci, cj, sort_i, sort_j, dx_max, rshift, hi_max,
+                              hj_max, di_max, dj_min, max_index_i, max_index_j,
+                              &first_pi, &last_pj, e);
 
   /* Limits of the outer loops. */
   int first_pi_loop = first_pi;
@@ -1151,8 +1172,8 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
 
   /* Take the max/min of both values calculated to work out how many particles
    * to read into the cache. */
-  last_pj = max(last_pj, max_ind_j);
-  first_pi = min(first_pi, max_ind_i);
+  last_pj = max(last_pj, max_index_i[count_i - 1]);
+  first_pi = min(first_pi, max_index_j[0]);
 
   /* Read the needed particles into the two caches. */
   int first_pi_align = first_pi;
@@ -1166,26 +1187,25 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
 
   if (cell_is_active(ci, e)) {
 
-    /* Loop over the parts in ci. */
-    for (int pid = count_i - 1; pid >= first_pi_loop && max_ind_j >= 0; pid--) {
+    /* Loop over the parts in ci until nothing is within range in cj. */
+    for (int pid = count_i - 1; pid >= first_pi_loop; pid--) {
 
       /* Get a hold of the ith part in ci. */
       struct part *restrict pi = &parts_i[sort_i[pid].i];
       if (!part_is_active(pi, e)) continue;
 
-      /* Determine the exit iteration of the interaction loop. */
-      dj = sort_j[max_ind_j].d;
-      while (max_ind_j > 0 && max_di[pid] < dj) {
-        max_ind_j--;
-
-        dj = sort_j[max_ind_j].d;
-      }
-      int exit_iteration = max_ind_j + 1;
-
       /* Set the cache index. */
       int ci_cache_idx = pid - first_pi_align;
 
+      /* Skip this particle if no particle in cj is within range of it. */
       const float hi = ci_cache->h[ci_cache_idx];
+      const double di_test =
+          sort_i[pid].d + hi * kernel_gamma + dx_max - rshift;
+      if (di_test < dj_min) continue;
+
+      /* Determine the exit iteration of the interaction loop. */
+      int exit_iteration = max_index_i[pid];
+
       const float hig2 = hi * hi * kernel_gamma2;
 
       vector pix, piy, piz;
@@ -1294,26 +1314,27 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
   }
 
   if (cell_is_active(cj, e)) {
-    /* Loop over the parts in cj. */
-    for (int pjd = 0; pjd <= last_pj_loop && max_ind_i < count_i; pjd++) {
+
+    /* Loop over the parts in cj until nothing is within range in ci. */
+    for (int pjd = 0; pjd <= last_pj_loop; pjd++) {
 
       /* Get a hold of the jth part in cj. */
       struct part *restrict pj = &parts_j[sort_j[pjd].i];
       if (!part_is_active(pj, e)) continue;
 
-      /* Determine the exit iteration of the interaction loop. */
-      di = sort_i[max_ind_i].d;
-      while (max_ind_i < count_i - 1 && max_dj[pjd] > di) {
-        max_ind_i++;
-
-        di = sort_i[max_ind_i].d;
-      }
-      int exit_iteration = max_ind_i;
-
       /* Set the cache index. */
       int cj_cache_idx = pjd;
 
+      /*TODO: rshift term. */
+      /* Skip this particle if no particle in ci is within range of it. */
       const float hj = cj_cache->h[cj_cache_idx];
+      const double dj_test =
+          sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift;
+      if (dj_test > di_max) continue;
+
+      /* Determine the exit iteration of the interaction loop. */
+      int exit_iteration = max_index_j[pjd];
+
       const float hjg2 = hj * hj * kernel_gamma2;
 
       vector pjx, pjy, pjz;
diff --git a/src/runner_doiact_vec.h b/src/runner_doiact_vec.h
index 50d0722d577c38a4cb3cce35a339795b399161fe..09dc76ef04df5d29ea32f4af24efdc09e433aa73 100644
--- a/src/runner_doiact_vec.h
+++ b/src/runner_doiact_vec.h
@@ -37,6 +37,7 @@
 void runner_doself1_density_vec(struct runner *r, struct cell *restrict c);
 void runner_doself2_force_vec(struct runner *r, struct cell *restrict c);
 void runner_dopair1_density_vec(struct runner *r, struct cell *restrict ci,
-                                struct cell *restrict cj);
+                                struct cell *restrict cj, const int sid,
+                                const double *shift);
 
 #endif /* SWIFT_RUNNER_VEC_H */
diff --git a/src/scheduler.c b/src/scheduler.c
index b07c403e4ecd960b22b51f24372ca0a3420a453f..4081cde0489b1b439ceb46fc9b4e191541f15bef 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -127,7 +127,8 @@ static void scheduler_splittask_hydro(struct task *t, struct scheduler *s) {
     redo = 0;
 
     /* Non-splittable task? */
-    if ((t->ci == NULL) || (t->type == task_type_pair && t->cj == NULL)) {
+    if ((t->ci == NULL) || (t->type == task_type_pair && t->cj == NULL) ||
+        t->ci->count == 0 || (t->cj != NULL && t->cj->count == 0)) {
       t->type = task_type_none;
       t->subtype = task_subtype_none;
       t->cj = NULL;
@@ -140,7 +141,6 @@ static void scheduler_splittask_hydro(struct task *t, struct scheduler *s) {
 
       /* Get a handle on the cell involved. */
       struct cell *ci = t->ci;
-      const double width = ci->dmin;
 
       /* Foreign task? */
       if (ci->nodeID != s->nodeID) {
@@ -149,18 +149,14 @@ static void scheduler_splittask_hydro(struct task *t, struct scheduler *s) {
       }
 
       /* Is this cell even split and the task does not violate h ? */
-      if (ci->split && 2.f * kernel_gamma * ci->h_max * space_stretch < width) {
+      if (cell_can_split_self_task(ci)) {
 
         /* Make a sub? */
-        if (scheduler_dosub && /* Note division here to avoid overflow */
-            (ci->count > 0 && ci->count < space_subsize / ci->count)) {
+        if (scheduler_dosub && ci->count < space_subsize_self) {
 
           /* convert to a self-subtask. */
           t->type = task_type_sub_self;
 
-          /* Depend on local sorts on this cell. */
-          if (ci->sorts != NULL) scheduler_addunlock(s, ci->sorts, t);
-
           /* Otherwise, make tasks explicitly. */
         } else {
 
@@ -172,7 +168,7 @@ static void scheduler_splittask_hydro(struct task *t, struct scheduler *s) {
           while (ci->progeny[first_child] == NULL) first_child++;
           t->ci = ci->progeny[first_child];
           for (int k = first_child + 1; k < 8; k++)
-            if (ci->progeny[k] != NULL)
+            if (ci->progeny[k] != NULL && ci->progeny[k]->count)
               scheduler_splittask_hydro(
                   scheduler_addtask(s, task_type_self, t->subtype, 0, 0,
                                     ci->progeny[k], NULL),
@@ -180,9 +176,9 @@ static void scheduler_splittask_hydro(struct task *t, struct scheduler *s) {
 
           /* Make a task for each pair of progeny */
           for (int j = 0; j < 8; j++)
-            if (ci->progeny[j] != NULL)
+            if (ci->progeny[j] != NULL && ci->progeny[j]->count)
               for (int k = j + 1; k < 8; k++)
-                if (ci->progeny[k] != NULL)
+                if (ci->progeny[k] != NULL && ci->progeny[k]->count)
                   scheduler_splittask_hydro(
                       scheduler_addtask(s, task_type_pair, t->subtype,
                                         sub_sid_flag[j][k], 0, ci->progeny[j],
@@ -191,16 +187,6 @@ static void scheduler_splittask_hydro(struct task *t, struct scheduler *s) {
         }
       } /* Cell is split */
 
-      /* Otherwise, make sure the self task has a drift task */
-      else {
-
-        lock_lock(&ci->lock);
-
-        if (ci->drift_part == NULL)
-          ci->drift_part = scheduler_addtask(s, task_type_drift_part,
-                                             task_subtype_none, 0, 0, ci, NULL);
-        lock_unlock_blind(&ci->lock);
-      }
     } /* Self interaction */
 
     /* Pair interaction? */
@@ -221,26 +207,17 @@ static void scheduler_splittask_hydro(struct task *t, struct scheduler *s) {
       double shift[3];
       const int sid = space_getsid(s->space, &ci, &cj, shift);
 
-      const double width_i = ci->dmin;
-      const double width_j = cj->dmin;
-
       /* Should this task be split-up? */
-      if (ci->split && cj->split &&
-          2.f * kernel_gamma * space_stretch * ci->h_max < width_i &&
-          2.f * kernel_gamma * space_stretch * cj->h_max < width_j) {
+      if (cell_can_split_pair_task(ci) && cell_can_split_pair_task(cj)) {
 
         /* Replace by a single sub-task? */
-        if (scheduler_dosub &&
-            ci->count * sid_scale[sid] < space_subsize / cj->count &&
+        if (scheduler_dosub && /* Use division to avoid integer overflow. */
+            ci->count * sid_scale[sid] < space_subsize_pair / cj->count &&
             !sort_is_corner(sid)) {
 
           /* Make this task a sub task. */
           t->type = task_type_sub_pair;
 
-          /* Depend on the sort tasks of both cells. */
-          if (ci->sorts != NULL) scheduler_addunlock(s, ci->sorts, t);
-          if (cj->sorts != NULL) scheduler_addunlock(s, cj->sorts, t);
-
           /* Otherwise, split it. */
         } else {
 
@@ -593,44 +570,15 @@ static void scheduler_splittask_hydro(struct task *t, struct scheduler *s) {
         t->type = task_type_none;
 
         for (int j = 0; j < 8; j++)
-          if (ci->progeny[j] != NULL)
+          if (ci->progeny[j] != NULL && ci->progeny[j]->count)
             for (int k = 0; k < 8; k++)
-              if (cj->progeny[k] != NULL) {
+              if (cj->progeny[k] != NULL && cj->progeny[k]->count) {
                 struct task *tl =
                     scheduler_addtask(s, task_type_pair, t->subtype, 0, 0,
                                       ci->progeny[j], cj->progeny[k]);
                 scheduler_splittask_hydro(tl, s);
                 tl->flags = space_getsid(s->space, &t->ci, &t->cj, shift);
               }
-
-        /* Otherwise, if not spilt, stitch-up the sorting. */
-      } else {
-
-        /* Create the drift and sort for ci. */
-        lock_lock(&ci->lock);
-        if (ci->drift_part == NULL && ci->nodeID == engine_rank)
-          ci->drift_part = scheduler_addtask(s, task_type_drift_part,
-                                             task_subtype_none, 0, 0, ci, NULL);
-        if (ci->sorts == NULL)
-          ci->sorts = scheduler_addtask(s, task_type_sort, task_subtype_none,
-                                        1 << sid, 0, ci, NULL);
-        else
-          ci->sorts->flags |= (1 << sid);
-        lock_unlock_blind(&ci->lock);
-        scheduler_addunlock(s, ci->sorts, t);
-
-        /* Create the drift and sort for cj. */
-        lock_lock(&cj->lock);
-        if (cj->drift_part == NULL && cj->nodeID == engine_rank)
-          cj->drift_part = scheduler_addtask(s, task_type_drift_part,
-                                             task_subtype_none, 0, 0, cj, NULL);
-        if (cj->sorts == NULL)
-          cj->sorts = scheduler_addtask(s, task_type_sort, task_subtype_none,
-                                        1 << sid, 0, cj, NULL);
-        else
-          cj->sorts->flags |= (1 << sid);
-        lock_unlock_blind(&cj->lock);
-        scheduler_addunlock(s, cj->sorts, t);
       }
     } /* pair interaction? */
   }   /* iterate over the current task. */
@@ -672,54 +620,36 @@ static void scheduler_splittask_gravity(struct task *t, struct scheduler *s) {
         break;
       }
 
-      /* Is this cell even split? */
-      if (ci->split) {
-
-        /* Make a sub? */
-        if (scheduler_dosub && /* Note division here to avoid overflow */
-            (ci->gcount > 0 && ci->gcount < space_subsize / ci->gcount)) {
-
-          /* convert to a self-subtask. */
-          t->type = task_type_sub_self;
-
-          /* Make sure we have a drift task (MATTHIEU temp. fix) */
-          lock_lock(&ci->lock);
-          if (ci->drift_gpart == NULL)
-            ci->drift_gpart = scheduler_addtask(
-                s, task_type_drift_gpart, task_subtype_none, 0, 0, ci, NULL);
-          lock_unlock_blind(&ci->lock);
-
-          /* Otherwise, make tasks explicitly. */
-        } else {
-
-          /* Take a step back (we're going to recycle the current task)... */
-          redo = 1;
-
-          /* Add the self tasks. */
-          int first_child = 0;
-          while (ci->progeny[first_child] == NULL) first_child++;
-          t->ci = ci->progeny[first_child];
-          for (int k = first_child + 1; k < 8; k++)
-            if (ci->progeny[k] != NULL)
-              scheduler_splittask_gravity(
-                  scheduler_addtask(s, task_type_self, t->subtype, 0, 0,
-                                    ci->progeny[k], NULL),
-                  s);
-
-          /* Make a task for each pair of progeny */
-          if (t->subtype != task_subtype_external_grav) {
-            for (int j = 0; j < 8; j++)
-              if (ci->progeny[j] != NULL)
-                for (int k = j + 1; k < 8; k++)
-                  if (ci->progeny[k] != NULL)
-                    scheduler_splittask_gravity(
-                        scheduler_addtask(s, task_type_pair, t->subtype,
-                                          sub_sid_flag[j][k], 0, ci->progeny[j],
-                                          ci->progeny[k]),
-                        s);
-          }
+      /* Should we split this task? */
+      if (ci->split && ci->gcount > space_subsize_self_grav) {
+
+        /* Take a step back (we're going to recycle the current task)... */
+        redo = 1;
+
+        /* Add the self tasks. */
+        int first_child = 0;
+        while (ci->progeny[first_child] == NULL) first_child++;
+        t->ci = ci->progeny[first_child];
+        for (int k = first_child + 1; k < 8; k++)
+          if (ci->progeny[k] != NULL)
+            scheduler_splittask_gravity(
+                scheduler_addtask(s, task_type_self, t->subtype, 0, 0,
+                                  ci->progeny[k], NULL),
+                s);
+
+        /* Make a task for each pair of progeny */
+        if (t->subtype != task_subtype_external_grav) {
+          for (int j = 0; j < 8; j++)
+            if (ci->progeny[j] != NULL)
+              for (int k = j + 1; k < 8; k++)
+                if (ci->progeny[k] != NULL)
+                  scheduler_splittask_gravity(
+                      scheduler_addtask(s, task_type_pair, t->subtype,
+                                        sub_sid_flag[j][k], 0, ci->progeny[j],
+                                        ci->progeny[k]),
+                      s);
         }
-      } /* Cell is split */
+      }
 
       /* Otherwise, make sure the self task has a drift task */
       else {
@@ -747,7 +677,7 @@ static void scheduler_splittask_gravity(struct task *t, struct scheduler *s) {
       }
 
       /* Should this task be split-up? */
-      if (ci->split && cj->split) {
+      if (0 && ci->split && cj->split) {
 
         // MATTHIEU: nothing here for now
 
@@ -813,7 +743,7 @@ void scheduler_splittasks(struct scheduler *s) {
 
   /* Call the mapper on each current task. */
   threadpool_map(s->threadpool, scheduler_splittasks_mapper, s->tasks,
-                 s->nr_tasks, sizeof(struct task), 1000, s);
+                 s->nr_tasks, sizeof(struct task), 0, s);
 }
 
 /**
@@ -823,13 +753,14 @@ void scheduler_splittasks(struct scheduler *s) {
  * @param type The type of the task.
  * @param subtype The sub-type of the task.
  * @param flags The flags of the task.
- * @param wait The number of unsatisfied dependencies of this task.
+ * @param implicit If true, only use this task to unlock dependencies, i.e.
+ *        this task is never enqueued.
  * @param ci The first cell to interact.
  * @param cj The second cell to interact.
  */
 struct task *scheduler_addtask(struct scheduler *s, enum task_types type,
-                               enum task_subtypes subtype, int flags, int wait,
-                               struct cell *ci, struct cell *cj) {
+                               enum task_subtypes subtype, int flags,
+                               int implicit, struct cell *ci, struct cell *cj) {
 
 #ifdef SWIFT_DEBUG_CHECKS
   if (ci == NULL && cj != NULL)
@@ -850,11 +781,11 @@ struct task *scheduler_addtask(struct scheduler *s, enum task_types type,
   t->type = type;
   t->subtype = subtype;
   t->flags = flags;
-  t->wait = wait;
+  t->wait = 0;
   t->ci = ci;
   t->cj = cj;
   t->skip = 1; /* Mark tasks as skip by default. */
-  t->implicit = 0;
+  t->implicit = implicit;
   t->weight = 0;
   t->rank = 0;
   t->nr_unlock_tasks = 0;
@@ -1035,9 +966,7 @@ void scheduler_reset(struct scheduler *s, int size) {
   if (size > s->size) {
 
     /* Free existing task lists if necessary. */
-    if (s->tasks != NULL) free(s->tasks);
-    if (s->tasks_ind != NULL) free(s->tasks_ind);
-    if (s->tid_active != NULL) free(s->tid_active);
+    scheduler_free_tasks(s);
 
     /* Allocate the new lists. */
     if (posix_memalign((void *)&s->tasks, task_align,
@@ -1184,11 +1113,6 @@ void scheduler_rewait_mapper(void *map_data, int num_elements,
     if (t->wait < 0)
       error("Task unlocked by more than %d tasks!",
             (1 << (8 * sizeof(t->wait) - 1)) - 1);
-
-    /* Skip sort tasks that have already been performed */
-    if (t->type == task_type_sort && t->flags == 0) {
-      error("Empty sort task encountered.");
-    }
 #endif
 
     /* Sets the waits of the dependances */
@@ -1232,7 +1156,7 @@ void scheduler_start(struct scheduler *s) {
   /* Re-wait the tasks. */
   if (s->active_count > 1000) {
     threadpool_map(s->threadpool, scheduler_rewait_mapper, s->tid_active,
-                   s->active_count, sizeof(int), 1000, s);
+                   s->active_count, sizeof(int), 0, s);
   } else {
     scheduler_rewait_mapper(s->tid_active, s->active_count, s);
   }
@@ -1277,14 +1201,14 @@ void scheduler_start(struct scheduler *s) {
               ci->ti_end_min);
 
         /* Special treatment for sort tasks */
-        if (ci->ti_end_min == ti_current && t->skip &&
+        /* if (ci->ti_end_min == ti_current && t->skip &&
             t->type == task_type_sort && t->flags == 0)
           error(
               "Task (type='%s/%s') should not have been skipped "
               "ti_current=%lld "
               "c->ti_end_min=%lld t->flags=%d",
               taskID_names[t->type], subtaskID_names[t->subtype], ti_current,
-              ci->ti_end_min, t->flags);
+              ci->ti_end_min, t->flags); */
 
       } else { /* pair */
 
@@ -1308,7 +1232,7 @@ void scheduler_start(struct scheduler *s) {
   /* Loop over the tasks and enqueue whoever is ready. */
   if (s->active_count > 1000) {
     threadpool_map(s->threadpool, scheduler_enqueue_mapper, s->tid_active,
-                   s->active_count, sizeof(int), 1000, s);
+                   s->active_count, sizeof(int), 0, s);
   } else {
     scheduler_enqueue_mapper(s->tid_active, s->active_count, s);
   }
@@ -1338,6 +1262,10 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
 
   /* If this is an implicit task, just pretend it's done. */
   if (t->implicit) {
+#ifdef SWIFT_DEBUG_CHECKS
+    t->ti_run = s->space->e->ti_current;
+#endif
+    t->skip = 1;
     for (int j = 0; j < t->nr_unlock_tasks; j++) {
       struct task *t2 = t->unlock_tasks[j];
       if (atomic_dec(&t2->wait) == 1) scheduler_enqueue(s, t2);
@@ -1417,11 +1345,6 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
         } else if (t->subtype == task_subtype_xv ||
                    t->subtype == task_subtype_rho ||
                    t->subtype == task_subtype_gradient) {
-#ifdef SWIFT_DEBUG_CHECKS
-          for (int k = 0; k < t->ci->count; k++)
-            if (t->ci->parts[k].ti_drift != s->space->e->ti_current)
-              error("Sending un-drifted particle !");
-#endif
           err = MPI_Isend(t->ci->parts, t->ci->count, part_mpi_type,
                           t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
           // message( "sending %i parts with tag=%i from %i to %i." ,
@@ -1719,11 +1642,29 @@ void scheduler_print_tasks(const struct scheduler *s, const char *fileName) {
  */
 void scheduler_clean(struct scheduler *s) {
 
-  free(s->tasks);
-  free(s->tasks_ind);
+  scheduler_free_tasks(s);
   free(s->unlocks);
   free(s->unlock_ind);
-  free(s->tid_active);
   for (int i = 0; i < s->nr_queues; ++i) queue_clean(&s->queues[i]);
   free(s->queues);
 }
+
+/**
+ * @brief Free the task arrays allocated by this #scheduler.
+ */
+void scheduler_free_tasks(struct scheduler *s) {
+
+  if (s->tasks != NULL) {
+    free(s->tasks);
+    s->tasks = NULL;
+  }
+  if (s->tasks_ind != NULL) {
+    free(s->tasks_ind);
+    s->tasks_ind = NULL;
+  }
+  if (s->tid_active != NULL) {
+    free(s->tid_active);
+    s->tid_active = NULL;
+  }
+  s->size = 0;
+}
diff --git a/src/scheduler.h b/src/scheduler.h
index 7bf9a40e7cec89eb25dfa6ce7a56912bf3a9e639..ac654580b2af2ffb506dc3fd9f0b988b89effbd0 100644
--- a/src/scheduler.h
+++ b/src/scheduler.h
@@ -52,7 +52,7 @@
 
 /* Flags . */
 #define scheduler_flag_none 0
-#define scheduler_flag_steal 1
+#define scheduler_flag_steal (1 << 1)
 
 /* Data of a scheduler. */
 struct scheduler {
@@ -133,8 +133,8 @@ void scheduler_reset(struct scheduler *s, int nr_tasks);
 void scheduler_ranktasks(struct scheduler *s);
 void scheduler_reweight(struct scheduler *s, int verbose);
 struct task *scheduler_addtask(struct scheduler *s, enum task_types type,
-                               enum task_subtypes subtype, int flags, int wait,
-                               struct cell *ci, struct cell *cj);
+                               enum task_subtypes subtype, int flags,
+                               int implicit, struct cell *ci, struct cell *cj);
 void scheduler_splittasks(struct scheduler *s);
 struct task *scheduler_done(struct scheduler *s, struct task *t);
 struct task *scheduler_unlock(struct scheduler *s, struct task *t);
@@ -143,5 +143,6 @@ void scheduler_set_unlocks(struct scheduler *s);
 void scheduler_dump_queue(struct scheduler *s);
 void scheduler_print_tasks(const struct scheduler *s, const char *fileName);
 void scheduler_clean(struct scheduler *s);
+void scheduler_free_tasks(struct scheduler *s);
 
 #endif /* SWIFT_SCHEDULER_H */
diff --git a/src/serial_io.c b/src/serial_io.c
index a7e342f0a90fcf4c57f334526ff91b1923de4585..eb1e0e23fb34fd8d6a21230d9e38cfe82c47df1d 100644
--- a/src/serial_io.c
+++ b/src/serial_io.c
@@ -59,19 +59,15 @@
  * @brief Reads a data array from a given HDF5 group.
  *
  * @param grp The group from which to read.
- * @param name The name of the array to read.
- * @param type The #DATA_TYPE of the attribute.
- * @param N The number of particles.
- * @param dim The dimension of the data (1 for scalar, 3 for vector)
- * @param part_c A (char*) pointer on the first occurrence of the field of
- *interest in the parts array
- * @param partSize The size in bytes of the particle structure.
- * @param importance If COMPULSORY, the data must be present in the IC file. If
- *OPTIONAL, the array will be zeroed when the data is not present.
+ * @param props The #io_props of the field to read
+ * @param N The number of particles to read on this rank.
+ * @param N_total The total number of particles on all ranks.
+ * @param offset The offset position where this rank starts reading.
+ * @param internal_units The #unit_system used internally
+ * @param ic_units The #unit_system used in the ICs
  *
  * @todo A better version using HDF5 hyper-slabs to read the file directly into
- *the part array
- * will be written once the structures have been stabilized.
+ * the part array will be written once the structures have been stabilized.
  */
 void readArray(hid_t grp, const struct io_props props, size_t N,
                long long N_total, long long offset,
@@ -274,16 +270,17 @@ void prepareArray(struct engine* e, hid_t grp, char* fileName, FILE* xmfFile,
  * @param fileName The name of the file in which the data is written
  * @param xmfFile The FILE used to write the XMF description
  * @param partTypeGroupName The name of the group containing the particles in
- *the HDF5 file.
- * @param name The name of the array to write.
- * @param type The #DATA_TYPE of the array.
+ * the HDF5 file.
+ * @param props The #io_props of the field to read
  * @param N The number of particles to write.
- * @param dim The dimension of the data (1 for scalar, 3 for vector)
- * @param part_c A (char*) pointer on the first occurrence of the field of
- *interest in the parts array
- * @param partSize The size in bytes of the particle structure.
- * @param us The unit_system currently in use
- * @param convFactor The UnitConversionFactor for this arrayo
+ * @param N_total The total number of particles on all ranks.
+ * @param offset The offset position where this rank starts writing.
+ * @param mpi_rank The MPI rank of this node
+ * @param internal_units The #unit_system used internally
+ * @param snapshot_units The #unit_system used in the snapshots
+ *
+ * @todo A better version using HDF5 hyper-slabs to write the file directly from
+ * the part array will be written once the structures have been stabilized.
  */
 void writeArray(struct engine* e, hid_t grp, char* fileName, FILE* xmfFile,
                 char* partTypeGroupName, const struct io_props props, size_t N,
@@ -741,7 +738,7 @@ void write_output_serial(struct engine* e, const char* baseName,
 
   /* File name */
   char fileName[FILENAME_BUFFER_SIZE];
-  snprintf(fileName, FILENAME_BUFFER_SIZE, "%s_%03i.hdf5", baseName,
+  snprintf(fileName, FILENAME_BUFFER_SIZE, "%s_%04i.hdf5", baseName,
            outputCount);
 
   /* Compute offset in the file and total number of particles */
diff --git a/src/single_io.c b/src/single_io.c
index 0b091a5997504e5f5a4cc3b8af7ca06c994e993c..194563352dff5570b8703f828fac95bccbf7409f 100644
--- a/src/single_io.c
+++ b/src/single_io.c
@@ -64,8 +64,7 @@
  * @param ic_units The #unit_system used in the ICs
  *
  * @todo A better version using HDF5 hyper-slabs to read the file directly into
- *the part array
- * will be written once the structures have been stabilized.
+ * the part array will be written once the structures have been stabilized.
  */
 void readArray(hid_t h_grp, const struct io_props prop, size_t N,
                const struct unit_system* internal_units,
@@ -607,7 +606,7 @@ void write_output_single(struct engine* e, const char* baseName,
 
   /* File name */
   char fileName[FILENAME_BUFFER_SIZE];
-  snprintf(fileName, FILENAME_BUFFER_SIZE, "%s_%03i.hdf5", baseName,
+  snprintf(fileName, FILENAME_BUFFER_SIZE, "%s_%04i.hdf5", baseName,
            outputCount);
 
   /* First time, we need to create the XMF file */
diff --git a/src/space.c b/src/space.c
index b1612876b6339fb29648a87e9aec93a1d8f64664..52a34248cd9bf38e03e476e4937fa601b0ee9222 100644
--- a/src/space.c
+++ b/src/space.c
@@ -60,9 +60,10 @@
 
 /* Split size. */
 int space_splitsize = space_splitsize_default;
-int space_subsize = space_subsize_default;
+int space_subsize_pair = space_subsize_pair_default;
+int space_subsize_self = space_subsize_self_default;
+int space_subsize_self_grav = space_subsize_self_grav_default;
 int space_maxsize = space_maxsize_default;
-int space_maxcount = space_maxcount_default;
 
 /**
  * @brief Interval stack necessary for parallel particle sorting.
@@ -214,6 +215,8 @@ void space_rebuild_recycle_mapper(void *map_data, int num_elements,
     c->scount = 0;
     c->init_grav = NULL;
     c->extra_ghost = NULL;
+    c->ghost_in = NULL;
+    c->ghost_out = NULL;
     c->ghost = NULL;
     c->kick1 = NULL;
     c->kick2 = NULL;
@@ -227,10 +230,15 @@ void space_rebuild_recycle_mapper(void *map_data, int num_elements,
     c->grav_long_range = NULL;
     c->grav_down = NULL;
     c->super = c;
-    if (c->sort != NULL) {
-      free(c->sort);
-      c->sort = NULL;
-    }
+    c->parts = NULL;
+    c->xparts = NULL;
+    c->gparts = NULL;
+    c->sparts = NULL;
+    for (int i = 0; i < 13; i++)
+      if (c->sort[i] != NULL) {
+        free(c->sort[i]);
+        c->sort[i] = NULL;
+      }
 #if WITH_MPI
     c->recv_xv = NULL;
     c->recv_rho = NULL;
@@ -245,6 +253,15 @@ void space_rebuild_recycle_mapper(void *map_data, int num_elements,
   }
 }
 
+/**
+ * @brief Free up any allocated cells.
+ */
+void space_free_cells(struct space *s) {
+  threadpool_map(&s->e->threadpool, space_rebuild_recycle_mapper, s->cells_top,
+                 s->nr_cells, sizeof(struct cell), 0, s);
+  s->maxdepth = 0;
+}
+
 /**
  * @brief Re-build the top-level cell grid.
  *
@@ -308,14 +325,22 @@ void space_regrid(struct space *s, int verbose) {
         "small,\n"
         " - the (minimal) time-step is too large leading to particles with "
         "predicted smoothing lengths too large for the box size,\n"
-        " - particle with velocities so large that they move by more than two "
+        " - particles with velocities so large that they move by more than two "
         "box sizes per time-step.\n");
 
-  /* Check if we have enough cells for gravity. */
-  if (s->gravity && (cdim[0] < 8 || cdim[1] < 8 || cdim[2] < 8))
+  /* Check if we have enough cells for periodic gravity. */
+  if (s->gravity && s->periodic && (cdim[0] < 8 || cdim[1] < 8 || cdim[2] < 8))
     error(
-        "Must have at least 8 cells in each spatial dimension when gravity "
-        "is switched on.");
+        "Must have at least 8 cells in each spatial dimension when periodic "
+        "gravity is switched on.\nThis error is often caused by any of the "
+        "followings:\n"
+        " - too few particles to generate a sensible grid,\n"
+        " - the initial value of 'Scheduler:max_top_level_cells' is too "
+        "small,\n"
+        " - the (minimal) time-step is too large leading to particles with "
+        "predicted smoothing lengths too large for the box size,\n"
+        " - particles with velocities so large that they move by more than two "
+        "box sizes per time-step.\n");
 
 /* In MPI-Land, changing the top-level cell size requires that the
  * global partition is recomputed and the particles redistributed.
@@ -357,19 +382,21 @@ void space_regrid(struct space *s, int verbose) {
 
 /* Be verbose about this. */
 #ifdef SWIFT_DEBUG_CHECKS
-    message("re)griding space cdim=(%d %d %d)", cdim[0], cdim[1], cdim[2]);
+    message("(re)griding space cdim=(%d %d %d)", cdim[0], cdim[1], cdim[2]);
     fflush(stdout);
 #endif
 
     /* Free the old cells, if they were allocated. */
     if (s->cells_top != NULL) {
-      threadpool_map(&s->e->threadpool, space_rebuild_recycle_mapper,
-                     s->cells_top, s->nr_cells, sizeof(struct cell), 100, s);
+      space_free_cells(s);
       free(s->cells_top);
       free(s->multipoles_top);
-      s->maxdepth = 0;
     }
 
+    /* Also free the task arrays, these will be regenerated and we can use the
+     * memory while copying the particle arrays. */
+    if (s->e != NULL) scheduler_free_tasks(&s->e->sched);
+
     /* Set the new cell dimensions only if smaller. */
     for (int k = 0; k < 3; k++) {
       s->cdim[k] = cdim[k];
@@ -476,9 +503,7 @@ void space_regrid(struct space *s, int verbose) {
   else { /* Otherwise, just clean up the cells. */
 
     /* Free the old cells, if they were allocated. */
-    threadpool_map(&s->e->threadpool, space_rebuild_recycle_mapper,
-                   s->cells_top, s->nr_cells, sizeof(struct cell), 100, s);
-    s->maxdepth = 0;
+    space_free_cells(s);
   }
 
   if (verbose)
@@ -499,7 +524,7 @@ void space_rebuild(struct space *s, int verbose) {
 
 /* Be verbose about this. */
 #ifdef SWIFT_DEBUG_CHECKS
-  if (s->e->nodeID == 0 || verbose) message("re)building space");
+  if (s->e->nodeID == 0 || verbose) message("(re)building space");
   fflush(stdout);
 #endif
 
@@ -910,14 +935,16 @@ void space_rebuild(struct space *s, int verbose) {
     c->ti_old_part = ti_old;
     c->ti_old_gpart = ti_old;
     c->ti_old_multipole = ti_old;
-    c->parts = finger;
-    c->xparts = xfinger;
-    c->gparts = gfinger;
-    c->sparts = sfinger;
-    finger = &finger[c->count];
-    xfinger = &xfinger[c->count];
-    gfinger = &gfinger[c->gcount];
-    sfinger = &sfinger[c->scount];
+    if (c->nodeID == engine_rank) {
+      c->parts = finger;
+      c->xparts = xfinger;
+      c->gparts = gfinger;
+      c->sparts = sfinger;
+      finger = &finger[c->count];
+      xfinger = &xfinger[c->count];
+      gfinger = &gfinger[c->gcount];
+      sfinger = &sfinger[c->scount];
+    }
   }
   // message( "hooking up cells took %.3f %s." ,
   // clocks_from_ticks(getticks() - tic), clocks_getunit());
@@ -954,7 +981,7 @@ void space_split(struct space *s, struct cell *cells, int nr_cells,
   const ticks tic = getticks();
 
   threadpool_map(&s->e->threadpool, space_split_mapper, cells, nr_cells,
-                 sizeof(struct cell), 1, s);
+                 sizeof(struct cell), 0, s);
 
   if (verbose)
     message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
@@ -962,28 +989,33 @@ void space_split(struct space *s, struct cell *cells, int nr_cells,
 }
 
 /**
- * @brief Runs through the top-level cells and checks whether tasks associated
- * with them can be split. If not, try to sanitize the cells.
+ * @brief #threadpool mapper function to sanitize the cells
  *
- * @param s The #space to act upon.
+ * @param map_data Pointers towards the top-level cells.
+ * @param num_cells The number of top-level cells.
+ * @param extra_data Unused parameters.
  */
-void space_sanitize(struct space *s) {
-
-  s->sanitized = 1;
+void space_sanitize_mapper(void *map_data, int num_cells, void *extra_data) {
+  /* Unpack the inputs. */
+  struct cell *cells_top = (struct cell *)map_data;
 
-  for (int k = 0; k < s->nr_cells; k++) {
+  for (int ind = 0; ind < num_cells; ind++) {
+    struct cell *c = &cells_top[ind];
+    cell_sanitize(c, 0);
+  }
+}
 
-    struct cell *c = &s->cells_top[k];
-    const double min_width = c->dmin;
+/**
+ * @brief Runs through the top-level cells and sanitize their h values
+ *
+ * @param s The #space to act upon.
+ */
+void space_sanitize(struct space *s) {
 
-    /* Do we have a problem ? */
-    if (c->h_max * kernel_gamma * space_stretch > min_width * 0.5 &&
-        c->count > space_maxcount) {
+  if (s->e->nodeID == 0) message("Cleaning up unreasonable values of h");
 
-      /* Ok, clean-up the mess */
-      cell_sanitize(c);
-    }
-  }
+  threadpool_map(&s->e->threadpool, space_sanitize_mapper, s->cells_top,
+                 s->nr_cells, sizeof(struct cell), 0, NULL);
 }
 
 /**
@@ -1166,7 +1198,7 @@ void space_parts_get_cell_index(struct space *s, int *ind, struct cell *cells,
   data.ind = ind;
 
   threadpool_map(&s->e->threadpool, space_parts_get_cell_index_mapper, s->parts,
-                 s->nr_parts, sizeof(struct part), 1000, &data);
+                 s->nr_parts, sizeof(struct part), 0, &data);
 
   if (verbose)
     message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
@@ -1193,7 +1225,7 @@ void space_gparts_get_cell_index(struct space *s, int *gind, struct cell *cells,
   data.ind = gind;
 
   threadpool_map(&s->e->threadpool, space_gparts_get_cell_index_mapper,
-                 s->gparts, s->nr_gparts, sizeof(struct gpart), 1000, &data);
+                 s->gparts, s->nr_gparts, sizeof(struct gpart), 0, &data);
 
   if (verbose)
     message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
@@ -1220,7 +1252,7 @@ void space_sparts_get_cell_index(struct space *s, int *sind, struct cell *cells,
   data.ind = sind;
 
   threadpool_map(&s->e->threadpool, space_sparts_get_cell_index_mapper,
-                 s->sparts, s->nr_sparts, sizeof(struct spart), 1000, &data);
+                 s->sparts, s->nr_sparts, sizeof(struct spart), 0, &data);
 
   if (verbose)
     message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
@@ -1783,10 +1815,11 @@ void space_gparts_sort_mapper(void *map_data, int num_elements,
  */
 void space_map_clearsort(struct cell *c, void *data) {
 
-  if (c->sort != NULL) {
-    free(c->sort);
-    c->sort = NULL;
-  }
+  for (int i = 0; i < 13; i++)
+    if (c->sort[i] != NULL) {
+      free(c->sort[i]);
+      c->sort[i] = NULL;
+    }
 }
 
 /**
@@ -2284,7 +2317,8 @@ void space_recycle(struct space *s, struct cell *c) {
     error("Failed to destroy spinlocks.");
 
   /* Clear this cell's sort arrays. */
-  if (c->sort != NULL) free(c->sort);
+  for (int i = 0; i < 13; i++)
+    if (c->sort[i] != NULL) free(c->sort[i]);
 
   /* Lock the space. */
   lock_lock(&s->lock);
@@ -2336,7 +2370,8 @@ void space_recycle_list(struct space *s, struct cell *cell_list_begin,
       error("Failed to destroy spinlocks.");
 
     /* Clear this cell's sort arrays. */
-    if (c->sort != NULL) free(c->sort);
+    for (int i = 0; i < 13; i++)
+      if (c->sort[i] != NULL) free(c->sort[i]);
 
     /* Count this cell. */
     count += 1;
@@ -2480,7 +2515,7 @@ void space_synchronize_particle_positions(struct space *s) {
       (s->nr_gparts > 0 && s->nr_sparts > 0))
     threadpool_map(&s->e->threadpool,
                    space_synchronize_particle_positions_mapper, s->gparts,
-                   s->nr_gparts, sizeof(struct gpart), 1000, (void *)s);
+                   s->nr_gparts, sizeof(struct gpart), 0, (void *)s);
 }
 
 /**
@@ -2630,7 +2665,6 @@ void space_init(struct space *s, const struct swift_params *params,
   s->dim[0] = dim[0];
   s->dim[1] = dim[1];
   s->dim[2] = dim[2];
-  s->sanitized = 0;
   s->periodic = periodic;
   s->gravity = gravity;
   s->nr_parts = Npart;
@@ -2677,15 +2711,21 @@ void space_init(struct space *s, const struct swift_params *params,
   /* Get the constants for the scheduler */
   space_maxsize = parser_get_opt_param_int(params, "Scheduler:cell_max_size",
                                            space_maxsize_default);
-  space_subsize = parser_get_opt_param_int(params, "Scheduler:cell_sub_size",
-                                           space_subsize_default);
+  space_subsize_pair = parser_get_opt_param_int(
+      params, "Scheduler:cell_sub_size_pair", space_subsize_pair_default);
+  space_subsize_self = parser_get_opt_param_int(
+      params, "Scheduler:cell_sub_size_self", space_subsize_self_default);
+  space_subsize_self_grav =
+      parser_get_opt_param_int(params, "Scheduler:cell_sub_size_self_grav",
+                               space_subsize_self_grav_default);
   space_splitsize = parser_get_opt_param_int(
       params, "Scheduler:cell_split_size", space_splitsize_default);
-  space_maxcount = parser_get_opt_param_int(params, "Scheduler:cell_max_count",
-                                            space_maxcount_default);
+
   if (verbose)
-    message("max_size set to %d, sub_size set to %d, split_size set to %d",
-            space_maxsize, space_subsize, space_splitsize);
+    message(
+        "max_size set to %d, sub_size_pair set to %d, sub_size_self set to %d, "
+        "split_size set to %d",
+        space_maxsize, space_subsize_pair, space_subsize_self, space_splitsize);
 
   /* Apply h scaling */
   const double scaling =
diff --git a/src/space.h b/src/space.h
index e8e8600349c97ff8a60f0fcf2964d6ec514a7589..dbbba714c2b3c9841905b2ba54e4f2d854b820a6 100644
--- a/src/space.h
+++ b/src/space.h
@@ -30,19 +30,22 @@
 #include <stddef.h>
 
 /* Includes. */
-#include "cell.h"
 #include "hydro_space.h"
 #include "lock.h"
 #include "parser.h"
 #include "part.h"
 #include "space.h"
 
+/* Avoid cyclic inclusions */
+struct cell;
+
 /* Some constants. */
 #define space_cellallocchunk 1000
 #define space_splitsize_default 400
 #define space_maxsize_default 8000000
-#define space_subsize_default 64000000
-#define space_maxcount_default 10000
+#define space_subsize_pair_default 256000000
+#define space_subsize_self_default 32000
+#define space_subsize_self_grav_default 32000
 #define space_max_top_level_cells_default 12
 #define space_stretch 1.10f
 #define space_maxreldx 0.1f
@@ -53,8 +56,9 @@
 /* Split size. */
 extern int space_splitsize;
 extern int space_maxsize;
-extern int space_subsize;
-extern int space_maxcount;
+extern int space_subsize_pair;
+extern int space_subsize_self;
+extern int space_subsize_self_grav;
 
 /**
  * @brief The space in which the cells and particles reside.
@@ -139,9 +143,6 @@ struct space {
   /*! Number of queues in the system. */
   int nr_queues;
 
-  /*! Has this space already been sanitized ? */
-  int sanitized;
-
   /*! The associated engine. */
   struct engine *e;
 
@@ -225,5 +226,6 @@ void space_check_timesteps(struct space *s);
 void space_replicate(struct space *s, int replicate, int verbose);
 void space_reset_task_counters(struct space *s);
 void space_clean(struct space *s);
+void space_free_cells(struct space *s);
 
 #endif /* SWIFT_SPACE_H */
diff --git a/src/statistics.c b/src/statistics.c
index 57d60bcb1b247c9616c859b7ac8a475acdcd878f..5a3f1ff4f9a2232a14817e7e55fd2cff5bdcd80e 100644
--- a/src/statistics.c
+++ b/src/statistics.c
@@ -271,12 +271,12 @@ void stats_collect(const struct space *s, struct statistics *stats) {
   /* Run parallel collection of statistics for parts */
   if (s->nr_parts > 0)
     threadpool_map(&s->e->threadpool, stats_collect_part_mapper, s->parts,
-                   s->nr_parts, sizeof(struct part), 10000, &extra_data);
+                   s->nr_parts, sizeof(struct part), 0, &extra_data);
 
   /* Run parallel collection of statistics for gparts */
   if (s->nr_gparts > 0)
     threadpool_map(&s->e->threadpool, stats_collect_gpart_mapper, s->gparts,
-                   s->nr_gparts, sizeof(struct gpart), 10000, &extra_data);
+                   s->nr_gparts, sizeof(struct gpart), 0, &extra_data);
 }
 
 /**
diff --git a/src/swift.h b/src/swift.h
index 20397eb24df478cba65a0e35d686b402f1d8ee70..1d1a7c7d04b3662c524504c292aa7d9eee2c3d09 100644
--- a/src/swift.h
+++ b/src/swift.h
@@ -57,6 +57,7 @@
 #include "sourceterms.h"
 #include "space.h"
 #include "task.h"
+#include "threadpool.h"
 #include "timeline.h"
 #include "timers.h"
 #include "tools.h"
diff --git a/src/task.h b/src/task.h
index 052f3e8036381441e283d3f7847d09e98ec1dac2..dee888c9f16dd69785a31371da15078af4e0af0c 100644
--- a/src/task.h
+++ b/src/task.h
@@ -36,6 +36,7 @@
  * @brief The different task types.
  *
  * Be sure to update the taskID_names array in tasks.c if you modify this list!
+ * Also update the python task plotting scripts!
  */
 enum task_types {
   task_type_none = 0,
@@ -162,6 +163,9 @@ struct task {
   /*! ID of the queue or runner owning this task */
   short int rid;
 
+  /*! Information about the direction of the pair task */
+  short int sid;
+
   /*! Start and end time of this task */
   ticks tic, toc;
 #endif
diff --git a/src/threadpool.c b/src/threadpool.c
index c11fd8121bb02f36fce1796d79a7eb55a38102c4..465756f71d88df81921a880edf8cdb1ee17f6026 100644
--- a/src/threadpool.c
+++ b/src/threadpool.c
@@ -26,13 +26,139 @@
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
+#ifdef SWIFT_DEBUG_THREADPOOL
+#include <dlfcn.h>
+#endif
 
 /* This object's header. */
 #include "threadpool.h"
 
 /* Local headers. */
 #include "atomic.h"
+#include "clocks.h"
 #include "error.h"
+#include "minmax.h"
+
+#ifdef SWIFT_DEBUG_THREADPOOL
+/**
+ * @brief Store a log entry of the given chunk.
+ */
+void threadpool_log(struct threadpool *tp, int tid, size_t chunk_size,
+                    ticks tic, ticks toc) {
+  struct mapper_log *log = &tp->logs[tid > 0 ? tid : 0];
+
+  /* Check if we need to re-allocate the log buffer. */
+  if (log->count == log->size) {
+    log->size *= 2;
+    struct mapper_log_entry *new_log;
+    if ((new_log = (struct mapper_log_entry *)malloc(
+             sizeof(struct mapper_log_entry) * log->size)) == NULL)
+      error("Failed to re-allocate mapper log.");
+    memcpy(new_log, log->log, sizeof(struct mapper_log_entry) * log->count);
+    free(log->log);
+    log->log = new_log;
+  }
+
+  /* Store the new entry. */
+  struct mapper_log_entry *entry = &log->log[log->count];
+  entry->tid = tid;
+  entry->chunk_size = chunk_size;
+  entry->tic = tic;
+  entry->toc = toc;
+  entry->map_function = tp->map_function;
+  log->count++;
+}
+
+void threadpool_dump_log(struct threadpool *tp, const char *filename,
+                         int reset) {
+
+  /* Open the output file. */
+  FILE *fd;
+  if ((fd = fopen(filename, "w")) == NULL)
+    error("Failed to create log file '%s'.", filename);
+
+  /* Create a buffer of function names. */
+  const int max_names = 100;
+  struct name_entry {
+    threadpool_map_function map_function;
+    const char *name;
+  };
+  struct name_entry names[max_names];
+  bzero(names, sizeof(struct name_entry) * max_names);
+
+  /* Write a header. */
+  fprintf(fd, "# map_function thread_id chunk_size tic toc\n");
+  fprintf(fd, "# {'num_threads': %i, 'cpufreq': %lli}\n", tp->num_threads,
+          clocks_get_cpufreq());
+
+  /* Loop over the per-tid logs and dump them. */
+  for (int k = 0; k < tp->num_threads; k++) {
+    struct mapper_log *log = &tp->logs[k];
+
+    /* Loop over the log entries and dump them. */
+    for (int i = 0; i < log->count; i++) {
+
+      struct mapper_log_entry *entry = &log->log[i];
+
+      /* Look for the function pointer in the buffer. */
+      int nid = 0;
+      while (nid < max_names && names[nid].map_function != entry->map_function)
+        nid++;
+
+      /* If the name was not found, make a new entry. */
+      if (nid == max_names) {
+        for (int j = 1; j < max_names; j++) names[j - 1] = names[j];
+        names[0].map_function = entry->map_function;
+        Dl_info dl_info;
+        dladdr(entry->map_function, &dl_info);
+        names[0].name = dl_info.dli_sname;
+        nid = 0;
+      }
+
+      /* Log a line to the file. */
+      fprintf(fd, "%s %i %i %lli %lli\n", names[nid].name, entry->tid,
+              entry->chunk_size, entry->tic, entry->toc);
+    }
+
+    /* Clear the log if requested. */
+    if (reset) log->count = 0;
+  }
+
+  /* Close the file. */
+  fclose(fd);
+}
+#endif  // SWIFT_DEBUG_THREADPOOL
+
+/**
+ * @brief Runner main loop, get a chunk and call the mapper function.
+ */
+void threadpool_chomp(struct threadpool *tp, int tid) {
+
+  /* Loop until we can't get a chunk. */
+  while (1) {
+    /* Desired chunk size. */
+    size_t chunk_size =
+        (tp->map_data_size - tp->map_data_count) / (2 * tp->num_threads);
+    if (chunk_size > tp->map_data_chunk) chunk_size = tp->map_data_chunk;
+    if (chunk_size < 1) chunk_size = 1;
+
+    /* Get a chunk and check its size. */
+    size_t task_ind = atomic_add(&tp->map_data_count, chunk_size);
+    if (task_ind >= tp->map_data_size) break;
+    if (task_ind + chunk_size > tp->map_data_size)
+      chunk_size = tp->map_data_size - task_ind;
+
+/* Call the mapper function. */
+#ifdef SWIFT_DEBUG_THREADPOOL
+    ticks tic = getticks();
+#endif
+    tp->map_function((char *)tp->map_data + (tp->map_data_stride * task_ind),
+                     chunk_size, tp->map_extra_data);
+#ifdef SWIFT_DEBUG_THREADPOOL
+    threadpool_log(tp, tid, chunk_size, tic, getticks());
+#endif
+  }
+}
 
 void *threadpool_runner(void *data) {
 
@@ -43,39 +169,17 @@ void *threadpool_runner(void *data) {
   while (1) {
 
     /* Let the controller know that this thread is waiting. */
-    pthread_mutex_lock(&tp->thread_mutex);
-    tp->num_threads_waiting += 1;
-    if (tp->num_threads_waiting == tp->num_threads) {
-      pthread_cond_signal(&tp->control_cond);
-    }
+    pthread_barrier_wait(&tp->wait_barrier);
 
     /* Wait for the controller. */
-    pthread_cond_wait(&tp->thread_cond, &tp->thread_mutex);
-    tp->num_threads_waiting -= 1;
-    tp->num_threads_running += 1;
-    if (tp->num_threads_running == tp->num_threads) {
-      pthread_cond_signal(&tp->control_cond);
-    }
-    pthread_mutex_unlock(&tp->thread_mutex);
-
-    /* The index of the mapping task we will work on next. */
-    while (1) {
-      /* Desired chunk size. */
-      size_t chunk_size =
-          (tp->map_data_size - tp->map_data_count) / (2 * tp->num_threads);
-      if (chunk_size > tp->map_data_chunk) chunk_size = tp->map_data_chunk;
-      if (chunk_size < 1) chunk_size = 1;
-
-      /* Get a chunk and check its size. */
-      size_t task_ind = atomic_add(&tp->map_data_count, chunk_size);
-      if (task_ind >= tp->map_data_size) break;
-      if (task_ind + chunk_size > tp->map_data_size)
-        chunk_size = tp->map_data_size - task_ind;
-
-      /* Call the mapper function. */
-      tp->map_function((char *)tp->map_data + (tp->map_data_stride * task_ind),
-                       chunk_size, tp->map_extra_data);
-    }
+    pthread_barrier_wait(&tp->run_barrier);
+
+    /* If no map function is specified, just die. We use this as a mechanism
+       to shut down threads without leaving the barriers in an invalid state. */
+    if (tp->map_function == NULL) pthread_exit(NULL);
+
+    /* Do actual work. */
+    threadpool_chomp(tp, atomic_inc(&tp->num_threads_running));
   }
 }
 
@@ -89,18 +193,28 @@ void threadpool_init(struct threadpool *tp, int num_threads) {
 
   /* Initialize the thread counters. */
   tp->num_threads = num_threads;
-  tp->num_threads_waiting = 0;
+
+#ifdef SWIFT_DEBUG_THREADPOOL
+  if ((tp->logs = (struct mapper_log *)malloc(sizeof(struct mapper_log) *
+                                              num_threads)) == NULL)
+    error("Failed to allocate mapper logs.");
+  for (int k = 0; k < num_threads; k++) {
+    tp->logs[k].size = threadpool_log_initial_size;
+    tp->logs[k].count = 0;
+    if ((tp->logs[k].log = (struct mapper_log_entry *)malloc(
+             sizeof(struct mapper_log_entry) * tp->logs[k].size)) == NULL)
+      error("Failed to allocate mapper log.");
+  }
+#endif
 
   /* If there is only a single thread, do nothing more as of here as
      we will just do work in the (blocked) calling thread. */
   if (num_threads == 1) return;
 
-  /* Init the threadpool mutexes. */
-  if (pthread_mutex_init(&tp->thread_mutex, NULL) != 0)
-    error("Failed to initialize mutexex.");
-  if (pthread_cond_init(&tp->control_cond, NULL) != 0 ||
-      pthread_cond_init(&tp->thread_cond, NULL) != 0)
-    error("Failed to initialize condition variables.");
+  /* Init the barriers. */
+  if (pthread_barrier_init(&tp->wait_barrier, NULL, num_threads) != 0 ||
+      pthread_barrier_init(&tp->run_barrier, NULL, num_threads) != 0)
+    error("Failed to initialize barriers.");
 
   /* Set the task counter to zero. */
   tp->map_data_size = 0;
@@ -109,24 +223,21 @@ void threadpool_init(struct threadpool *tp, int num_threads) {
   tp->map_data_chunk = 0;
   tp->map_function = NULL;
 
-  /* Allocate the threads. */
-  if ((tp->threads = (pthread_t *)malloc(sizeof(pthread_t) * num_threads)) ==
-      NULL) {
+  /* Allocate the threads, one less than requested since the calling thread
+     works as well. */
+  if ((tp->threads = (pthread_t *)malloc(sizeof(pthread_t) *
+                                         (num_threads - 1))) == NULL) {
     error("Failed to allocate thread array.");
   }
 
   /* Create and start the threads. */
-  pthread_mutex_lock(&tp->thread_mutex);
-  for (int k = 0; k < num_threads; k++) {
+  for (int k = 0; k < num_threads - 1; k++) {
     if (pthread_create(&tp->threads[k], NULL, &threadpool_runner, tp) != 0)
       error("Failed to create threadpool runner thread.");
   }
 
   /* Wait for all the threads to be up and running. */
-  while (tp->num_threads_waiting < tp->num_threads) {
-    pthread_cond_wait(&tp->control_cond, &tp->thread_mutex);
-  }
-  pthread_mutex_unlock(&tp->thread_mutex);
+  pthread_barrier_wait(&tp->wait_barrier);
 }
 
 /**
@@ -140,7 +251,8 @@ void threadpool_init(struct threadpool *tp, int num_threads) {
  * @param map_data The data on which the mapping function will be called.
  * @param N Number of elements in @c map_data.
  * @param stride Size, in bytes, of each element of @c map_data.
- * @param chunk Number of map data elements to pass to the function at a time.
+ * @param chunk Number of map data elements to pass to the function at a time,
+ *        or zero to choose the number automatically.
  * @param extra_data Addtitional pointer that will be passed to the mapping
  *        function, may contain additional data.
  */
@@ -148,37 +260,86 @@ void threadpool_map(struct threadpool *tp, threadpool_map_function map_function,
                     void *map_data, size_t N, int stride, int chunk,
                     void *extra_data) {
 
+#ifdef SWIFT_DEBUG_THREADPOOL
+  ticks tic = getticks();
+#endif
+
   /* If we just have a single thread, call the map function directly. */
   if (tp->num_threads == 1) {
     map_function(map_data, N, extra_data);
+#ifdef SWIFT_DEBUG_THREADPOOL
+    tp->map_function = map_function;
+    threadpool_log(tp, 0, N, tic, getticks());
+#endif
     return;
   }
 
   /* Set the map data and signal the threads. */
-  pthread_mutex_lock(&tp->thread_mutex);
   tp->map_data_stride = stride;
   tp->map_data_size = N;
   tp->map_data_count = 0;
-  tp->map_data_chunk = chunk;
+  tp->map_data_chunk =
+      chunk ? chunk
+            : max((int)(N / (tp->num_threads * threadpool_default_chunk_ratio)),
+                  1);
   tp->map_function = map_function;
   tp->map_data = map_data;
   tp->map_extra_data = extra_data;
   tp->num_threads_running = 0;
-  pthread_cond_broadcast(&tp->thread_cond);
 
   /* Wait for all the threads to be up and running. */
-  while (tp->num_threads_running < tp->num_threads) {
-    pthread_cond_wait(&tp->control_cond, &tp->thread_mutex);
-  }
+  pthread_barrier_wait(&tp->run_barrier);
+
+  /* Do some work while I'm at it. */
+  threadpool_chomp(tp, tp->num_threads - 1);
 
   /* Wait for all threads to be done. */
-  while (tp->num_threads_waiting < tp->num_threads) {
-    pthread_cond_wait(&tp->control_cond, &tp->thread_mutex);
-  }
-  pthread_mutex_unlock(&tp->thread_mutex);
+  pthread_barrier_wait(&tp->wait_barrier);
+
+#ifdef SWIFT_DEBUG_THREADPOOL
+  /* Log the total call time to thread id -1. */
+  threadpool_log(tp, -1, N, tic, getticks());
+#endif
 }
 
+/**
+ * @brief Re-sets the log for this #threadpool.
+ */
+#ifdef SWIFT_DEBUG_THREADPOOL
+void threadpool_reset_log(struct threadpool *tp) {
+  for (int k = 0; k < tp->num_threads; k++) tp->logs[k].count = 0;
+}
+#endif
+
 /**
  * @brief Frees up the memory allocated for this #threadpool.
  */
-void threadpool_clean(struct threadpool *tp) { free(tp->threads); }
+void threadpool_clean(struct threadpool *tp) {
+
+  if (tp->num_threads > 1) {
+    /* Destroy the runner threads by calling them with a NULL mapper function
+     * and waiting for all the threads to terminate. This ensures that no
+     * thread is still waiting at a barrier. */
+    tp->map_function = NULL;
+    pthread_barrier_wait(&tp->run_barrier);
+    for (int k = 0; k < tp->num_threads - 1; k++) {
+      void *retval;
+      pthread_join(tp->threads[k], &retval);
+    }
+
+    /* Release the barriers. */
+    if (pthread_barrier_destroy(&tp->wait_barrier) != 0 ||
+        pthread_barrier_destroy(&tp->run_barrier) != 0)
+      error("Failed to destroy threadpool barriers.");
+
+    /* Clean up memory. */
+    free(tp->threads);
+  }
+
+#ifdef SWIFT_DEBUG_THREADPOOL
+  for (int k = 0; k < tp->num_threads; k++) {
+    free(tp->logs[k].log);
+  }
+  free(tp->logs);
+#endif
+}
diff --git a/src/threadpool.h b/src/threadpool.h
index f9c7eeffb700adc579ec05902193b888cdd6363d..019403f658a22d36c4a6e1ec1ae1fdc47c62658d 100644
--- a/src/threadpool.h
+++ b/src/threadpool.h
@@ -25,10 +25,44 @@
 /* Some standard headers. */
 #include <pthread.h>
 
+/* Local includes. */
+#include "cycle.h"
+
+/* Local defines. */
+#define threadpool_log_initial_size 1000
+#define threadpool_default_chunk_ratio 7
+
 /* Function type for mappings. */
 typedef void (*threadpool_map_function)(void *map_data, int num_elements,
                                         void *extra_data);
 
+/* Data for threadpool logging. */
+struct mapper_log_entry {
+
+  /* ID of the thread executing the chunk. */
+  int tid;
+
+  /* Size of the chunk processed. */
+  int chunk_size;
+
+  /* Pointer to the mapper function. */
+  threadpool_map_function map_function;
+
+  /*! Start and end time of this task */
+  ticks tic, toc;
+};
+
+struct mapper_log {
+  /* Log of threadpool mapper calls. */
+  struct mapper_log_entry *log;
+
+  /* Size of the allocated log. */
+  int size;
+
+  /* Number of entries in the log. */
+  int count;
+};
+
 /* Data of a threadpool. */
 struct threadpool {
 
@@ -36,8 +70,8 @@ struct threadpool {
   pthread_t *threads;
 
   /* This is where threads go to rest. */
-  pthread_mutex_t thread_mutex;
-  pthread_cond_t control_cond, thread_cond;
+  pthread_barrier_t wait_barrier;
+  pthread_barrier_t run_barrier;
 
   /* Current map data and count. */
   void *map_data, *map_extra_data;
@@ -49,7 +83,11 @@ struct threadpool {
   int num_threads;
 
   /* Counter for the number of threads that are done. */
-  volatile int num_threads_waiting, num_threads_running;
+  volatile int num_threads_running;
+
+#ifdef SWIFT_DEBUG_THREADPOOL
+  struct mapper_log *logs;
+#endif
 };
 
 /* Function prototypes. */
@@ -58,5 +96,10 @@ void threadpool_map(struct threadpool *tp, threadpool_map_function map_function,
                     void *map_data, size_t N, int stride, int chunk,
                     void *extra_data);
 void threadpool_clean(struct threadpool *tp);
+#ifdef SWIFT_DEBUG_THREADPOOL
+void threadpool_reset_log(struct threadpool *tp);
+void threadpool_dump_log(struct threadpool *tp, const char *filename,
+                         int reset);
+#endif
 
 #endif /* SWIFT_THREADPOOL_H */
diff --git a/src/tools.c b/src/tools.c
index 73684c82662870d368f7dd360c84635654f06434..7d69ebc6c476312081d8a8c34c76c6592da5cab0 100644
--- a/src/tools.c
+++ b/src/tools.c
@@ -32,11 +32,13 @@
 #include "tools.h"
 
 /* Local includes. */
+#include "active.h"
 #include "cell.h"
 #include "error.h"
 #include "gravity.h"
 #include "hydro.h"
 #include "part.h"
+#include "periodic.h"
 #include "runner.h"
 
 /**
@@ -181,6 +183,8 @@ void pairs_all_density(struct runner *r, struct cell *ci, struct cell *cj) {
 
   float r2, hi, hj, hig2, hjg2, dx[3];
   struct part *pi, *pj;
+  const double dim[3] = {r->e->s->dim[0], r->e->s->dim[1], r->e->s->dim[2]};
+  const struct engine *e = r->e;
 
   /* Implements a double-for loop and checks every interaction */
   for (int i = 0; i < ci->count; ++i) {
@@ -189,6 +193,9 @@ void pairs_all_density(struct runner *r, struct cell *ci, struct cell *cj) {
     hi = pi->h;
     hig2 = hi * hi * kernel_gamma2;
 
+    /* Skip inactive particles. */
+    if (!part_is_active(pi, e)) continue;
+
     for (int j = 0; j < cj->count; ++j) {
 
       pj = &cj->parts[j];
@@ -197,6 +204,7 @@ void pairs_all_density(struct runner *r, struct cell *ci, struct cell *cj) {
       r2 = 0.0f;
       for (int k = 0; k < 3; k++) {
         dx[k] = ci->parts[i].x[k] - cj->parts[j].x[k];
+        dx[k] = nearest(dx[k], dim[k]);
         r2 += dx[k] * dx[k];
       }
 
@@ -216,6 +224,9 @@ void pairs_all_density(struct runner *r, struct cell *ci, struct cell *cj) {
     hj = pj->h;
     hjg2 = hj * hj * kernel_gamma2;
 
+    /* Skip inactive particles. */
+    if (!part_is_active(pj, e)) continue;
+
     for (int i = 0; i < ci->count; ++i) {
 
       pi = &ci->parts[i];
@@ -224,6 +235,7 @@ void pairs_all_density(struct runner *r, struct cell *ci, struct cell *cj) {
       r2 = 0.0f;
       for (int k = 0; k < 3; k++) {
         dx[k] = cj->parts[j].x[k] - ci->parts[i].x[k];
+        dx[k] = nearest(dx[k], dim[k]);
         r2 += dx[k] * dx[k];
       }
 
@@ -241,6 +253,7 @@ void pairs_all_force(struct runner *r, struct cell *ci, struct cell *cj) {
 
   float r2, hi, hj, hig2, hjg2, dx[3];
   struct part *pi, *pj;
+  const double dim[3] = {r->e->s->dim[0], r->e->s->dim[1], r->e->s->dim[2]};
 
   /* Implements a double-for loop and checks every interaction */
   for (int i = 0; i < ci->count; ++i) {
@@ -259,6 +272,7 @@ void pairs_all_force(struct runner *r, struct cell *ci, struct cell *cj) {
       r2 = 0.0f;
       for (int k = 0; k < 3; k++) {
         dx[k] = ci->parts[i].x[k] - cj->parts[j].x[k];
+        dx[k] = nearest(dx[k], dim[k]);
         r2 += dx[k] * dx[k];
       }
 
@@ -288,6 +302,7 @@ void pairs_all_force(struct runner *r, struct cell *ci, struct cell *cj) {
       r2 = 0.0f;
       for (int k = 0; k < 3; k++) {
         dx[k] = cj->parts[j].x[k] - ci->parts[i].x[k];
+        dx[k] = nearest(dx[k], dim[k]);
         r2 += dx[k] * dx[k];
       }
 
@@ -304,6 +319,7 @@ void pairs_all_force(struct runner *r, struct cell *ci, struct cell *cj) {
 void self_all_density(struct runner *r, struct cell *ci) {
   float r2, hi, hj, hig2, hjg2, dxi[3];  //, dxj[3];
   struct part *pi, *pj;
+  const struct engine *e = r->e;
 
   /* Implements a double-for loop and checks every interaction */
   for (int i = 0; i < ci->count; ++i) {
@@ -328,14 +344,14 @@ void self_all_density(struct runner *r, struct cell *ci) {
       }
 
       /* Hit or miss? */
-      if (r2 < hig2) {
+      if (r2 < hig2 && part_is_active(pi, e)) {
 
         /* Interact */
         runner_iact_nonsym_density(r2, dxi, hi, hj, pi, pj);
       }
 
       /* Hit or miss? */
-      if (r2 < hjg2) {
+      if (r2 < hjg2 && part_is_active(pj, e)) {
 
         dxi[0] = -dxi[0];
         dxi[1] = -dxi[1];
@@ -423,7 +439,7 @@ void pairs_single_grav(double *dim, long long int pid,
       fdx[i] = dx[i];
     }
     r2 = fdx[0] * fdx[0] + fdx[1] * fdx[1] + fdx[2] * fdx[2];
-    runner_iact_grav_pp(0.f, r2, fdx, &pi, &pj);
+    runner_iact_grav_pp(r2, fdx, &pi, &pj);
     a[0] += pi.a_grav[0];
     a[1] += pi.a_grav[1];
     a[2] += pi.a_grav[2];
@@ -748,7 +764,7 @@ void gravity_n2(struct gpart *gparts, const int gcount,
                 const struct gravity_props *gravity_properties, float rlr) {
 
   const float rlr_inv = 1. / rlr;
-  const float r_cut = gravity_properties->r_cut;
+  const float r_cut = gravity_properties->r_cut_max;
   const float max_d = r_cut * rlr;
   const float max_d2 = max_d * max_d;
 
@@ -783,7 +799,7 @@ void gravity_n2(struct gpart *gparts, const int gcount,
       if (r2 < max_d2 || 1) {
 
         /* Apply the gravitational acceleration. */
-        runner_iact_grav_pp(rlr_inv, r2, dx, gpi, gpj);
+        runner_iact_grav_pp(r2, dx, gpi, gpj);
       }
     }
   }
diff --git a/src/vector.h b/src/vector.h
index 48b9af924b64219f6e7d85292b23a87c348f9ea4..6a7c6837989025785c1f9134004f2ebcc226a205 100644
--- a/src/vector.h
+++ b/src/vector.h
@@ -23,8 +23,12 @@
 /* Have I already read this file? */
 #ifndef VEC_MACRO
 
+/* Config parameters. */
 #include "../config.h"
 
+/* Local headers */
+#include "inline.h"
+
 #ifdef WITH_VECTORIZATION
 
 /* Need to check whether compiler supports this (IBM does not)
@@ -64,7 +68,9 @@
 #define vec_sub(a, b) _mm512_sub_ps(a, b)
 #define vec_mask_sub(a, b, mask) _mm512_mask_sub_ps(a, mask, a, b)
 #define vec_mul(a, b) _mm512_mul_ps(a, b)
+#define vec_div(a, b) _mm512_div_ps(a, b)
 #define vec_fma(a, b, c) _mm512_fmadd_ps(a, b, c)
+#define vec_fnma(a, b, c) _mm512_fnmadd_ps(a, b, c)
 #define vec_sqrt(a) _mm512_sqrt_ps(a)
 #define vec_rcp(a) _mm512_rcp14_ps(a)
 #define vec_rsqrt(a) _mm512_rsqrt14_ps(a)
@@ -77,15 +83,16 @@
 #define vec_cmp_lt(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ)
 #define vec_cmp_lte(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ)
 #define vec_cmp_gte(a, b) _mm512_cmp_ps_mask(a, b, _CMP_GE_OQ)
-#define vec_cmp_result(a) a
-#define vec_form_int_mask(a) a
+#define vec_cmp_result(a) ({ a; })
+#define vec_form_int_mask(a) ({ a; })
 #define vec_and(a, b) _mm512_and_ps(a, b)
-#define vec_mask_and(a, b) a &b
-#define vec_and_mask(a, mask) _mm512_maskz_expand_ps(mask, a)
-#define vec_init_mask(mask) mask = 0xFFFF
-#define vec_zero_mask(mask) mask = 0
-#define vec_create_mask(mask, cond) mask = cond
-#define vec_pad_mask(mask, pad) mask = mask >> (pad)
+#define vec_mask_and(a, b) _mm512_kand(a, b)
+#define vec_and_mask(a, mask) _mm512_maskz_mov_ps(mask, a)
+#define vec_init_mask_true(mask) ({ mask = 0xFFFF; })
+#define vec_zero_mask(mask) ({ mask = 0; })
+#define vec_create_mask(mask, cond) ({ mask = cond; })
+#define vec_pad_mask(mask, pad) ({ mask = mask >> (pad); })
+#define vec_blend(mask, a, b) _mm512_mask_blend_ps(mask, a, b)
 #define vec_todbl_lo(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 0))
 #define vec_todbl_hi(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 1))
 #define vec_dbl_tofloat(a, b) _mm512_insertf128(_mm512_castps128_ps512(a), b, 1)
@@ -159,6 +166,7 @@
 #define vec_sub(a, b) _mm256_sub_ps(a, b)
 #define vec_mask_sub(a, b, mask) vec_sub(a, vec_and(b, mask.v))
 #define vec_mul(a, b) _mm256_mul_ps(a, b)
+#define vec_div(a, b) _mm256_div_ps(a, b)
 #define vec_sqrt(a) _mm256_sqrt_ps(a)
 #define vec_rcp(a) _mm256_rcp_ps(a)
 #define vec_rsqrt(a) _mm256_rsqrt_ps(a)
@@ -176,11 +184,12 @@
 #define vec_and(a, b) _mm256_and_ps(a, b)
 #define vec_mask_and(a, b) _mm256_and_ps(a.v, b.v)
 #define vec_and_mask(a, mask) _mm256_and_ps(a, mask.v)
-#define vec_init_mask(mask) mask.m = vec_setint1(0xFFFFFFFF)
+#define vec_init_mask_true(mask) mask.m = vec_setint1(0xFFFFFFFF)
 #define vec_create_mask(mask, cond) mask.v = cond
 #define vec_zero_mask(mask) mask.v = vec_setzero()
 #define vec_pad_mask(mask, pad) \
   for (int i = VEC_SIZE - (pad); i < VEC_SIZE; i++) mask.i[i] = 0
+#define vec_blend(mask, a, b) _mm256_blendv_ps(a, b, mask.v)
 #define vec_todbl_lo(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 0))
 #define vec_todbl_hi(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 1))
 #define vec_dbl_tofloat(a, b) _mm256_insertf128(_mm256_castps128_ps256(a), b, 1)
@@ -222,6 +231,7 @@
 /* Check if we have AVX2 intrinsics alongside AVX */
 #ifdef HAVE_AVX2
 #define vec_fma(a, b, c) _mm256_fmadd_ps(a, b, c)
+#define vec_fnma(a, b, c) _mm256_fnmadd_ps(a, b, c)
 
 /* Used in VEC_FORM_PACKED_MASK */
 #define identity_indices 0x0706050403020100
@@ -250,6 +260,11 @@
 #define vec_fma(a, b, c) vec_add(vec_mul(a, b), c)
 #endif
 
+/* Create a negated FMA using vec_sub and vec_mul if AVX2 is not present. */
+#ifndef vec_fnma
+#define vec_fnma(a, b, c) vec_sub(c, vec_mul(a, b))
+#endif
+
 /* Form a packed mask without intrinsics if AVX2 is not present. */
 #ifndef VEC_FORM_PACKED_MASK
 
@@ -313,6 +328,7 @@
 #define vec_add(a, b) _mm_add_ps(a, b)
 #define vec_sub(a, b) _mm_sub_ps(a, b)
 #define vec_mul(a, b) _mm_mul_ps(a, b)
+#define vec_div(a, b) _mm_div_ps(a, b)
 #define vec_sqrt(a) _mm_sqrt_ps(a)
 #define vec_rcp(a) _mm_rcp_ps(a)
 #define vec_rsqrt(a) _mm_rsqrt_ps(a)
diff --git a/src/version.c b/src/version.c
index 54a416f6b0745a523382f338fa838018e5254b1e..46c31103c953ce2ff70b9e346f88470008dd8266 100644
--- a/src/version.c
+++ b/src/version.c
@@ -142,10 +142,7 @@ const char *configuration_options(void) {
   static int initialised = 0;
   static const char *config = SWIFT_CONFIG_FLAGS;
   if (!initialised) {
-    if (strlen(config) < 1024 - 2)
-      sprintf(buf, "'%s'", config);
-    else
-      error("SWIFT_CONFIG_FLAGS string longer than buffer");
+    snprintf(buf, 1024, "'%s'", config);
     initialised = 1;
   }
   return buf;
@@ -161,10 +158,7 @@ const char *compilation_cflags(void) {
   static int initialised = 0;
   static const char *cflags = SWIFT_CFLAGS;
   if (!initialised) {
-    if (strlen(cflags) < 1024 - 2)
-      sprintf(buf, "'%s'", cflags);
-    else
-      error("SWIFT_CFLAGS string longer than buffer");
+    snprintf(buf, 1024, "'%s'", cflags);
     initialised = 1;
   }
   return buf;
diff --git a/src/xmf.c b/src/xmf.c
index ca4ffe5157599dd5a45295dcfa59f9420753f5cf..67682b4794ade773c39a748eddf765e392c74865 100644
--- a/src/xmf.c
+++ b/src/xmf.c
@@ -1,6 +1,7 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Copyright (c) 2017 Matthieu Schaller (matthieu.schaller@durham.ac.uk).
+ *                    Peter W. Draper   (p.w.draper@durham.ac.uk)
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
@@ -21,7 +22,9 @@
 #include "../config.h"
 
 /* Some standard headers. */
+#include <libgen.h>
 #include <stdio.h>
+#include <string.h>
 
 /* This object's header. */
 #include "xmf.h"
@@ -30,6 +33,21 @@
 #include "common_io.h"
 #include "error.h"
 
+/**
+ * @brief Return the basename of an HDF5 path.
+ *
+ * Need basename as XML paths are relative to the container, and XMF file is
+ * written with the same baseName as the HDF5 snapshots.
+ *
+ * @param hdfFileName
+ * @return the basename part of hdfFileName.
+ */
+static const char* xmf_basename(const char* hdfFileName) {
+  static char buffer[FILENAME_BUFFER_SIZE];
+  strcpy(buffer, hdfFileName);
+  return basename(buffer);
+}
+
 /**
  * @brief Prepare the XMF file corresponding to a snapshot.
  *
@@ -135,7 +153,7 @@ void xmf_write_outputfooter(FILE* xmfFile, int output, float time) {
   /* Write end of the section of this time step */
 
   fprintf(xmfFile,
-          "\n</Grid> <!-- End of meta-data for output=%03i, time=%f -->\n",
+          "\n</Grid> <!-- End of meta-data for output=%04i, time=%f -->\n",
           output, time);
   fprintf(xmfFile, "\n</Grid> <!-- timeSeries -->\n");
   fprintf(xmfFile, "</Domain>\n");
@@ -154,6 +172,7 @@ void xmf_write_outputfooter(FILE* xmfFile, int output, float time) {
  */
 void xmf_write_groupheader(FILE* xmfFile, char* hdfFileName, size_t N,
                            enum part_type ptype) {
+
   fprintf(xmfFile, "\n<Grid Name=\"%s\" GridType=\"Uniform\">\n",
           part_type_names[ptype]);
   fprintf(xmfFile,
@@ -163,7 +182,7 @@ void xmf_write_groupheader(FILE* xmfFile, char* hdfFileName, size_t N,
           "<DataItem Dimensions=\"%zu 3\" NumberType=\"Double\" "
           "Precision=\"8\" "
           "Format=\"HDF\">%s:/PartType%d/Coordinates</DataItem>\n",
-          N, hdfFileName, (int)ptype);
+          N, xmf_basename(hdfFileName), (int)ptype);
   fprintf(xmfFile,
           "</Geometry>\n <!-- Done geometry for %s, start of particle fields "
           "list -->\n",
@@ -251,13 +270,13 @@ void xmf_write_line(FILE* xmfFile, const char* fileName,
     fprintf(xmfFile,
             "<DataItem Dimensions=\"%zu\" NumberType=\"%s\" "
             "Precision=\"%d\" Format=\"HDF\">%s:%s/%s</DataItem>\n",
-            N, xmf_type(type), xmf_precision(type), fileName, partTypeGroupName,
-            name);
+            N, xmf_type(type), xmf_precision(type), xmf_basename(fileName),
+            partTypeGroupName, name);
   else
     fprintf(xmfFile,
             "<DataItem Dimensions=\"%zu %d\" NumberType=\"%s\" "
             "Precision=\"%d\" Format=\"HDF\">%s:%s/%s</DataItem>\n",
-            N, dim, xmf_type(type), xmf_precision(type), fileName,
+            N, dim, xmf_type(type), xmf_precision(type), xmf_basename(fileName),
             partTypeGroupName, name);
   fprintf(xmfFile, "</Attribute>\n");
 }
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 7c45ead22f77da7e0aa53e03051c7351cc97f550..9cd6e9ab9e09935d39bf416dfbb65b83a874b382 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -21,20 +21,24 @@ AM_LDFLAGS = ../src/.libs/libswiftsim.a $(HDF5_LDFLAGS) $(HDF5_LIBS) $(FFTW_LIBS
 
 # List of programs and scripts to run in the test suite
 TESTS = testGreetings testMaths testReading.sh testSingle testKernel testSymmetry \
-        testPair.sh testPairPerturbed.sh test27cells.sh test27cellsPerturbed.sh  \
+        testActivePair.sh test27cells.sh test27cellsPerturbed.sh  \
         testParser.sh testSPHStep test125cells.sh test125cellsPerturbed.sh testFFT \
         testAdiabaticIndex testRiemannExact testRiemannTRRS testRiemannHLLC \
-        testMatrixInversion testThreadpool testDump testLogger \
-        testVoronoi1D testVoronoi2D testVoronoi3D
+        testMatrixInversion testThreadpool testDump testLogger testInteractions.sh \
+        testVoronoi1D testVoronoi2D testVoronoi3D \
+	testPeriodicBC.sh testPeriodicBCPerturbed.sh
 
 # List of test programs to compile
 check_PROGRAMS = testGreetings testReading testSingle testTimeIntegration \
-		 testSPHStep testPair test27cells test125cells testParser \
+		 testSPHStep testActivePair test27cells test125cells testParser \
                  testKernel testFFT testInteractions testMaths \
                  testSymmetry testThreadpool benchmarkInteractions \
                  testAdiabaticIndex testRiemannExact testRiemannTRRS \
                  testRiemannHLLC testMatrixInversion testDump testLogger \
-		 testVoronoi1D testVoronoi2D testVoronoi3D
+		 testVoronoi1D testVoronoi2D testVoronoi3D testPeriodicBC
+
+# Rebuild tests when SWIFT is updated.
+$(check_PROGRAMS): ../src/.libs/libswiftsim.a
 
 # Sources for the individual programs
 testGreetings_SOURCES = testGreetings.c
@@ -51,10 +55,12 @@ testSPHStep_SOURCES = testSPHStep.c
 
 testSingle_SOURCES = testSingle.c
 
-testPair_SOURCES = testPair.c
+testActivePair_SOURCES = testActivePair.c
 
 test27cells_SOURCES = test27cells.c
 
+testPeriodicBC_SOURCES = testPeriodicBC.c
+
 test125cells_SOURCES = test125cells.c
 
 testParser_SOURCES = testParser.c
@@ -90,10 +96,10 @@ testDump_SOURCES = testDump.c
 testLogger_SOURCES = testLogger.c
 
 # Files necessary for distribution
-EXTRA_DIST = testReading.sh makeInput.py testPair.sh testPairPerturbed.sh \
-	     test27cells.sh test27cellsPerturbed.sh testParser.sh \
-	     test125cells.sh test125cellsPerturbed.sh testParserInput.yaml difffloat.py \
-	     tolerance_125_normal.dat tolerance_125_perturbed.dat \
-             tolerance_27_normal.dat tolerance_27_perturbed.dat \
-	     tolerance_pair_normal.dat tolerance_pair_perturbed.dat \
-	     fft_params.yml
+EXTRA_DIST = testReading.sh makeInput.py testActivePair.sh \
+	     test27cells.sh test27cellsPerturbed.sh testParser.sh testPeriodicBC.sh \
+	     testPeriodicBCPerturbed.sh test125cells.sh test125cellsPerturbed.sh testParserInput.yaml \
+	     difffloat.py tolerance_125_normal.dat tolerance_125_perturbed.dat \
+             tolerance_27_normal.dat tolerance_27_perturbed.dat tolerance_27_perturbed_h.dat tolerance_27_perturbed_h2.dat \
+	     tolerance_testInteractions.dat tolerance_pair_active.dat \
+	     fft_params.yml tolerance_periodic_BC_normal.dat tolerance_periodic_BC_perturbed.dat
diff --git a/tests/benchmarkInteractions.c b/tests/benchmarkInteractions.c
index ec3710e05e0151cdff13f2205bcd06bda45a34be..2cc1f830f9827a4805d8f201294e20e8334f4b09 100644
--- a/tests/benchmarkInteractions.c
+++ b/tests/benchmarkInteractions.c
@@ -31,6 +31,7 @@
 #define IACT runner_iact_nonsym_density
 #define IACT_VEC runner_iact_nonsym_2_vec_density
 #define IACT_NAME "test_nonsym_density"
+#define NUM_VEC_PROC_INT 2
 #endif
 
 #ifdef SYM_DENSITY
@@ -53,8 +54,9 @@
 
 #ifndef IACT
 #define IACT runner_iact_nonsym_density
-#define IACT_VEC runner_iact_nonsym_2_vec_density
+#define IACT_VEC runner_iact_nonsym_1_vec_density
 #define IACT_NAME "test_nonsym_density"
+#define NUM_VEC_PROC_INT 1
 #endif
 
 /**
@@ -125,7 +127,7 @@ struct part *make_particles(size_t count, double *offset, double spacing,
  */
 void prepare_force(struct part *parts, size_t count) {
 
-#if !defined(GIZMO_SPH) && !defined(SHADOWFAX_SPH)
+#if !defined(GIZMO_SPH) && !defined(SHADOWFAX_SPH) && !defined(MINIMAL_SPH)
   struct part *p;
   for (size_t i = 0; i < count; ++i) {
     p = &parts[i];
@@ -389,19 +391,35 @@ void test_interactions(struct part test_part, struct part *parts, size_t count,
 
     hi_inv_vec = vec_reciprocal(hi_vec);
 
-    mask_t mask, mask2;
-    vec_init_mask(mask);
-    vec_init_mask(mask2);
-
+    mask_t mask;
+    vec_init_mask_true(mask);
+#if (NUM_VEC_PROC_INT == 2)
+    mask_t mask2;
+    vec_init_mask_true(mask2);
+#endif
     const ticks vec_tic = getticks();
 
-    for (size_t i = 0; i < count; i += 2 * VEC_SIZE) {
+    for (size_t i = 0; i < count; i += NUM_VEC_PROC_INT * VEC_SIZE) {
 
+/* Interleave two vectors for interaction. */
+#if (NUM_VEC_PROC_INT == 2)
       IACT_VEC(&(r2q[i]), &(dxq[i]), &(dyq[i]), &(dzq[i]), (hi_inv_vec),
                (vix_vec), (viy_vec), (viz_vec), &(vjxq[i]), &(vjyq[i]),
                &(vjzq[i]), &(mjq[i]), &rhoSum, &rho_dhSum, &wcountSum,
                &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, &curlvzSum,
                mask, mask2, 0);
+#else /* Only use one vector for interaction. */
+      vector r2, dx, dy, dz;
+      r2.v = vec_load(&(r2q[i]));
+      dx.v = vec_load(&(dxq[i]));
+      dy.v = vec_load(&(dyq[i]));
+      dz.v = vec_load(&(dzq[i]));
+
+      IACT_VEC(&r2, &dx, &dy, &dz, (hi_inv_vec), (vix_vec), (viy_vec),
+               (viz_vec), &(vjxq[i]), &(vjyq[i]), &(vjzq[i]), &(mjq[i]),
+               &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum,
+               &curlvxSum, &curlvySum, &curlvzSum, mask);
+#endif
     }
 
     VEC_HADD(rhoSum, piq[0]->rho);
diff --git a/tests/difffloat.py b/tests/difffloat.py
index 0bdc706a1c44ee6c42c54ad37e93f634742e06bc..ddcf7bcb29758afa3429dea8bcf50e1c5c0477dc 100644
--- a/tests/difffloat.py
+++ b/tests/difffloat.py
@@ -42,11 +42,12 @@ if len(sys.argv) >= 4:
 if len(sys.argv) >= 5:
     number_to_check = int(sys.argv[4])
 
-if len(sys.argv) == 6:
-    ignoreSmallRhoDh = int(sys.argv[5])
-else:
-    ignoreSmallRhoDh = 0
-    
+# Get the particle properties being compared from the header.
+with open(file1, 'r') as f:
+  line = f.readline()
+  if 'ID' in line:
+    part_props = line.split()[1:]
+
 data1 = loadtxt(file1)
 data2 = loadtxt(file2)
 if fileTol != "":
@@ -63,7 +64,7 @@ n_lines = shape(data1)[0]
 n_columns = shape(data1)[1]
 
 if fileTol != "":
-    if n_linesTol != 2:
+    if n_linesTol != 3:
         print "Incorrect number of lines in tolerance file '%s'."%fileTol
     if n_columnsTol != n_columns:
         print "Incorrect number of columns in tolerance file '%s'."%fileTol
@@ -73,10 +74,12 @@ if fileTol == "":
     print "Relative difference tolerance:", rel_tol
     absTol = ones(n_columns) * abs_tol
     relTol = ones(n_columns) * rel_tol
+    limTol = zeros(n_columns)
 else:
     print "Tolerances read from file"
     absTol = dataTol[0,:]
     relTol = dataTol[1,:]
+    limTol = dataTol[2,:]
 
 n_lines_to_check = 0
 if number_to_check > 0:
@@ -100,20 +103,17 @@ for i in range(n_lines_to_check):
             rel_diff = 0.
 
         if( abs_diff > 1.1*absTol[j]):
-            print "Absolute difference larger than tolerance (%e) for particle %d, column %d:"%(absTol[j], i,j)
+            print "Absolute difference larger than tolerance (%e) for particle %d, column %s:"%(absTol[j], data1[i,0], part_props[j])
             print "%10s:           a = %e"%("File 1", data1[i,j])
             print "%10s:           b = %e"%("File 2", data2[i,j])
             print "%10s:       |a-b| = %e"%("Difference", abs_diff)
             print ""
             error = True
 
-        if abs(data1[i,j]) < 4e-6 and abs(data2[i,j]) < 4e-6 : continue
+        if abs(data1[i,j]) + abs(data2[i,j]) < limTol[j] : continue
 
-        # Ignore pathological cases with rho_dh
-        if ignoreSmallRhoDh and j == 8 and abs(data1[i,j]) < 2e-4: continue
-        
         if( rel_diff > 1.1*relTol[j]):
-            print "Relative difference larger than tolerance (%e) for particle %d, column %d:"%(relTol[j], i,j)
+            print "Relative difference larger than tolerance (%e) for particle %d, column %s:"%(relTol[j], data1[i,0], part_props[j])
             print "%10s:           a = %e"%("File 1", data1[i,j])
             print "%10s:           b = %e"%("File 2", data2[i,j])
             print "%10s: |a-b|/|a+b| = %e"%("Difference", rel_diff)
diff --git a/tests/test125cells.c b/tests/test125cells.c
index 5cd8c82a3fb1850b34d157befa70ae75240c7012..023ce145846a30baf79a42877199e6a3028cd75c 100644
--- a/tests/test125cells.c
+++ b/tests/test125cells.c
@@ -349,13 +349,11 @@ struct cell *make_cell(size_t n, const double offset[3], double size, double h,
   cell->ti_old_part = 8;
   cell->ti_end_min = 8;
   cell->ti_end_max = 8;
-  cell->ti_sort = 0;
 
   // shuffle_particles(cell->parts, cell->count);
 
   cell->sorted = 0;
-  cell->sort = NULL;
-  cell->sortsize = 0;
+  for (int k = 0; k < 13; k++) cell->sort[k] = NULL;
 
   return cell;
 }
@@ -363,7 +361,8 @@ struct cell *make_cell(size_t n, const double offset[3], double size, double h,
 void clean_up(struct cell *ci) {
   free(ci->parts);
   free(ci->xparts);
-  free(ci->sort);
+  for (int k = 0; k < 13; k++)
+    if (ci->sort[k] != NULL) free(ci->sort[k]);
   free(ci);
 }
 
@@ -445,6 +444,8 @@ void dump_particle_fields(char *fileName, struct cell *main_cell,
 
 /* Just a forward declaration... */
 void runner_dopair1_density(struct runner *r, struct cell *ci, struct cell *cj);
+void runner_dopair1_branch_density(struct runner *r, struct cell *ci,
+                                   struct cell *cj);
 void runner_doself1_density(struct runner *r, struct cell *ci);
 void runner_dopair2_force(struct runner *r, struct cell *ci, struct cell *cj);
 void runner_doself2_force(struct runner *r, struct cell *ci);
@@ -565,8 +566,8 @@ int main(int argc, char *argv[]) {
   prog_const.const_newton_G = 1.f;
 
   struct hydro_props hp;
-  hp.target_neighbours = pow_dimension(h) * kernel_norm;
-  hp.delta_neighbours = 4.;
+  hp.eta_neighbours = h;
+  hp.h_tolerance = 1e0;
   hp.h_max = FLT_MAX;
   hp.max_smoothing_iterations = 1;
   hp.CFL_condition = 0.1;
@@ -637,11 +638,20 @@ int main(int argc, char *argv[]) {
     }
 
     /* First, sort stuff */
-    for (int j = 0; j < 125; ++j) runner_do_sort(&runner, cells[j], 0x1FFF, 0);
+    for (int j = 0; j < 125; ++j)
+      runner_do_sort(&runner, cells[j], 0x1FFF, 0, 0);
 
 /* Do the density calculation */
 #if !(defined(MINIMAL_SPH) && defined(WITH_VECTORIZATION))
 
+/* Initialise the particle cache. */
+#ifdef WITH_VECTORIZATION
+    runner.ci_cache.count = 0;
+    cache_init(&runner.ci_cache, 512);
+    runner.cj_cache.count = 0;
+    cache_init(&runner.cj_cache, 512);
+#endif
+
     /* Run all the pairs (only once !)*/
     for (int i = 0; i < 5; i++) {
       for (int j = 0; j < 5; j++) {
@@ -664,7 +674,7 @@ int main(int argc, char *argv[]) {
 
                 struct cell *cj = cells[iii * 25 + jjj * 5 + kkk];
 
-                if (cj > ci) runner_dopair1_density(&runner, ci, cj);
+                if (cj > ci) runner_dopair1_branch_density(&runner, ci, cj);
               }
             }
           }
diff --git a/tests/test27cells.c b/tests/test27cells.c
index a0f541d17100a13079580aabbef065fa5adbd5e1..7ba1eec9ad279f09f63021e332dac1cfd5cc1505 100644
--- a/tests/test27cells.c
+++ b/tests/test27cells.c
@@ -30,11 +30,9 @@
 /* Local headers. */
 #include "swift.h"
 
-#define ACC_THRESHOLD 1e-5
-
 #if defined(WITH_VECTORIZATION)
 #define DOSELF1 runner_doself1_density_vec
-#define DOPAIR1 runner_dopair1_density_vec
+#define DOPAIR1 runner_dopair1_branch_density
 #define DOSELF1_NAME "runner_doself1_density_vec"
 #define DOPAIR1_NAME "runner_dopair1_density_vec"
 #endif
@@ -45,7 +43,7 @@
 #endif
 
 #ifndef DOPAIR1
-#define DOPAIR1 runner_dopair1_density
+#define DOPAIR1 runner_dopair1_branch_density
 #define DOPAIR1_NAME "runner_dopair1_density"
 #endif
 
@@ -64,18 +62,20 @@ enum velocity_types {
  * @param offset The position of the cell offset from (0,0,0).
  * @param size The cell size.
  * @param h The smoothing length of the particles in units of the inter-particle
- *separation.
+ * separation.
  * @param density The density of the fluid.
  * @param partId The running counter of IDs.
  * @param pert The perturbation to apply to the particles in the cell in units
- *of the inter-particle separation.
+ * of the inter-particle separation.
  * @param vel The type of velocity field (0, random, divergent, rotating)
+ * @param h_pert The perturbation to apply to the smoothing length.
  */
 struct cell *make_cell(size_t n, double *offset, double size, double h,
                        double density, long long *partId, double pert,
-                       enum velocity_types vel) {
+                       enum velocity_types vel, double h_pert) {
   const size_t count = n * n * n;
   const double volume = size * size * size;
+  float h_max = 0.f;
   struct cell *cell = malloc(sizeof(struct cell));
   bzero(cell, sizeof(struct cell));
 
@@ -121,7 +121,11 @@ struct cell *make_cell(size_t n, double *offset, double size, double h,
             part->v[2] = 0.f;
             break;
         }
-        part->h = size * h / (float)n;
+        if (h_pert)
+          part->h = size * h * random_uniform(1.f, h_pert) / (float)n;
+        else
+          part->h = size * h / (float)n;
+        h_max = fmaxf(h_max, part->h);
         part->id = ++(*partId);
 
 #if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH)
@@ -156,7 +160,7 @@ struct cell *make_cell(size_t n, double *offset, double size, double h,
 
   /* Cell properties */
   cell->split = 0;
-  cell->h_max = h;
+  cell->h_max = h_max;
   cell->count = count;
   cell->dx_max_part = 0.;
   cell->dx_max_sort = 0.;
@@ -170,20 +174,19 @@ struct cell *make_cell(size_t n, double *offset, double size, double h,
   cell->ti_old_part = 8;
   cell->ti_end_min = 8;
   cell->ti_end_max = 8;
-  cell->ti_sort = 8;
 
   shuffle_particles(cell->parts, cell->count);
 
   cell->sorted = 0;
-  cell->sort = NULL;
-  cell->sortsize = 0;
+  for (int k = 0; k < 13; k++) cell->sort[k] = NULL;
 
   return cell;
 }
 
 void clean_up(struct cell *ci) {
   free(ci->parts);
-  free(ci->sort);
+  for (int k = 0; k < 13; k++)
+    if (ci->sort[k] != NULL) free(ci->sort[k]);
   free(ci);
 }
 
@@ -202,6 +205,10 @@ void zero_particle_fields(struct cell *c) {
 void end_calculation(struct cell *c) {
   for (int pid = 0; pid < c->count; pid++) {
     hydro_end_density(&c->parts[pid]);
+
+    /* Recover the common "Neighbour number" definition */
+    c->parts[pid].density.wcount *= pow_dimension(c->parts[pid].h);
+    c->parts[pid].density.wcount *= kernel_norm;
   }
 }
 
@@ -288,33 +295,11 @@ void dump_particle_fields(char *fileName, struct cell *main_cell,
   fclose(file);
 }
 
-/**
- * @brief Compares the vectorised result against
- * the serial result of the interaction.
- *
- * @param serial_parts Particle array that has been interacted serially
- * @param vec_parts Particle array to be interacted using vectors
- * @param count No. of particles that have been interacted
- * @param threshold Level of accuracy needed
- *
- * @return Non-zero value if difference found, 0 otherwise
- */
-int check_results(struct part *serial_parts, struct part *vec_parts, int count,
-                  double threshold) {
-  int result = 0;
-
-  for (int i = 0; i < count; i++)
-    result += compare_particles(serial_parts[i], vec_parts[i], threshold);
-
-  return result;
-}
-
 /* Just a forward declaration... */
 void runner_doself1_density(struct runner *r, struct cell *ci);
 void runner_doself1_density_vec(struct runner *r, struct cell *ci);
-void runner_dopair1_density(struct runner *r, struct cell *ci, struct cell *cj);
-void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
-                                struct cell *cj);
+void runner_dopair1_branch_density(struct runner *r, struct cell *ci,
+                                   struct cell *cj);
 
 /* And go... */
 int main(int argc, char *argv[]) {
@@ -322,8 +307,7 @@ int main(int argc, char *argv[]) {
   engine_pin();
   size_t runs = 0, particles = 0;
   double h = 1.23485, size = 1., rho = 1.;
-  double perturbation = 0.;
-  double threshold = ACC_THRESHOLD;
+  double perturbation = 0., h_pert = 0.;
   char outputFileNameExtension[200] = "";
   char outputFileName[200] = "";
   enum velocity_types vel = velocity_zero;
@@ -339,11 +323,14 @@ int main(int argc, char *argv[]) {
   srand(0);
 
   char c;
-  while ((c = getopt(argc, argv, "m:s:h:n:r:t:d:f:v:a:")) != -1) {
+  while ((c = getopt(argc, argv, "m:s:h:p:n:r:t:d:f:v:")) != -1) {
     switch (c) {
       case 'h':
         sscanf(optarg, "%lf", &h);
         break;
+      case 'p':
+        sscanf(optarg, "%lf", &h_pert);
+        break;
       case 's':
         sscanf(optarg, "%lf", &size);
         break;
@@ -365,9 +352,6 @@ int main(int argc, char *argv[]) {
       case 'v':
         sscanf(optarg, "%d", (int *)&vel);
         break;
-      case 'a':
-        sscanf(optarg, "%lf", &threshold);
-        break;
       case '?':
         error("Unknown option.");
         break;
@@ -382,6 +366,7 @@ int main(int argc, char *argv[]) {
         "runner_doself1_density()."
         "\n\nOptions:"
         "\n-h DISTANCE=1.2348 - Smoothing length in units of <x>"
+        "\n-p                 - Random fractional change in h, h=h*random(1,p)"
         "\n-m rho             - Physical density in the cell"
         "\n-s size            - Physical size of the cell"
         "\n-d pert            - Perturbation to apply to the particles [0,1["
@@ -415,7 +400,11 @@ int main(int argc, char *argv[]) {
   space.dim[2] = 3.;
 
   struct hydro_props hp;
+  hp.eta_neighbours = h;
+  hp.h_tolerance = 1e0;
   hp.h_max = FLT_MAX;
+  hp.max_smoothing_iterations = 1;
+  hp.CFL_condition = 0.1;
 
   struct engine engine;
   engine.s = &space;
@@ -435,12 +424,13 @@ int main(int argc, char *argv[]) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 3; ++k) {
         double offset[3] = {i * size, j * size, k * size};
-        cells[i * 9 + j * 3 + k] = make_cell(particles, offset, size, h, rho,
-                                             &partId, perturbation, vel);
+        cells[i * 9 + j * 3 + k] =
+            make_cell(particles, offset, size, h, rho, &partId, perturbation,
+                      vel, h_pert);
 
         runner_do_drift_part(&runner, cells[i * 9 + j * 3 + k], 0);
 
-        runner_do_sort(&runner, cells[i * 9 + j * 3 + k], 0x1FFF, 0);
+        runner_do_sort(&runner, cells[i * 9 + j * 3 + k], 0x1FFF, 0, 0);
       }
     }
   }
@@ -504,10 +494,6 @@ int main(int argc, char *argv[]) {
     }
   }
 
-  /* Store the vectorised particle results. */
-  struct part vec_parts[main_cell->count];
-  for (int i = 0; i < main_cell->count; i++) vec_parts[i] = main_cell->parts[i];
-
   /* Output timing */
   ticks corner_time = timings[0] + timings[2] + timings[6] + timings[8] +
                       timings[18] + timings[20] + timings[24] + timings[26];
@@ -552,10 +538,6 @@ int main(int argc, char *argv[]) {
   sprintf(outputFileName, "brute_force_27_%s.dat", outputFileNameExtension);
   dump_particle_fields(outputFileName, main_cell, cells);
 
-  /* Check serial results against the vectorised results. */
-  if (check_results(main_cell->parts, vec_parts, main_cell->count, threshold))
-    message("Differences found...");
-
   /* Output timing */
   message("Brute force calculation took : %15lli ticks.", toc - tic);
 
diff --git a/tests/test27cells.sh.in b/tests/test27cells.sh.in
index 4312ce55e13097d4ae40c289b9c5caa885ff37cc..059a7a208aa8e570ad5035fac16ffd201bf3dddd 100755
--- a/tests/test27cells.sh.in
+++ b/tests/test27cells.sh.in
@@ -1,13 +1,14 @@
 #!/bin/bash
 
+# Test for particles with the same smoothing length
 for v in {0..3}
 do
     echo ""
 	
     rm -f brute_force_27_standard.dat swift_dopair_27_standard.dat
 
-    echo "Running ./test27cells -n 6 -r 1 -d 0 -f standard -v $v -a 1e-4"
-    ./test27cells -n 6 -r 1 -d 0 -f standard -v $v -a 1e-4
+    echo "Running ./test27cells -n 6 -r 1 -d 0 -f standard -v $v"
+    ./test27cells -n 6 -r 1 -d 0 -f standard -v $v
 
     if [ -e brute_force_27_standard.dat ]
     then
@@ -27,4 +28,60 @@ do
     
 done
 
+# Test for particles with random smoothing lengths
+for v in {0..3}
+do
+    echo ""
+	
+    rm -f brute_force_27_standard.dat swift_dopair_27_standard.dat
+
+    echo "Running ./test27cells -n 6 -r 1 -d 0 -f standard -v $v -p 1.1"
+    ./test27cells -n 6 -r 1 -d 0 -f standard -v $v -p 1.1
+
+    if [ -e brute_force_27_standard.dat ]
+    then
+	if python @srcdir@/difffloat.py brute_force_27_standard.dat swift_dopair_27_standard.dat @srcdir@/tolerance_27_perturbed_h.dat 6
+	then
+	    echo "Accuracy test passed"
+	else
+	    echo "Accuracy test failed"
+	    exit 1
+	fi
+    else
+	echo "Error Missing test output file"
+	exit 1
+    fi
+
+    echo "------------"
+    
+done
+
+# Test for particles with random smoothing lengths
+for v in {0..3}
+do
+    echo ""
+	
+    rm -f brute_force_27_standard.dat swift_dopair_27_standard.dat
+
+    echo "Running ./test27cells -n 6 -r 1 -d 0 -f standard -v $v -p 1.3"
+    ./test27cells -n 6 -r 1 -d 0 -f standard -v $v -p 1.3
+
+    if [ -e brute_force_27_standard.dat ]
+    then
+	if python @srcdir@/difffloat.py brute_force_27_standard.dat swift_dopair_27_standard.dat @srcdir@/tolerance_27_perturbed_h2.dat 6
+	then
+	    echo "Accuracy test passed"
+	else
+	    echo "Accuracy test failed"
+	    exit 1
+	fi
+    else
+	echo "Error Missing test output file"
+	exit 1
+    fi
+
+    echo "------------"
+    
+done
+
 exit $?
diff --git a/tests/test27cellsPerturbed.sh.in b/tests/test27cellsPerturbed.sh.in
index 2f2e1db76346ca8f0ea4c2365ee349e232a1ce53..f875504e541588377ca6e40fe55681ebec3466f6 100755
--- a/tests/test27cellsPerturbed.sh.in
+++ b/tests/test27cellsPerturbed.sh.in
@@ -1,17 +1,18 @@
 #!/bin/bash
 
+# Test for particles with the same smoothing length
 for v in {0..3}
 do
     echo ""
 
     rm -f brute_force_27_perturbed.dat swift_dopair_27_perturbed.dat
 
-    echo "Running ./test27cells -n 6 -r 1 -d 0.1 -f perturbed -v $v -a 5e-4"
-    ./test27cells -n 6 -r 1 -d 0.1 -f perturbed -v $v -a 5e-4
+    echo "Running ./test27cells -n 6 -r 1 -d 0.1 -f perturbed -v $v"
+    ./test27cells -n 6 -r 1 -d 0.1 -f perturbed -v $v
 
     if [ -e brute_force_27_perturbed.dat ]
     then
-	if python @srcdir@/difffloat.py brute_force_27_perturbed.dat swift_dopair_27_perturbed.dat @srcdir@/tolerance_27_perturbed.dat 6 1
+	if python @srcdir@/difffloat.py brute_force_27_perturbed.dat swift_dopair_27_perturbed.dat @srcdir@/tolerance_27_perturbed.dat 6
 	then
 	    echo "Accuracy test passed"
 	else
@@ -27,4 +28,59 @@ do
 
 done
 
+# Test for particles with random smoothing lengths
+for v in {0..3}
+do
+    echo ""
+
+    rm -f brute_force_27_perturbed.dat swift_dopair_27_perturbed.dat
+
+    echo "Running ./test27cells -n 6 -r 1 -d 0.1 -f perturbed -v $v -p 1.1"
+    ./test27cells -n 6 -r 1 -d 0.1 -f perturbed -v $v -p 1.1
+
+    if [ -e brute_force_27_perturbed.dat ]
+    then
+	if python @srcdir@/difffloat.py brute_force_27_perturbed.dat swift_dopair_27_perturbed.dat @srcdir@/tolerance_27_perturbed_h.dat 6
+	then
+	    echo "Accuracy test passed"
+	else
+	    echo "Accuracy test failed"
+	    exit 1
+	fi
+    else
+	echo "Error Missing test output file"
+	exit 1
+    fi
+
+    echo "------------"
+
+done
+
+# Test for particles with random smoothing lengths
+for v in {0..3}
+do
+    echo ""
+
+    rm -f brute_force_27_perturbed.dat swift_dopair_27_perturbed.dat
+
+    echo "Running ./test27cells -n 6 -r 1 -d 0.1 -f perturbed -v $v -p 1.3"
+    ./test27cells -n 6 -r 1 -d 0.1 -f perturbed -v $v -p 1.3
+
+    if [ -e brute_force_27_perturbed.dat ]
+    then
+	if python @srcdir@/difffloat.py brute_force_27_perturbed.dat swift_dopair_27_perturbed.dat @srcdir@/tolerance_27_perturbed_h2.dat 6
+	then
+	    echo "Accuracy test passed"
+	else
+	    echo "Accuracy test failed"
+	    exit 1
+	fi
+    else
+	echo "Error Missing test output file"
+	exit 1
+    fi
+
+    echo "------------"
+
+done
 exit $?
diff --git a/tests/testActivePair.c b/tests/testActivePair.c
new file mode 100644
index 0000000000000000000000000000000000000000..1e0111b4f0e480d0f66463b4c2264cdd89bd28c8
--- /dev/null
+++ b/tests/testActivePair.c
@@ -0,0 +1,510 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (C) 2015 Matthieu Schaller (matthieu.schaller@durham.ac.uk).
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+#include "../config.h"
+
+/* Some standard headers. */
+#include <fenv.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+
+/* Local headers. */
+#include "swift.h"
+
+/**
+ * @brief Constructs a cell and all of its particle in a valid state prior to
+ * a DOPAIR or DOSELF calcuation.
+ *
+ * @param n The cube root of the number of particles.
+ * @param offset The position of the cell offset from (0,0,0).
+ * @param size The cell size.
+ * @param h The smoothing length of the particles in units of the inter-particle
+ * separation.
+ * @param density The density of the fluid.
+ * @param partId The running counter of IDs.
+ * @param pert The perturbation to apply to the particles in the cell in units
+ * of the inter-particle separation.
+ * @param h_pert The perturbation to apply to the smoothing length.
+ * @param fraction_active The fraction of particles that should be active in the
+ * cell.
+ */
+struct cell *make_cell(size_t n, double *offset, double size, double h,
+                       double density, long long *partId, double pert,
+                       double h_pert, double fraction_active) {
+  const size_t count = n * n * n;
+  const double volume = size * size * size;
+  float h_max = 0.f;
+  struct cell *cell = malloc(sizeof(struct cell));
+  bzero(cell, sizeof(struct cell));
+
+  if (posix_memalign((void **)&cell->parts, part_align,
+                     count * sizeof(struct part)) != 0) {
+    error("couldn't allocate particles, no. of particles: %d", (int)count);
+  }
+  bzero(cell->parts, count * sizeof(struct part));
+
+  /* Construct the parts */
+  struct part *part = cell->parts;
+  for (size_t x = 0; x < n; ++x) {
+    for (size_t y = 0; y < n; ++y) {
+      for (size_t z = 0; z < n; ++z) {
+        part->x[0] =
+            offset[0] +
+            size * (x + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n;
+        part->x[1] =
+            offset[1] +
+            size * (y + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n;
+        part->x[2] =
+            offset[2] +
+            size * (z + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n;
+        part->v[0] = random_uniform(-0.05, 0.05);
+        part->v[1] = random_uniform(-0.05, 0.05);
+        part->v[2] = random_uniform(-0.05, 0.05);
+
+        if (h_pert)
+          part->h = size * h * random_uniform(1.f, h_pert) / (float)n;
+        else
+          part->h = size * h / (float)n;
+        h_max = fmaxf(h_max, part->h);
+        part->id = ++(*partId);
+
+#if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH)
+        part->conserved.mass = density * volume / count;
+
+#ifdef SHADOWFAX_SPH
+        double anchor[3] = {0., 0., 0.};
+        double side[3] = {1., 1., 1.};
+        voronoi_cell_init(&part->cell, part->x, anchor, side);
+#endif
+
+#else
+        part->mass = density * volume / count;
+#endif
+
+#if defined(HOPKINS_PE_SPH)
+        part->entropy = 1.f;
+        part->entropy_one_over_gamma = 1.f;
+#endif
+        if (random_uniform(0, 1.f) < fraction_active)
+          part->time_bin = 1;
+        else
+          part->time_bin = num_time_bins + 1;
+
+#ifdef SWIFT_DEBUG_CHECKS
+        part->ti_drift = 8;
+        part->ti_kick = 8;
+#endif
+
+        ++part;
+      }
+    }
+  }
+
+  /* Cell properties */
+  cell->split = 0;
+  cell->h_max = h_max;
+  cell->count = count;
+  cell->dx_max_part = 0.;
+  cell->dx_max_sort = 0.;
+  cell->width[0] = size;
+  cell->width[1] = size;
+  cell->width[2] = size;
+  cell->loc[0] = offset[0];
+  cell->loc[1] = offset[1];
+  cell->loc[2] = offset[2];
+
+  cell->ti_old_part = 8;
+  cell->ti_end_min = 8;
+  cell->ti_end_max = 8;
+
+  shuffle_particles(cell->parts, cell->count);
+
+  cell->sorted = 0;
+  for (int k = 0; k < 13; k++) cell->sort[k] = NULL;
+
+  return cell;
+}
+
+void clean_up(struct cell *ci) {
+  free(ci->parts);
+  for (int k = 0; k < 13; k++)
+    if (ci->sort[k] != NULL) free(ci->sort[k]);
+  free(ci);
+}
+
+/**
+ * @brief Initializes all particles field to be ready for a density calculation
+ */
+void zero_particle_fields(struct cell *c) {
+  for (int pid = 0; pid < c->count; pid++) {
+    hydro_init_part(&c->parts[pid], NULL);
+  }
+}
+
+/**
+ * @brief Ends the loop by adding the appropriate coefficients
+ */
+void end_calculation(struct cell *c) {
+  for (int pid = 0; pid < c->count; pid++) {
+    hydro_end_density(&c->parts[pid]);
+
+    /* Recover the common "Neighbour number" definition */
+    c->parts[pid].density.wcount *= pow_dimension(c->parts[pid].h);
+    c->parts[pid].density.wcount *= kernel_norm;
+  }
+}
+
+/**
+ * @brief Dump all the particles to a file
+ */
+void dump_particle_fields(char *fileName, struct cell *ci, struct cell *cj) {
+  FILE *file = fopen(fileName, "a");
+
+  /* Write header */
+  fprintf(file, "# %4s %13s\n", "ID", "wcount");
+
+  fprintf(file, "# ci --------------------------------------------\n");
+
+  for (int pid = 0; pid < ci->count; pid++) {
+    fprintf(file, "%6llu %13e\n", ci->parts[pid].id,
+            ci->parts[pid].density.wcount);
+  }
+
+  fprintf(file, "# cj --------------------------------------------\n");
+
+  for (int pjd = 0; pjd < cj->count; pjd++) {
+    fprintf(file, "%6llu %13e\n", cj->parts[pjd].id,
+            cj->parts[pjd].density.wcount);
+  }
+
+  fclose(file);
+}
+
+/* Just a forward declaration... */
+void runner_dopair1_density(struct runner *r, struct cell *ci, struct cell *cj);
+void runner_doself1_density_vec(struct runner *r, struct cell *ci);
+void runner_dopair1_branch_density(struct runner *r, struct cell *ci,
+                                   struct cell *cj);
+
+/**
+ * @brief Computes the pair interactions of two cells using SWIFT and a brute
+ * force implementation.
+ */
+void test_pair_interactions(struct runner *runner, struct cell **ci,
+                            struct cell **cj, char *swiftOutputFileName,
+                            char *bruteForceOutputFileName) {
+
+  runner_do_sort(runner, *ci, 0x1FFF, 0, 0);
+  runner_do_sort(runner, *cj, 0x1FFF, 0, 0);
+
+  /* Zero the fields */
+  zero_particle_fields(*ci);
+  zero_particle_fields(*cj);
+
+  /* Run the test */
+  runner_dopair1_branch_density(runner, *ci, *cj);
+
+  /* Let's get physical ! */
+  end_calculation(*ci);
+  end_calculation(*cj);
+
+  /* Dump if necessary */
+  dump_particle_fields(swiftOutputFileName, *ci, *cj);
+
+  /* Now perform a brute-force version for accuracy tests */
+
+  /* Zero the fields */
+  zero_particle_fields(*ci);
+  zero_particle_fields(*cj);
+
+  /* Run the brute-force test */
+  pairs_all_density(runner, *ci, *cj);
+
+  /* Let's get physical ! */
+  end_calculation(*ci);
+  end_calculation(*cj);
+
+  dump_particle_fields(bruteForceOutputFileName, *ci, *cj);
+}
+
+/**
+ * @brief Computes the pair interactions of two cells in various configurations.
+ */
+void test_all_pair_interactions(struct runner *runner, double *offset2,
+                                size_t particles, double size, double h,
+                                double rho, long long *partId,
+                                double perturbation, double h_pert,
+                                char *swiftOutputFileName,
+                                char *bruteForceOutputFileName) {
+
+  double offset1[3] = {0, 0, 0};
+  struct cell *ci, *cj;
+
+  /* All active particles. */
+  ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert,
+                 1.);
+  cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert,
+                 1.);
+
+  test_pair_interactions(runner, &ci, &cj, swiftOutputFileName,
+                         bruteForceOutputFileName);
+
+  clean_up(ci);
+  clean_up(cj);
+
+  /* Half particles are active. */
+  ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert,
+                 0.5);
+  cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert,
+                 0.5);
+
+  test_pair_interactions(runner, &ci, &cj, swiftOutputFileName,
+                         bruteForceOutputFileName);
+
+  clean_up(ci);
+  clean_up(cj);
+
+  /* All particles inactive. */
+  ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert,
+                 0.);
+  cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert,
+                 0.);
+
+  test_pair_interactions(runner, &ci, &cj, swiftOutputFileName,
+                         bruteForceOutputFileName);
+
+  clean_up(ci);
+  clean_up(cj);
+
+  /* 10% of particles active. */
+  ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert,
+                 0.1);
+  cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert,
+                 0.1);
+
+  test_pair_interactions(runner, &ci, &cj, swiftOutputFileName,
+                         bruteForceOutputFileName);
+
+  clean_up(ci);
+  clean_up(cj);
+
+  /* One active cell one inactive cell. */
+  ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert,
+                 1.0);
+  cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert,
+                 0.);
+
+  test_pair_interactions(runner, &ci, &cj, swiftOutputFileName,
+                         bruteForceOutputFileName);
+
+  clean_up(ci);
+  clean_up(cj);
+
+  /* One active cell one inactive cell. */
+  ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert,
+                 0.);
+  cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert,
+                 1.0);
+
+  test_pair_interactions(runner, &ci, &cj, swiftOutputFileName,
+                         bruteForceOutputFileName);
+
+  clean_up(ci);
+  clean_up(cj);
+
+  /* Smaller cells, all active. */
+  ci = make_cell(2, offset1, size, h, rho, partId, perturbation, h_pert, 1.0);
+  cj = make_cell(2, offset2, size, h, rho, partId, perturbation, h_pert, 1.0);
+
+  test_pair_interactions(runner, &ci, &cj, swiftOutputFileName,
+                         bruteForceOutputFileName);
+
+  clean_up(ci);
+  clean_up(cj);
+
+  /* Different numbers of particles in each cell. */
+  ci = make_cell(10, offset1, size, h, rho, partId, perturbation, h_pert, 0.5);
+  cj = make_cell(3, offset2, size, h, rho, partId, perturbation, h_pert, 0.75);
+
+  test_pair_interactions(runner, &ci, &cj, swiftOutputFileName,
+                         bruteForceOutputFileName);
+
+  clean_up(ci);
+  clean_up(cj);
+
+  /* One cell inactive and the other only half active. */
+  ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert,
+                 0.5);
+  cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert,
+                 0.);
+
+  test_pair_interactions(runner, &ci, &cj, swiftOutputFileName,
+                         bruteForceOutputFileName);
+
+  clean_up(ci);
+  clean_up(cj);
+
+  /* One cell inactive and the other only half active. */
+  ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert,
+                 0.);
+  cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert,
+                 0.5);
+
+  test_pair_interactions(runner, &ci, &cj, swiftOutputFileName,
+                         bruteForceOutputFileName);
+
+  /* Clean things to make the sanitizer happy ... */
+  clean_up(ci);
+  clean_up(cj);
+}
+
+int main(int argc, char *argv[]) {
+  size_t particles = 0, runs = 0, type = 0;
+  double h = 1.23485, size = 1., rho = 1.;
+  double perturbation = 0.1, h_pert = 1.1;
+  struct space space;
+  struct engine engine;
+  struct runner *runner;
+  char c;
+  static long long partId = 0;
+  char outputFileNameExtension[200] = "";
+  char swiftOutputFileName[200] = "";
+  char bruteForceOutputFileName[200] = "";
+
+  /* Initialize CPU frequency, this also starts time. */
+  unsigned long long cpufreq = 0;
+  clocks_set_cpufreq(cpufreq);
+
+  /* Choke on FP-exceptions */
+  feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
+
+  /* Generate a RNG seed from time. */
+  unsigned int seed = time(NULL);
+
+  while ((c = getopt(argc, argv, "h:p:n:r:t:d:s:f:")) != -1) {
+    switch (c) {
+      case 'h':
+        sscanf(optarg, "%lf", &h);
+        break;
+      case 'p':
+        sscanf(optarg, "%lf", &h_pert);
+        break;
+      case 'n':
+        sscanf(optarg, "%zu", &particles);
+        break;
+      case 'r':
+        sscanf(optarg, "%zu", &runs);
+        break;
+      case 't':
+        sscanf(optarg, "%zu", &type);
+        break;
+      case 'd':
+        sscanf(optarg, "%lf", &perturbation);
+        break;
+      case 's':
+        sscanf(optarg, "%u", &seed);
+        break;
+      case 'f':
+        strcpy(outputFileNameExtension, optarg);
+        break;
+      case '?':
+        error("Unknown option.");
+        break;
+    }
+  }
+
+  if (h < 0 || particles == 0 || runs == 0 || type > 2) {
+    printf(
+        "\nUsage: %s -n PARTICLES_PER_AXIS -r NUMBER_OF_RUNS [OPTIONS...]\n"
+        "\nGenerates a cell pair, filled with particles on a Cartesian grid."
+        "\nThese are then interacted using runner_dopair1_density."
+        "\n\nOptions:"
+        "\n-t TYPE=0          - cells share face (0), edge (1) or corner (2)"
+        "\n-h DISTANCE=1.2348 - smoothing length"
+        "\n-p                 - Random fractional change in h, h=h*random(1,p)"
+        "\n-d pert            - perturbation to apply to the particles [0,1["
+        "\n-s seed            - seed for RNG"
+        "\n-f fileName        - part of the file name used to save the dumps\n",
+        argv[0]);
+    exit(1);
+  }
+
+  /* Seed RNG. */
+  message("Seed used for RNG: %d", seed);
+  srand(seed);
+
+  space.periodic = 0;
+  space.dim[0] = 3.;
+  space.dim[1] = 3.;
+  space.dim[2] = 3.;
+
+  engine.s = &space;
+  engine.time = 0.1f;
+  engine.ti_current = 8;
+  engine.max_active_bin = num_time_bins;
+
+  if (posix_memalign((void **)&runner, SWIFT_STRUCT_ALIGNMENT,
+                     sizeof(struct runner)) != 0) {
+    error("couldn't allocate runner");
+  }
+
+  runner->e = &engine;
+
+  /* Create output file names. */
+  sprintf(swiftOutputFileName, "swift_dopair_%s.dat", outputFileNameExtension);
+  sprintf(bruteForceOutputFileName, "brute_force_%s.dat",
+          outputFileNameExtension);
+
+  /* Delete files if they already exist. */
+  remove(swiftOutputFileName);
+  remove(bruteForceOutputFileName);
+
+#ifdef WITH_VECTORIZATION
+  runner->ci_cache.count = 0;
+  cache_init(&runner->ci_cache, 512);
+  runner->cj_cache.count = 0;
+  cache_init(&runner->cj_cache, 512);
+#endif
+
+  double offset[3] = {1., 0., 0.};
+
+  /* Test a pair of cells face-on. */
+  test_all_pair_interactions(runner, offset, particles, size, h, rho, &partId,
+                             perturbation, h_pert, swiftOutputFileName,
+                             bruteForceOutputFileName);
+
+  /* Test a pair of cells edge-on. */
+  offset[0] = 1.;
+  offset[1] = 1.;
+  offset[2] = 0.;
+  test_all_pair_interactions(runner, offset, particles, size, h, rho, &partId,
+                             perturbation, h_pert, swiftOutputFileName,
+                             bruteForceOutputFileName);
+
+  /* Test a pair of cells corner-on. */
+  offset[0] = 1.;
+  offset[1] = 1.;
+  offset[2] = 1.;
+  test_all_pair_interactions(runner, offset, particles, size, h, rho, &partId,
+                             perturbation, h_pert, swiftOutputFileName,
+                             bruteForceOutputFileName);
+  return 0;
+}
diff --git a/tests/testActivePair.sh.in b/tests/testActivePair.sh.in
new file mode 100755
index 0000000000000000000000000000000000000000..ff8d027a469bd9bc78286b843cf2dffd3ef27ad3
--- /dev/null
+++ b/tests/testActivePair.sh.in
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+echo ""
+
+rm -f brute_force_pair_active.dat swift_dopair_active.dat
+
+./testActivePair -n 6 -r 1 -d 0 -f active
+
+python @srcdir@/difffloat.py brute_force_active.dat swift_dopair_active.dat @srcdir@/tolerance_pair_active.dat
+
+exit $?
diff --git a/tests/testInteractions.c b/tests/testInteractions.c
index 4ce7fe40554d24551750629fa47c0bee7acdb6da..54d1f38733a1f1647331166f1a37b40ed3511419 100644
--- a/tests/testInteractions.c
+++ b/tests/testInteractions.c
@@ -17,12 +17,6 @@
  *
  ******************************************************************************/
 
-#include "../config.h"
-
-#ifndef WITH_VECTORIZATION
-int main() { return 0; }
-#else
-
 #include <fenv.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -30,15 +24,17 @@ int main() { return 0; }
 #include <unistd.h>
 #include "swift.h"
 
+#ifdef WITH_VECTORIZATION
+
 #define array_align sizeof(float) * VEC_SIZE
 #define ACC_THRESHOLD 1e-5
 
-/* Typdef function pointers for serial and vectorised versions of the
- * interaction functions. */
-typedef void (*serial_interaction)(float, float *, float, float, struct part *,
-                                   struct part *);
-typedef void (*vec_interaction)(float *, float *, float *, float *,
-                                struct part **, struct part **);
+#ifndef IACT
+#define IACT runner_iact_nonsym_density
+#define IACT_VEC runner_iact_nonsym_1_vec_density
+#define IACT_NAME "test_nonsym_density"
+#define NUM_VEC_PROC_INT 1
+#endif
 
 /**
  * @brief Constructs an array of particles in a valid state prior to
@@ -74,7 +70,10 @@ struct part *make_particles(size_t count, double *offset, double spacing,
 
   p->h = h;
   p->id = ++(*partId);
+
+#if !defined(GIZMO_SPH) && !defined(SHADOWFAX_SPH)
   p->mass = 1.0f;
+#endif
 
   /* Place rest of particles around the test particle
    * with random position within a unit sphere. */
@@ -93,7 +92,9 @@ struct part *make_particles(size_t count, double *offset, double spacing,
 
     p->h = h;
     p->id = ++(*partId);
+#if !defined(GIZMO_SPH) && !defined(SHADOWFAX_SPH)
     p->mass = 1.0f;
+#endif
   }
   return particles;
 }
@@ -103,6 +104,7 @@ struct part *make_particles(size_t count, double *offset, double spacing,
  */
 void prepare_force(struct part *parts, size_t count) {
 
+#if !defined(GIZMO_SPH) && !defined(SHADOWFAX_SPH) && !defined(MINIMAL_SPH)
   struct part *p;
   for (size_t i = 0; i < count; ++i) {
     p = &parts[i];
@@ -113,6 +115,7 @@ void prepare_force(struct part *parts, size_t count) {
     p->force.v_sig = 0.0f;
     p->force.h_dt = 0.0f;
   }
+#endif
 }
 
 /**
@@ -122,25 +125,26 @@ void dump_indv_particle_fields(char *fileName, struct part *p) {
 
   FILE *file = fopen(fileName, "a");
 
+  /* Write header */
   fprintf(file,
-          "%6llu %10f %10f %10f %10f %10f %10f %10e %10e %10e %13e %13e %13e "
-          "%13e %13e %13e %13e "
-          "%13e %13e %13e %10f\n",
+          "%6llu %10f %10f %10f %10f %10f %10f %13e %13e %13e %13e %13e "
+          "%13e %13e %13e\n",
           p->id, p->x[0], p->x[1], p->x[2], p->v[0], p->v[1], p->v[2],
-          p->a_hydro[0], p->a_hydro[1], p->a_hydro[2], p->rho,
-          p->density.rho_dh, p->density.wcount, p->density.wcount_dh,
-          p->force.h_dt, p->force.v_sig,
-#if defined(GADGET2_SPH)
-          p->density.div_v, p->density.rot_v[0], p->density.rot_v[1],
-          p->density.rot_v[2], p->entropy_dt
-#elif defined(DEFAULT_SPH)
-          p->density.div_v, p->density.rot_v[0], p->density.rot_v[1],
-          p->density.rot_v[2], 0.
+          hydro_get_density(p),
+#if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH)
+          0.f,
 #else
+          p->density.rho_dh,
+#endif
+          p->density.wcount, p->density.wcount_dh,
+#if defined(GADGET2_SPH) || defined(DEFAULT_SPH) || defined(HOPKINS_PE_SPH)
           p->density.div_v, p->density.rot_v[0], p->density.rot_v[1],
           p->density.rot_v[2]
+#else
+          0., 0., 0., 0.
 #endif
           );
+
   fclose(file);
 }
 
@@ -152,13 +156,10 @@ void write_header(char *fileName) {
   FILE *file = fopen(fileName, "w");
   /* Write header */
   fprintf(file,
-          "# %4s %10s %10s %10s %10s %10s %10s %10s %10s %10s %13s %13s %13s "
-          "%13s %13s %13s %13s"
-          "%13s %13s %13s %13s\n",
-          "ID", "pos_x", "pos_y", "pos_z", "v_x", "v_y", "v_z", "a_x", "a_y",
-          "a_z", "rho", "rho_dh", "wcount", "wcount_dh", "dh/dt", "v_sig",
-          "div_v", "curl_vx", "curl_vy", "curl_vz", "dS/dt");
-  fprintf(file, "\n# PARTICLES BEFORE INTERACTION:\n");
+          "# %4s %10s %10s %10s %10s %10s %10s %13s %13s %13s %13s %13s "
+          "%13s %13s %13s\n",
+          "ID", "pos_x", "pos_y", "pos_z", "v_x", "v_y", "v_z", "rho", "rho_dh",
+          "wcount", "wcount_dh", "div_v", "curl_vx", "curl_vy", "curl_vz");
   fclose(file);
 }
 
@@ -187,8 +188,8 @@ int check_results(struct part serial_test_part, struct part *serial_parts,
 }
 
 /*
- * @brief Calls the serial and vectorised version of an interaction
- * function given by the function pointers.
+ * @brief Calls the serial and vectorised version of the non-symmetrical density
+ * interaction.
  *
  * @param test_part Particle that will be updated
  * @param parts Particle array to be interacted
@@ -196,16 +197,15 @@ int check_results(struct part serial_test_part, struct part *serial_parts,
  * @param serial_inter_func Serial interaction function to be called
  * @param vec_inter_func Vectorised interaction function to be called
  * @param runs No. of times to call interactions
+ * @param num_vec_proc No. of vectors to use to process interaction
  *
  */
 void test_interactions(struct part test_part, struct part *parts, size_t count,
-                       serial_interaction serial_inter_func,
-                       vec_interaction vec_inter_func, char *filePrefix,
-                       size_t runs) {
+                       char *filePrefix, int runs, int num_vec_proc) {
 
-  ticks serial_time = 0, vec_time = 0;
+  ticks serial_time = 0;
+  ticks vec_time = 0;
 
-  FILE *file;
   char serial_filename[200] = "";
   char vec_filename[200] = "";
 
@@ -217,64 +217,68 @@ void test_interactions(struct part test_part, struct part *parts, size_t count,
   write_header(serial_filename);
   write_header(vec_filename);
 
-  /* Test particle at the center of a unit sphere. */
   struct part pi_serial, pi_vec;
-
-  /* Remaining particles in the sphere that will interact with test particle. */
   struct part pj_serial[count], pj_vec[count];
 
-  /* Stores the separation, smoothing length and pointers to particles
-   * needed for the vectorised interaction. */
+  float r2[count] __attribute__((aligned(array_align)));
+  float dx[3 * count] __attribute__((aligned(array_align)));
+
+  struct part *piq[count], *pjq[count];
+  for (size_t k = 0; k < count; k++) {
+    piq[k] = NULL;
+    pjq[k] = NULL;
+  }
+
   float r2q[count] __attribute__((aligned(array_align)));
   float hiq[count] __attribute__((aligned(array_align)));
-  float hjq[count] __attribute__((aligned(array_align)));
-  float dxq[3 * count] __attribute__((aligned(array_align)));
-  struct part *piq[count], *pjq[count];
+  float dxq[count] __attribute__((aligned(array_align)));
+
+  float dyq[count] __attribute__((aligned(array_align)));
+  float dzq[count] __attribute__((aligned(array_align)));
+  float mjq[count] __attribute__((aligned(array_align)));
+  float vixq[count] __attribute__((aligned(array_align)));
+  float viyq[count] __attribute__((aligned(array_align)));
+  float vizq[count] __attribute__((aligned(array_align)));
+  float vjxq[count] __attribute__((aligned(array_align)));
+  float vjyq[count] __attribute__((aligned(array_align)));
+  float vjzq[count] __attribute__((aligned(array_align)));
 
   /* Call serial interaction a set number of times. */
-  for (size_t k = 0; k < runs; k++) {
+  for (int k = 0; k < runs; k++) {
     /* Reset particle to initial setup */
     pi_serial = test_part;
     for (size_t i = 0; i < count; i++) pj_serial[i] = parts[i];
 
-    /* Only dump data on first run. */
-    if (k == 0) {
-      /* Dump state of particles before serial interaction. */
-      dump_indv_particle_fields(serial_filename, &pi_serial);
-      for (size_t i = 0; i < count; i++)
-        dump_indv_particle_fields(serial_filename, &pj_serial[i]);
-    }
-
     /* Perform serial interaction */
     for (size_t i = 0; i < count; i++) {
       /* Compute the pairwise distance. */
-      float r2 = 0.0f;
-      float dx[3];
-      for (size_t k = 0; k < 3; k++) {
-        dx[k] = pi_serial.x[k] - pj_serial[i].x[k];
-        r2 += dx[k] * dx[k];
+      r2[i] = 0.0f;
+      for (int k = 0; k < 3; k++) {
+        int ind = (3 * i) + k;
+        dx[ind] = pi_serial.x[k] - pj_serial[i].x[k];
+        r2[i] += dx[ind] * dx[ind];
       }
+    }
 
-      const ticks tic = getticks();
-
-      serial_inter_func(r2, dx, pi_serial.h, pj_serial[i].h, &pi_serial,
-                        &pj_serial[i]);
-
-      serial_time += getticks() - tic;
+    const ticks tic = getticks();
+/* Perform serial interaction */
+#ifdef __ICC
+#pragma novector
+#endif
+    for (size_t i = 0; i < count; i++) {
+      IACT(r2[i], &(dx[3 * i]), pi_serial.h, pj_serial[i].h, &pi_serial,
+           &pj_serial[i]);
     }
+    serial_time += getticks() - tic;
   }
 
-  file = fopen(serial_filename, "a");
-  fprintf(file, "\n# PARTICLES AFTER INTERACTION:\n");
-  fclose(file);
-
   /* Dump result of serial interaction. */
   dump_indv_particle_fields(serial_filename, &pi_serial);
   for (size_t i = 0; i < count; i++)
     dump_indv_particle_fields(serial_filename, &pj_serial[i]);
 
   /* Call vector interaction a set number of times. */
-  for (size_t k = 0; k < runs; k++) {
+  for (int k = 0; k < runs; k++) {
     /* Reset particle to initial setup */
     pi_vec = test_part;
     for (size_t i = 0; i < count; i++) pj_vec[i] = parts[i];
@@ -284,45 +288,92 @@ void test_interactions(struct part test_part, struct part *parts, size_t count,
       /* Compute the pairwise distance. */
       float r2 = 0.0f;
       float dx[3];
-      for (size_t k = 0; k < 3; k++) {
+      for (int k = 0; k < 3; k++) {
         dx[k] = pi_vec.x[k] - pj_vec[i].x[k];
         r2 += dx[k] * dx[k];
       }
 
       r2q[i] = r2;
-      dxq[3 * i + 0] = dx[0];
-      dxq[3 * i + 1] = dx[1];
-      dxq[3 * i + 2] = dx[2];
+      dxq[i] = dx[0];
       hiq[i] = pi_vec.h;
-      hjq[i] = pj_vec[i].h;
       piq[i] = &pi_vec;
       pjq[i] = &pj_vec[i];
-    }
 
-    /* Only dump data on first run. */
-    if (k == 0) {
-      /* Dump state of particles before vector interaction. */
-      dump_indv_particle_fields(vec_filename, piq[0]);
-      for (size_t i = 0; i < count; i++)
-        dump_indv_particle_fields(vec_filename, pjq[i]);
+      dyq[i] = dx[1];
+      dzq[i] = dx[2];
+      mjq[i] = pj_vec[i].mass;
+      vixq[i] = pi_vec.v[0];
+      viyq[i] = pi_vec.v[1];
+      vizq[i] = pi_vec.v[2];
+      vjxq[i] = pj_vec[i].v[0];
+      vjyq[i] = pj_vec[i].v[1];
+      vjzq[i] = pj_vec[i].v[2];
     }
 
+    /* Perform vector interaction. */
+    vector hi_vec, hi_inv_vec, vix_vec, viy_vec, viz_vec;
+    vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum,
+        curlvySum, curlvzSum;
+    mask_t mask, mask2;
+
+    rhoSum.v = vec_set1(0.f);
+    rho_dhSum.v = vec_set1(0.f);
+    wcountSum.v = vec_set1(0.f);
+    wcount_dhSum.v = vec_set1(0.f);
+    div_vSum.v = vec_set1(0.f);
+    curlvxSum.v = vec_set1(0.f);
+    curlvySum.v = vec_set1(0.f);
+    curlvzSum.v = vec_set1(0.f);
+
+    hi_vec.v = vec_load(&hiq[0]);
+    vix_vec.v = vec_load(&vixq[0]);
+    viy_vec.v = vec_load(&viyq[0]);
+    viz_vec.v = vec_load(&vizq[0]);
+
+    hi_inv_vec = vec_reciprocal(hi_vec);
+    vec_init_mask_true(mask);
+    vec_init_mask_true(mask2);
+
     const ticks vec_tic = getticks();
 
-    /* Perform vector interaction. */
-    for (size_t i = 0; i < count; i += VEC_SIZE) {
-      vec_inter_func(&(r2q[i]), &(dxq[3 * i]), &(hiq[i]), &(hjq[i]), &(piq[i]),
-                     &(pjq[i]));
+    for (size_t i = 0; i < count; i += num_vec_proc * VEC_SIZE) {
+
+      /* Interleave two vectors for interaction. */
+      if (num_vec_proc == 2) {
+        runner_iact_nonsym_2_vec_density(
+            &(r2q[i]), &(dxq[i]), &(dyq[i]), &(dzq[i]), (hi_inv_vec), (vix_vec),
+            (viy_vec), (viz_vec), &(vjxq[i]), &(vjyq[i]), &(vjzq[i]), &(mjq[i]),
+            &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum,
+            &curlvxSum, &curlvySum, &curlvzSum, mask, mask2, 0);
+      } else { /* Only use one vector for interaction. */
+
+        vector r2, dx, dy, dz;
+        r2.v = vec_load(&(r2q[i]));
+        dx.v = vec_load(&(dxq[i]));
+        dy.v = vec_load(&(dyq[i]));
+        dz.v = vec_load(&(dzq[i]));
+
+        runner_iact_nonsym_1_vec_density(
+            &r2, &dx, &dy, &dz, (hi_inv_vec), (vix_vec), (viy_vec), (viz_vec),
+            &(vjxq[i]), &(vjyq[i]), &(vjzq[i]), &(mjq[i]), &rhoSum, &rho_dhSum,
+            &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum,
+            &curlvzSum, mask);
+      }
     }
 
+    VEC_HADD(rhoSum, piq[0]->rho);
+    VEC_HADD(rho_dhSum, piq[0]->density.rho_dh);
+    VEC_HADD(wcountSum, piq[0]->density.wcount);
+    VEC_HADD(wcount_dhSum, piq[0]->density.wcount_dh);
+    VEC_HADD(div_vSum, piq[0]->density.div_v);
+    VEC_HADD(curlvxSum, piq[0]->density.rot_v[0]);
+    VEC_HADD(curlvySum, piq[0]->density.rot_v[1]);
+    VEC_HADD(curlvzSum, piq[0]->density.rot_v[2]);
+
     vec_time += getticks() - vec_tic;
   }
 
-  file = fopen(vec_filename, "a");
-  fprintf(file, "\n# PARTICLES AFTER INTERACTION:\n");
-  fclose(file);
-
-  /* Dump result of vector interaction. */
+  /* Dump result of serial interaction. */
   dump_indv_particle_fields(vec_filename, piq[0]);
   for (size_t i = 0; i < count; i++)
     dump_indv_particle_fields(vec_filename, pjq[i]);
@@ -334,6 +385,7 @@ void test_interactions(struct part test_part, struct part *parts, size_t count,
   message("The serial interactions took     : %15lli ticks.",
           serial_time / runs);
   message("The vectorised interactions took : %15lli ticks.", vec_time / runs);
+  message("Speed up: %15fx.", (double)(serial_time) / vec_time);
 }
 
 /* And go... */
@@ -386,62 +438,22 @@ int main(int argc, char *argv[]) {
 
   /* Build the infrastructure */
   static long long partId = 0;
-  struct part density_test_particle, force_test_particle;
-  struct part *density_particles =
-      make_particles(count, offset, spacing, h, &partId);
-  struct part *force_particles =
-      make_particles(count, offset, spacing, h, &partId);
-  prepare_force(force_particles, count);
-
-  /* Define which interactions to call */
-  serial_interaction serial_inter_func = &runner_iact_nonsym_density;
-  vec_interaction vec_inter_func = &runner_iact_nonsym_vec_density;
-
-  density_test_particle = density_particles[0];
+  struct part test_particle;
+  struct part *particles = make_particles(count, offset, spacing, h, &partId);
+
+  test_particle = particles[0];
   /* Call the non-sym density test. */
-  message("Testing non-symmetrical density interaction...");
-  test_interactions(density_test_particle, &density_particles[1], count - 1,
-                    serial_inter_func, vec_inter_func, "test_nonsym_density",
-                    runs);
-
-  density_particles = make_particles(count, offset, spacing, h, &partId);
-
-  /* Re-assign function pointers. */
-  serial_inter_func = &runner_iact_density;
-  vec_inter_func = &runner_iact_vec_density;
-
-  density_test_particle = density_particles[0];
-  /* Call the symmetrical density test. */
-  message("Testing symmetrical density interaction...");
-  test_interactions(density_test_particle, &density_particles[1], count - 1,
-                    serial_inter_func, vec_inter_func, "test_sym_density",
-                    runs);
-
-  /* Re-assign function pointers. */
-  serial_inter_func = &runner_iact_nonsym_force;
-  vec_inter_func = &runner_iact_nonsym_vec_force;
-
-  force_test_particle = force_particles[0];
-  /* Call the test non-sym force test. */
-  message("Testing non-symmetrical force interaction...");
-  test_interactions(force_test_particle, &force_particles[1], count - 1,
-                    serial_inter_func, vec_inter_func, "test_nonsym_force",
-                    runs);
-
-  force_particles = make_particles(count, offset, spacing, h, &partId);
-  prepare_force(force_particles, count);
-
-  /* Re-assign function pointers. */
-  serial_inter_func = &runner_iact_force;
-  vec_inter_func = &runner_iact_vec_force;
-
-  force_test_particle = force_particles[0];
-  /* Call the test symmetrical force test. */
-  message("Testing symmetrical force interaction...");
-  test_interactions(force_test_particle, &force_particles[1], count - 1,
-                    serial_inter_func, vec_inter_func, "test_sym_force", runs);
+  message("Testing %s interaction...", IACT_NAME);
+  test_interactions(test_particle, &particles[1], count - 1, IACT_NAME, runs,
+                    1);
+  test_interactions(test_particle, &particles[1], count - 1, IACT_NAME, runs,
+                    2);
 
   return 0;
 }
 
-#endif /* WITH_VECTORIZATION */
+#else
+
+int main() { return 1; }
+
+#endif
diff --git a/tests/testInteractions.sh.in b/tests/testInteractions.sh.in
new file mode 100644
index 0000000000000000000000000000000000000000..4b002c56e37eff417c673ddac2e44b3edf17683a
--- /dev/null
+++ b/tests/testInteractions.sh.in
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+echo ""
+
+rm -f test_nonsym_density_serial.dat test_nonsym_density_vec.dat
+
+echo "Running ./testInteractions"
+
+./testInteractions
+
+if [ $? != 0 ]; then
+  echo "testInteractions is redundant when vectorisation is disabled"
+else
+  if [ -e test_nonsym_density_serial.dat ]
+  then
+    if python @srcdir@/difffloat.py test_nonsym_density_serial.dat test_nonsym_density_vec.dat @srcdir@/tolerance_testInteractions.dat
+    then
+      echo "Accuracy test passed"
+    else
+      echo "Accuracy test failed"
+      exit 1
+    fi
+  else
+    echo "Error Missing test output file"
+    exit 1
+  fi
+fi
+
+echo "------------"
diff --git a/tests/testKernel.c b/tests/testKernel.c
index a3731188e51b1235fe84f36eab7c270c788f7dea..0658639070526f28ce1bceefc54d3f2d7a3ae765 100644
--- a/tests/testKernel.c
+++ b/tests/testKernel.c
@@ -68,7 +68,7 @@ int main() {
       vx.f[j] = (i + j) * 2.25f / numPoints;
     }
 
-    vx_h.v = vec_mul(vx.v, vec_set1(1.f/h));
+    vx_h.v = vec_mul(vx.v, vec_set1(1.f / h));
 
     kernel_deval_1_vec(&vx_h, &W_vec, &dW_vec);
 
@@ -106,8 +106,8 @@ int main() {
       vx_2.f[j] = (i + j) * 2.25f / numPoints;
     }
 
-    vx_h.v = vec_mul(vx.v, vec_set1(1.f/h));
-    vx_h_2.v = vec_mul(vx_2.v, vec_set1(1.f/h));
+    vx_h.v = vec_mul(vx.v, vec_set1(1.f / h));
+    vx_h_2.v = vec_mul(vx_2.v, vec_set1(1.f / h));
 
     kernel_deval_2_vec(&vx_h, &W_vec, &dW_vec, &vx_h_2, &W_vec_2, &dW_vec_2);
 
diff --git a/tests/testPair.c b/tests/testPair.c
deleted file mode 100644
index 92987d2fdb625fec6e186a280837f145787f599b..0000000000000000000000000000000000000000
--- a/tests/testPair.c
+++ /dev/null
@@ -1,322 +0,0 @@
-/*******************************************************************************
- * This file is part of SWIFT.
- * Copyright (C) 2015 Matthieu Schaller (matthieu.schaller@durham.ac.uk).
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- ******************************************************************************/
-
-#include <fenv.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include "swift.h"
-
-/* n is both particles per axis and box size:
- * particles are generated on a mesh with unit spacing
- */
-struct cell *make_cell(size_t n, double *offset, double size, double h,
-                       double density, unsigned long long *partId,
-                       double pert) {
-  const size_t count = n * n * n;
-  const double volume = size * size * size;
-  struct cell *cell = malloc(sizeof(struct cell));
-  bzero(cell, sizeof(struct cell));
-
-  if (posix_memalign((void **)&cell->parts, part_align,
-                     count * sizeof(struct part)) != 0) {
-    error("couldn't allocate particles, no. of particles: %d", (int)count);
-  }
-  bzero(cell->parts, count * sizeof(struct part));
-
-  /* Construct the parts */
-  struct part *part = cell->parts;
-  for (size_t x = 0; x < n; ++x) {
-    for (size_t y = 0; y < n; ++y) {
-      for (size_t z = 0; z < n; ++z) {
-        part->x[0] =
-            offset[0] +
-            size * (x + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n;
-        part->x[1] =
-            offset[1] +
-            size * (y + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n;
-        part->x[2] =
-            offset[2] +
-            size * (z + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n;
-        // part->v[0] = part->x[0] - 1.5;
-        // part->v[1] = part->x[1] - 1.5;
-        // part->v[2] = part->x[2] - 1.5;
-        part->v[0] = random_uniform(-0.05, 0.05);
-        part->v[1] = random_uniform(-0.05, 0.05);
-        part->v[2] = random_uniform(-0.05, 0.05);
-        part->h = size * h / (float)n;
-        part->id = ++(*partId);
-#if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH)
-        part->conserved.mass = density * volume / count;
-#else
-        part->mass = density * volume / count;
-#endif
-        part->time_bin = 1;
-
-#ifdef SWIFT_DEBUG_CHECKS
-        part->ti_drift = 8;
-        part->ti_kick = 8;
-#endif
-
-        ++part;
-      }
-    }
-  }
-
-  /* Cell properties */
-  cell->split = 0;
-  cell->h_max = h;
-  cell->count = count;
-  cell->dx_max_part = 0.;
-  cell->dx_max_sort = 0.;
-  cell->width[0] = n;
-  cell->width[1] = n;
-  cell->width[2] = n;
-  cell->loc[0] = offset[0];
-  cell->loc[1] = offset[1];
-  cell->loc[2] = offset[2];
-
-  cell->ti_old_part = 8;
-  cell->ti_end_min = 8;
-  cell->ti_end_max = 8;
-
-  shuffle_particles(cell->parts, cell->count);
-
-  cell->sorted = 0;
-  cell->sort = NULL;
-  cell->sortsize = 0;
-
-  return cell;
-}
-
-void clean_up(struct cell *ci) {
-  free(ci->parts);
-  free(ci->sort);
-  free(ci);
-}
-
-/**
- * @brief Initializes all particles field to be ready for a density calculation
- */
-void zero_particle_fields(struct cell *c) {
-  for (int pid = 0; pid < c->count; pid++) {
-    hydro_init_part(&c->parts[pid], NULL);
-  }
-}
-
-/**
- * @brief Dump all the particles to a file
- */
-void dump_particle_fields(char *fileName, struct cell *ci, struct cell *cj) {
-  FILE *file = fopen(fileName, "w");
-
-  /* Write header */
-  fprintf(file,
-          "# %4s %10s %10s %10s %10s %10s %10s %13s %13s %13s %13s %13s "
-          "%13s %13s %13s\n",
-          "ID", "pos_x", "pos_y", "pos_z", "v_x", "v_y", "v_z", "rho", "rho_dh",
-          "wcount", "wcount_dh", "div_v", "curl_vx", "curl_vy", "curl_vz");
-
-  fprintf(file, "# ci --------------------------------------------\n");
-
-  for (int pid = 0; pid < ci->count; pid++) {
-    fprintf(file,
-            "%6llu %10f %10f %10f %10f %10f %10f %13e %13e %13e %13e %13e "
-            "%13e %13e %13e\n",
-            ci->parts[pid].id, ci->parts[pid].x[0], ci->parts[pid].x[1],
-            ci->parts[pid].x[2], ci->parts[pid].v[0], ci->parts[pid].v[1],
-            ci->parts[pid].v[2], hydro_get_density(&ci->parts[pid]),
-#if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH)
-            0.f,
-#else
-            ci->parts[pid].density.rho_dh,
-#endif
-            ci->parts[pid].density.wcount, ci->parts[pid].density.wcount_dh,
-#if defined(GADGET2_SPH) || defined(DEFAULT_SPH) || defined(HOPKINS_PE_SPH)
-            ci->parts[pid].density.div_v, ci->parts[pid].density.rot_v[0],
-            ci->parts[pid].density.rot_v[1], ci->parts[pid].density.rot_v[2]
-#else
-            0., 0., 0., 0.
-#endif
-            );
-  }
-
-  fprintf(file, "# cj --------------------------------------------\n");
-
-  for (int pjd = 0; pjd < cj->count; pjd++) {
-    fprintf(file,
-            "%6llu %10f %10f %10f %10f %10f %10f %13e %13e %13e %13e %13e "
-            "%13e %13e %13e\n",
-            cj->parts[pjd].id, cj->parts[pjd].x[0], cj->parts[pjd].x[1],
-            cj->parts[pjd].x[2], cj->parts[pjd].v[0], cj->parts[pjd].v[1],
-            cj->parts[pjd].v[2], hydro_get_density(&cj->parts[pjd]),
-#if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH)
-            0.f,
-#else
-            cj->parts[pjd].density.rho_dh,
-#endif
-            cj->parts[pjd].density.wcount, cj->parts[pjd].density.wcount_dh,
-#if defined(GADGET2_SPH) || defined(DEFAULT_SPH) || defined(HOPKINS_PE_SPH)
-            cj->parts[pjd].density.div_v, cj->parts[pjd].density.rot_v[0],
-            cj->parts[pjd].density.rot_v[1], cj->parts[pjd].density.rot_v[2]
-#else
-            0., 0., 0., 0.
-#endif
-            );
-  }
-
-  fclose(file);
-}
-
-/* Just a forward declaration... */
-void runner_dopair1_density(struct runner *r, struct cell *ci, struct cell *cj);
-
-int main(int argc, char *argv[]) {
-  size_t particles = 0, runs = 0, volume, type = 0;
-  double offset[3] = {0, 0, 0}, h = 1.1255, size = 1., rho = 1.;
-  double perturbation = 0.;
-  struct cell *ci, *cj;
-  struct space space;
-  struct engine engine;
-  struct runner runner;
-  char c;
-  static unsigned long long partId = 0;
-  char outputFileNameExtension[200] = "";
-  char outputFileName[200] = "";
-  ticks tic, toc, time;
-
-  /* Initialize CPU frequency, this also starts time. */
-  unsigned long long cpufreq = 0;
-  clocks_set_cpufreq(cpufreq);
-
-  srand(0);
-
-  while ((c = getopt(argc, argv, "h:p:r:t:d:f:")) != -1) {
-    switch (c) {
-      case 'h':
-        sscanf(optarg, "%lf", &h);
-        break;
-      case 'p':
-        sscanf(optarg, "%zu", &particles);
-        break;
-      case 'r':
-        sscanf(optarg, "%zu", &runs);
-        break;
-      case 't':
-        sscanf(optarg, "%zu", &type);
-        break;
-      case 'd':
-        sscanf(optarg, "%lf", &perturbation);
-        break;
-      case 'f':
-        strcpy(outputFileNameExtension, optarg);
-        break;
-      case '?':
-        error("Unknown option.");
-        break;
-    }
-  }
-
-  if (h < 0 || particles == 0 || runs == 0 || type > 2) {
-    printf(
-        "\nUsage: %s -p PARTICLES_PER_AXIS -r NUMBER_OF_RUNS [OPTIONS...]\n"
-        "\nGenerates a cell pair, filled with particles on a Cartesian grid."
-        "\nThese are then interacted using runner_dopair1_density."
-        "\n\nOptions:"
-        "\n-t TYPE=0          - cells share face (0), edge (1) or corner (2)"
-        "\n-h DISTANCE=1.1255 - smoothing length"
-        "\n-d pert            - perturbation to apply to the particles [0,1["
-        "\n-f fileName        - part of the file name used to save the dumps\n",
-        argv[0]);
-    exit(1);
-  }
-
-  space.periodic = 0;
-
-  engine.s = &space;
-  engine.time = 0.1f;
-  engine.ti_current = 8;
-  engine.max_active_bin = num_time_bins;
-  runner.e = &engine;
-
-  volume = particles * particles * particles;
-  message("particles: %zu B\npositions: 0 B", 2 * volume * sizeof(struct part));
-
-  ci = make_cell(particles, offset, size, h, rho, &partId, perturbation);
-  for (size_t i = 0; i < type + 1; ++i) offset[i] = 1.;
-  cj = make_cell(particles, offset, size, h, rho, &partId, perturbation);
-
-  runner_do_sort(&runner, ci, 0x1FFF, 0);
-  runner_do_sort(&runner, cj, 0x1FFF, 0);
-
-  time = 0;
-  for (size_t i = 0; i < runs; ++i) {
-    /* Zero the fields */
-    zero_particle_fields(ci);
-    zero_particle_fields(cj);
-
-    tic = getticks();
-
-#if defined(DEFAULT_SPH) || !defined(WITH_VECTORIZATION)
-    /* Run the test */
-    runner_dopair1_density(&runner, ci, cj);
-#endif
-
-    toc = getticks();
-    time += toc - tic;
-
-    /* Dump if necessary */
-    if (i % 50 == 0) {
-      sprintf(outputFileName, "swift_dopair_%s.dat", outputFileNameExtension);
-      dump_particle_fields(outputFileName, ci, cj);
-    }
-  }
-
-  /* Output timing */
-  message("SWIFT calculation took       %lli ticks.", time / runs);
-
-  /* Now perform a brute-force version for accuracy tests */
-
-  /* Zero the fields */
-  zero_particle_fields(ci);
-  zero_particle_fields(cj);
-
-  tic = getticks();
-
-#if defined(DEFAULT_SPH) || !defined(WITH_VECTORIZATION)
-  /* Run the brute-force test */
-  pairs_all_density(&runner, ci, cj);
-#endif
-
-  toc = getticks();
-
-  /* Dump */
-  sprintf(outputFileName, "brute_force_%s.dat", outputFileNameExtension);
-  dump_particle_fields(outputFileName, ci, cj);
-
-  /* Output timing */
-  message("Brute force calculation took %lli ticks.", toc - tic);
-
-  /* Clean things to make the sanitizer happy ... */
-  clean_up(ci);
-  clean_up(cj);
-
-  return 0;
-}
diff --git a/tests/testPair.sh.in b/tests/testPair.sh.in
deleted file mode 100755
index bd7051b060c4acab6cf5a164af1914715856849b..0000000000000000000000000000000000000000
--- a/tests/testPair.sh.in
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/bash
-
-echo ""
-
-rm -f brute_force_standard.dat swift_dopair_standard.dat
-
-./testPair -p 6 -r 1 -d 0 -f standard
-
-python @srcdir@/difffloat.py brute_force_standard.dat swift_dopair_standard.dat @srcdir@/tolerance_pair_normal.dat
-
-exit $?
diff --git a/tests/testPairPerturbed.sh.in b/tests/testPairPerturbed.sh.in
deleted file mode 100755
index 9f214e25a098448a906f9da307ea569e327cfdea..0000000000000000000000000000000000000000
--- a/tests/testPairPerturbed.sh.in
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/bash
-
-echo ""
-
-rm -f brute_force_perturbed.dat swift_dopair_perturbed.dat
-
-./testPair -p 6 -r 1 -d 0.1 -f perturbed
-
-python @srcdir@/difffloat.py brute_force_perturbed.dat swift_dopair_perturbed.dat @srcdir@/tolerance_pair_perturbed.dat
-
-exit $?
diff --git a/tests/testPeriodicBC.c b/tests/testPeriodicBC.c
new file mode 100644
index 0000000000000000000000000000000000000000..6fa2dc607b996b9e8508338a9806633c5a4d1a89
--- /dev/null
+++ b/tests/testPeriodicBC.c
@@ -0,0 +1,587 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (C) 2015 Matthieu Schaller (matthieu.schaller@durham.ac.uk).
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+
+/* Config parameters. */
+#include "../config.h"
+
+/* Some standard headers. */
+#include <fenv.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+/* Local headers. */
+#include "swift.h"
+
+#define ACC_THRESHOLD 1e-5
+
+#if defined(WITH_VECTORIZATION)
+#define DOSELF1 runner_doself1_density_vec
+#define DOPAIR1 runner_dopair1_branch_density
+#define DOSELF1_NAME "runner_doself1_density_vec"
+#define DOPAIR1_NAME "runner_dopair1_density_vec"
+#endif
+
+#ifndef DOSELF1
+#define DOSELF1 runner_doself1_density
+#define DOSELF1_NAME "runner_doself1_density"
+#endif
+
+#ifndef DOPAIR1
+#define DOPAIR1 runner_dopair1_branch_density
+#define DOPAIR1_NAME "runner_dopair1_density"
+#endif
+
+enum velocity_types {
+  velocity_zero,
+  velocity_random,
+  velocity_divergent,
+  velocity_rotating
+};
+
+/**
+ * @brief Constructs a cell and all of its particle in a valid state prior to
+ * a DOPAIR or DOSELF calcuation.
+ *
+ * @param n The cube root of the number of particles.
+ * @param offset The position of the cell offset from (0,0,0).
+ * @param size The cell size.
+ * @param h The smoothing length of the particles in units of the inter-particle
+ *separation.
+ * @param density The density of the fluid.
+ * @param partId The running counter of IDs.
+ * @param pert The perturbation to apply to the particles in the cell in units
+ *of the inter-particle separation.
+ * @param vel The type of velocity field (0, random, divergent, rotating)
+ */
+struct cell *make_cell(size_t n, double *offset, double size, double h,
+                       double density, long long *partId, double pert,
+                       enum velocity_types vel) {
+  const size_t count = n * n * n;
+  const double volume = size * size * size;
+  struct cell *cell = malloc(sizeof(struct cell));
+  bzero(cell, sizeof(struct cell));
+
+  if (posix_memalign((void **)&cell->parts, part_align,
+                     count * sizeof(struct part)) != 0) {
+    error("couldn't allocate particles, no. of particles: %d", (int)count);
+  }
+  bzero(cell->parts, count * sizeof(struct part));
+
+  float h_max = 0.f;
+
+  /* Construct the parts */
+  struct part *part = cell->parts;
+  for (size_t x = 0; x < n; ++x) {
+    for (size_t y = 0; y < n; ++y) {
+      for (size_t z = 0; z < n; ++z) {
+        part->x[0] =
+            offset[0] +
+            size * (x + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n;
+        part->x[1] =
+            offset[1] +
+            size * (y + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n;
+        part->x[2] =
+            offset[2] +
+            size * (z + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n;
+        switch (vel) {
+          case velocity_zero:
+            part->v[0] = 0.f;
+            part->v[1] = 0.f;
+            part->v[2] = 0.f;
+            break;
+          case velocity_random:
+            part->v[0] = random_uniform(-0.05, 0.05);
+            part->v[1] = random_uniform(-0.05, 0.05);
+            part->v[2] = random_uniform(-0.05, 0.05);
+            break;
+          case velocity_divergent:
+            part->v[0] = part->x[0] - 1.5 * size;
+            part->v[1] = part->x[1] - 1.5 * size;
+            part->v[2] = part->x[2] - 1.5 * size;
+            break;
+          case velocity_rotating:
+            part->v[0] = part->x[1];
+            part->v[1] = -part->x[0];
+            part->v[2] = 0.f;
+            break;
+        }
+        part->h = size * h / (float)n;
+        h_max = fmax(h_max, part->h);
+        part->id = ++(*partId);
+
+#if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH)
+        part->conserved.mass = density * volume / count;
+
+#ifdef SHADOWFAX_SPH
+        double anchor[3] = {0., 0., 0.};
+        double side[3] = {1., 1., 1.};
+        voronoi_cell_init(&part->cell, part->x, anchor, side);
+#endif
+
+#else
+        part->mass = density * volume / count;
+#endif
+
+#if defined(HOPKINS_PE_SPH)
+        part->entropy = 1.f;
+        part->entropy_one_over_gamma = 1.f;
+#endif
+
+        part->time_bin = 1;
+
+#ifdef SWIFT_DEBUG_CHECKS
+        part->ti_drift = 8;
+        part->ti_kick = 8;
+#endif
+
+        ++part;
+      }
+    }
+  }
+
+  /* Cell properties */
+  cell->split = 0;
+  cell->h_max = h_max;
+  cell->count = count;
+  cell->dx_max_part = 0.;
+  cell->dx_max_sort = 0.;
+  cell->width[0] = size;
+  cell->width[1] = size;
+  cell->width[2] = size;
+  cell->loc[0] = offset[0];
+  cell->loc[1] = offset[1];
+  cell->loc[2] = offset[2];
+
+  cell->ti_old_part = 8;
+  cell->ti_end_min = 8;
+  cell->ti_end_max = 8;
+
+  shuffle_particles(cell->parts, cell->count);
+
+  cell->sorted = 0;
+  for (int k = 0; k < 13; k++) cell->sort[k] = NULL;
+
+  return cell;
+}
+
+void clean_up(struct cell *ci) {
+  free(ci->parts);
+  for (int k = 0; k < 13; k++)
+    if (ci->sort[k] != NULL) free(ci->sort[k]);
+  free(ci);
+}
+
+/**
+ * @brief Initializes all particles field to be ready for a density calculation
+ */
+void zero_particle_fields(struct cell *c) {
+  for (int pid = 0; pid < c->count; pid++) {
+    hydro_init_part(&c->parts[pid], NULL);
+  }
+}
+
+/**
+ * @brief Ends the loop by adding the appropriate coefficients
+ */
+void end_calculation(struct cell *c) {
+  for (int pid = 0; pid < c->count; pid++) {
+    hydro_end_density(&c->parts[pid]);
+  }
+}
+
+/**
+ * @brief Dump all the particles to a file
+ */
+void dump_particle_fields(char *fileName, struct cell *main_cell, int i, int j,
+                          int k) {
+  FILE *file = fopen(fileName, "a");
+
+  /* Write header */
+  fprintf(file,
+          "# %4s %10s %10s %10s %10s %10s %10s %13s %13s %13s %13s %13s "
+          "%13s %13s %13s\n",
+          "ID", "pos_x", "pos_y", "pos_z", "v_x", "v_y", "v_z", "rho", "rho_dh",
+          "wcount", "wcount_dh", "div_v", "curl_vx", "curl_vy", "curl_vz");
+
+  fprintf(file, "# Centre cell at (i,j,k)=(%d, %d, %d) ---------------------\n",
+          i, j, k);
+
+  /* Write main cell */
+  for (int pid = 0; pid < main_cell->count; pid++) {
+    fprintf(file,
+            "%6llu %10f %10f %10f %10f %10f %10f %13e %13e %13e %13e %13e "
+            "%13e %13e %13e\n",
+            main_cell->parts[pid].id, main_cell->parts[pid].x[0],
+            main_cell->parts[pid].x[1], main_cell->parts[pid].x[2],
+            main_cell->parts[pid].v[0], main_cell->parts[pid].v[1],
+            main_cell->parts[pid].v[2],
+            hydro_get_density(&main_cell->parts[pid]),
+#if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH)
+            0.f,
+#else
+            main_cell->parts[pid].density.rho_dh,
+#endif
+            main_cell->parts[pid].density.wcount,
+            main_cell->parts[pid].density.wcount_dh,
+#if defined(GADGET2_SPH) || defined(DEFAULT_SPH) || defined(HOPKINS_PE_SPH)
+            main_cell->parts[pid].density.div_v,
+            main_cell->parts[pid].density.rot_v[0],
+            main_cell->parts[pid].density.rot_v[1],
+            main_cell->parts[pid].density.rot_v[2]
+#else
+            0., 0., 0., 0.
+#endif
+            );
+  }
+  fclose(file);
+}
+
+/**
+ * @brief Compares the vectorised result against
+ * the serial result of the interaction.
+ *
+ * @param serial_parts Particle array that has been interacted serially
+ * @param vec_parts Particle array to be interacted using vectors
+ * @param count No. of particles that have been interacted
+ * @param threshold Level of accuracy needed
+ *
+ * @return Non-zero value if difference found, 0 otherwise
+ */
+int check_results(struct part *serial_parts, struct part *vec_parts, int count,
+                  double threshold) {
+  int result = 0;
+
+  for (int i = 0; i < count; i++)
+    result += compare_particles(serial_parts[i], vec_parts[i], threshold);
+
+  return result;
+}
+
+/* Just a forward declaration... */
+void runner_doself1_density(struct runner *r, struct cell *ci);
+void runner_doself1_density_vec(struct runner *r, struct cell *ci);
+void runner_dopair1_branch_density(struct runner *r, struct cell *ci,
+                                   struct cell *cj);
+
+void test_boundary_conditions(struct cell **cells, struct runner runner,
+                              const int loc_i, const int loc_j, const int loc_k,
+                              const int dim, char *swiftOutputFileName,
+                              char *bruteForceOutputFileName) {
+
+  /* Store the main cell for future use */
+  struct cell *main_cell = cells[loc_i * (dim * dim) + loc_j * dim + loc_k];
+
+  /* Zero the fields */
+  for (int j = 0; j < 512; ++j) zero_particle_fields(cells[j]);
+
+/* Run all the pairs */
+#if !(defined(MINIMAL_SPH) && defined(WITH_VECTORIZATION))
+
+#ifdef WITH_VECTORIZATION
+  runner.ci_cache.count = 0;
+  cache_init(&runner.ci_cache, 512);
+  runner.cj_cache.count = 0;
+  cache_init(&runner.cj_cache, 512);
+#endif
+
+  /* Now loop over all the neighbours of this cell
+   * and perform the pair interactions. */
+  for (int ii = -1; ii < 2; ii++) {
+    int iii = loc_i + ii;
+    iii = (iii + dim) % dim;
+    for (int jj = -1; jj < 2; jj++) {
+      int jjj = loc_j + jj;
+      jjj = (jjj + dim) % dim;
+      for (int kk = -1; kk < 2; kk++) {
+        int kkk = loc_k + kk;
+        kkk = (kkk + dim) % dim;
+
+        /* Get the neighbouring cell */
+        struct cell *cj = cells[iii * (dim * dim) + jjj * dim + kkk];
+
+        if (cj != main_cell) DOPAIR1(&runner, main_cell, cj);
+      }
+    }
+  }
+
+  /* And now the self-interaction */
+
+  DOSELF1(&runner, main_cell);
+
+#endif
+
+  /* Let's get physical ! */
+  end_calculation(main_cell);
+
+  /* Dump particles from the main cell. */
+  dump_particle_fields(swiftOutputFileName, main_cell, loc_i, loc_j, loc_k);
+
+  /* Now perform a brute-force version for accuracy tests */
+
+  /* Zero the fields */
+  for (int i = 0; i < 512; ++i) zero_particle_fields(cells[i]);
+
+#if !(defined(MINIMAL_SPH) && defined(WITH_VECTORIZATION))
+
+  /* Now loop over all the neighbours of this cell
+   * and perform the pair interactions. */
+  for (int ii = -1; ii < 2; ii++) {
+    int iii = loc_i + ii;
+    iii = (iii + dim) % dim;
+    for (int jj = -1; jj < 2; jj++) {
+      int jjj = loc_j + jj;
+      jjj = (jjj + dim) % dim;
+      for (int kk = -1; kk < 2; kk++) {
+        int kkk = loc_k + kk;
+        kkk = (kkk + dim) % dim;
+
+        /* Get the neighbouring cell */
+        struct cell *cj = cells[iii * (dim * dim) + jjj * dim + kkk];
+
+        if (cj != main_cell) pairs_all_density(&runner, main_cell, cj);
+      }
+    }
+  }
+
+  /* And now the self-interaction */
+  self_all_density(&runner, main_cell);
+
+#endif
+
+  /* Let's get physical ! */
+  end_calculation(main_cell);
+
+  /* Dump */
+  dump_particle_fields(bruteForceOutputFileName, main_cell, loc_i, loc_j,
+                       loc_k);
+}
+
+/* And go... */
+int main(int argc, char *argv[]) {
+
+  engine_pin();
+  size_t runs = 0, particles = 0;
+  double h = 1.23485, size = 1., rho = 1.;
+  double perturbation = 0.;
+  double threshold = ACC_THRESHOLD;
+  char outputFileNameExtension[200] = "";
+  char swiftOutputFileName[200] = "";
+  char bruteForceOutputFileName[200] = "";
+  enum velocity_types vel = velocity_zero;
+
+  /* Initialize CPU frequency, this also starts time. */
+  unsigned long long cpufreq = 0;
+  clocks_set_cpufreq(cpufreq);
+
+  /* Choke on FP-exceptions */
+  feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
+
+  /* Get some randomness going */
+  srand(0);
+
+  char c;
+  while ((c = getopt(argc, argv, "m:s:h:n:r:t:d:f:v:a:")) != -1) {
+    switch (c) {
+      case 'h':
+        sscanf(optarg, "%lf", &h);
+        break;
+      case 's':
+        sscanf(optarg, "%lf", &size);
+        break;
+      case 'n':
+        sscanf(optarg, "%zu", &particles);
+        break;
+      case 'r':
+        sscanf(optarg, "%zu", &runs);
+        break;
+      case 'd':
+        sscanf(optarg, "%lf", &perturbation);
+        break;
+      case 'm':
+        sscanf(optarg, "%lf", &rho);
+        break;
+      case 'f':
+        strcpy(outputFileNameExtension, optarg);
+        break;
+      case 'v':
+        sscanf(optarg, "%d", (int *)&vel);
+        break;
+      case 'a':
+        sscanf(optarg, "%lf", &threshold);
+        break;
+      case '?':
+        error("Unknown option.");
+        break;
+    }
+  }
+
+  if (h < 0 || particles == 0 || runs == 0) {
+    printf(
+        "\nUsage: %s -n PARTICLES_PER_AXIS -r NUMBER_OF_RUNS [OPTIONS...]\n"
+        "\nGenerates 27 cells, filled with particles on a Cartesian grid."
+        "\nThese are then interacted using runner_dopair1_density() and "
+        "runner_doself1_density()."
+        "\n\nOptions:"
+        "\n-h DISTANCE=1.2348 - Smoothing length in units of <x>"
+        "\n-m rho             - Physical density in the cell"
+        "\n-s size            - Physical size of the cell"
+        "\n-d pert            - Perturbation to apply to the particles [0,1["
+        "\n-v type (0,1,2,3)  - Velocity field: (zero, random, divergent, "
+        "rotating)"
+        "\n-f fileName        - Part of the file name used to save the dumps\n",
+        argv[0]);
+    exit(1);
+  }
+
+  /* Help users... */
+  message("DOSELF1 function called: %s", DOSELF1_NAME);
+  message("DOPAIR1 function called: %s", DOPAIR1_NAME);
+  message("Vector size: %d", VEC_SIZE);
+  message("Adiabatic index: ga = %f", hydro_gamma);
+  message("Hydro implementation: %s", SPH_IMPLEMENTATION);
+  message("Smoothing length: h = %f", h * size);
+  message("Kernel:               %s", kernel_name);
+  message("Neighbour target: N = %f", pow_dimension(h) * kernel_norm);
+  message("Density target: rho = %f", rho);
+  message("div_v target:   div = %f", vel == 2 ? 3.f : 0.f);
+  message("curl_v target: curl = [0., 0., %f]", vel == 3 ? -2.f : 0.f);
+
+  printf("\n");
+
+  /* Build the infrastructure */
+  struct space space;
+  space.periodic = 1;
+  space.dim[0] = 8.;
+  space.dim[1] = 8.;
+  space.dim[2] = 8.;
+
+  struct hydro_props hp;
+  hp.h_max = FLT_MAX;
+
+  struct engine engine;
+  engine.s = &space;
+  engine.time = 0.1f;
+  engine.ti_current = 8;
+  engine.max_active_bin = num_time_bins;
+  engine.hydro_properties = &hp;
+
+  struct runner runner;
+  runner.e = &engine;
+
+  /* Construct some cells */
+  struct cell *cells[512];
+  const int dim = 8;
+  static long long partId = 0;
+  for (int i = 0; i < dim; ++i) {
+    for (int j = 0; j < dim; ++j) {
+      for (int k = 0; k < dim; ++k) {
+        double offset[3] = {i * size, j * size, k * size};
+        cells[i * (dim * dim) + j * dim + k] = make_cell(
+            particles, offset, size, h, rho, &partId, perturbation, vel);
+
+        runner_do_drift_part(&runner, cells[i * (dim * dim) + j * dim + k], 0);
+
+        runner_do_sort(&runner, cells[i * (dim * dim) + j * dim + k], 0x1FFF, 0,
+                       0);
+      }
+    }
+  }
+
+  /* Create output file names. */
+  sprintf(swiftOutputFileName, "swift_periodic_BC_%s.dat",
+          outputFileNameExtension);
+  sprintf(bruteForceOutputFileName, "brute_force_periodic_BC_%s.dat",
+          outputFileNameExtension);
+
+  /* Delete files if they already exist. */
+  remove(swiftOutputFileName);
+  remove(bruteForceOutputFileName);
+
+  const int half_dim = (dim - 1) / 2;
+
+  /* Test the periodic boundary conditions for each of the 8 corners. Interact
+   * each corner with all of its 26 neighbours.*/
+  test_boundary_conditions(cells, runner, 0, 0, 0, dim, swiftOutputFileName,
+                           bruteForceOutputFileName);
+  test_boundary_conditions(cells, runner, dim - 1, 0, 0, dim,
+                           swiftOutputFileName, bruteForceOutputFileName);
+  test_boundary_conditions(cells, runner, dim - 1, 0, dim - 1, dim,
+                           swiftOutputFileName, bruteForceOutputFileName);
+  test_boundary_conditions(cells, runner, 0, 0, dim - 1, dim,
+                           swiftOutputFileName, bruteForceOutputFileName);
+  test_boundary_conditions(cells, runner, 0, dim - 1, 0, dim,
+                           swiftOutputFileName, bruteForceOutputFileName);
+  test_boundary_conditions(cells, runner, dim - 1, dim - 1, 0, dim,
+                           swiftOutputFileName, bruteForceOutputFileName);
+  test_boundary_conditions(cells, runner, 0, dim - 1, dim - 1, dim,
+                           swiftOutputFileName, bruteForceOutputFileName);
+  test_boundary_conditions(cells, runner, dim - 1, dim - 1, dim - 1, dim,
+                           swiftOutputFileName, bruteForceOutputFileName);
+
+  /* Test the boundary conditions for cells at the centre of each face of the
+   * box. */
+  test_boundary_conditions(cells, runner, half_dim, half_dim, 0, dim,
+                           swiftOutputFileName, bruteForceOutputFileName);
+  test_boundary_conditions(cells, runner, dim - 1, half_dim, half_dim, dim,
+                           swiftOutputFileName, bruteForceOutputFileName);
+  test_boundary_conditions(cells, runner, half_dim, half_dim, dim - 1, dim,
+                           swiftOutputFileName, bruteForceOutputFileName);
+  test_boundary_conditions(cells, runner, 0, half_dim, half_dim, dim,
+                           swiftOutputFileName, bruteForceOutputFileName);
+  test_boundary_conditions(cells, runner, half_dim, 0, half_dim, dim,
+                           swiftOutputFileName, bruteForceOutputFileName);
+  test_boundary_conditions(cells, runner, half_dim, dim - 1, half_dim, dim,
+                           swiftOutputFileName, bruteForceOutputFileName);
+
+  /* Test the boundary conditions for cells at the centre of each edge of the
+   * box. */
+  test_boundary_conditions(cells, runner, half_dim, dim - 1, 0, dim,
+                           swiftOutputFileName, bruteForceOutputFileName);
+  test_boundary_conditions(cells, runner, dim - 1, dim - 1, half_dim, dim,
+                           swiftOutputFileName, bruteForceOutputFileName);
+  test_boundary_conditions(cells, runner, half_dim, dim - 1, dim - 1, dim,
+                           swiftOutputFileName, bruteForceOutputFileName);
+  test_boundary_conditions(cells, runner, 0, dim - 1, half_dim, dim,
+                           swiftOutputFileName, bruteForceOutputFileName);
+
+  test_boundary_conditions(cells, runner, 0, half_dim, 0, dim,
+                           swiftOutputFileName, bruteForceOutputFileName);
+  test_boundary_conditions(cells, runner, dim - 1, half_dim, 0, dim,
+                           swiftOutputFileName, bruteForceOutputFileName);
+  test_boundary_conditions(cells, runner, dim - 1, half_dim, dim - 1, dim,
+                           swiftOutputFileName, bruteForceOutputFileName);
+  test_boundary_conditions(cells, runner, 0, half_dim, dim - 1, dim,
+                           swiftOutputFileName, bruteForceOutputFileName);
+
+  test_boundary_conditions(cells, runner, half_dim, 0, 0, dim,
+                           swiftOutputFileName, bruteForceOutputFileName);
+  test_boundary_conditions(cells, runner, dim - 1, 0, half_dim, dim,
+                           swiftOutputFileName, bruteForceOutputFileName);
+  test_boundary_conditions(cells, runner, half_dim, 0, dim - 1, dim,
+                           swiftOutputFileName, bruteForceOutputFileName);
+  test_boundary_conditions(cells, runner, 0, 0, half_dim, dim,
+                           swiftOutputFileName, bruteForceOutputFileName);
+
+  /* Clean things to make the sanitizer happy ... */
+  for (int i = 0; i < 512; ++i) clean_up(cells[i]);
+
+  return 0;
+}
diff --git a/tests/testPeriodicBC.sh.in b/tests/testPeriodicBC.sh.in
new file mode 100755
index 0000000000000000000000000000000000000000..075acc0b68686bd2f418cf457140b3d6b93093d5
--- /dev/null
+++ b/tests/testPeriodicBC.sh.in
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+for v in {0..3}
+do
+    echo ""
+	
+    rm -f brute_force_periodic_BC_standard.dat swift_periodic_BC_standard.dat
+
+    echo "Running ./testPeriodicBC -n 6 -r 1 -d 0 -f standard -v $v"
+    ./testPeriodicBC -n 6 -r 1 -d 0 -f standard -v $v
+
+    if [ -e brute_force_periodic_BC_standard.dat ]
+    then
+	if python @srcdir@/difffloat.py brute_force_periodic_BC_standard.dat swift_periodic_BC_standard.dat @srcdir@/tolerance_periodic_BC_normal.dat
+  then
+	    echo "Accuracy test passed"
+	else
+	    echo "Accuracy test failed"
+	    exit 1
+	fi
+    else
+	echo "Error Missing test output file"
+	exit 1
+    fi
+
+    echo "------------"
+    
+done
+
+exit $?
diff --git a/tests/testPeriodicBCPerturbed.sh.in b/tests/testPeriodicBCPerturbed.sh.in
new file mode 100755
index 0000000000000000000000000000000000000000..ac190d5a80654154dcd329e69c1c9cc9fe45833a
--- /dev/null
+++ b/tests/testPeriodicBCPerturbed.sh.in
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+for v in {0..3}
+do
+    echo ""
+	
+    rm -f brute_force_periodic_BC_perturbed.dat swift_periodic_BC_perturbed.dat
+
+    echo "Running ./testPeriodicBC -n 6 -r 1 -d 0.1 -f perturbed -v $v"
+    ./testPeriodicBC -n 6 -r 1 -d 0.1 -f perturbed -v $v
+
+    if [ -e brute_force_periodic_BC_perturbed.dat ]
+    then
+	if python @srcdir@/difffloat.py brute_force_periodic_BC_perturbed.dat swift_periodic_BC_perturbed.dat @srcdir@/tolerance_periodic_BC_perturbed.dat
+	then
+	    echo "Accuracy test passed"
+	else
+	    echo "Accuracy test failed"
+	    exit 1
+	fi
+    else
+	echo "Error Missing test output file"
+	exit 1
+    fi
+
+    echo "------------"
+    
+done
+
+exit $?
diff --git a/tests/testSPHStep.c b/tests/testSPHStep.c
index 014dacd1eb62040b03e6038b2c23183a24ec4850..e890c7c1a834ec7ca13ed2e8a509b7ea42db28fd 100644
--- a/tests/testSPHStep.c
+++ b/tests/testSPHStep.c
@@ -81,8 +81,7 @@ struct cell *make_cell(size_t N, float cellSize, int offset[3], int id_offset) {
   cell->ti_end_max = 1;
 
   cell->sorted = 0;
-  cell->sort = NULL;
-  cell->sortsize = 0;
+  for (int k = 0; k < 13; k++) cell->sort[k] = NULL;
 
   return cell;
 }
@@ -212,7 +211,8 @@ int main() {
   for (int j = 0; j < 27; ++j) {
     free(cells[j]->parts);
     free(cells[j]->xparts);
-    free(cells[j]->sort);
+    for (int k = 0; k < 13; k++)
+      if (cells[j]->sort[k] != NULL) free(cells[j]->sort[k]);
     free(cells[j]);
   }
 
diff --git a/tests/testThreadpool.c b/tests/testThreadpool.c
index aa65d533a29afbe4e7e8384fb887281822a31e58..6b39991e1620fa90cfea0b7103d6e3e2ce4ed286 100644
--- a/tests/testThreadpool.c
+++ b/tests/testThreadpool.c
@@ -17,6 +17,8 @@
  *
  ******************************************************************************/
 
+#include "../config.h"
+
 // Standard includes.
 #include <stdio.h>
 #include <stdlib.h>
@@ -31,7 +33,7 @@ void map_function_first(void *map_data, int num_elements, void *extra_data) {
   for (int ind = 0; ind < num_elements; ind++) {
     int input = inputs[ind];
     usleep(rand() % 1000000);
-    printf("map_function_first: got input %i.\n", input);
+    printf("   map_function_first: got input %i.\n", input);
     fflush(stdout);
   }
 }
@@ -41,7 +43,7 @@ void map_function_second(void *map_data, int num_elements, void *extra_data) {
   for (int ind = 0; ind < num_elements; ind++) {
     int input = inputs[ind];
     usleep(rand() % 1000000);
-    printf("map_function_second: got input %i.\n", input);
+    printf("   map_function_second: got input %i.\n", input);
     fflush(stdout);
   }
 }
@@ -49,37 +51,49 @@ void map_function_second(void *map_data, int num_elements, void *extra_data) {
 int main(int argc, char *argv[]) {
 
   // Some constants for this test.
-  const int num_threads = 16;
   const int N = 20;
   const int num_runs = 2;
 
-  // Create a threadpool with 8 threads.
-  struct threadpool tp;
-  threadpool_init(&tp, num_threads);
+  // Create threadpools with different numbers of threads.
+  for (int num_thread = 1; num_thread <= 16; num_thread *= 4) {
+    printf("# Creating threadpool with %d threads\n", num_thread);
+    struct threadpool tp;
+    threadpool_init(&tp, num_thread);
 
-  // Main loop.
-  for (int run = 0; run < num_runs; run++) {
+    // Main loop.
+    for (int run = 0; run < num_runs; run++) {
 
-    // Run over a set of integers and print them.
-    int data[N];
-    for (int k = 0; k < N; k++) data[k] = k;
-    printf("processing integers from 0..%i.\n", N);
-    fflush(stdout);
-    threadpool_map(&tp, map_function_first, data, N, sizeof(int), 1, NULL);
+      // Run over a set of integers and print them.
+      int data[N];
+      for (int k = 0; k < N; k++) data[k] = k;
+      printf("1..processing integers from 0..%i.\n", N);
+      fflush(stdout);
+      threadpool_map(&tp, map_function_first, data, N, sizeof(int), 1, NULL);
 
-    // Do the same thing again, with less jobs than threads.
-    printf("processing integers from 0..%i.\n", N / 2);
-    fflush(stdout);
-    threadpool_map(&tp, map_function_second, data, N / 2, sizeof(int), 1, NULL);
+      // Do the same thing again, with less jobs than threads.
+      printf("2..processing integers from 0..%i.\n", N / 2);
+      fflush(stdout);
+      threadpool_map(&tp, map_function_second, data, N / 2, sizeof(int), 1,
+                     NULL);
 
-    // Do the same thing again, with a chunk size of two.
-    printf("processing integers from 0..%i.\n", N);
-    fflush(stdout);
-    threadpool_map(&tp, map_function_first, data, N, sizeof(int), 2, NULL);
-  }
+      // Do the same thing again, with a chunk size of two.
+      printf("3..processing integers from 0..%i.\n", N);
+      fflush(stdout);
+      threadpool_map(&tp, map_function_first, data, N, sizeof(int), 2, NULL);
+    }
+
+/* If logging was enabled, dump the log. */
+#ifdef SWIFT_DEBUG_THREADPOOL
+    char filename[80];
+    sprintf(filename, "threadpool_log-%d.txt", num_thread);
+    printf("# Dumping log\n");
+    threadpool_dump_log(&tp, filename, 1);
+#endif
 
-  /* Be clean */
-  threadpool_clean(&tp);
+    /* Be clean */
+    threadpool_clean(&tp);
+    printf("\n");
+  }
 
   return 0;
 }
diff --git a/tests/tolerance_125_normal.dat b/tests/tolerance_125_normal.dat
index c9ad23d4472c46e64e8418e46c5fe71f813b23b5..0f11d03507b23c76b5703e118eede1359fe2afba 100644
--- a/tests/tolerance_125_normal.dat
+++ b/tests/tolerance_125_normal.dat
@@ -1,3 +1,4 @@
 #   ID    pos_x    pos_y    pos_z      v_x      v_y      v_z        h      rho    div_v        S        u        P        c      a_x      a_y      a_z     h_dt    v_sig    dS/dt    du/dt
     0	  1e-4	   1e-4	    1e-4       1e-4	1e-4	 1e-4	    1e-4   1e-4	  1e-4	       1e-4	1e-4	 1e-4	  1e-4	 1e-4	  1e-4	   1e-4	   1e-4	   1e-4	    1e-4     1e-4
     0	  1e-4	   1e-4	    1e-4       1e-4	1e-4	 1e-4	    1e-4   1e-4	  1e-4	       1e-4	1e-4	 1e-4	  1e-4	 1e-4	  1e-4	   1e-4	   1e-4	   1e-4	    1e-4     1e-4
+    0	  1e-6	   1e-6	    1e-6       1e-6	1e-6	 1e-6	    1e-6   1e-6	  1e-6	       1e-6	1e-6	 1e-6	  1e-6	 1e-5	  1e-5	   1e-5	   1e-5	   1e-5	    1e-5     1e-5
diff --git a/tests/tolerance_125_perturbed.dat b/tests/tolerance_125_perturbed.dat
index 04e642b28cb3729cb81f8183c3e69595ac651876..349f68c1ad6393ba2ffba675126edc3de11a487e 100644
--- a/tests/tolerance_125_perturbed.dat
+++ b/tests/tolerance_125_perturbed.dat
@@ -1,3 +1,4 @@
 #   ID    pos_x    pos_y    pos_z      v_x      v_y      v_z        h      rho    div_v        S        u        P        c      a_x      a_y      a_z     h_dt    v_sig    dS/dt    du/dt
     0	  1e-4	   1e-4	    1e-4       1e-4	1e-4	 1e-4	    1e-4   1e-4	  1e-4	       1e-4	1e-4	 1e-4	  1e-4	 1e-4	  1e-4	   1e-4	   1e-4	   1e-4	    1e-4     1e-4
-    0	  1e-4	   1e-4	    1e-4       1e-4	1e-4	 1e-4	    1e-4   1e-4	  1e-4	       1e-4	1e-4	 1e-4	  1e-4	 5e-3	  5e-3	   5e-3	   1e-4	   1e-4	    1e-4     1e-4
+    0	  1e-4	   1e-4	    1e-4       1e-4	1e-4	 1e-4	    1e-4   1e-4	  1e-4	       1e-4	1e-4	 1e-4	  1e-4	 2e-3	  2e-3	   2e-3	   1e-4	   1e-4	    1e-4     1e-4
+    0	  1e-6	   1e-6	    1e-6       1e-6	1e-6	 1e-6	    1e-6   1e-6	  1e-6	       1e-6	1e-6	 1e-6	  1e-6	 2e-4	  2e-4	   2e-4	   1e-6	   1e-6	    1e-6     1e-6
diff --git a/tests/tolerance_27_normal.dat b/tests/tolerance_27_normal.dat
index 31ee002bb9c73ff8d74cce545aff715476b33507..0fe55e84a42e7541068744e1e554afff1731ed3f 100644
--- a/tests/tolerance_27_normal.dat
+++ b/tests/tolerance_27_normal.dat
@@ -1,3 +1,4 @@
 #   ID      pos_x      pos_y      pos_z        v_x        v_y        v_z           rho        rho_dh        wcount     wcount_dh         div_v       curl_vx       curl_vy       curl_vz
-    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   2e-6	      4e-5	    2e-4       2e-3		 1e-5	     6e-6	   6e-6		 6e-6
-    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-6	      1.2e-4	    1e-4       1e-4		 2e-4	     1e-4	   1e-4	 	 1e-4
+    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   2e-6	      4e-5	    4e-4       1e-2		 1e-5	     6e-6	   6e-6		 6e-6
+    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-6	      1.2e-4	    1e-4       2e-4		 2e-4	     1e-4	   1e-4	 	 1e-4
+    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-6	      1e-6	    1e-6       1e-6		 1e-6	     1e-6	   1e-6		 1e-6
diff --git a/tests/tolerance_27_perturbed.dat b/tests/tolerance_27_perturbed.dat
index 9c6ee8c77cc6d53e67db9dbb86be197d49149b10..aa86962b733e2da73211bceeb30b2345af808bb5 100644
--- a/tests/tolerance_27_perturbed.dat
+++ b/tests/tolerance_27_perturbed.dat
@@ -1,3 +1,4 @@
 #   ID      pos_x      pos_y      pos_z        v_x        v_y        v_z           rho        rho_dh        wcount     wcount_dh         div_v       curl_vx       curl_vy       curl_vz
-    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1.2e-6     1e-4	    5e-5       2e-3		 4e-6	     3e-6	   3e-6		 3e-6
-    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-6	      2e-3	    1e-5       1e-4		 4e-5	     2e-3	   2e-3	 	 2e-3
+    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   2e-6       1e-4	    2e-4       1e-2		 1e-5	     3e-6	   3e-6		 7e-6
+    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-6	      2e-3	    1e-5       2e-3		 6e-5	     2e-3	   2e-3	 	 2e-3
+    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-6	      4e-4	    1e-6       1e0		 1e-6	     2e-6	   2e-6		 2e-6
diff --git a/tests/tolerance_27_perturbed_h.dat b/tests/tolerance_27_perturbed_h.dat
new file mode 100644
index 0000000000000000000000000000000000000000..5142c2a2090e15381a19b2bc71e253a482973b11
--- /dev/null
+++ b/tests/tolerance_27_perturbed_h.dat
@@ -0,0 +1,4 @@
+#   ID      pos_x      pos_y      pos_z        v_x        v_y        v_z           rho        rho_dh        wcount     wcount_dh         div_v       curl_vx       curl_vy       curl_vz
+    0	      1e-6       1e-6	      1e-6 	       1e-6 	    1e-6	     1e-6	         2.4e-6       1e-4	        5e-4       1.2e-2		         1e-5	       3e-6	         3e-6		       8e-6
+    0	      1e-6       1e-6	      1e-6 	       1e-6 	    1e-6	     1e-6	         1.2e-6	      1.4e-2	      1e-5       2e-3		           2.5e-4	     3e-3	         3e-3	 	       3e-3
+    0	      1e-6       1e-6	      1e-6 	       1e-6 	    1e-6	     1e-6	         1e-6	      1e-6	        1e-6       1e0		           1e-6	       4e-6	         4e-6		       4e-6
diff --git a/tests/tolerance_27_perturbed_h2.dat b/tests/tolerance_27_perturbed_h2.dat
new file mode 100644
index 0000000000000000000000000000000000000000..23f6a5006124f6233aebd111005760a5dcc5b6a3
--- /dev/null
+++ b/tests/tolerance_27_perturbed_h2.dat
@@ -0,0 +1,4 @@
+#   ID      pos_x      pos_y      pos_z        v_x        v_y        v_z           rho        rho_dh        wcount     wcount_dh         div_v       curl_vx       curl_vy       curl_vz
+    0	      1e-6       1e-6	      1e-6 	       1e-6 	    1e-6	     1e-6	         3e-6       1e-4	        5e-4       1.5e-2		         1.4e-5	       3e-6	         3e-6		       9e-6
+    0	      1e-6       1e-6	      1e-6 	       1e-6 	    1e-6	     1e-6	         1.5e-6	    1.57e-2	      1e-5       4.74e-3		       3.89e-4	     3e-3	         3e-3	 	       3e-3
+    0	      1e-6       1e-6	      1e-6 	       1e-6 	    1e-6	     1e-6	         1e-6	      1e-6	        1e-6       1e0		           1e-6	         4e-6	         4e-6		       4e-6
diff --git a/tests/tolerance_pair_active.dat b/tests/tolerance_pair_active.dat
new file mode 100644
index 0000000000000000000000000000000000000000..b07697a686eb7801326ceaf77cf93fb3a1491c2e
--- /dev/null
+++ b/tests/tolerance_pair_active.dat
@@ -0,0 +1,4 @@
+#   ID     wcount
+    0	     1e-2
+    0	     1e-2
+    0	     1e-2
diff --git a/tests/tolerance_periodic_BC_normal.dat b/tests/tolerance_periodic_BC_normal.dat
new file mode 100644
index 0000000000000000000000000000000000000000..823e4af488b343f57e3c90e89ee2d4f13d3ca94b
--- /dev/null
+++ b/tests/tolerance_periodic_BC_normal.dat
@@ -0,0 +1,4 @@
+#   ID      pos_x      pos_y      pos_z        v_x        v_y        v_z           rho        rho_dh        wcount     wcount_dh         div_v       curl_vx       curl_vy       curl_vz
+    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   4e-6	      4e-5	    1e-3       1e-2		 2e-4	     2e-4	   2e-4		 2e-4
+    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   2e-6	      2e-4	    1e-4       2e-4		 6e-4	     2e-3	   2e-3	 	 2e-3
+    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-6	      1e-4	    1e-6       1e-4		 5e-4	     2e-4	   2e-4	 	 2e-4
diff --git a/tests/tolerance_pair_normal.dat b/tests/tolerance_periodic_BC_perturbed.dat
similarity index 53%
rename from tests/tolerance_pair_normal.dat
rename to tests/tolerance_periodic_BC_perturbed.dat
index f5031c5f47dfa203300ebcc9a47fbac42f854d26..df5ee6458ba05eed08006586514467fcdb715990 100644
--- a/tests/tolerance_pair_normal.dat
+++ b/tests/tolerance_periodic_BC_perturbed.dat
@@ -1,3 +1,4 @@
 #   ID      pos_x      pos_y      pos_z        v_x        v_y        v_z           rho        rho_dh        wcount     wcount_dh         div_v       curl_vx       curl_vy       curl_vz
-    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-5	      1e-5	    2e-5       3e-2		 1e-5	     1e-5	   1e-5		 1e-5
-    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-5	      1.2e-5	    1e-5       1e-2		 1e-4	     1e-4	   1e-4		 1e-4
+    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   3e-6	      4e-5	    1e-3       1e-2		 2e-4	     1e-4	   1e-4		 1e-4
+    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   2e-6	      6e-3	    1e-4       3e-3		 1e-2	     6e-3	   6e-3	 	 6e-3
+    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-6	      2e-3	    1e-6       1e0		 5e-4	     3e-3	   3e-3 	 3e-3
diff --git a/tests/tolerance_pair_perturbed.dat b/tests/tolerance_testInteractions.dat
similarity index 52%
rename from tests/tolerance_pair_perturbed.dat
rename to tests/tolerance_testInteractions.dat
index ca58ff45995158e031eca6b60eec498aa6c627ef..ebb376bf26bfdc0fb2107ab720bbf9eca5a35bce 100644
--- a/tests/tolerance_pair_perturbed.dat
+++ b/tests/tolerance_testInteractions.dat
@@ -1,3 +1,4 @@
 #   ID      pos_x      pos_y      pos_z        v_x        v_y        v_z           rho        rho_dh        wcount     wcount_dh         div_v       curl_vx       curl_vy       curl_vz
-    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-5	      1e-5	    2e-5       3e-2		 1e-5	     1e-5	   1e-5		 1e-5
-    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-3	      4e-4	    8e-3       2e-2		 1e-4	     1.6e-4	   1.6e-4	 1.6e-4
+    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-5	      4e-5	    4e-4       1e-2		 1e-5	     6e-6	   6e-6		 6e-6
+    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-6	      1.2e-4	  1e-4       2e-4		 2e-4	     1e-4	   1e-4	 	 1e-4
+    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-6	      1e-6	    1e-6       1e-6		 1e-6	     1e-6	   1e-6		 1e-6
diff --git a/theory/Multipoles/bibliography.bib b/theory/Multipoles/bibliography.bib
index 12e274dd63093ba1e14750249f2538c092e5268a..193db42ea4947e49930b79cbd663562d971ec2d4 100644
--- a/theory/Multipoles/bibliography.bib
+++ b/theory/Multipoles/bibliography.bib
@@ -96,3 +96,69 @@ doi="10.1007/BF02123482",
 url="http://dx.doi.org/10.1007/BF02123482"
 }
 
+
+
+@article{Greengard1987,
+title = "A fast algorithm for particle simulations",
+journal = "Journal of Computational Physics",
+volume = "73",
+number = "2",
+pages = "325 - 348",
+year = "1987",
+note = "",
+issn = "0021-9991",
+doi = "http://dx.doi.org/10.1016/0021-9991(87)90140-9",
+url = "http://www.sciencedirect.com/science/article/pii/0021999187901409",
+author = "L Greengard and V Rokhlin",
+}
+
+@article{Cheng1999,
+title = "A Fast Adaptive Multipole Algorithm in Three Dimensions",
+journal = "Journal of Computational Physics",
+volume = "155",
+number = "2",
+pages = "468 - 498",
+year = "1999",
+note = "",
+issn = "0021-9991",
+doi = "http://dx.doi.org/10.1006/jcph.1999.6355",
+url = "http://www.sciencedirect.com/science/article/pii/S0021999199963556",
+author = "H. Cheng and L. Greengard and V. Rokhlin",
+keywords = "Laplace equation",
+keywords = "translation operators",
+keywords = "fast multipole method",
+keywords = "adaptive algorithms"
+}
+
+
+@ARTICLE{Dehnen2000,
+   author = {{Dehnen}, W.},
+    title = "{A Very Fast and Momentum-conserving Tree Code}",
+  journal = {\apjl},
+   eprint = {astro-ph/0003209},
+ keywords = {Celestial Mechanics, Stellar Dynamics, Methods: n-Body Simulations, Methods: Numerical},
+     year = 2000,
+    month = jun,
+   volume = 536,
+    pages = {L39-L42},
+      doi = {10.1086/312724},
+   adsurl = {http://adsabs.harvard.edu/abs/2000ApJ...536L..39D},
+  adsnote = {Provided by the SAO/NASA Astrophysics Data System}
+}
+
+@ARTICLE{Dehnen2002,
+   author = {{Dehnen}, W.},
+    title = "{A Hierarchical $\lt$E10$\gt$O$\lt$/E10$\gt$(N) Force Calculation Algorithm}",
+  journal = {Journal of Computational Physics},
+   eprint = {astro-ph/0202512},
+     year = 2002,
+    month = jun,
+   volume = 179,
+    pages = {27-42},
+      doi = {10.1006/jcph.2002.7026},
+   adsurl = {http://adsabs.harvard.edu/abs/2002JCoPh.179...27D},
+  adsnote = {Provided by the SAO/NASA Astrophysics Data System}
+}
+
+
+
diff --git a/theory/Multipoles/cells.odg b/theory/Multipoles/cells.odg
new file mode 100644
index 0000000000000000000000000000000000000000..ada8fd7a1a6e746fca93f2b1ed04b78a6b7f9097
Binary files /dev/null and b/theory/Multipoles/cells.odg differ
diff --git a/theory/Multipoles/cells.pdf b/theory/Multipoles/cells.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..d621f6f1023d71503f698b69694d980ef27814e6
Binary files /dev/null and b/theory/Multipoles/cells.pdf differ
diff --git a/theory/Multipoles/fmm_standalone.tex b/theory/Multipoles/fmm_standalone.tex
index fcd727a89abe95bba69b23c58ce5067c8cc53211..dc4266a23110873ff38ccbec4d71345e2780d6b2 100644
--- a/theory/Multipoles/fmm_standalone.tex
+++ b/theory/Multipoles/fmm_standalone.tex
@@ -4,27 +4,45 @@
 \usepackage{times}
 
 \newcommand{\swift}{{\sc Swift}\xspace}
+\newcommand{\nbody}{$N$-body\xspace}
 
 %opening
 \title{FMM in SWIFT}
 \author{Matthieu Schaller}
-
 \begin{document}
 
+\date{\today}
+
+\pagerange{\pageref{firstpage}--\pageref{lastpage}} \pubyear{2014}
+
 \maketitle
 
-We use the multi-index notation of \cite{Dehnen2014} to simplify expressions.
+\label{firstpage}
+
+\begin{abstract}
+Making gravity great again.
+\end{abstract}
+
+\begin{keywords}
+\end{keywords}
+
+\section{Gravity in \swift}
+\label{sec:gravity}
 
 \input{potential_softening}
+\input{fmm_summary}
 \input{gravity_derivatives}
+\input{mesh_summary}
 
 \bibliographystyle{mnras}
 \bibliography{./bibliography.bib}
 
 \appendix
+\input{vector_notation}
 \onecolumn
 \input{potential_derivatives}
 
 
+\label{lastpage}
 
 \end{document}
diff --git a/theory/Multipoles/fmm_summary.tex b/theory/Multipoles/fmm_summary.tex
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9ab88ada6836d6118c7cfd74e39f4d1c504b3
--- /dev/null
+++ b/theory/Multipoles/fmm_summary.tex
@@ -0,0 +1,182 @@
+\subsection{Evaluating the forces using the Fast Multipole Method}
+\label{ssec:fmm_summary}
+
+The algorithmically challenging aspect of the \nbody problem is to
+evaluate for each particle in a system the potential and associated
+forces generated by all the other particles. Mathematically, this means
+evaluate
+\begin{equation}
+  \phi(\mathbf{x}_a) = \sum_{b \neq a} G m_b\varphi(\mathbf{x}_a -
+  \mathbf{x}_b)\qquad \forall~a\in N
+  \label{eq:fmm:n_body}
+\end{equation}
+efficiently for large numbers of particles $N$. In the case of collisionless
+dynamics, the particles are a mere Monte-Carlo sampling of the
+underlying coarse-grained phase-space distribution which justifies the
+use of approximate method to evaluate Eq.~\ref{eq:fmm:n_body}. The
+\emph{Fast Multipole Method} (FMM) \citep{Greengard1987, Cheng1999},
+popularized in the field and adapted specifically for gravity solvers
+by \cite{Dehnen2000, Dehnen2002}, is an $\mathcal{O}(N)$ method
+designed to solve Eq.~\ref{eq:fmm:n_body} by expanding the potential both
+around $\mathbf{x}_i$ and $\mathbf{x}_j$ and grouping similar terms
+arising from nearby particles. \\
+
+In what follows, we use the compact multi-index notation of
+\cite{Dehnen2014} (repeated in appendix \ref{sec:multi_index_notation}
+for completeness) to simplify expressions and ease
+comparisons. $\mathbf{k}$, $\mathbf{m}$ and $\mathbf{n}$ are
+multi-indices and $\mathbf{r}$, $\mathbf{R}$, $\mathbf{x}$,
+$\mathbf{y}$ and $\mathbf{z}$ are vectors, whilst $a$ and $b$ are
+particle indices.\\
+
+\begin{figure}
+\includegraphics[width=\columnwidth]{cells.pdf}
+\caption{The basics of the FMM: The potential generated by a particle
+  at position $\mathbf{x}_b$ on a particle at position at location
+  $\mathbf{x}_a$ is replaced by a Taylor expansion of the potential
+  around the distance vector $\mathbf{R}$ linking the two centres of mass
+  ($\mathbf{z}_A$ and $\mathbf{z}_B$) of cell $A$ and $B$. The
+  expansion converges towards the exact expression provided
+  $|\mathbf{R}|<|\mathbf{r}_a + \mathbf{r}_b|$.}
+\label{fig:fmm:cells}
+\end{figure}
+
+
+For a single pair of particles $a$ and $b$ located in cell $A$ and $B$
+with centres of mass $\mathbf{z}_A$ and  $\mathbf{z}_B$
+respectively, as shown on Fig.~\ref{fig:fmm:cells}, the potential
+generated by $b$ at the location of $a$ can be rewritten as
+\begin{align}
+  \varphi(\mathbf{x}_a - \mathbf{x}_b)
+  &= \varphi\left(\mathbf{x}_a - \mathbf{z}_A - \mathbf{x}_b +
+  \mathbf{z}_B + \mathbf{z}_A - \mathbf{z}_B\right)  \nonumber \\
+  &= \varphi\left(\mathbf{r}_a - \mathbf{r}_b + \mathbf{R}\right)
+  \nonumber \\
+  &= \sum_\mathbf{k} \frac{1}{\mathbf{k}!} \left(\mathbf{r}_a -
+  \mathbf{r}_b\right)^{\mathbf{k}} \nabla^{\mathbf{k}}\varphi(\mathbf{R})
+  \nonumber \\
+  &= \sum_\mathbf{k} \frac{1}{\mathbf{k}!} \sum_{\mathbf{n} <
+    \mathbf{k}} \binom{\mathbf{k}}{\mathbf{n}} \mathbf{r}_a^{\mathbf{n}}
+  \left(-\mathbf{r}_b\right)^{\mathbf{k} - \mathbf{n}}
+  \nabla^{\mathbf{k}}\varphi(\mathbf{R})\nonumber \\
+  &= \sum_\mathbf{n} \frac{1}{\mathbf{n}!} \mathbf{r}_a^{\mathbf{n}}
+  \sum_\mathbf{m} \frac{1}{\mathbf{m}!}
+  \left(-\mathbf{r}_b\right)^\mathbf{m} \nabla^{\mathbf{n}+\mathbf{m}} \varphi(\mathbf{R}),
+\end{align}
+where we used the Taylor expansion of $\varphi$ around $\mathbf{R} \equiv
+\mathbf{z}_A - \mathbf{z}_B$ on the third line, used $\mathbf{r}_a
+\equiv \mathbf{x}_a - \mathbf{z}_A$, $\mathbf{r}_b \equiv \mathbf{x}_b
+- \mathbf{z}_B$ throughout and defined $\mathbf{m} \equiv
+\mathbf{k}-\mathbf{n}$ on the last line. Expanding the series only up
+to order $p$, we get
+\begin{equation}
+  \varphi(\mathbf{x}_a - \mathbf{x}_b) \approx \sum_{\mathbf{n}}^{p}
+  \frac{1}{\mathbf{n}!} \mathbf{r}_a^{\mathbf{n}} \sum_{\mathbf{m}}^{p
+    -|\mathbf{n}|} 
+  \frac{1}{\mathbf{m}!} \left(-\mathbf{r}_b\right)^\mathbf{m}
+  \nabla^{\mathbf{n}+\mathbf{m}} \varphi(\mathbf{R}),
+  \label{eq:fmm:fmm_one_part}
+\end{equation}
+with the approximation converging as $p\rightarrow\infty$ towards the
+correct value provided $|\mathbf{R}|<|\mathbf{r}_a +
+\mathbf{r}_b|$. If we now consider all the particles within $B$ and
+combine their contributions to the potential at location
+$\mathbf{x}_a$ in cell $A$, we get
+\begin{align}
+  \phi_{BA}(\mathbf{x}_a) &= \sum_{b\in B}G m_b\varphi(\mathbf{x}_a -
+  \mathbf{x}_b)  \label{eq:fmm:fmm_one_cell}  \\
+  &\approx G\sum_{\mathbf{n}}^{p}
+  \frac{1}{\mathbf{n}!} \mathbf{r}_a^{\mathbf{n}} \sum_{\mathbf{m}}
+    ^{p -|\mathbf{n}|}
+  \frac{1}{\mathbf{m}!} \sum_{b\in B} m_b\left(-\mathbf{r}_b\right)^\mathbf{m}
+  \nabla^{\mathbf{n}+\mathbf{m}} \varphi(\mathbf{R}) \nonumber. 
+\end{align}
+This last equation forms the basis of the FMM. The algorithm
+decomposes the equation into three separated sums evaluated at
+different stages.\\
+
+In a first step, multipoles are constructed from the
+innermost sum. For each cell, we compute all the terms
+\begin{equation}
+  \mathsf{M}_{\mathbf{m}}(\mathbf{z}_B) = \frac{1}{\mathbf{m}!}
+  \sum_{b\in B} m_b\left(-\mathbf{r}_b\right)^\mathbf{m} \label{eq:fmm:P2M} 
+\end{equation}
+up to order $p$. This is the first kernel of the method, commonly
+labelled as \textsc{P2M} (particle to multipole). In a second step, we
+compute the second kernel, \textsc{M2L} (multipole to local
+expansion), which corresponds to the interaction of a cell with
+another one:
+\begin{equation}
+  \mathsf{F}_{\mathbf{n}}(\mathbf{z}_A) = G\sum_{\mathbf{m}}^{p -|\mathbf{n}|}
+  \mathsf{M}_{\mathbf{m}}(\mathbf{z}_B)
+  \mathsf{D}_{\mathbf{n}+\mathbf{m}}(\mathbf{R}), \label{eq:fmm:M2L} 
+\end{equation}
+where $\mathsf{D}_{\mathbf{n}+\mathbf{m}}(\mathbf{R}) \equiv
+\nabla^{\mathbf{n}+\mathbf{m}} \varphi(\mathbf{R})$ is an order $n+m$
+derivative of the potential. This is the computationally expensive
+step of the FMM algorithm as the number of operations in a naive
+implementation using cartesian coordinates scales as
+$\mathcal{O}(p^6)$. More advanced techniques
+\citep[e.g.][]{Dehnen2014} can bring the cost down to
+$\mathcal{O}(p^3)$, albeit at a considerable algebraic cost. For
+collisionless dynamics, high accuracy is not required and low values
+of $p$ are sufficient, which maintains the computational cost of the
+M2L kernel at a reasonable level.  
+Finally, in the last step, the potential is propagated from the local
+expansion centre to the particles (L2P kernel) using
+\begin{equation}
+  \phi_{BA}(\mathbf{x}_a) = \sum_{\mathbf{n}}^{p}
+  \frac{1}{\mathbf{n}!} \mathbf{r}_a^{\mathbf{n}}
+  \mathsf{F}_{\mathbf{n}}(\mathbf{z}_A). \label{eq:fmm:L2P} 
+\end{equation}
+In summary, the potential generated by a cell $B$ on the particles in
+cell $A$ is obtained by the successive application of the P2M, M2L and
+L2P kernels. The P2M and L2P kernels are applied only once per
+particle, whilst one M2L calculation has to be performed for each pair
+of cells. The forces applied to the particles are obtained by the same
+procedure using an extra order in the Taylor expansion. For instance,
+for the acceleration along $x$, we have:
+\begin{equation}
+  a_x(\mathbf{x}_a) = \sum_{\mathbf{n}}^{p-1}
+  \frac{1}{\mathbf{n}!} \mathbf{r}_a^{\mathbf{n}}
+  \mathsf{F}_{\mathbf{n}+\left(1,0,0\right)}(\mathbf{z}_A). \label{eq:fmm:L2P_force} 
+\end{equation}
+
+In practice, the multipoles can be constructed recursively from the
+leaves of the tree to the root and the local expansions from the root
+to the leaves by shifting the $\mathsf{M}$ and $\mathsf{F}$ tensors
+and adding their contributions to their parent or daughter cell's
+tensors respecitvely. The shifting formulas (M2M and L2L kernels)
+read:
+
+\begin{align}
+  \mathsf{M}_{\mathbf{m}}(\mathbf{x} + \mathbf{y}) &=
+  \sum_{\mathbf{n}}^{\mathbf{m}}
+  \frac{\mathbf{y}^\mathbf{n}}{\mathbf{n}!}\mathsf{M}_{\mathbf{m} -
+    \mathbf{n}}(\mathbf{x}), \label{eq:fmm:M2M} \\
+  \mathsf{F}_{\mathbf{n}}(\mathbf{x} + \mathbf{y}) &=
+  \sum_{\mathbf{m}}^{p-|\mathbf{n}|}
+  \frac{\mathbf{y}^\mathbf{m}}{\mathbf{m}!}\mathsf{F}_{\mathbf{m} +
+    \mathbf{n}}(\mathbf{x}). \label{eq:fmm:L2L} 
+\end{align}
+
+All the kernels (Eqs.~\ref{eq:fmm:P2M}-\ref{eq:fmm:L2L}) are rather
+straightforward to evaluate as they are only made of additions and
+multiplications (provided $\mathsf{D}$ can be evaluated quickly, see
+Sec.~\ref{ssec:grav_derivatives}), which are extremely efficient
+instructions on modern architectures. However, the fully expanded sums
+can lead to rather large and prone to typo expressions. To avoid any
+mishaps, we use a \texttt{python} script to generate C code in which
+all the sums are unrolled and correct by construction. In \swift, we
+implemented the kernels up to order $p=5$, as it proved to be accurate
+enough for our purpose, but this could be extended to higher order
+easily. This implies storing $56$ numbers per cell for each
+$\textsf{M}$ and $\textsf{F}$ plus three numbers for the location of
+the centre of mass. For leaf-cells with large numbers of particles, as
+in \swift, this is a small memory overhead. One further small
+improvement consists in choosing $\mathbf{z}_A$ to be the centre of
+mass of cell $A$ rather than its geometrical centre. The first order
+multipoles ($\mathsf{M}_{100},\mathsf{M}_{010},\mathsf{M}_{001}$) then
+vanish by construction. This allows us to simplify some of the
+expressions and helps reduce, albeit by a small fraction, the memory
+footprint of the tree structure.
diff --git a/theory/Multipoles/gravity_derivatives.py b/theory/Multipoles/generate_multipoles/gravity_derivatives.py
similarity index 100%
rename from theory/Multipoles/gravity_derivatives.py
rename to theory/Multipoles/generate_multipoles/gravity_derivatives.py
diff --git a/theory/Multipoles/multipoles.py b/theory/Multipoles/generate_multipoles/multipoles.py
similarity index 100%
rename from theory/Multipoles/multipoles.py
rename to theory/Multipoles/generate_multipoles/multipoles.py
diff --git a/theory/Multipoles/vector_powers.py b/theory/Multipoles/generate_multipoles/vector_powers.py
similarity index 100%
rename from theory/Multipoles/vector_powers.py
rename to theory/Multipoles/generate_multipoles/vector_powers.py
diff --git a/theory/Multipoles/gravity_derivatives.tex b/theory/Multipoles/gravity_derivatives.tex
index e4c7b1565ab6c82de5623d5a643c3a8bd1fa513f..e4569ef960fae5e92343f1d99902a5c14fd6ee5c 100644
--- a/theory/Multipoles/gravity_derivatives.tex
+++ b/theory/Multipoles/gravity_derivatives.tex
@@ -1,52 +1,55 @@
-\subsection{Derivatives of the gravitational potential}
+\subsection{Notes on the derivatives of the gravitational potential}
+\label{ssec:grav_derivatives}
 
 The calculation of all the
-$D_\mathbf{n}(x,y,z) \equiv \nabla^{\mathbf{n}}\phi(x,y,z)$ terms up
+$\mathsf{D}_\mathbf{n}(x,y,z) \equiv \nabla^{\mathbf{n}}\varphi(x,y,z)$ terms up
 to the relevent order can be quite tedious and it is beneficial to
 automatize the whole setup. Ideally, one would like to have an
-expression for each of this term that is only made of multiplications
+expression for each of these terms that is only made of multiplications
 and additions of each of the coordinates and the inverse distance. We
-achieve this by writing $\phi$ as a composition of functions
-$\phi(u(x,y,z))$ and apply the \textit{Fa\`a di Bruno}
+achieve this by writing $\varphi$ as a composition of functions
+$\varphi(u(x,y,z))$ and apply the \textit{Fa\`a di Bruno}
 formula \citep[i.e. the ``chain rule'' for higher order derivatives,
-e.g.][]{Hardy2006} to construct our terms:
-
+ see e.g.][]{Hardy2006} to construct our terms:
 \begin{equation}
 \label{eq:faa_di_bruno}
-\frac{\partial^n}{\partial x_1 \cdots \partial x_n} \phi(u)
-= \sum_{A} \phi^{(|A|)}(u) \prod_{B \in
+\frac{\partial^n}{\partial x_1 \cdots \partial x_n} \varphi(u)
+= \sum_{A} \varphi^{(|A|)}(u) \prod_{B \in
 A} \frac{\partial^{|B|}}{\prod_{c\in B}\partial x_c} u(x,y,z),
 \end{equation}
 where $A$ is the set of all partitions of $\lbrace1,\cdots, n\rbrace$,
-$B$ is a block of a partition $A$ and $|\cdot|$ denotes the
-cardinality of a set. For generic functions $\phi$ and $u$ this
+$B$ is a block of a partition in the set $A$ and $|\cdot|$ denotes the
+cardinality of a set. For generic functions $\varphi$ and $u$ this
 formula yields an untracktable number of terms; an 8th-order
 derivative will have $4140$ (!)  terms in the sum\footnote{The number
-of terms in the sum is given by the Bell number of the same order}. \\
-We choose to write
+  of terms in the sum is given by the Bell number of the same
+  order.}. \\ For the un-softened gravitational potential, we choose to write
 \begin{align}
-   \phi(x,y,z) &= 1 / \sqrt{u(x,y,z)}, \\
+   \varphi(x,y,z) &= 1 / \sqrt{u(x,y,z)}, \\
    u(x,y,z) &= x^2 + y^2 + z^2.
 \end{align}
-This choice allows to have derivatives of any order of $\phi(u)$ that
-only depend on powers of $u$:
-
+This choice allows to have derivatives of any order of $\varphi(u)$ that
+can be easily expressed and only depend on powers of $u$:
 \begin{equation}
-f^{(n)}(u) = \frac{\Gamma(\frac{1}{2})}{\Gamma(\frac{1}{2} -
-n)}\frac{1}{u^{n+\frac{1}{2}}}.
+\varphi^{(n)}(u) = (-1)^n\cdot\frac{(2n-1)!!}{2^n}\cdot\frac{1}{u^{n+\frac{1}{2}}},
 \end{equation}
-More importantly, this choice of decomposition allows us to have
-derivatives of $u$ only up to second order in $x$, $y$ or $z$. The
-number of non-zero terms in eq. \ref{eq:faa_di_bruno} is hence
-drastically reduced. For instance, when computing
-$D_{(4,1,3)} \equiv \frac{\partial^8}{\partial x^4 \partial y \partial
-z^3} \phi$, $4100$ of the $4140$ terms will involve at least one
+where $!!$ denotes the semi-factorial. More importantly, this
+choice of decomposition allows us to have non-zero derivatives of $u$
+only up to second order in $x$, $y$ or $z$. The number of non-zero
+terms in eq. \ref{eq:faa_di_bruno} is hence drastically reduced. For
+instance, when computing $\mathsf{D}_{(4,1,3)}(\mathbf{r}) \equiv
+\frac{\partial^8}{\partial x^4 \partial y \partial z^3}
+\varphi(u(x,y,z))$, $4100$ of the $4140$ terms will involve at least one
 zero-valued derivative (e.g. $\partial^3/\partial x^3$ or
 $\partial^2/\partial x\partial y$) of $u$. Furthermore, among the 40
-remaining terms, many will involve the same derivatives and can be
-grouped together, leaving us with a sum of six products of $x$,$y$ and
-$z$. This is generally the case for most of the $D_\mathbf{n}$'s and
-figuring out which terms are identical in a given set of partitions of
-$\lbrace1,\cdots, n\rbrace$ is an interesting exercise in
-combinatorics left for the reader \citep[see also][]{Hardy2006}.
+remaining terms, many will involve the same combination of derivatives
+of $u$ and can be grouped together, leaving us with a sum of six
+products of $x$,$y$ and $z$. This is generally the case for most of
+the $\mathsf{D}_\mathbf{n}$'s and figuring out which terms are identical in a
+given set of partitions of $\lbrace1,\cdots, n\rbrace$ is an
+interesting exercise in combinatorics left for the reader \citep[see
+  also][]{Hardy2006}. We use a \texttt{python} script based on this
+technique to generate the actual C routines used within \swift. Some
+examples of these terms are given in Appendix
+\ref{sec:pot_derivatives}.
 
diff --git a/theory/Multipoles/mesh_summary.tex b/theory/Multipoles/mesh_summary.tex
new file mode 100644
index 0000000000000000000000000000000000000000..3069257c8845804d9a307cc54fffec5e36e4ae8c
--- /dev/null
+++ b/theory/Multipoles/mesh_summary.tex
@@ -0,0 +1,39 @@
+\subsection{Coupling the FMM to a mesh for periodic long-range forces}
+\label{ssec:mesh_summary}
+
+\begin{equation}
+  S(x) = \frac{e^x}{1 + e^x}
+\end{equation}
+
+\begin{align}
+  \varphi_s(r) &= \frac{1}{r}\left[2 - 2S\left(\frac{2r}{r_s}\right)\right] \nonumber\\
+  &= \frac{1}{r}\left[2 - \frac{2e^{\frac{2r}{r_s}}}{1+e^{\frac{2r}{r_s}}}\right] 
+\end{align}
+\begin{align}
+  |\mathbf{f}_s(r)| &= \frac{1}{r^2}\left[\frac{4r}{r_s}S'\left(\frac{2r}{r_s}\right) - 2S\left(\frac{2r}{r_s}\right) + 2\right] \nonumber \\
+  &= \frac{1}{r^2}\left[\frac{4r}{r_s}\frac{e^{\frac{2r}{r_s}}}{(1+e^{\frac{2r}{r_s}})^2} - \frac{2e^{\frac{2r}{r_s}}}{1+e^{\frac{2r}{r_s}}} + 2\right]
+\end{align}
+
+\begin{equation}
+  \tilde\varphi_l(k) = \frac{1}{k^2}\left[\frac{\upi}{2}kr_s\textrm{csch}\left(\frac{\upi}{2}kr_s\right) \right]
+\end{equation}
+
+\begin{figure}
+\includegraphics[width=\columnwidth]{potential_short.pdf}
+\caption{aa}
+\label{fig:fmm:potential_short}
+\end{figure}
+
+
+\begin{figure}
+\includegraphics[width=\columnwidth]{force_short.pdf}
+\caption{bb}
+\label{fig:fmm:force_short}
+\end{figure}
+
+
+\begin{figure}
+\includegraphics[width=\columnwidth]{potential_long.pdf}
+\caption{cc}
+\label{fig:fmm:potential_long}
+\end{figure}
diff --git a/theory/Multipoles/plot_mesh.py b/theory/Multipoles/plot_mesh.py
new file mode 100644
index 0000000000000000000000000000000000000000..6706016f73b4b6251c6d517ec89eacbb7a469417
--- /dev/null
+++ b/theory/Multipoles/plot_mesh.py
@@ -0,0 +1,267 @@
+###############################################################################
+ # This file is part of SWIFT.
+ # Copyright (c) 2016  Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ # 
+ # This program is free software: you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published
+ # by the Free Software Foundation, either version 3 of the License, or
+ # (at your option) any later version.
+ # 
+ # This program is distributed in the hope that it will be useful,
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ # GNU General Public License for more details.
+ # 
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ # 
+ ##############################################################################
+import matplotlib
+matplotlib.use("Agg")
+from pylab import *
+from scipy import integrate
+from scipy import special
+from scipy.optimize import curve_fit
+from scipy.optimize import fsolve
+from matplotlib.font_manager import FontProperties
+import numpy
+import math
+
+params = {'axes.labelsize': 9,
+'axes.titlesize': 10,
+'font.size': 10,
+'legend.fontsize': 10,
+'xtick.labelsize': 8,
+'ytick.labelsize': 8,
+'text.usetex': True,
+'figure.figsize' : (3.15,3.15),
+'figure.subplot.left'    : 0.12,
+'figure.subplot.right'   : 0.99  ,
+'figure.subplot.bottom'  : 0.09  ,
+'figure.subplot.top'     : 0.99  ,
+'figure.subplot.wspace'  : 0.  ,
+'figure.subplot.hspace'  : 0.  ,
+'lines.markersize' : 6,
+'lines.linewidth' : 3.,
+'text.latex.unicode': True
+}
+rcParams.update(params)
+rc('font',**{'family':'sans-serif','sans-serif':['Times']})
+colors=['#4477AA', '#CC6677', '#DDCC77', '#117733']
+
+
+# Parameters
+r_s = 2.
+r_min = 1e-2
+r_max = 1.5e2
+
+# Radius
+r = logspace(log10(r_min), log10(r_max), 401)
+r_rs = r / r_s
+
+k = logspace(log10(r_min/r_s**2), log10(r_max/r_s**2), 401)
+k_rs = k * r_s
+
+# Newtonian solution
+phi_newton = 1. / r
+phit_newton = 1. / k**2
+force_newton = 1. / r**2
+
+def my_exp(x):
+    return 1. + x + (x**2 / 2.) + (x**3 / 6.) + (x**4 / 24.) + (x**5 / 120.)# + (x**6 / 720.)
+    #return exp(x)
+
+def csch(x): # hyperbolic cosecant
+    return 1. / sinh(x)
+
+def sigmoid(x):
+    return my_exp(x) / (my_exp(x) + 1.)
+
+def d_sigmoid(x):
+    return my_exp(x) / ((my_exp(x) + 1)**2)
+
+def swift_corr(x):
+    return 2 * sigmoid( 4 * x ) - 1
+
+#figure()
+#x = linspace(-4, 4, 100)
+#plot(x, special.erf(x), '-', color=colors[0])
+#plot(x, swift_corr(x), '-', color=colors[1])
+#plot(x, x, '-', color=colors[2])
+#ylim(-1.1, 1.1)
+#xlim(-4.1, 4.1)
+#savefig("temp.pdf")
+
+# Correction in real space
+corr_short_gadget2 = special.erf(r / (2.*r_s))
+corr_short_swift = swift_corr(r / (2.*r_s)) 
+eta_short_gadget2 = special.erfc(r / 2.*r_s) + (r / (r_s * math.sqrt(math.pi))) * exp(-r**2 / (4.*r_s**2))
+eta_short_swift = 4. * (r / r_s) * d_sigmoid(2. * r / r_s) - 2. * sigmoid(2 * r / r_s) + 2.
+
+# Corection in Fourier space
+corr_long_gadget2 = exp(-k**2*r_s**2)
+corr_long_swift = math.pi * k * r_s * csch(0.5 * math.pi * r_s * k) / 2.
+
+# Shortrange term
+phi_short_gadget2 = (1.  / r ) * (1. - corr_short_gadget2)
+phi_short_swift = (1.  / r ) * (1. - corr_short_swift)
+force_short_gadget2 = (1. / r**2) * eta_short_gadget2
+force_short_swift = (1. / r**2) * eta_short_swift
+
+# Long-range term
+phi_long_gadget2 = (1.  / r ) * corr_short_gadget2
+phi_long_swift = (1.  / r ) * corr_short_swift
+phit_long_gadget2 = corr_long_gadget2 / k**2
+phit_long_swift = corr_long_swift / k**2
+
+
+
+
+figure()
+
+# Potential
+subplot(311, xscale="log", yscale="log")
+
+plot(r_rs, phi_newton, '--', lw=1.4, label="${\\rm Newtonian}$", color=colors[0])
+plot(r_rs, phi_short_gadget2, '-', lw=1.4, label="${\\rm Gadget}$", color=colors[2])
+plot(r_rs, phi_short_swift, '-', lw=1.4, label="${\\rm SWIFT}$", color=colors[3])
+plot([1., 1.], [1e-5, 1e5], 'k-', alpha=0.5, lw=0.5)
+
+xlim(1.1*r_min/ r_s, 0.9*r_max / r_s)
+ylim(1.1/r_max, 0.9/r_min)
+ylabel("$\\varphi_s(r)$", labelpad=-3)
+
+legend(loc="upper right", frameon=True, handletextpad=0.1, handlelength=3.2, fontsize=8)
+
+# Correction
+subplot(312, xscale="log", yscale="log")
+plot(r_rs, np.ones(np.size(r)), '--', lw=1.4, color=colors[0])
+plot(r_rs, 1. - corr_short_gadget2, '-', lw=1.4, color=colors[2])
+plot(r_rs, 1. - corr_short_swift, '-', lw=1.4, color=colors[3])
+plot(r_rs, np.ones(np.size(r))*0.01, 'k:', alpha=0.5, lw=0.5)
+plot([1., 1.], [-1e5, 1e5], 'k-', alpha=0.5, lw=0.5)
+
+yticks([1e-2, 1e-1, 1], ["$0.01$", "$0.1$", "$1$"])
+xlim(1.1*r_min/r_s, 0.9*r_max/r_s)
+ylim(3e-3, 1.5)
+#ylabel("$\\chi_s(r)$", labelpad=-3)
+ylabel("$\\varphi_s(r) \\times r$", labelpad=-2)
+
+# 1 - Correction
+subplot(313, xscale="log", yscale="log")
+plot(r_rs, corr_short_gadget2, '-', lw=1.4, color=colors[2])
+plot(r_rs, corr_short_swift, '-', lw=1.4, color=colors[3])
+
+plot([1., 1.], [1e-5, 1e5], 'k-', alpha=0.5, lw=0.5)
+plot(r_rs, np.ones(np.size(r)), 'k:', alpha=0.5, lw=0.5)
+plot(r_rs, np.ones(np.size(r))*0.01, 'k:', alpha=0.5, lw=0.5)
+
+xlim(1.1*r_min/r_s, 0.9*r_max/r_s)
+ylim(3e-3, 1.5)
+#ylabel("$1 - \\chi_s(r)$", labelpad=-2)
+ylabel("$1 - \\varphi_s(r) \\times r$", labelpad=-2)
+yticks([1e-2, 1e-1, 1], ["$0.01$", "$0.1$", "$1$"])
+xlabel("$r / r_s$", labelpad=-3)
+
+savefig("potential_short.pdf")
+
+##################################################################################################
+
+
+# Force
+figure()
+subplot(311, xscale="log", yscale="log")
+
+plot(r_rs, force_newton, '--', lw=1.4, label="${\\rm Newtonian}$", color=colors[0])
+plot(r_rs, force_short_gadget2, '-', lw=1.4, label="${\\rm Gadget}$", color=colors[2])
+plot(r_rs, force_short_swift, '-', lw=1.4, label="${\\rm SWIFT}$", color=colors[3])
+plot([1., 1.], [1e-5, 1e5], 'k-', alpha=0.5, lw=0.5)
+
+xlim(1.1*r_min/ r_s, 0.9*r_max / r_s)
+ylim(1.1/r_max**2, 0.9/r_min**2)
+ylabel("$|\\mathbf{f}_s(r)|$", labelpad=-3)
+yticks([1e-4, 1e-2, 1e0, 1e2], ["$10^{-4}$", "$10^{-2}$", "$10^{0}$", "$10^{2}$"])
+
+legend(loc="upper right", frameon=True, handletextpad=0.1, handlelength=3.2, fontsize=8)
+
+# Correction
+subplot(312, xscale="log", yscale="log")
+plot(r_rs, np.ones(np.size(r)), '--', lw=1.4, color=colors[0])
+plot(r_rs, eta_short_gadget2, '-', lw=1.4, color=colors[2])
+plot(r_rs, eta_short_swift, '-', lw=1.4, color=colors[3])
+plot(r_rs, np.ones(np.size(r))*0.01, 'k:', alpha=0.5, lw=0.5)
+plot([1., 1.], [-1e5, 1e5], 'k-', alpha=0.5, lw=0.5)
+
+yticks([1e-2, 1e-1, 1], ["$0.01$", "$0.1$", "$1$"])
+xlim(1.1*r_min/r_s, 0.9*r_max/r_s)
+ylim(3e-3, 1.5)
+#ylabel("$\\eta_s(r)$", labelpad=-3)
+ylabel("$|\\mathbf{f}_s(r)|\\times r^2$", labelpad=-2)
+
+# 1 - Correction
+subplot(313, xscale="log", yscale="log")
+plot(r_rs, 1. - eta_short_gadget2, '-', lw=1.4, color=colors[2])
+plot(r_rs, 1. - eta_short_swift, '-', lw=1.4, color=colors[3])
+
+plot([1., 1.], [1e-5, 1e5], 'k-', alpha=0.5, lw=0.5)
+plot(r_rs, np.ones(np.size(r)), 'k:', alpha=0.5, lw=0.5)
+plot(r_rs, np.ones(np.size(r))*0.01, 'k:', alpha=0.5, lw=0.5)
+
+xlim(1.1*r_min/r_s, 0.9*r_max/r_s)
+ylim(3e-3, 1.5)
+#ylabel("$1 - \\eta_s(r)$", labelpad=-2)
+ylabel("$1 - |\\mathbf{f}_s(r)|\\times r^2$", labelpad=-3)
+yticks([1e-2, 1e-1, 1], ["$0.01$", "$0.1$", "$1$"])
+xlabel("$r / r_s$", labelpad=-3)
+
+savefig("force_short.pdf")
+
+##################################################################################################
+
+figure()
+subplot(311, xscale="log", yscale="log")
+
+# Potential
+plot(k_rs, phit_newton, '--', lw=1.4, label="${\\rm Newtonian}$", color=colors[0])
+plot(k_rs, phit_long_gadget2, '-', lw=1.4, label="${\\rm Gadget}$", color=colors[2])
+plot(k_rs, phit_long_swift, '-', lw=1.4, label="${\\rm SWIFT}$", color=colors[3])
+plot(k_rs, -phit_long_swift, ':', lw=1.4, color=colors[3])
+plot([1., 1.], [1e-5, 1e5], 'k-', alpha=0.5, lw=0.5)
+
+legend(loc="lower left", frameon=True, handletextpad=0.1, handlelength=3.2, fontsize=8)
+
+xlim(1.1*r_min/ r_s, 0.9*r_max / r_s)
+ylim(1.1/r_max**2, 0.9/r_min**2)
+ylabel("$\\tilde{\\varphi_l}(k)$", labelpad=-3)
+yticks([1e-4, 1e-2, 1e0, 1e2], ["$10^{-4}$", "$10^{-2}$", "$10^{0}$", "$10^{2}$"])
+
+subplot(312, xscale="log", yscale="log")
+
+# Potential normalized
+plot(k_rs, phit_newton * k**2, '--', lw=1.4, label="${\\rm Newtonian}$", color=colors[0])
+plot(k_rs, phit_long_gadget2 * k**2, '-', lw=1.4, label="${\\rm Gadget}$", color=colors[2])
+plot(k_rs, phit_long_swift * k**2, '-', lw=1.4, label="${\\rm SWIFT}$", color=colors[3])
+plot([1., 1.], [1e-5, 1e5], 'k-', alpha=0.5, lw=0.5)
+plot(r_rs, np.ones(np.size(r))*0.01, 'k:', alpha=0.5, lw=0.5)
+
+xlim(1.1*r_min/ r_s, 0.9*r_max / r_s)
+ylim(3e-3, 1.5)
+ylabel("$k^2 \\times \\tilde{\\varphi_l}(k)$", labelpad=-3)
+yticks([1e-2, 1e-1, 1], ["$0.01$", "$0.1$", "$1$"])
+
+subplot(313, xscale="log", yscale="log")
+
+plot(k_rs, 1. - phit_long_gadget2 * k**2, '-', lw=1.4, label="${\\rm Gadget}$", color=colors[2])
+plot(k_rs, 1. - phit_long_swift * k**2, '-', lw=1.4, label="${\\rm SWIFT}$", color=colors[3])
+plot([1., 1.], [1e-5, 1e5], 'k-', alpha=0.5, lw=0.5)
+plot(r_rs, np.ones(np.size(r)), 'k:', alpha=0.5, lw=0.5)
+plot(r_rs, np.ones(np.size(r))*0.01, 'k:', alpha=0.5, lw=0.5)
+
+xlim(1.1*r_min/ r_s, 0.9*r_max / r_s)
+ylim(3e-3, 1.5)
+ylabel("$1 - k^2 \\times \\tilde{\\varphi_l}(k)$", labelpad=-3)
+yticks([1e-2, 1e-1, 1], ["$0.01$", "$0.1$", "$1$"])
+
+xlabel("$k \\times r_s$", labelpad=0)
+
+savefig("potential_long.pdf")
diff --git a/theory/Multipoles/potential.py b/theory/Multipoles/plot_potential.py
similarity index 92%
rename from theory/Multipoles/potential.py
rename to theory/Multipoles/plot_potential.py
index 559f590762a3cbef171c5dd584cbc517879a2cec..8761314572cdbda1304cdf882f920651b58be08e 100644
--- a/theory/Multipoles/potential.py
+++ b/theory/Multipoles/plot_potential.py
@@ -141,7 +141,7 @@ plot([epsilon, epsilon], [-10, 10], 'k-', alpha=0.5, lw=0.5)
 plot([epsilon/plummer_equivalent_factor, epsilon/plummer_equivalent_factor], [0, 10], 'k-', alpha=0.5, lw=0.5)
 
 ylim(0, 2.3)
-ylabel("$|\\phi(r)|$", labelpad=1)
+ylabel("$\\varphi(r)$", labelpad=1)
 #yticks([0., 0.5, 1., 1.5, 2., 2.5], ["$%.1f$"%(0.*epsilon), "$%.1f$"%(0.5*epsilon), "$%.1f$"%(1.*epsilon), "$%.1f$"%(1.5*epsilon), "$%.1f$"%(2.*epsilon)])
 
 xlim(0,r_max_plot)
@@ -163,19 +163,6 @@ xticks([0., 0.5, 1., 1.5, 2., 2.5], ["$%.1f$"%(0./epsilon), "", "$%.1f$"%(1./eps
 xlabel("$r/H$", labelpad=-7)
 
 ylim(0, 0.95)
-ylabel("$|\\overrightarrow{\\nabla}\\phi(r)|$", labelpad=0)
+ylabel("$|\\overrightarrow{\\nabla}\\varphi(r)|$", labelpad=0)
 
 savefig("potential.pdf")
-
-
-
-
-#Construct potential
-# phi = np.zeros(np.size(r))
-# for i in range(np.size(r)):
-#     if r[i] > 2*epsilon:
-#         phi[i] = 1./ r[i]
-#     elif r[i] > epsilon:
-#         phi[i] = -(1./epsilon) * ((32./3.)*u[i]**2 - (48./3.)*u[i]**3 + (38.4/4.)*u[i]**4 - (32./15.)*u[i]**5 + (2./30.)*u[i]**(-1) - (9/5.))
-#     else:
-#         phi[i] = -(1./epsilon) * ((32./6.)*u[i]**2 - (38.4/4.)*u[i]**4 + (32./5.)*u[i]**4 - (7./5.))
diff --git a/theory/Multipoles/potential_derivatives.tex b/theory/Multipoles/potential_derivatives.tex
index 56184ce98902d76ad53ce1d49e3d6d67dfc33ac4..5c7b1e6566d7d51b5d27ea3c24d785571e1ad692 100644
--- a/theory/Multipoles/potential_derivatives.tex
+++ b/theory/Multipoles/potential_derivatives.tex
@@ -1,4 +1,5 @@
-\subsection{Derivatives of the potential}
+\section{Derivatives of the potential}
+\label{sec:pot_derivatives}
 
 For completeness, we give here the full expression for the first few
 derivatives of the potential that are used in our FMM scheme. We use
@@ -6,7 +7,7 @@ the notation $\mathbf{r}=(r_x, r_y, r_z)$, $r = |\mathbf{r}|$ and
 $u=r/H$. Starting from the potential (Eq. \ref{eq:fmm:potential},
 reproduced here for clarity), 
 \begin{align}
-D_{000}(\mathbf{r}) = \phi (\mathbf{r},H) = 
+\mathsf{D}_{000}(\mathbf{r}) = \varphi (\mathbf{r},H) = 
 \left\lbrace\begin{array}{rcl}
 \frac{1}{H} \left(-3u^7 + 15u^6 - 28u^5 + 21u^4 - 7u^2 + 3\right) & \mbox{if} & u < 1,\\
 \frac{1}{r} & \mbox{if} & u \geq 1, 
@@ -14,10 +15,11 @@ D_{000}(\mathbf{r}) = \phi (\mathbf{r},H) =
 \right.\nonumber
 \end{align}
 we can construct the higher order terms by successively applying the
-"chain rule". We show examples of the first few relevant ones here.
+"chain rule". We show representative examples of the first few
+relevant ones here split by order.
 
 \begin{align}
-D_{100}(\mathbf{r}) = \frac{\partial}{\partial r_x} \phi (\mathbf{r},H) = 
+\mathsf{D}_{100}(\mathbf{r}) = \frac{\partial}{\partial r_x} \varphi (\mathbf{r},H) = 
 \left\lbrace\begin{array}{rcl}
 -\frac{r_x}{H^3} \left(21u^5 - 90u^4 + 140u^3 - 84u^2 + 14\right) & \mbox{if} & u < 1,\\
 -\frac{r_x}{r^3} & \mbox{if} & u \geq 1, 
@@ -25,8 +27,10 @@ D_{100}(\mathbf{r}) = \frac{\partial}{\partial r_x} \phi (\mathbf{r},H) =
 \right.\nonumber
 \end{align}
 
+\noindent\rule{6cm}{0.4pt}
+
 \begin{align}
-D_{200}(\mathbf{r}) = \frac{\partial^2}{\partial r_x^2} \phi (\mathbf{r},H) = 
+\mathsf{D}_{200}(\mathbf{r}) = \frac{\partial^2}{\partial r_x^2} \varphi (\mathbf{r},H) = 
 \left\lbrace\begin{array}{rcl}
 \frac{r_x^2}{H^5}\left(-105u^3+360u^2-420u+168\right) -
 \frac{1}{H^3} \left(21u^5 - 90u^4 + 140u^3 - 84u^2 + 14\right) & \mbox{if} & u < 1,\\
@@ -36,7 +40,7 @@ D_{200}(\mathbf{r}) = \frac{\partial^2}{\partial r_x^2} \phi (\mathbf{r},H) =
 \end{align}
 
 \begin{align}
-D_{110}(\mathbf{r}) = \frac{\partial^2}{\partial r_x\partial r_y} \phi (\mathbf{r},H) = 
+\mathsf{D}_{110}(\mathbf{r}) = \frac{\partial^2}{\partial r_x\partial r_y} \varphi (\mathbf{r},H) = 
 \left\lbrace\begin{array}{rcl}
 \frac{r_xr_y}{H^5}\left(-105u^3+360u^2-420u+168\right) & \mbox{if} & u < 1,\\
 3\frac{r_xr_y}{r^5} & \mbox{if} & u \geq 1, 
@@ -44,8 +48,10 @@ D_{110}(\mathbf{r}) = \frac{\partial^2}{\partial r_x\partial r_y} \phi (\mathbf{
 \right.\nonumber
 \end{align}
 
+\noindent\rule{6cm}{0.4pt}
+
 \begin{align}
-D_{300}(\mathbf{r}) = \frac{\partial^3}{\partial r_x^3} \phi (\mathbf{r},H) = 
+\mathsf{D}_{300}(\mathbf{r}) = \frac{\partial^3}{\partial r_x^3} \varphi (\mathbf{r},H) = 
 \left\lbrace\begin{array}{rcl}
 -\frac{r_x^3}{H^7} \left(315u - 720 + 420u^{-1}\right) +
 \frac{3r_x}{H^5}\left(-105u^3+360u^2-420u+168\right) & \mbox{if} & u < 1,\\
@@ -55,7 +61,7 @@ D_{300}(\mathbf{r}) = \frac{\partial^3}{\partial r_x^3} \phi (\mathbf{r},H) =
 \end{align}
 
 \begin{align}
-D_{210}(\mathbf{r}) = \frac{\partial^3}{\partial r_x^3} \phi (\mathbf{r},H) = 
+\mathsf{D}_{210}(\mathbf{r}) = \frac{\partial^3}{\partial r_x^3} \varphi (\mathbf{r},H) = 
 \left\lbrace\begin{array}{rcl}
 -\frac{r_x^2r_y}{H^7} \left(315u - 720 + 420u^{-1}\right) +
 \frac{r_y}{H^5}\left(-105u^3+360u^2-420u+168\right) & \mbox{if} & u < 1,\\
@@ -66,10 +72,32 @@ D_{210}(\mathbf{r}) = \frac{\partial^3}{\partial r_x^3} \phi (\mathbf{r},H) =
 
 
 \begin{align}
-D_{111}(\mathbf{r}) = \frac{\partial^3}{\partial r_x\partial r_y\partial r_z} \phi (\mathbf{r},H) = 
+\mathsf{D}_{111}(\mathbf{r}) = \frac{\partial^3}{\partial r_x\partial r_y\partial r_z} \varphi (\mathbf{r},H) = 
 \left\lbrace\begin{array}{rcl}
 -\frac{r_xr_yr_z}{H^7} \left(315u - 720 + 420u^{-1}\right) & \mbox{if} & u < 1,\\
 -15\frac{r_xr_yr_z}{r^7} & \mbox{if} & u \geq 1, 
 \end{array}
 \right.\nonumber
 \end{align}
+
+\noindent\rule{6cm}{0.4pt}
+
+\begin{align}
+  \mathsf{D}_{400}(\mathbf{r}) &=
+  \nonumber
+\end{align}
+
+\begin{align}
+  \mathsf{D}_{310}(\mathbf{r}) &=
+  \nonumber
+\end{align}
+
+\begin{align}
+  \mathsf{D}_{220}(\mathbf{r}) &=
+  \nonumber
+\end{align}
+
+\begin{align}
+  \mathsf{D}_{211}(\mathbf{r}) &=
+  \nonumber
+\end{align}
diff --git a/theory/Multipoles/potential_softening.tex b/theory/Multipoles/potential_softening.tex
index 1186a9cec377fd8daa94e14d024115f95ecfdc99..aa9ee12340a3492a19dcf9048548952ef7e141e1 100644
--- a/theory/Multipoles/potential_softening.tex
+++ b/theory/Multipoles/potential_softening.tex
@@ -1,4 +1,5 @@
 \subsection{Gravitational softening}
+\label{ssec:potential_softening}
 
 To avoid artificial two-body relaxation, the Dirac
 $\delta$-distribution of particles is convolved with a softening
@@ -6,9 +7,13 @@ kernel of a given fixed, but time-variable, scale-length
 $\epsilon$. Instead of the commonly used spline kernel of
 \cite{Monaghan1985} (e.g. in \textsc{Gadget}), we use a C2 kernel
 \citep{Wendland1995} which leads to an expression for the force that
-is cheaper to compute and has a very similar overall shape. We set
-$\tilde\delta(\mathbf{x}) = \rho(|\mathbf{x}|) = W(|\mathbf{x}|,
-3\epsilon_{\rm Plummer})$, with $W(r, H)$ given by
+is cheaper to compute and has a very similar overall shape. The C2
+kernel has the advantage of being branch-free leading to an expression
+which is faster to evaluate using vector units available on modern
+architectures; it also does not require any divisions to evaluate the
+softened forces. We set $\tilde\delta(\mathbf{x}) =
+\rho(|\mathbf{x}|) = W(|\mathbf{x}|, 3\epsilon_{\rm Plummer})$, with
+$W(r, H)$ given by
 
 \begin{align}
 W(r,H) &= \frac{21}{2\pi H^3} \times \nonumber \\
@@ -18,9 +23,9 @@ W(r,H) &= \frac{21}{2\pi H^3} \times \nonumber \\
 \end{array}
 \right.
 \end{align}
-and $u = r/H$. The potential $\phi(r,H)$ corresponding to this density distribution reads
+and $u = r/H$. The potential $\varphi(r,H)$ corresponding to this density distribution reads
 \begin{align}
-\phi = 
+\varphi = 
 \left\lbrace\begin{array}{rcl}
 \frac{1}{H} (-3u^7 + 15u^6 - 28u^5 + 21u^4 - 7u^2 + 3) & \mbox{if} & u < 1,\\
 \frac{1}{r} & \mbox{if} & u \geq 1.
@@ -41,12 +46,13 @@ details see Sec. 2 of~\cite{Price2007}).
 
 \begin{figure}
 \includegraphics[width=\columnwidth]{potential.pdf}
-\caption{The density (top), potential (middle) and forces (bottom) of
-generated py a point mass in our softened gravitational scheme (for
-completeness, we chose $\epsilon=2$). A
-Plummer-equivalent sphere is shown for comparison. The spline kernel
-of \citet{Monaghan1985}, used in \textsc{Gadget}, is shown for
-comparison but note that it has not been re-scaled to match the
-Plummer-sphere potential at $r=0$.}
+\caption{The density (top), potential (middle) and forces (bottom)
+  generated py a point mass in our softened gravitational scheme.
+  A Plummer-equivalent sphere is shown for comparison. The spline
+  kernel of \citet{Monaghan1985}, used in \textsc{Gadget}, is shown
+  for comparison but note that it has not been re-scaled to match the
+  Plummer-sphere potential at $r=0$.  %(for completeness, we chose
+  %$\epsilon=2$).
+  }
 \label{fig:fmm:softening}
 \end{figure}
diff --git a/theory/Multipoles/run.sh b/theory/Multipoles/run.sh
index f25d407cd4ffe679a272f352798817f7c0c4e55a..fc376188ad2e69d2879ce963ddc7069c736fc8b7 100755
--- a/theory/Multipoles/run.sh
+++ b/theory/Multipoles/run.sh
@@ -1,5 +1,15 @@
 #!/bin/bash
-python potential.py
+if [ ! -e potential.pdf ]
+then
+    echo "Generating 1st figure..."
+    python plot_potential.py
+fi
+if [ ! -e potential_short.pdf ]
+then
+    echo "Generating 2nd figures..."
+    python plot_mesh.py
+fi
+echo "Generating PDF..."
 pdflatex -jobname=fmm fmm_standalone.tex
 bibtex fmm.aux
 pdflatex -jobname=fmm fmm_standalone.tex
diff --git a/theory/Multipoles/vector_notation.tex b/theory/Multipoles/vector_notation.tex
new file mode 100644
index 0000000000000000000000000000000000000000..4c17a1b92ad7576ac3aaa02b8d02993acfcd795a
--- /dev/null
+++ b/theory/Multipoles/vector_notation.tex
@@ -0,0 +1,34 @@
+\section{Multi-index notation}
+\label{sec:multi_index_notation}
+
+We define a multi-index $\mathbf{n}$ as a triplet of integers
+non-negative integers:
+\begin{equation}
+  \mathbf{n} \equiv \left(n_x, n_y, n_z\right), \qquad n_i \in \mathbb{N},
+\end{equation}
+with a norm $n$ given by
+\begin{equation}
+  n = |\mathbf{n}| \equiv n_x + n_y + n_z. 
+\end{equation}
+We also define the exponentiation of a vector
+$\mathbf{r}=(r_x,r_y,r_z)$ by a multi-index $\mathbf{n}$ as
+\begin{equation}
+  \mathbf{r}^\mathbf{n} \equiv r_x^{n_x} \cdot r_y^{n_y} \cdot r_z^{n_z},
+\end{equation}
+which for a scalar $\alpha$ reduces to
+\begin{equation}
+  \alpha^\mathbf{n} = \alpha^{n}.
+\end{equation}
+Finally, the factiorial of a multi-index is defined to be
+\begin{equation}
+  \mathbf{n}! \equiv n_x! \cdot n_y! \cdot n_z!,
+\end{equation}
+which leads to a simple expression for the binomial coefficients of
+two multi-indices entering Taylor expansions:
+\begin{equation}
+  \binom{\mathbf{n}}{\mathbf{k}} = \binom{n_x}{k_x}\binom{n_y}{k_y}\binom{n_z}{k_z}.
+\end{equation}
+When appearing as the index in a sum, a multi-index represents all
+values that the triplet can take up to a given norm. For instance,
+$\sum_{\mathbf{n}}^{p}$ indicates that the sum runs over all possible
+multi-indices whose norm is $\leq p$.