diff --git a/README b/README
index b0a21c610f7ab4c380e523b51c6f74b81f98a2f2..cd2a397a18e872e7914b24fd58cc588ec1d6c8c0 100644
--- a/README
+++ b/README
@@ -26,6 +26,8 @@ Valid options are:
   -f    {int} Overwrite the CPU frequency (Hz) to be used for time measurements
   -g          Run with an external gravitational potential
   -G          Run with self-gravity
+  -n    {int} Execute a fixed number of time steps. When unset use the time_end
+              parameter to stop. 
   -s          Run with SPH
   -t    {int} The number of threads to use on each MPI rank. Defaults to 1 if not specified.
   -v     [12] Increase the level of verbosity 1: MPI-rank 0 writes 
diff --git a/configure.ac b/configure.ac
index 497107121753f212dd8b07f5a8e8eed7acdf82b5..23fbf5c6e4c4211379986a4a9c951892d044713c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -72,9 +72,6 @@ if test "$enable_ipo" = "yes"; then
    fi
 fi
 
-# Add libtool support.
-LT_INIT
-
 # Check for MPI. Need to do this before characterising the compiler (C99 mode),
 # as this changes the compiler.
 # We should consider using AX_PROG_CC_MPI to replace AC_PROG_CC when compiling
@@ -151,6 +148,9 @@ AM_CONDITIONAL([HAVEMPI],[test $enable_mpi = "yes"])
 # Indicate that MPIRUN can be modified by an environement variable
 AC_ARG_VAR(MPIRUN, Path to the mpirun command if non-standard)
 
+# Add libtool support (now that CC is defined).
+LT_INIT
+
 # Need C99 and inline support.
 AC_PROG_CC_C99
 AC_C_INLINE
@@ -179,7 +179,7 @@ if test "$enable_opt" = "yes" ; then
    ac_test_CFLAGS="yes"
    CFLAGS="$old_CFLAGS $CFLAGS"
 
-   # Check SSE & AVX support (some overlap with AX_CC_MAXOPT). 
+   # Check SSE & AVX support (some overlap with AX_CC_MAXOPT).
    # Don't use the SIMD_FLAGS result with Intel compilers. The -x<code>
    # value from AX_CC_MAXOPT should be sufficient.
    AX_EXT
@@ -287,12 +287,75 @@ AC_SUBST([METIS_LIBS])
 AC_SUBST([METIS_INCS])
 AM_CONDITIONAL([HAVEMETIS],[test -n "$METIS_LIBS"])
 
-# # Check for zlib.
-# AC_CHECK_LIB([z],[gzopen],[
-#     AC_DEFINE([HAVE_LIBZ],[1],[Set to 1 if zlib is installed.])
-#     LDFLAGS="$LDFLAGS -lz"
-#     ],[])
+#  Check for tcmalloc a fast malloc that is part of the gperftools.
+have_tcmalloc="no"
+AC_ARG_WITH([tcmalloc],
+   [AS_HELP_STRING([--with-tcmalloc],
+      [use tcmalloc library or specify the directory with lib @<:@yes/no@:>@]
+   )],
+   [with_tcmalloc="$withval"],
+   [with_tcmalloc="no"]
+)
+if test "x$with_tcmalloc" != "xno"; then
+   if test "x$with_tcmalloc" != "xyes" && test "x$with_tcmalloc" != "x"; then
+      tclibs="-L$with_tcmalloc -ltcmalloc"
+   else
+      tclibs="-ltcmalloc"
+   fi
+   AC_CHECK_LIB([tcmalloc],[tc_cfree],[have_tcmalloc="yes"],[have_tcmalloc="no"],
+                $tclibs)
+
+   #  Could just have the minimal version.
+   if test "$have_tcmalloc" = "no"; then
+      if test "x$with_tcmalloc" != "xyes" && test "x$with_tcmalloc" != "x"; then
+         tclibs="-L$with_tcmalloc -ltcmalloc_minimal"
+      else
+         tclibs="-ltcmalloc_minimal"
+      fi
+      AC_CHECK_LIB([tcmalloc],[tc_cfree],[have_tcmalloc="yes"],[have_tcmalloc="no"],
+                   $tclibs)
+   fi
 
+   if test "$have_tcmalloc" = "yes"; then
+      TCMALLOC_LIBS="$tclibs"
+
+      # These are recommended for GCC.
+      if test "$ax_cv_c_compiler_vendor" = "gnu"; then
+         CFLAGS="$CFLAGS -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free"
+      fi
+   else
+      TCMALLOC_LIBS=""
+   fi
+fi
+AC_SUBST([TCMALLOC_LIBS])
+AM_CONDITIONAL([HAVETCMALLOC],[test -n "$TCMALLOC_LIBS"])
+
+#  Check for -lprofiler usually part of the gpreftools along with tcmalloc.
+have_profiler="no"
+AC_ARG_WITH([profiler],
+   [AS_HELP_STRING([--with-profiler],
+      [use cpu profiler library or specify the directory with lib @<:@yes/no@:>@]
+   )],
+   [with_profiler="$withval"],
+   [with_profiler="yes"]
+)
+if test "x$with_profiler" != "xno"; then
+   if test "x$with_profiler" != "xyes" && test "x$with_profiler" != "x"; then
+      proflibs="-L$with_profiler -lprofiler"
+   else
+      proflibs="-lprofiler"
+   fi
+   AC_CHECK_LIB([profiler],[ProfilerFlush],[have_profiler="yes"],[have_profiler="no"],
+                $proflibs)
+
+   if test "$have_profiler" = "yes"; then
+      PROFILER_LIBS="$proflibs"
+   else
+      PROFILER_LIBS=""
+   fi
+fi
+AC_SUBST([PROFILER_LIBS])
+AM_CONDITIONAL([HAVEPROFILER],[test -n "$PROFILER_LIBS"])
 
 # Check for HDF5. This is required.
 AX_LIB_HDF5
@@ -410,6 +473,8 @@ AC_MSG_RESULT([
     - parallel     : $have_parallel_hdf5
    Metis enabled   : $have_metis
    libNUMA enabled : $have_numa
+   Using tcmalloc  : $have_tcmalloc
+   CPU profiler    : $have_profiler
 ])
 
 # Generate output.
diff --git a/doc/Makefile.am b/doc/Makefile.am
index 1736a4ad81a674e68932a45e2f8bea8ef82305f3..dc88489ca4ac4ef2fe856d910420fee6a7e87c8a 100644
--- a/doc/Makefile.am
+++ b/doc/Makefile.am
@@ -1,3 +1,19 @@
+# This file is part of SWIFT.
+# Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk),
+#                    Matthieu Schaller (matthieu.schaller@durham.ac.uk).
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 doxyfile.stamp: 
 if HAVE_DOXYGEN
diff --git a/examples/CosmoVolume/cosmoVolume.yml b/examples/CosmoVolume/cosmoVolume.yml
index d33de8170284b3251a00a43c1a8acaaf10e35cc8..acfd449cbf57408718a7c2a50d8a16bf5ccd5b4e 100644
--- a/examples/CosmoVolume/cosmoVolume.yml
+++ b/examples/CosmoVolume/cosmoVolume.yml
@@ -37,10 +37,8 @@ Statistics:
 SPH:
   resolution_eta:        1.2348   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
   delta_neighbours:      0.1      # The tolerance for the targetted number of neighbours.
-  max_ghost_iterations:  30       # Maximal number of iterations allowed to converge towards the smoothing length.
   max_smoothing_length:  0.705    # Maximal smoothing length allowed (in internal units).
   CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
-  max_volume_change:     2.       # Maximal allowed change of volume over one time-step
 
 # Parameters related to the initial conditions
 InitialConditions:
diff --git a/examples/ExternalPointMass/externalPointMass.yml b/examples/ExternalPointMass/externalPointMass.yml
index e4c3abc28362801ee53320f9e547bf6d01670ec7..52330163caa13609fd7674a5cdf2921743fbe227 100644
--- a/examples/ExternalPointMass/externalPointMass.yml
+++ b/examples/ExternalPointMass/externalPointMass.yml
@@ -32,10 +32,8 @@ Statistics:
 SPH:
   resolution_eta:        1.2348   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
   delta_neighbours:      0.1      # The tolerance for the targetted number of neighbours.
-  max_ghost_iterations:  30       # Maximal number of iterations allowed to converge towards the smoothing length.
   max_smoothing_length:  10.      # Maximal smoothing length allowed (in internal units).
   CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
-  max_volume_change:     2.       # Maximal allowed change of volume over one time-step
 
 # Parameters related to the initial conditions
 InitialConditions:
diff --git a/examples/Makefile.am b/examples/Makefile.am
index fcb6d3ac4158e140ed5bf227f9215d488f608dd7..9ad4d06f8764f8f57f68d0e501f4a726941c40e6 100644
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -1,4 +1,3 @@
-
 # This file is part of SWIFT.
 # Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk),
 #                    Matthieu Schaller (matthieu.schaller@durham.ac.uk).
@@ -22,13 +21,17 @@ MYFLAGS = -DTIMER
 # Add the source directory and debug to CFLAGS
 AM_CFLAGS = -I../src $(HDF5_CPPFLAGS)
 
-AM_LDFLAGS =
+AM_LDFLAGS = $(HDF5_LDFLAGS)
+
+# Extra libraries.
+EXTRA_LIBS = $(HDF5_LIBS) $(PROFILER_LIBS) $(TCMALLOC_LIBS)
 
-MPI_THREAD_LIBS = @MPI_THREAD_LIBS@
+# MPI libraries.
 MPI_LIBS = $(METIS_LIBS) $(MPI_THREAD_LIBS)
 MPI_FLAGS = -DWITH_MPI $(METIS_INCS)
 
-# Set-up the library
+
+# Programs.
 bin_PROGRAMS = swift swift_fixdt
 
 # Build MPI versions as well?
@@ -46,20 +49,20 @@ ENGINE_POLICY_SETAFFINITY=
 # Sources for swift
 swift_SOURCES = main.c
 swift_CFLAGS = $(MYFLAGS) $(AM_CFLAGS) -DENGINE_POLICY="engine_policy_keep $(ENGINE_POLICY_SETAFFINITY)"
-swift_LDADD =  ../src/.libs/libswiftsim.a $(HDF5_LDFLAGS) $(HDF5_LIBS)
+swift_LDADD =  ../src/.libs/libswiftsim.a $(EXTRA_LIBS)
 
 swift_fixdt_SOURCES = main.c
 swift_fixdt_CFLAGS = $(MYFLAGS) $(AM_CFLAGS) -DENGINE_POLICY="engine_policy_fixdt | engine_policy_keep $(ENGINE_POLICY_SETAFFINITY)"
-swift_fixdt_LDADD =  ../src/.libs/libswiftsim.a $(HDF5_LDFLAGS) $(HDF5_LIBS)
+swift_fixdt_LDADD =  ../src/.libs/libswiftsim.a $(EXTRA_LIBS)
 
 # Sources for swift_mpi, do we need an affinity policy for MPI?
 swift_mpi_SOURCES = main.c
 swift_mpi_CFLAGS = $(MYFLAGS) $(AM_CFLAGS) $(MPI_FLAGS) -DENGINE_POLICY="engine_policy_keep $(ENGINE_POLICY_SETAFFINITY)"
-swift_mpi_LDADD =  ../src/.libs/libswiftsim_mpi.a $(HDF5_LDFLAGS) $(HDF5_LIBS) $(MPI_LIBS)
+swift_mpi_LDADD =  ../src/.libs/libswiftsim_mpi.a $(MPI_LIBS) $(EXTRA_LIBS)
 
 swift_fixdt_mpi_SOURCES = main.c
 swift_fixdt_mpi_CFLAGS = $(MYFLAGS) $(AM_CFLAGS) $(MPI_FLAGS) -DENGINE_POLICY="engine_policy_fixdt | engine_policy_keep $(ENGINE_POLICY_SETAFFINITY)"
-swift_fixdt_mpi_LDADD =  ../src/.libs/libswiftsim_mpi.a $(HDF5_LDFLAGS) $(HDF5_LIBS) $(MPI_LIBS)
+swift_fixdt_mpi_LDADD =  ../src/.libs/libswiftsim_mpi.a $(MPI_LIBS) $(EXTRA_LIBS)
 
 # Scripts to generate ICs
 EXTRA_DIST = UniformBox/makeIC.py UniformBox/run.sh UniformBox/uniformBox.yml \
diff --git a/examples/SedovBlast/sedov.yml b/examples/SedovBlast/sedov.yml
index 96b16c5b85b2e534e5ac8b7680f61ed9912da3d6..9fbabb4969b6accdb7323d8270b735951ac0693a 100644
--- a/examples/SedovBlast/sedov.yml
+++ b/examples/SedovBlast/sedov.yml
@@ -32,10 +32,8 @@ Statistics:
 SPH:
   resolution_eta:        1.2348   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
   delta_neighbours:      0.1      # The tolerance for the targetted number of neighbours.
-  max_ghost_iterations:  30       # Maximal number of iterations allowed to converge towards the smoothing length.
   max_smoothing_length:  1.       # Maximal smoothing length allowed (in internal units).
   CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
-  max_volume_change:     2.       # Maximal allowed change of volume over one time-step
 
 # Parameters related to the initial conditions
 InitialConditions:
diff --git a/examples/SodShock/sodShock.yml b/examples/SodShock/sodShock.yml
index 62ed0c68cad8e063f49ab0655aee0e10f57e6314..003b7286777230ceb3b84ee01c6a1f335aeb9476 100644
--- a/examples/SodShock/sodShock.yml
+++ b/examples/SodShock/sodShock.yml
@@ -32,10 +32,8 @@ Statistics:
 SPH:
   resolution_eta:        1.2348   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
   delta_neighbours:      0.1      # The tolerance for the targetted number of neighbours.
-  max_ghost_iterations:  30       # Maximal number of iterations allowed to converge towards the smoothing length.
   max_smoothing_length:  0.01     # Maximal smoothing length allowed (in internal units).
   CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
-  max_volume_change:     2.       # Maximal allowed change of volume over one time-step
 
 # Parameters related to the initial conditions
 InitialConditions:
diff --git a/examples/UniformBox/uniformBox.yml b/examples/UniformBox/uniformBox.yml
index 64f9a135fff6ecb2e4500ade95d1500a0712f2c8..50afbee02ddc8a801ca77ed4900b7d2e0e3b50b5 100644
--- a/examples/UniformBox/uniformBox.yml
+++ b/examples/UniformBox/uniformBox.yml
@@ -32,10 +32,8 @@ Statistics:
 SPH:
   resolution_eta:        1.2348   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
   delta_neighbours:      0.1      # The tolerance for the targetted number of neighbours.
-  max_ghost_iterations:  30       # Maximal number of iterations allowed to converge towards the smoothing length.
   max_smoothing_length:  0.1      # Maximal smoothing length allowed (in internal units).
   CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
-  max_volume_change:     2.       # Maximal allowed change of volume over one time-step
   
 # Parameters related to the initial conditions
 InitialConditions:
diff --git a/examples/main.c b/examples/main.c
index 6422ccceef0a5f5c5f3b5acce9ba60b957986aac..016f948bb1b031ba756a344d020d4b26c8a42bed 100644
--- a/examples/main.c
+++ b/examples/main.c
@@ -74,6 +74,8 @@ void print_help_message() {
   printf("  %2s %8s %s\n", "-g", "",
          "Run with an external gravitational potential");
   printf("  %2s %8s %s\n", "-G", "", "Run with self-gravity");
+  printf("  %2s %8s %s\n", "-n", "{int}",
+         "Execute a fixed number of time steps");
   printf("  %2s %8s %s\n", "-s", "", "Run with SPH");
   printf("  %2s %8s %s\n", "-t", "{int}",
          "The number of threads to use on each MPI rank. Defaults to 1 if not "
@@ -93,7 +95,6 @@ void print_help_message() {
  * @brief Main routine that loads a few particles and generates some output.
  *
  */
-
 int main(int argc, char *argv[]) {
 
   struct clocks_time tic, toc;
@@ -139,6 +140,7 @@ int main(int argc, char *argv[]) {
   int with_aff = 0;
   int dry_run = 0;
   int dump_tasks = 0;
+  int nsteps = -2;
   int with_cosmology = 0;
   int with_external_gravity = 0;
   int with_self_gravity = 0;
@@ -151,7 +153,7 @@ int main(int argc, char *argv[]) {
 
   /* Parse the parameters */
   int c;
-  while ((c = getopt(argc, argv, "acdef:gGhst:v:y:")) != -1) switch (c) {
+  while ((c = getopt(argc, argv, "acdef:gGhn:st:v:y:")) != -1) switch (c) {
       case 'a':
         with_aff = 1;
         break;
@@ -180,6 +182,13 @@ int main(int argc, char *argv[]) {
       case 'h':
         if (myrank == 0) print_help_message();
         return 0;
+      case 'n':
+        if (sscanf(optarg, "%d", &nsteps) != 1) {
+          if (myrank == 0) printf("Error parsing fixed number of steps.\n");
+          if (myrank == 0) print_help_message();
+          return 1;
+        }
+        break;
       case 's':
         with_hydro = 1;
         break;
@@ -323,18 +332,21 @@ int main(int argc, char *argv[]) {
   size_t Ngas = 0, Ngpart = 0;
   double dim[3] = {0., 0., 0.};
   int periodic = 0;
+  int flag_entropy_ICs = 0;
   if (myrank == 0) clocks_gettime(&tic);
 #if defined(WITH_MPI)
 #if defined(HAVE_PARALLEL_HDF5)
   read_ic_parallel(ICfileName, dim, &parts, &gparts, &Ngas, &Ngpart, &periodic,
-                   myrank, nr_nodes, MPI_COMM_WORLD, MPI_INFO_NULL, dry_run);
+                   &flag_entropy_ICs, myrank, nr_nodes, MPI_COMM_WORLD,
+                   MPI_INFO_NULL, dry_run);
 #else
   read_ic_serial(ICfileName, dim, &parts, &gparts, &Ngas, &Ngpart, &periodic,
-                 myrank, nr_nodes, MPI_COMM_WORLD, MPI_INFO_NULL, dry_run);
+                 &flag_entropy_ICs, myrank, nr_nodes, MPI_COMM_WORLD,
+                 MPI_INFO_NULL, dry_run);
 #endif
 #else
   read_ic_single(ICfileName, dim, &parts, &gparts, &Ngas, &Ngpart, &periodic,
-                 dry_run);
+                 &flag_entropy_ICs, dry_run);
 #endif
   if (myrank == 0) {
     clocks_gettime(&toc);
@@ -355,7 +367,7 @@ int main(int argc, char *argv[]) {
     free(parts);
     parts = NULL;
     for (size_t k = 0; k < Ngpart; ++k)
-      if (gparts[k].id > 0) error("Linking problem");
+      if (gparts[k].id_or_neg_offset < 0) error("Linking problem");
     Ngas = 0;
   }
 
@@ -467,7 +479,7 @@ int main(int argc, char *argv[]) {
 #endif
 
   /* Initialise the particles */
-  engine_init_particles(&e);
+  engine_init_particles(&e, flag_entropy_ICs);
 
   /* Legend */
   if (myrank == 0)
@@ -475,17 +487,15 @@ int main(int argc, char *argv[]) {
            "Updates", "g-Updates", "Wall-clock time", clocks_getunit());
 
   /* Main simulation loop */
-  for (int j = 0; !engine_is_done(&e); j++) {
+  for (int j = 0; !engine_is_done(&e) && e.step != nsteps; j++) {
 
 /* Repartition the space amongst the nodes? */
 #ifdef WITH_MPI
     if (j % 100 == 2) e.forcerepart = reparttype;
 #endif
 
+    /* Reset timers */
     timers_reset(timers_mask_all);
-#ifdef COUNTER
-    for (k = 0; k < runner_counter_count; k++) runner_counter[k] = 0;
-#endif
 
     /* Take a step. */
     engine_step(&e);
diff --git a/examples/parameter_example.yml b/examples/parameter_example.yml
index 9b2ebd028a3506e231aa36ed2078bb44faf4eb49..fb1e6ff6931b2fdee00792eed7f178872ee6d950 100644
--- a/examples/parameter_example.yml
+++ b/examples/parameter_example.yml
@@ -39,10 +39,10 @@ Statistics:
 SPH:
   resolution_eta:        1.2348   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
   delta_neighbours:      0.1      # The tolerance for the targetted number of neighbours.
-  max_ghost_iterations:  30       # Maximal number of iterations allowed to converge towards the smoothing length.
+  max_ghost_iterations:  30       # (Optional) Maximal number of iterations allowed to converge towards the smoothing length.
   max_smoothing_length:  0.1      # Maximal smoothing length allowed (in internal units).
   CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
-  max_volume_change:     2.       # Maximal allowed change of kernel volume over one time-step
+  max_volume_change:     2.       # (Optional) Maximal allowed change of kernel volume over one time-step
 
 # Parameters related to the initial conditions
 InitialConditions:
diff --git a/examples/plot_tasks.py b/examples/plot_tasks.py
index e070bdb565d6d8305b7974137292910dd720afe2..f2d0aa95d1f35f30476e1989349a07be8d9e5b0a 100755
--- a/examples/plot_tasks.py
+++ b/examples/plot_tasks.py
@@ -55,8 +55,8 @@ PLOT_PARAMS = {"axes.labelsize": 10,
 pl.rcParams.update(PLOT_PARAMS)
 
 #  Tasks and subtypes. Indexed as in tasks.h.
-TASKTYPES = ["none", "sort", "self", "pair", "sub", "init", "ghost", "drift",
-             "kick", "kick_fixdt", "send", "recv", "grav_pp", "grav_mm",
+TASKTYPES = ["none", "sort", "self", "pair", "sub_self", "sub_pair", "init", "ghost",
+             "drift", "kick", "kick_fixdt", "send", "recv", "grav_pp", "grav_mm",
              "grav_up", "grav_down", "grav_external", "part_sort", "gpart_sort",
              "split_cell", "rewait", "count"]
 
@@ -64,7 +64,8 @@ TASKCOLOURS = {"none": "black",
                "sort": "lightblue",
                "self": "greenyellow",
                "pair": "navy",
-               "sub": "hotpink",
+               "sub_self": "greenyellow",
+               "sub_pair": "navy",
                "init": "indigo",
                "ghost": "cyan",
                "drift": "maroon",
diff --git a/examples/plot_tasks_MPI.py b/examples/plot_tasks_MPI.py
index 44705a0164034f8a22f1f3aa860a64d3e7656d2e..9a92faf9417c9a302831eb8cb2f4471eb672d59c 100755
--- a/examples/plot_tasks_MPI.py
+++ b/examples/plot_tasks_MPI.py
@@ -61,8 +61,8 @@ PLOT_PARAMS = {"axes.labelsize": 10,
 pl.rcParams.update(PLOT_PARAMS)
 
 #  Tasks and subtypes. Indexed as in tasks.h.
-TASKTYPES = ["none", "sort", "self", "pair", "sub", "init", "ghost", "drift",
-             "kick", "kick_fixdt", "send", "recv", "grav_pp", "grav_mm",
+TASKTYPES = ["none", "sort", "self", "pair", "sub_self", "sub_pair", "init", "ghost",
+             "drift", "kick", "kick_fixdt", "send", "recv", "grav_pp", "grav_mm",
              "grav_up", "grav_down", "grav_external", "part_sort", "gpart_sort",
              "split_cell", "rewait", "count"]
 
@@ -70,7 +70,8 @@ TASKCOLOURS = {"none": "black",
                "sort": "lightblue",
                "self": "greenyellow",
                "pair": "navy",
-               "sub": "hotpink",
+               "sub_self": "greenyellow",
+               "sub_pair": "navy",
                "init": "indigo",
                "ghost": "cyan",
                "drift": "maroon",
diff --git a/src/Makefile.am b/src/Makefile.am
index 9ed6f9f19d4f659119271338e6673d7a14b8f9ee..ba9e2373a72ddd21a747337edca9a5c22a2cd26d 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1,4 +1,3 @@
-
 # This file is part of SWIFT.
 # Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk),
 #                    Matthieu Schaller (matthieu.schaller@durham.ac.uk).
@@ -17,14 +16,21 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 # Add the debug flag to the whole thing
-AM_CFLAGS = -DTIMER -DCOUNTER $(HDF5_CPPFLAGS)
+AM_CFLAGS = -DTIMER $(HDF5_CPPFLAGS)
 
 # Assign a "safe" version number
-AM_LDFLAGS = $(LAPACK_LIBS) $(BLAS_LIBS) $(HDF5_LDFLAGS) -version-info 0:0:0 # -fsanitize=address
+AM_LDFLAGS = $(HDF5_LDFLAGS) -version-info 0:0:0
 
 # The git command, if available.
 GIT_CMD = @GIT_CMD@
 
+# Additional dependencies for shared libraries.
+EXTRA_LIBS = $(HDF5_LIBS) $(PROFILER_LIBS) $(TCMALLOC_LIBS)
+
+# MPI libraries.
+MPI_LIBS = $(METIS_LIBS) $(MPI_THREAD_LIBS)
+MPI_FLAGS = -DWITH_MPI $(METIS_INCS)
+
 # Build the libswiftsim library
 lib_LTLIBRARIES = libswiftsim.la
 # Build a MPI-enabled version too?
@@ -67,10 +73,13 @@ nobase_noinst_HEADERS = approx_math.h atomic.h cycle.h error.h inline.h kernel_h
 
 # Sources and flags for regular library
 libswiftsim_la_SOURCES = $(AM_SOURCES)
+libswiftsim_la_CFLAGS = $(AM_CFLAGS)
+libswiftsim_la_LDFLAGS = $(AM_LDFLAGS) $(EXTRA_LIBS)
 
 # Sources and flags for MPI library
 libswiftsim_mpi_la_SOURCES = $(AM_SOURCES)
-libswiftsim_mpi_la_CFLAGS = $(AM_CFLAGS) -DWITH_MPI $(METIS_INCS)
+libswiftsim_mpi_la_CFLAGS = $(AM_CFLAGS) $(MPI_FLAGS)
+libswiftsim_mpi_la_LDFLAGS = $(AM_LDFLAGS) $(MPI_LIBS) $(EXTRA_LIBS)
 libswiftsim_mpi_la_SHORTNAME = mpi
 
 
diff --git a/src/approx_math.h b/src/approx_math.h
index ef93ea63c383c74caa3eaff65446962872389a35..cbca602b3fafcc5044b0939b2207b8f9d50a7446 100644
--- a/src/approx_math.h
+++ b/src/approx_math.h
@@ -19,6 +19,8 @@
 #ifndef SWIFT_APPROX_MATH_H
 #define SWIFT_APPROX_MATH_H
 
+#include "inline.h"
+
 /**
  * @brief Approximate version of expf(x) using a 4th order Taylor expansion
  *
diff --git a/src/cell.c b/src/cell.c
index 97a67583a7872999d7b7e7c5a0eb9787c83b4bd3..cd83ddab570a3c848e0502629534743e8f383a4f 100644
--- a/src/cell.c
+++ b/src/cell.c
@@ -128,6 +128,7 @@ int cell_unpack(struct pcell *pc, struct cell *c, struct space *s) {
     }
 
   /* Return the total number of unpacked cells. */
+  c->pcell_size = count;
   return count;
 }
 
@@ -213,6 +214,39 @@ int cell_pack(struct cell *c, struct pcell *pc) {
       pc->progeny[k] = -1;
 
   /* Return the number of packed cells used. */
+  c->pcell_size = count;
+  return count;
+}
+
+int cell_pack_ti_ends(struct cell *c, int *ti_ends) {
+
+  /* Pack this cell's data. */
+  ti_ends[0] = c->ti_end_min;
+
+  /* Fill in the progeny, depth-first recursion. */
+  int count = 1;
+  for (int k = 0; k < 8; k++)
+    if (c->progeny[k] != NULL) {
+      count += cell_pack_ti_ends(c->progeny[k], &ti_ends[count]);
+    }
+
+  /* Return the number of packed values. */
+  return count;
+}
+
+int cell_unpack_ti_ends(struct cell *c, int *ti_ends) {
+
+  /* Unpack this cell's data. */
+  c->ti_end_min = ti_ends[0];
+
+  /* Fill in the progeny, depth-first recursion. */
+  int count = 1;
+  for (int k = 0; k < 8; k++)
+    if (c->progeny[k] != NULL) {
+      count += cell_unpack_ti_ends(c->progeny[k], &ti_ends[count]);
+    }
+
+  /* Return the number of packed values. */
   return count;
 }
 
@@ -269,7 +303,7 @@ int cell_locktree(struct cell *c) {
     /* Undo the holds up to finger. */
     for (struct cell *finger2 = c->parent; finger2 != finger;
          finger2 = finger2->parent)
-      __sync_fetch_and_sub(&finger2->hold, 1);
+      atomic_dec(&finger2->hold);
 
     /* Unlock this cell. */
     if (lock_unlock(&c->lock) != 0) error("Failed to unlock cell.");
@@ -309,7 +343,7 @@ int cell_glocktree(struct cell *c) {
     if (lock_trylock(&finger->glock) != 0) break;
 
     /* Increment the hold. */
-    __sync_fetch_and_add(&finger->ghold, 1);
+    atomic_inc(&finger->ghold);
 
     /* Unlock the cell. */
     if (lock_unlock(&finger->glock) != 0) error("Failed to unlock cell.");
@@ -327,7 +361,7 @@ int cell_glocktree(struct cell *c) {
     /* Undo the holds up to finger. */
     for (struct cell *finger2 = c->parent; finger2 != finger;
          finger2 = finger2->parent)
-      __sync_fetch_and_sub(&finger2->ghold, 1);
+      atomic_dec(&finger2->ghold);
 
     /* Unlock this cell. */
     if (lock_unlock(&c->glock) != 0) error("Failed to unlock cell.");
@@ -353,7 +387,7 @@ void cell_unlocktree(struct cell *c) {
 
   /* Climb up the tree and unhold the parents. */
   for (struct cell *finger = c->parent; finger != NULL; finger = finger->parent)
-    __sync_fetch_and_sub(&finger->hold, 1);
+    atomic_dec(&finger->hold);
 
   TIMER_TOC(timer_locktree);
 }
@@ -367,7 +401,7 @@ void cell_gunlocktree(struct cell *c) {
 
   /* Climb up the tree and unhold the parents. */
   for (struct cell *finger = c->parent; finger != NULL; finger = finger->parent)
-    __sync_fetch_and_sub(&finger->ghold, 1);
+    atomic_dec(&finger->ghold);
 
   TIMER_TOC(timer_locktree);
 }
@@ -376,9 +410,11 @@ void cell_gunlocktree(struct cell *c) {
  * @brief Sort the parts into eight bins along the given pivots.
  *
  * @param c The #cell array to be sorted.
+ * @param parts_offset Offset of the cell parts array relative to the
+ *        space's parts array, i.e. c->parts - s->parts.
  */
 
-void cell_split(struct cell *c) {
+void cell_split(struct cell *c, ptrdiff_t parts_offset) {
 
   int i, j;
   const int count = c->count, gcount = c->gcount;
@@ -406,12 +442,14 @@ void cell_split(struct cell *c) {
       xparts[j] = xtemp;
     }
   }
-  /* for ( k = 0 ; k <= j ; k++ )
-      if ( parts[k].x[0] > pivot[0] )
-          error( "cell_split: sorting failed." );
-  for ( k = i ; k < count ; k++ )
-      if ( parts[k].x[0] < pivot[0] )
-          error( "cell_split: sorting failed." ); */
+
+#ifdef SWIFT_DEBUG_CHECKS
+  for (int k = 0; k <= j; k++)
+    if (parts[k].x[0] > pivot[0]) error("cell_split: sorting failed.");
+  for (int k = i; k < count; k++)
+    if (parts[k].x[0] < pivot[0]) error("cell_split: sorting failed.");
+#endif
+
   left[1] = i;
   right[1] = count - 1;
   left[0] = 0;
@@ -433,14 +471,17 @@ void cell_split(struct cell *c) {
         xparts[j] = xtemp;
       }
     }
-    /* for ( int kk = left[k] ; kk <= j ; kk++ )
-        if ( parts[kk].x[1] > pivot[1] ) {
-            message( "ival=[%i,%i], i=%i, j=%i." , left[k] , right[k] , i , j );
-            error( "sorting failed (left)." );
-            }
-    for ( int kk = i ; kk <= right[k] ; kk++ )
-        if ( parts[kk].x[1] < pivot[1] )
-            error( "sorting failed (right)." ); */
+
+#ifdef SWIFT_DEBUG_CHECKS
+    for (int kk = left[k]; kk <= j; kk++)
+      if (parts[kk].x[1] > pivot[1]) {
+        message("ival=[%i,%i], i=%i, j=%i.", left[k], right[k], i, j);
+        error("sorting failed (left).");
+      }
+    for (int kk = i; kk <= right[k]; kk++)
+      if (parts[kk].x[1] < pivot[1]) error("sorting failed (right).");
+#endif
+
     left[2 * k + 1] = i;
     right[2 * k + 1] = right[k];
     left[2 * k] = left[k];
@@ -463,16 +504,20 @@ void cell_split(struct cell *c) {
         xparts[j] = xtemp;
       }
     }
-    /* for ( int kk = left[k] ; kk <= j ; kk++ )
-        if ( parts[kk].x[2] > pivot[2] ) {
-            message( "ival=[%i,%i], i=%i, j=%i." , left[k] , right[k] , i , j );
-            error( "sorting failed (left)." );
-            }
-    for ( int kk = i ; kk <= right[k] ; kk++ )
-        if ( parts[kk].x[2] < pivot[2] ) {
-            message( "ival=[%i,%i], i=%i, j=%i." , left[k] , right[k] , i , j );
-            error( "sorting failed (right)." );
-            } */
+
+#ifdef SWIFT_DEBUG_CHECKS
+    for (int kk = left[k]; kk <= j; kk++)
+      if (parts[kk].x[2] > pivot[2]) {
+        message("ival=[%i,%i], i=%i, j=%i.", left[k], right[k], i, j);
+        error("sorting failed (left).");
+      }
+    for (int kk = i; kk <= right[k]; kk++)
+      if (parts[kk].x[2] < pivot[2]) {
+        message("ival=[%i,%i], i=%i, j=%i.", left[k], right[k], i, j);
+        error("sorting failed (right).");
+      }
+#endif
+
     left[2 * k + 1] = i;
     right[2 * k + 1] = right[k];
     left[2 * k] = left[k];
@@ -487,35 +532,36 @@ void cell_split(struct cell *c) {
   }
 
   /* Re-link the gparts. */
-  for (int k = 0; k < count; k++)
-    if (parts[k].gpart != NULL) parts[k].gpart->part = &parts[k];
+  part_relink_gparts(parts, count, parts_offset);
 
+#ifdef SWIFT_DEBUG_CHECKS
   /* Verify that _all_ the parts have been assigned to a cell. */
-  /* for ( k = 1 ; k < 8 ; k++ )
-      if ( &c->progeny[k-1]->parts[ c->progeny[k-1]->count ] !=
-  c->progeny[k]->parts )
-          error( "Particle sorting failed (internal consistency)." );
-  if ( c->progeny[0]->parts != c->parts )
-      error( "Particle sorting failed (left edge)." );
-  if ( &c->progeny[7]->parts[ c->progeny[7]->count ] != &c->parts[ count ] )
-      error( "Particle sorting failed (right edge)." ); */
+  for (int k = 1; k < 8; k++)
+    if (&c->progeny[k - 1]->parts[c->progeny[k - 1]->count] !=
+        c->progeny[k]->parts)
+      error("Particle sorting failed (internal consistency).");
+  if (c->progeny[0]->parts != c->parts)
+    error("Particle sorting failed (left edge).");
+  if (&c->progeny[7]->parts[c->progeny[7]->count] != &c->parts[count])
+    error("Particle sorting failed (right edge).");
 
   /* Verify a few sub-cells. */
-  /* for (int k = 0 ; k < c->progeny[0]->count ; k++ )
-      if ( c->progeny[0]->parts[k].x[0] > pivot[0] ||
-           c->progeny[0]->parts[k].x[1] > pivot[1] ||
-           c->progeny[0]->parts[k].x[2] > pivot[2] )
-          error( "Sorting failed (progeny=0)." );
-  for (int k = 0 ; k < c->progeny[1]->count ; k++ )
-      if ( c->progeny[1]->parts[k].x[0] > pivot[0] ||
-           c->progeny[1]->parts[k].x[1] > pivot[1] ||
-           c->progeny[1]->parts[k].x[2] <= pivot[2] )
-          error( "Sorting failed (progeny=1)." );
-  for (int k = 0 ; k < c->progeny[2]->count ; k++ )
-      if ( c->progeny[2]->parts[k].x[0] > pivot[0] ||
-           c->progeny[2]->parts[k].x[1] <= pivot[1] ||
-           c->progeny[2]->parts[k].x[2] > pivot[2] )
-          error( "Sorting failed (progeny=2)." ); */
+  for (int k = 0; k < c->progeny[0]->count; k++)
+    if (c->progeny[0]->parts[k].x[0] > pivot[0] ||
+        c->progeny[0]->parts[k].x[1] > pivot[1] ||
+        c->progeny[0]->parts[k].x[2] > pivot[2])
+      error("Sorting failed (progeny=0).");
+  for (int k = 0; k < c->progeny[1]->count; k++)
+    if (c->progeny[1]->parts[k].x[0] > pivot[0] ||
+        c->progeny[1]->parts[k].x[1] > pivot[1] ||
+        c->progeny[1]->parts[k].x[2] <= pivot[2])
+      error("Sorting failed (progeny=1).");
+  for (int k = 0; k < c->progeny[2]->count; k++)
+    if (c->progeny[2]->parts[k].x[0] > pivot[0] ||
+        c->progeny[2]->parts[k].x[1] <= pivot[1] ||
+        c->progeny[2]->parts[k].x[2] > pivot[2])
+      error("Sorting failed (progeny=2).");
+#endif
 
   /* Now do the same song and dance for the gparts. */
 
@@ -581,8 +627,7 @@ void cell_split(struct cell *c) {
   }
 
   /* Re-link the parts. */
-  for (int k = 0; k < gcount; k++)
-    if (gparts[k].id > 0) gparts[k].part->gpart = &gparts[k];
+  part_relink_parts(gparts, gcount, parts - parts_offset);
 }
 
 /**
diff --git a/src/cell.h b/src/cell.h
index 65665c9dcfcd652cc688f652008044d618b10790..b5cc49d05c1cee6e0a884cdb708abbaff4281829 100644
--- a/src/cell.h
+++ b/src/cell.h
@@ -24,11 +24,15 @@
 #define SWIFT_CELL_H
 
 /* Includes. */
+#include <stddef.h>
+
+/* Local includes. */
 #include "lock.h"
 #include "multipole.h"
 #include "part.h"
+#include "task.h"
 
-/* Forward declaration of space, needed for cell_unpack. */
+/* Avoid cyclic inclusions */
 struct space;
 
 /* Max tag size set to 2^29 to take into account some MPI implementations
@@ -121,7 +125,10 @@ struct cell {
   struct task *ghost, *init, *drift, *kick;
 
   /* Task receiving data. */
-  struct task *recv_xv, *recv_rho;
+  struct task *recv_xv, *recv_rho, *recv_ti;
+
+  /* Task send data. */
+  struct link *send_xv, *send_rho, *send_ti;
 
   /* Tasks for gravity tree. */
   struct task *grav_up, *grav_down;
@@ -174,13 +181,15 @@ struct cell {
   ((int)(k) + (cdim)[2] * ((int)(j) + (cdim)[1] * (int)(i)))
 
 /* Function prototypes. */
-void cell_split(struct cell *c);
+void cell_split(struct cell *c, ptrdiff_t parts_offset);
 int cell_locktree(struct cell *c);
 void cell_unlocktree(struct cell *c);
 int cell_glocktree(struct cell *c);
 void cell_gunlocktree(struct cell *c);
 int cell_pack(struct cell *c, struct pcell *pc);
 int cell_unpack(struct pcell *pc, struct cell *c, struct space *s);
+int cell_pack_ti_ends(struct cell *c, int *ti_ends);
+int cell_unpack_ti_ends(struct cell *c, int *ti_ends);
 int cell_getsize(struct cell *c);
 int cell_link_parts(struct cell *c, struct part *parts);
 int cell_link_gparts(struct cell *c, struct gpart *gparts);
diff --git a/src/common_io.c b/src/common_io.c
index 4399c83f7ebd2bb5355b7df9cc1fdecd301e36aa..971fe6b01c682c489d3444ec1d47d6a902250bb8 100644
--- a/src/common_io.c
+++ b/src/common_io.c
@@ -43,6 +43,8 @@
 #include "const.h"
 #include "error.h"
 #include "kernel_hydro.h"
+#include "part.h"
+#include "units.h"
 #include "version.h"
 
 const char* particle_type_names[NUM_PARTICLE_TYPES] = {
@@ -514,12 +516,10 @@ void prepare_dm_gparts(struct gpart* const gparts, size_t Ndm) {
 
   /* Let's give all these gparts a negative id */
   for (size_t i = 0; i < Ndm; ++i) {
-
     /* 0 or negative ids are not allowed */
-    if (gparts[i].id <= 0)
-      error("0 or negative ID for DM particle %zd: ID=%lld", i, gparts[i].id);
-
-    gparts[i].id = -gparts[i].id;
+    if (gparts[i].id_or_neg_offset <= 0)
+      error("0 or negative ID for DM particle %zd: ID=%lld", i,
+            gparts[i].id_or_neg_offset);
   }
 }
 
@@ -553,7 +553,7 @@ void duplicate_hydro_gparts(struct part* const parts,
     gparts[i + Ndm].mass = parts[i].mass;
 
     /* Link the particles */
-    gparts[i + Ndm].part = &parts[i];
+    gparts[i + Ndm].id_or_neg_offset = -i;
     parts[i].gpart = &gparts[i + Ndm];
   }
 }
@@ -578,9 +578,8 @@ void collect_dm_gparts(const struct gpart* const gparts, size_t Ntot,
      * gparts[i].part); */
 
     /* And collect the DM ones */
-    if (gparts[i].id < 0LL) {
-      memcpy(&dmparts[count], &gparts[i], sizeof(struct gpart));
-      dmparts[count].id = -dmparts[count].id;
+    if (gparts[i].id_or_neg_offset > 0) {
+      dmparts[count] = gparts[i];
       count++;
     }
   }
diff --git a/src/common_io.h b/src/common_io.h
index ed1c96801c904d9952c7b3ca995b847afdc3fb43..fa6811a26671a2ed85c12ada7bb380094f55795d 100644
--- a/src/common_io.h
+++ b/src/common_io.h
@@ -23,13 +23,11 @@
 /* Config parameters. */
 #include "../config.h"
 
-/* Includes. */
-#include "kernel_hydro.h"
+#if defined(HAVE_HDF5)
+
 #include "part.h"
 #include "units.h"
 
-#if defined(HAVE_HDF5)
-
 /**
  * @brief The different types of data used in the GADGET IC files.
  *
@@ -107,6 +105,6 @@ void writeXMFline(FILE* xmfFile, char* fileName, char* partTypeGroupName,
 void writeCodeDescription(hid_t h_file);
 void writeUnitSystem(hid_t h_file, struct UnitSystem* us);
 
-#endif
+#endif /* defined HDF5 */
 
 #endif /* SWIFT_COMMON_IO_H */
diff --git a/src/const.h b/src/const.h
index 6432ef6a9e8107d94c93a32967e291d7e1e4d24d..673aa3bf0093c1c426938a384fd017c1c1d18310 100644
--- a/src/const.h
+++ b/src/const.h
@@ -60,4 +60,7 @@
 /* Gravity properties */
 #define EXTERNAL_POTENTIAL_POINTMASS
 
+/* Are we debugging ? */
+//#define SWIFT_DEBUG_CHECKS
+
 #endif /* SWIFT_CONST_H */
diff --git a/src/debug.c b/src/debug.c
index a3647915d96a9456cb6d8177d144dab56fd0b97e..e7b60c52f92fdb64fdefc8ac5c5e353f1ab57e6e 100644
--- a/src/debug.c
+++ b/src/debug.c
@@ -20,11 +20,19 @@
  *
  ******************************************************************************/
 
+/* Config parameters. */
+#include "../config.h"
+
+/* Some standard headers. */
 #include <stdio.h>
 
+/* This object's header. */
+#include "debug.h"
+
+/* Local includes. */
 #include "config.h"
 #include "const.h"
-#include "debug.h"
+#include "inline.h"
 #include "part.h"
 
 /* Import the right hydro definition */
@@ -51,9 +59,8 @@
  *
  * (Should be used for debugging only as it runs in O(N).)
  */
-
-void printParticle(struct part *parts, struct xpart *xparts, long long int id,
-                   size_t N) {
+void printParticle(const struct part *parts, struct xpart *xparts,
+                   long long int id, size_t N) {
 
   int found = 0;
 
@@ -69,19 +76,33 @@ void printParticle(struct part *parts, struct xpart *xparts, long long int id,
   if (!found) printf("## Particles[???] id=%lld not found\n", id);
 }
 
-void printgParticle(struct gpart *gparts, long long int id, size_t N) {
+/**
+ * @brief Looks for the g-particle with the given id and prints its information
+ * to
+ * the standard output.
+ *
+ * @param gparts The array of g-particles.
+ * @param parts The array of particles.
+ * @param id The id too look for.
+ * @param N The size of the array of g-particles.
+ *
+ * (Should be used for debugging only as it runs in O(N).)
+ */
+void printgParticle(const struct gpart *gparts, const struct part *parts,
+                    long long int id, size_t N) {
 
   int found = 0;
 
   /* Look for the particle. */
   for (size_t i = 0; i < N; i++)
-    if (gparts[i].id == -id) {
-      printf("## gParticle[%zd] (DM) :\n id=%lld ", i, -gparts[i].id);
+    if (gparts[i].id_or_neg_offset == id) {
+      printf("## gParticle[%zd] (DM) :\n id=%lld", i, id);
       gravity_debug_particle(&gparts[i]);
       found = 1;
       break;
-    } else if (gparts[i].id > 0 && gparts[i].part->id == id) {
-      printf("## gParticle[%zd] (hydro) :\n id=%lld ", i, gparts[i].id);
+    } else if (gparts[i].id_or_neg_offset < 0 &&
+               parts[-gparts[i].id_or_neg_offset].id == id) {
+      printf("## gParticle[%zd] (hydro) :\n id=%lld", i, id);
       gravity_debug_particle(&gparts[i]);
       found = 1;
       break;
@@ -95,16 +116,26 @@ void printgParticle(struct gpart *gparts, long long int id, size_t N) {
  *
  * @param p The particle to print
  * @param xp The extended data ot the particle to print
- *
  */
+void printParticle_single(const struct part *p, const struct xpart *xp) {
 
-void printParticle_single(struct part *p, struct xpart *xp) {
-
-  printf("## Particle: id=%lld ", p->id);
+  printf("## Particle: id=%lld", p->id);
   hydro_debug_particle(p, xp);
   printf("\n");
 }
 
+/**
+ * @brief Prints the details of a given particle to stdout
+ *
+ * @param gp The g-particle to print
+ */
+void printgParticle_single(struct gpart *gp) {
+
+  printf("## g-Particle: id=%lld ", gp->id_or_neg_offset);
+  gravity_debug_particle(gp);
+  printf("\n");
+}
+
 #ifdef HAVE_METIS
 
 /**
diff --git a/src/debug.h b/src/debug.h
index fba5cb68680472715025f9fee39a5bf06abacf81..367241201977d9b79a8c2913dbae5d08f1148529 100644
--- a/src/debug.h
+++ b/src/debug.h
@@ -23,10 +23,12 @@
 #include "cell.h"
 #include "part.h"
 
-void printParticle(struct part *parts, struct xpart *xparts, long long int id,
-                   size_t N);
-void printgParticle(struct gpart *parts, long long int id, size_t N);
-void printParticle_single(struct part *p, struct xpart *xp);
+void printParticle(const struct part *parts, struct xpart *xparts,
+                   long long int id, size_t N);
+void printgParticle(const struct gpart *gparts, const struct part *parts,
+                    long long int id, size_t N);
+void printParticle_single(const struct part *p, const struct xpart *xp);
+void printgParticle_single(struct gpart *gp);
 
 #ifdef HAVE_METIS
 #include "metis.h"
diff --git a/src/engine.c b/src/engine.c
index 56d5e1cdd97285e35bb76d7c96c873885d3febc2..8267b52adb2013263c54103892956e5eb8aee01a 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -59,15 +59,26 @@
 #include "parallel_io.h"
 #include "part.h"
 #include "partition.h"
+#include "proxy.h"
+#include "runner.h"
 #include "serial_io.h"
 #include "single_io.h"
 #include "timers.h"
-
-const char *engine_policy_names[13] = {
-    "none",                 "rand",   "steal",        "keep",
-    "block",                "fix_dt", "cpu_tight",    "mpi",
-    "numa_affinity",        "hydro",  "self_gravity", "external_gravity",
-    "cosmology_integration"};
+#include "units.h"
+
+const char *engine_policy_names[13] = {"none",
+                                       "rand",
+                                       "steal",
+                                       "keep",
+                                       "block",
+                                       "fix_dt",
+                                       "cpu_tight",
+                                       "mpi",
+                                       "numa_affinity",
+                                       "hydro",
+                                       "self_gravity",
+                                       "external_gravity",
+                                       "cosmology_integration"};
 
 /** The rank of the engine as a global variable (for messages). */
 int engine_rank;
@@ -87,7 +98,7 @@ static cpu_set_t entry_affinity;
  * @return The new #link pointer.
  */
 
-void engine_addlink(struct engine *e, struct link **l, struct task *t) {
+struct link *engine_addlink(struct engine *e, struct link *l, struct task *t) {
 
   /* Get the next free link. */
   const int ind = atomic_inc(&e->nr_links);
@@ -102,8 +113,8 @@ void engine_addlink(struct engine *e, struct link **l, struct task *t) {
 }
 
 /**
- * @brief Generate the hierarchical tasks for a hierarchy of cells - i.e. all
- * the O(Npart) tasks.
+ * @brief Generate the gravity hierarchical tasks for a hierarchy of cells -
+ * i.e. all the O(Npart) tasks.
  *
  * Tasks are only created here. The dependencies will be added later on.
  *
@@ -111,9 +122,8 @@ void engine_addlink(struct engine *e, struct link **l, struct task *t) {
  * @param c The #cell.
  * @param super The super #cell.
  */
-
-void engine_make_hierarchical_tasks(struct engine *e, struct cell *c,
-                                    struct cell *super) {
+void engine_make_gravity_hierarchical_tasks(struct engine *e, struct cell *c,
+                                            struct cell *super) {
 
   struct scheduler *s = &e->sched;
   const int is_with_external_gravity =
@@ -122,45 +132,102 @@ void engine_make_hierarchical_tasks(struct engine *e, struct cell *c,
   const int is_fixdt = (e->policy & engine_policy_fixdt) == engine_policy_fixdt;
 
   /* Am I the super-cell? */
-  if (super == NULL && (c->count > 0 || c->gcount > 0)) {
+  /* TODO(pedro): Add a condition for gravity tasks as well. */
+  if (super == NULL &&
+      (c->density != NULL || (!c->split && (c->count > 0 || c->gcount > 0)))) {
 
-    /* Remember me. */
+    /* This is the super cell, i.e. the first with density tasks attached. */
     super = c;
 
     /* Local tasks only... */
     if (c->nodeID == e->nodeID) {
 
       /* Add the init task. */
-      c->init = scheduler_addtask(s, task_type_init, task_subtype_none, 0, 0, c,
-                                  NULL, 0);
+      if (c->init == NULL)
+        c->init = scheduler_addtask(s, task_type_init, task_subtype_none, 0, 0,
+                                    c, NULL, 0);
 
       /* Add the drift task. */
-      c->drift = scheduler_addtask(s, task_type_drift, task_subtype_none, 0, 0,
-                                   c, NULL, 0);
+      if (c->drift == NULL)
+        c->drift = scheduler_addtask(s, task_type_drift, task_subtype_none, 0,
+                                     0, c, NULL, 0);
 
       /* Add the kick task that matches the policy. */
       if (is_fixdt) {
-        c->kick = scheduler_addtask(s, task_type_kick_fixdt, task_subtype_none,
-                                    0, 0, c, NULL, 0);
+        if (c->kick == NULL)
+          c->kick = scheduler_addtask(s, task_type_kick_fixdt,
+                                      task_subtype_none, 0, 0, c, NULL, 0);
       } else {
-        c->kick = scheduler_addtask(s, task_type_kick, task_subtype_none, 0, 0,
-                                    c, NULL, 0);
+        if (c->kick == NULL)
+          c->kick = scheduler_addtask(s, task_type_kick, task_subtype_none, 0,
+                                      0, c, NULL, 0);
       }
 
-      if (c->count > 0) {
+      if (is_with_external_gravity)
+        c->grav_external = scheduler_addtask(
+            s, task_type_grav_external, task_subtype_none, 0, 0, c, NULL, 0);
+    }
+  }
+
+  /* Set the super-cell. */
+  c->super = super;
 
-        /* Generate the ghost task. */
-        c->ghost = scheduler_addtask(s, task_type_ghost, task_subtype_none, 0,
-                                     0, c, NULL, 0);
-      }
+  /* Recurse. */
+  if (c->split)
+    for (int k = 0; k < 8; k++)
+      if (c->progeny[k] != NULL)
+        engine_make_gravity_hierarchical_tasks(e, c->progeny[k], super);
+}
+
+/**
+ * @brief Generate the hydro hierarchical tasks for a hierarchy of cells -
+ * i.e. all the O(Npart) tasks.
+ *
+ * Tasks are only created here. The dependencies will be added later on.
+ *
+ * @param e The #engine.
+ * @param c The #cell.
+ * @param super The super #cell.
+ */
+void engine_make_hydro_hierarchical_tasks(struct engine *e, struct cell *c,
+                                          struct cell *super) {
+
+  struct scheduler *s = &e->sched;
+  const int is_fixdt = (e->policy & engine_policy_fixdt) == engine_policy_fixdt;
 
-      if (c->gcount > 0) {
+  /* Is this the super-cell? */
+  if (super == NULL && (c->density != NULL || (c->count > 0 && !c->split))) {
+
+    /* This is the super cell, i.e. the first with density tasks attached. */
+    super = c;
 
-        /* Add the external gravity tasks */
-        if (is_with_external_gravity)
-          c->grav_external = scheduler_addtask(
-              s, task_type_grav_external, task_subtype_none, 0, 0, c, NULL, 0);
+    /* Local tasks only... */
+    if (c->nodeID == e->nodeID) {
+
+      /* Add the init task. */
+      if (c->init == NULL)
+        c->init = scheduler_addtask(s, task_type_init, task_subtype_none, 0, 0,
+                                    c, NULL, 0);
+
+      /* Add the drift task. */
+      if (c->drift == NULL)
+        c->drift = scheduler_addtask(s, task_type_drift, task_subtype_none, 0,
+                                     0, c, NULL, 0);
+
+      /* Add the kick task that matches the policy. */
+      if (is_fixdt) {
+        if (c->kick == NULL)
+          c->kick = scheduler_addtask(s, task_type_kick_fixdt,
+                                      task_subtype_none, 0, 0, c, NULL, 0);
+      } else {
+        if (c->kick == NULL)
+          c->kick = scheduler_addtask(s, task_type_kick, task_subtype_none, 0,
+                                      0, c, NULL, 0);
       }
+
+      /* Generate the ghost task. */
+      c->ghost = scheduler_addtask(s, task_type_ghost, task_subtype_none, 0, 0,
+                                   c, NULL, 0);
     }
   }
 
@@ -171,7 +238,7 @@ void engine_make_hierarchical_tasks(struct engine *e, struct cell *c,
   if (c->split)
     for (int k = 0; k < 8; k++)
       if (c->progeny[k] != NULL)
-        engine_make_hierarchical_tasks(e, c->progeny[k], super);
+        engine_make_hydro_hierarchical_tasks(e, c->progeny[k], super);
 }
 
 /**
@@ -236,9 +303,12 @@ void engine_redistribute(struct engine *e) {
     }
     const int cid = cell_getid(cdim, parts[k].x[0] * ih[0],
                                parts[k].x[1] * ih[1], parts[k].x[2] * ih[2]);
-    /* if (cid < 0 || cid >= s->nr_cells)
-       error("Bad cell id %i for part %i at [%.3e,%.3e,%.3e].",
-             cid, k, parts[k].x[0], parts[k].x[1], parts[k].x[2]); */
+#ifdef SWIFT_DEBUG_CHECKS
+    if (cid < 0 || cid >= s->nr_cells)
+      error("Bad cell id %i for part %zi at [%.3e,%.3e,%.3e].", cid, k,
+            parts[k].x[0], parts[k].x[1], parts[k].x[2]);
+#endif
+
     dest[k] = cells[cid].nodeID;
 
     /* The counts array is indexed as count[from * nr_nodes + to]. */
@@ -265,11 +335,12 @@ void engine_redistribute(struct engine *e) {
           count_this_dest = 0;
         }
 
-        /* Debug */
-        /* if(s->parts[k].gpart->id < 0) */
-        /*   error("Trying to link a partnerless gpart !"); */
+#ifdef SWIFT_DEBUG_CHECKS
+        if (s->parts[k].gpart->id_or_neg_offset >= 0)
+          error("Trying to link a partnerless gpart !");
+#endif
 
-        s->parts[k].gpart->id = count_this_dest;
+        s->parts[k].gpart->id_or_neg_offset = -count_this_dest;
         count_this_dest++;
       }
     }
@@ -287,9 +358,12 @@ void engine_redistribute(struct engine *e) {
     }
     const int cid = cell_getid(cdim, gparts[k].x[0] * ih[0],
                                gparts[k].x[1] * ih[1], gparts[k].x[2] * ih[2]);
-    /* if (cid < 0 || cid >= s->nr_cells)
-       error("Bad cell id %i for part %i at [%.3e,%.3e,%.3e].",
-             cid, k, g_parts[k].x[0], g_parts[k].x[1], g_parts[k].x[2]); */
+#ifdef SWIFT_DEBUG_CHECKS
+    if (cid < 0 || cid >= s->nr_cells)
+      error("Bad cell id %i for part %zi at [%.3e,%.3e,%.3e].", cid, k,
+            gparts[k].x[0], gparts[k].x[1], gparts[k].x[2]);
+#endif
+
     g_dest[k] = cells[cid].nodeID;
 
     /* The counts array is indexed as count[from * nr_nodes + to]. */
@@ -450,13 +524,14 @@ void engine_redistribute(struct engine *e) {
     for (size_t k = offset_gparts; k < offset_gparts + count_gparts; ++k) {
 
       /* Does this gpart have a partner ? */
-      if (gparts_new[k].id >= 0) {
+      if (gparts_new[k].id_or_neg_offset <= 0) {
 
-        const size_t partner_index = offset_parts + gparts_new[k].id;
+        const ptrdiff_t partner_index =
+            offset_parts - gparts_new[k].id_or_neg_offset;
 
         /* Re-link */
-        gparts_new[k].part = &parts_new[partner_index];
-        gparts_new[k].part->gpart = &gparts_new[k];
+        gparts_new[k].id_or_neg_offset = -partner_index;
+        parts_new[partner_index].gpart = &gparts_new[k];
       }
     }
 
@@ -464,37 +539,39 @@ void engine_redistribute(struct engine *e) {
     offset_gparts += count_gparts;
   }
 
+#ifdef SWIFT_DEBUG_CHECKS
   /* Verify that all parts are in the right place. */
-  /* for ( int k = 0 ; k < nr_parts ; k++ ) {
-      int cid = cell_getid( cdim , parts_new[k].x[0]*ih[0],
-    parts_new[k].x[1]*ih[1], parts_new[k].x[2]*ih[2] );
-      if ( cells[ cid ].nodeID != nodeID )
-          error( "Received particle (%i) that does not belong here
-    (nodeID=%i).", k , cells[ cid ].nodeID );
-    } */
+  for (int k = 0; k < nr_parts; k++) {
+    int cid = cell_getid(cdim, parts_new[k].x[0] * ih[0],
+                         parts_new[k].x[1] * ih[1], parts_new[k].x[2] * ih[2]);
+    if (cells[cid].nodeID != nodeID)
+      error("Received particle (%i) that does not belong here (nodeID=%i).", k,
+            cells[cid].nodeID);
+  }
 
   /* Verify that the links are correct */
   /* MATTHIEU: To be commented out once we are happy */
   for (size_t k = 0; k < nr_gparts; ++k) {
 
-    if (gparts_new[k].id > 0) {
+    if (gparts_new[k].id_or_neg_offset <= 0) {
 
-      if (gparts_new[k].part->gpart != &gparts_new[k])
-        error("Linking problem !");
+      struct part *part = &parts_new[-gparts_new[k].id_or_neg_offset];
 
-      if (gparts_new[k].x[0] != gparts_new[k].part->x[0] ||
-          gparts_new[k].x[1] != gparts_new[k].part->x[1] ||
-          gparts_new[k].x[2] != gparts_new[k].part->x[2])
+      if (part->gpart != &gparts_new[k]) error("Linking problem !");
+
+      if (gparts_new[k].x[0] != part->x[0] ||
+          gparts_new[k].x[1] != part->x[1] || gparts_new[k].x[2] != part->x[2])
         error("Linked particles are not at the same position !");
     }
   }
   for (size_t k = 0; k < nr_parts; ++k) {
 
-    if (parts_new[k].gpart != NULL) {
-
-      if (parts_new[k].gpart->part != &parts_new[k]) error("Linking problem !");
+    if (parts_new[k].gpart != NULL &&
+        parts_new[k].gpart->id_or_neg_offset != -k) {
+      error("Linking problem !");
     }
   }
+#endif
 
   /* Set the new part data, free the old. */
   free(parts);
@@ -605,48 +682,63 @@ void engine_addtasks_grav(struct engine *e, struct cell *c, struct task *up,
  *
  * @param e The #engine.
  * @param ci The sending #cell.
- * @param cj The receiving #cell
+ * @param cj Dummy cell containing the nodeID of the receiving node.
+ * @param t_xv The send_xv #task, if it has already been created.
+ * @param t_rho The send_rho #task, if it has already been created.
+ * @param t_ti The send_ti #task, if required and has already been created.
  */
-
-void engine_addtasks_send(struct engine *e, struct cell *ci, struct cell *cj) {
+void engine_addtasks_send(struct engine *e, struct cell *ci, struct cell *cj,
+                          struct task *t_xv, struct task *t_rho,
+                          struct task *t_ti) {
 
 #ifdef WITH_MPI
   struct link *l = NULL;
   struct scheduler *s = &e->sched;
+  const int nodeID = cj->nodeID;
 
   /* Check if any of the density tasks are for the target node. */
   for (l = ci->density; l != NULL; l = l->next)
-    if (l->t->ci->nodeID == cj->nodeID ||
-        (l->t->cj != NULL && l->t->cj->nodeID == cj->nodeID))
+    if (l->t->ci->nodeID == nodeID ||
+        (l->t->cj != NULL && l->t->cj->nodeID == nodeID))
       break;
 
   /* If so, attach send tasks. */
   if (l != NULL) {
 
-    /* Create the tasks. */
-    struct task *t_xv = scheduler_addtask(s, task_type_send, task_subtype_none,
-                                          2 * ci->tag, 0, ci, cj, 0);
-    struct task *t_rho = scheduler_addtask(s, task_type_send, task_subtype_none,
-                                           2 * ci->tag + 1, 0, ci, cj, 0);
+    /* Create the tasks and their dependencies? */
+    if (t_xv == NULL) {
+      t_xv = scheduler_addtask(s, task_type_send, task_subtype_none,
+                               3 * ci->tag, 0, ci, cj, 0);
+      t_rho = scheduler_addtask(s, task_type_send, task_subtype_none,
+                                3 * ci->tag + 1, 0, ci, cj, 0);
+      if (!(e->policy & engine_policy_fixdt))
+        t_ti = scheduler_addtask(s, task_type_send, task_subtype_tend,
+                                 3 * ci->tag + 2, 0, ci, cj, 0);
 
-    /* The first send should depend on the engine's send_root. */
-    scheduler_addunlock(s, e->send_root, t_xv);
+      /* The send_rho task depends on the cell's ghost task. */
+      scheduler_addunlock(s, ci->super->ghost, t_rho);
 
-    /* The send_rho task depends on the cell's ghost task. */
-    scheduler_addunlock(s, ci->super->ghost, t_rho);
+      /* The send_rho task should unlock the super-cell's kick task. */
+      scheduler_addunlock(s, t_rho, ci->super->kick);
 
-    /* The send_rho task should unlock the super-cell's kick task. */
-    scheduler_addunlock(s, t_rho, ci->super->kick);
+      /* The send_xv task should unlock the super-cell's ghost task. */
+      scheduler_addunlock(s, t_xv, ci->super->ghost);
 
-    /* The send_xv task should unlock the super-cell's ghost task. */
-    scheduler_addunlock(s, t_xv, ci->super->ghost);
+      /* The super-cell's kick task should unlock the send_ti task. */
+      if (t_ti != NULL) scheduler_addunlock(s, ci->super->kick, t_ti);
+    }
 
+    /* Add them to the local cell. */
+    ci->send_xv = engine_addlink(e, ci->send_xv, t_xv);
+    ci->send_rho = engine_addlink(e, ci->send_rho, t_rho);
+    if (t_ti != NULL) ci->send_ti = engine_addlink(e, ci->send_ti, t_ti);
   }
 
   /* Recurse? */
-  else if (ci->split)
+  if (ci->split)
     for (int k = 0; k < 8; k++)
-      if (ci->progeny[k] != NULL) engine_addtasks_send(e, ci->progeny[k], cj);
+      if (ci->progeny[k] != NULL)
+        engine_addtasks_send(e, ci->progeny[k], cj, t_xv, t_rho, t_ti);
 
 #else
   error("SWIFT was not compiled with MPI support.");
@@ -657,44 +749,52 @@ void engine_addtasks_send(struct engine *e, struct cell *ci, struct cell *cj) {
  * @brief Add recv tasks to a hierarchy of cells.
  *
  * @param e The #engine.
- * @param c The #cell.
+ * @param c The foreign #cell.
  * @param t_xv The recv_xv #task, if it has already been created.
  * @param t_rho The recv_rho #task, if it has already been created.
+ * @param t_ti The recv_ti #task, if required and has already been created.
  */
 
 void engine_addtasks_recv(struct engine *e, struct cell *c, struct task *t_xv,
-                          struct task *t_rho) {
+                          struct task *t_rho, struct task *t_ti) {
 
 #ifdef WITH_MPI
   struct scheduler *s = &e->sched;
 
-  /* Do we need to construct a recv task? */
-  if (t_xv == NULL && c->nr_density > 0) {
+  /* Do we need to construct a recv task?
+     Note that since c is a foreign cell, all its density tasks will involve
+     only the current rank, and thus we don't have to check them.*/
+  if (t_xv == NULL && c->density != NULL) {
 
     /* Create the tasks. */
-    t_xv = c->recv_xv = scheduler_addtask(s, task_type_recv, task_subtype_none,
-                                          2 * c->tag, 0, c, NULL, 0);
-    t_rho = c->recv_rho = scheduler_addtask(
-        s, task_type_recv, task_subtype_none, 2 * c->tag + 1, 0, c, NULL, 0);
-
-    /* The first recv should depend on the engine's recv_root. */
-    scheduler_addunlock(s, e->recv_root, t_xv);
-  }
+    t_xv = scheduler_addtask(s, task_type_recv, task_subtype_none, 3 * c->tag,
+                             0, c, NULL, 0);
+    t_rho = scheduler_addtask(s, task_type_recv, task_subtype_none,
+                              3 * c->tag + 1, 0, c, NULL, 0);
+    if (!(e->policy & engine_policy_fixdt))
+      t_ti = scheduler_addtask(s, task_type_recv, task_subtype_tend,
+                               3 * c->tag + 2, 0, c, NULL, 0);
+  }
+  c->recv_xv = t_xv;
+  c->recv_rho = t_rho;
+  c->recv_ti = t_ti;
 
   /* Add dependencies. */
   for (struct link *l = c->density; l != NULL; l = l->next) {
     scheduler_addunlock(s, t_xv, l->t);
     scheduler_addunlock(s, l->t, t_rho);
   }
-  for (struct link *l = c->force; l != NULL; l = l->next)
+  for (struct link *l = c->force; l != NULL; l = l->next) {
     scheduler_addunlock(s, t_rho, l->t);
+    if (t_ti != NULL) scheduler_addunlock(s, l->t, t_ti);
+  }
   if (c->sorts != NULL) scheduler_addunlock(s, t_xv, c->sorts);
 
   /* Recurse? */
   if (c->split)
     for (int k = 0; k < 8; k++)
       if (c->progeny[k] != NULL)
-        engine_addtasks_recv(e, c->progeny[k], t_xv, t_rho);
+        engine_addtasks_recv(e, c->progeny[k], t_xv, t_rho, t_ti);
 
 #else
   error("SWIFT was not compiled with MPI support.");
@@ -888,7 +988,8 @@ void engine_exchange_strays(struct engine *e, size_t offset_parts,
 
     /* Re-link the associated gpart with the buffer offset of the part. */
     if (s->parts[offset_parts + k].gpart != NULL) {
-      s->parts[offset_parts + k].gpart->id = e->proxies[pid].nr_parts_out;
+      s->parts[offset_parts + k].gpart->id_or_neg_offset =
+          -e->proxies[pid].nr_parts_out;
     }
 
     /* Load the part and xpart into the proxy. */
@@ -904,7 +1005,7 @@ void engine_exchange_strays(struct engine *e, size_t offset_parts,
       error(
           "Do not have a proxy for the requested nodeID %i for part with "
           "id=%lli, x=[%e,%e,%e].",
-          node_id, s->gparts[offset_parts + k].id,
+          node_id, s->gparts[offset_parts + k].id_or_neg_offset,
           s->gparts[offset_gparts + k].x[0], s->gparts[offset_parts + k].x[1],
           s->gparts[offset_gparts + k].x[2]);
     proxy_gparts_load(&e->proxies[pid], &s->gparts[offset_gparts + k], 1);
@@ -964,7 +1065,7 @@ void engine_exchange_strays(struct engine *e, size_t offset_parts,
     s->xparts = xparts_new;
     for (size_t k = 0; k < offset_parts; k++) {
       if (s->parts[k].gpart != NULL) {
-        s->parts[k].gpart->part = &s->parts[k];
+        s->parts[k].gpart->id_or_neg_offset = -k;
       }
     }
   }
@@ -979,8 +1080,8 @@ void engine_exchange_strays(struct engine *e, size_t offset_parts,
     free(s->gparts);
     s->gparts = gparts_new;
     for (size_t k = 0; k < offset_gparts; k++) {
-      if (s->gparts[k].id > 0) {
-        s->gparts[k].part->gpart = &s->gparts[k];
+      if (s->gparts[k].id_or_neg_offset < 0) {
+        s->parts[-s->gparts[k].id_or_neg_offset].gpart = &s->gparts[k];
       }
     }
   }
@@ -1053,9 +1154,10 @@ void engine_exchange_strays(struct engine *e, size_t offset_parts,
       /* Re-link the gparts. */
       for (int k = 0; k < p->nr_gparts_in; k++) {
         struct gpart *gp = &s->gparts[offset_gparts + count_gparts + k];
-        if (gp->id >= 0) {
-          struct part *p = &s->parts[offset_parts + count_parts + gp->id];
-          gp->part = p;
+        if (gp->id_or_neg_offset <= 0) {
+          struct part *p =
+              &s->parts[offset_gparts + count_parts - gp->id_or_neg_offset];
+          gp->id_or_neg_offset = s->parts - p;
           p->gpart = gp;
         }
       }
@@ -1172,6 +1274,7 @@ void engine_count_and_link_tasks_mapper(void *map_data, int num_elements,
   struct engine *e = (struct engine *)extra_data;
   struct task *tasks = (struct task *)map_data;
   struct scheduler *sched = &e->sched;
+  const int nr_tasks = sched->nr_tasks;
 
   for (int ind = 0; ind < num_elements; ind++) {
 
@@ -1203,20 +1306,23 @@ void engine_count_and_link_tasks_mapper(void *map_data, int num_elements,
         engine_addlink(e, &t->cj->density, t);
         atomic_inc(&t->cj->nr_density);
       }
-    } else if (t->type == task_type_sub) {
+    } else if (t->type == task_type_sub_self) {
       atomic_inc(&t->ci->nr_tasks);
       if (t->cj != NULL) atomic_inc(&t->cj->nr_tasks);
       if (t->subtype == task_subtype_density) {
         engine_addlink(e, &t->ci->density, t);
         atomic_inc(&t->ci->nr_density);
-        if (t->cj != NULL) {
-          engine_addlink(e, &t->cj->density, t);
-          atomic_inc(&t->cj->nr_density);
-        }
+      }
+    } else if (t->type == task_type_sub_pair) {
+      atomic_inc(&t->ci->nr_tasks);
+      atomic_inc(&t->cj->nr_tasks);
+      if (t->subtype == task_subtype_density) {
+        t->ci->density = engine_addlink(e, t->ci->density, t);
+        atomic_inc(&t->ci->nr_density);
+        t->cj->density = engine_addlink(e, t->cj->density, t);
+        atomic_inc(&t->cj->nr_density);
       }
     }
-  }
-}
 
 void engine_count_and_link_tasks_serial(struct engine *e) {
 
@@ -1252,16 +1358,21 @@ void engine_count_and_link_tasks_serial(struct engine *e) {
         engine_addlink(e, &t->cj->density, t);
         atomic_inc(&t->cj->nr_density);
       }
-    } else if (t->type == task_type_sub) {
+    } else if (t->type == task_type_sub_self) {
       atomic_inc(&t->ci->nr_tasks);
       if (t->cj != NULL) atomic_inc(&t->cj->nr_tasks);
       if (t->subtype == task_subtype_density) {
         engine_addlink(e, &t->ci->density, t);
         atomic_inc(&t->ci->nr_density);
-        if (t->cj != NULL) {
-          engine_addlink(e, &t->cj->density, t);
-          atomic_inc(&t->cj->nr_density);
-        }
+      }
+    } else if (t->type == task_type_sub_pair) {
+      atomic_inc(&t->ci->nr_tasks);
+      atomic_inc(&t->cj->nr_tasks);
+      if (t->subtype == task_subtype_density) {
+        t->ci->density = engine_addlink(e, t->ci->density, t);
+        atomic_inc(&t->ci->nr_density);
+        t->cj->density = engine_addlink(e, t->cj->density, t);
+        atomic_inc(&t->cj->nr_density);
       }
     }
   }
@@ -1443,29 +1554,47 @@ void engine_make_extra_hydroloop_tasks_serial(struct engine *e) {
       }
     }
 
-    /* Otherwise, sub interaction? */
-    else if (t->type == task_type_sub && t->subtype == task_subtype_density) {
+    /* Otherwise, sub-self interaction? */
+    else if (t->type == task_type_sub_self &&
+             t->subtype == task_subtype_density) {
 
       /* Start by constructing the task for the second hydro loop */
       struct task *t2 =
-          scheduler_addtask(sched, task_type_sub, task_subtype_force, t->flags,
-                            0, t->ci, t->cj, 0);
+          scheduler_addtask(sched, task_type_sub_self, task_subtype_force,
+                            t->flags, 0, t->ci, t->cj, 0);
 
-      /* Add the link between the new loop and both cells */
-      engine_addlink(e, &t->ci->force, t2);
+      /* Add the link between the new loop and the cell */
+      t->ci->force = engine_addlink(e, t->ci->force, t2);
       atomic_inc(&t->ci->nr_force);
-      if (t->cj != NULL) {
-        engine_addlink(e, &t->cj->force, t2);
-        atomic_inc(&t->cj->nr_force);
+
+      /* Now, build all the dependencies for the hydro for the cells */
+      /* that are local and are not descendant of the same super-cells */
+      if (t->ci->nodeID == nodeID) {
+        engine_make_hydro_loops_dependencies(sched, t, t2, t->ci);
       }
+    }
+
+    /* Otherwise, sub-pair interaction? */
+    else if (t->type == task_type_sub_pair &&
+             t->subtype == task_subtype_density) {
+
+      /* Start by constructing the task for the second hydro loop */
+      struct task *t2 =
+          scheduler_addtask(sched, task_type_sub_pair, task_subtype_force,
+                            t->flags, 0, t->ci, t->cj, 0);
+
+      /* Add the link between the new loop and both cells */
+      t->ci->force = engine_addlink(e, t->ci->force, t2);
+      atomic_inc(&t->ci->nr_force);
+      t->cj->force = engine_addlink(e, t->cj->force, t2);
+      atomic_inc(&t->cj->nr_force);
 
       /* Now, build all the dependencies for the hydro for the cells */
       /* that are local and are not descendant of the same super-cells */
       if (t->ci->nodeID == nodeID) {
         engine_make_hydro_loops_dependencies(sched, t, t2, t->ci);
       }
-      if (t->cj != NULL && t->cj->nodeID == nodeID &&
-          t->ci->super != t->cj->super) {
+      if (t->cj->nodeID == nodeID && t->ci->super != t->cj->super) {
         engine_make_hydro_loops_dependencies(sched, t, t2, t->cj);
       }
     }
@@ -1599,8 +1728,14 @@ void engine_maketasks(struct engine *e) {
   engine_count_and_link_tasks_serial(e);
 
   /* Append hierarchical tasks to each cells */
-  for (int k = 0; k < nr_cells; k++)
-    engine_make_hierarchical_tasks(e, &cells[k], NULL);
+  if (e->policy & engine_policy_hydro)
+    for (int k = 0; k < nr_cells; k++)
+      engine_make_hydro_hierarchical_tasks(e, &cells[k], NULL);
+
+  if ((e->policy & engine_policy_self_gravity) ||
+      (e->policy & engine_policy_external_gravity))
+    for (int k = 0; k < nr_cells; k++)
+      engine_make_gravity_hierarchical_tasks(e, &cells[k], NULL);
 
   /* Run through the tasks and make force tasks for each density task.
      Each force task depends on the cell ghosts and unlocks the kick task
@@ -1627,12 +1762,13 @@ void engine_maketasks(struct engine *e) {
       /* Loop through the proxy's incoming cells and add the
          recv tasks. */
       for (int k = 0; k < p->nr_cells_in; k++)
-        engine_addtasks_recv(e, p->cells_in[k], NULL, NULL);
+        engine_addtasks_recv(e, p->cells_in[k], NULL, NULL, NULL);
 
       /* Loop through the proxy's outgoing cells and add the
          send tasks. */
       for (int k = 0; k < p->nr_cells_out; k++)
-        engine_addtasks_send(e, p->cells_out[k], p->cells_in[0]);
+        engine_addtasks_send(e, p->cells_out[k], p->cells_in[0], NULL, NULL,
+                             NULL);
     }
   }
 
@@ -1670,7 +1806,7 @@ void engine_marktasks_fixdt_mapper(void *map_data, int num_elements,
 
     /* Pair? */
     if (t->type == task_type_pair ||
-        (t->type == task_type_sub && t->cj != NULL)) {
+        (t->type == task_type_sub_pair)) {
 
       /* Local pointers. */
       const struct cell *ci = t->ci;
@@ -1727,7 +1863,7 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
 
     /* Pair? */
     else if (t->type == task_type_pair ||
-             (t->type == task_type_sub && t->cj != NULL)) {
+             (t->type == task_type_sub_pair)) {
 
       /* Local pointers. */
       const struct cell *ci = t->ci;
@@ -2114,7 +2250,11 @@ void engine_collect_drift(struct cell *c) {
   c->ang_mom[1] = ang_mom[1];
   c->ang_mom[2] = ang_mom[2];
 }
-
+/**
+ * @brief Print the conserved quantities statistics to a log file
+ *
+ * @param e The #engine.
+ */
 void engine_print_stats(struct engine *e) {
 
   const struct space *s = e->s;
@@ -2228,8 +2368,10 @@ void engine_launch(struct engine *e, int nr_runners, unsigned int mask,
  *forward in time.
  *
  * @param e The #engine
+ * @param flag_entropy_ICs Did the 'Internal Energy' of the particles actually
+ *contain entropy ?
  */
-void engine_init_particles(struct engine *e) {
+void engine_init_particles(struct engine *e, int flag_entropy_ICs) {
 
   struct space *s = e->s;
 
@@ -2288,6 +2430,7 @@ void engine_init_particles(struct engine *e) {
 
     mask |= 1 << task_type_send;
     mask |= 1 << task_type_recv;
+    submask |= 1 << task_subtype_tend;
   }
 
   /* Now, launch the calculation */
@@ -2296,7 +2439,7 @@ void engine_init_particles(struct engine *e) {
   TIMER_TOC(timer_runners);
 
   /* Apply some conversions (e.g. internal energy -> entropy) */
-  space_map_cells_pre(s, 0, cell_convert_hydro, NULL);
+  if (!flag_entropy_ICs) space_map_cells_pre(s, 0, cell_convert_hydro, NULL);
 
   clocks_gettime(&time2);
 
@@ -2394,7 +2537,8 @@ void engine_step(struct engine *e) {
 
     mask |= 1 << task_type_self;
     mask |= 1 << task_type_pair;
-    mask |= 1 << task_type_sub;
+    mask |= 1 << task_type_sub_self;
+    mask |= 1 << task_type_sub_pair;
     mask |= 1 << task_type_ghost;
 
     submask |= 1 << task_subtype_density;
@@ -2417,6 +2561,7 @@ void engine_step(struct engine *e) {
 
     mask |= 1 << task_type_send;
     mask |= 1 << task_type_recv;
+    submask |= 1 << task_subtype_tend;
   }
 
   /* Send off the runners. */
@@ -2582,8 +2727,7 @@ void engine_split(struct engine *e, struct partition *initial_partition) {
   s->xparts = xparts_new;
 
   /* Re-link the gparts. */
-  for (size_t k = 0; k < s->nr_parts; k++)
-    if (s->parts[k].gpart != NULL) s->parts[k].gpart->part = &s->parts[k];
+  part_relink_gparts(s->parts, s->nr_parts, 0);
 
   /* Re-allocate the local gparts. */
   if (e->verbose)
@@ -2599,30 +2743,29 @@ void engine_split(struct engine *e, struct partition *initial_partition) {
   s->gparts = gparts_new;
 
   /* Re-link the parts. */
-  for (size_t k = 0; k < s->nr_gparts; k++)
-    if (s->gparts[k].id > 0) s->gparts[k].part->gpart = &s->gparts[k];
+  part_relink_parts(s->gparts, s->nr_gparts, s->parts);
 
+#ifdef SWIFT_DEBUG_CHECKS
   /* Verify that the links are correct */
-  /* MATTHIEU: To be commented out once we are happy */
   for (size_t k = 0; k < s->nr_gparts; ++k) {
 
-    if (s->gparts[k].id > 0) {
+    if (s->gparts[k].id_or_neg_offset <= 0) {
+
+      struct part *part = &s->parts[-s->gparts[k].id_or_neg_offset];
 
-      if (s->gparts[k].part->gpart != &s->gparts[k]) error("Linking problem !");
+      if (part->gpart != &s->gparts[k]) error("Linking problem !");
 
-      if (s->gparts[k].x[0] != s->gparts[k].part->x[0] ||
-          s->gparts[k].x[1] != s->gparts[k].part->x[1] ||
-          s->gparts[k].x[2] != s->gparts[k].part->x[2])
+      if (s->gparts[k].x[0] != part->x[0] || s->gparts[k].x[1] != part->x[1] ||
+          s->gparts[k].x[2] != part->x[2])
         error("Linked particles are not at the same position !");
     }
   }
   for (size_t k = 0; k < s->nr_parts; ++k) {
 
-    if (s->parts[k].gpart != NULL) {
-
-      if (s->parts[k].gpart->part != &s->parts[k]) error("Linking problem !");
-    }
+    if (s->parts[k].gpart != NULL && s->parts[k].gpart->id_or_neg_offset != -k)
+      error("Linking problem !");
   }
+#endif
 
 #else
   error("SWIFT was not compiled with MPI support.");
@@ -3015,9 +3158,6 @@ void engine_init(struct engine *e, struct space *s,
   part_create_mpi_types();
 #endif
 
-  /* Initialize the threadpool. */
-  threadpool_init(&e->threadpool, e->nr_threads);
-
   /* First of all, init the barrier and lock it. */
   if (pthread_mutex_init(&e->barrier_mutex, NULL) != 0)
     error("Failed to initialize barrier mutex.");
@@ -3032,7 +3172,16 @@ void engine_init(struct engine *e, struct space *s,
   /* Init the scheduler with enough tasks for the initial sorting tasks. */
   const int nr_tasks = 2 * s->tot_cells + 2 * e->nr_threads;
   scheduler_init(&e->sched, e->s, nr_tasks, nr_queues, scheduler_flag_steal,
-                 e->nodeID, &e->threadpool);
+                 e->nodeID);
+
+  /* Create the sorting tasks. */
+  for (int i = 0; i < e->nr_threads; i++) {
+    scheduler_addtask(&e->sched, task_type_part_sort, task_subtype_none, i, 0,
+                      NULL, NULL, 0);
+
+    scheduler_addtask(&e->sched, task_type_gpart_sort, task_subtype_none, i, 0,
+                      NULL, NULL, 0);
+  }
 
   scheduler_ranktasks(&e->sched);
 
diff --git a/src/engine.h b/src/engine.h
index f1fea143e6252aa218ad69210a7e105aefdcc13c..6b00d7a4b8be8305bd8d0377db37a6dba2c7c7a7 100644
--- a/src/engine.h
+++ b/src/engine.h
@@ -37,13 +37,10 @@
 #include <stdio.h>
 
 /* Includes. */
-#include "hydro_properties.h"
-#include "lock.h"
+#include "clocks.h"
 #include "parser.h"
 #include "partition.h"
-#include "physical_constants.h"
 #include "potentials.h"
-#include "proxy.h"
 #include "runner.h"
 #include "scheduler.h"
 #include "space.h"
@@ -223,7 +220,7 @@ void engine_launch(struct engine *e, int nr_runners, unsigned int mask,
                    unsigned int submask);
 void engine_prepare(struct engine *e);
 void engine_print(struct engine *e);
-void engine_init_particles(struct engine *e);
+void engine_init_particles(struct engine *e, int flag_entropy_ICs);
 void engine_step(struct engine *e);
 void engine_maketasks(struct engine *e);
 void engine_split(struct engine *e, struct partition *initial_partition);
diff --git a/src/gravity/Default/gravity_debug.h b/src/gravity/Default/gravity_debug.h
index 21d775d703c8bd7000ad03386fe818124642a8f1..7cf375a1fdf7bccc4131dc415ab2d4acbbf2d3bc 100644
--- a/src/gravity/Default/gravity_debug.h
+++ b/src/gravity/Default/gravity_debug.h
@@ -18,7 +18,7 @@
  ******************************************************************************/
 
 __attribute__((always_inline)) INLINE static void gravity_debug_particle(
-    struct gpart* p) {
+    const struct gpart* p) {
   printf(
       "x=[%.3e,%.3e,%.3e], "
       "v_full=[%.3e,%.3e,%.3e] \n a=[%.3e,%.3e,%.3e],\n "
diff --git a/src/gravity/Default/gravity_io.h b/src/gravity/Default/gravity_io.h
index 129c4b39828ca73d2d80d79edbdaa8ec4d5a9e01..74f364dd97361f0513755bedec83fe7cb277c36b 100644
--- a/src/gravity/Default/gravity_io.h
+++ b/src/gravity/Default/gravity_io.h
@@ -39,8 +39,8 @@ __attribute__((always_inline)) INLINE static void darkmatter_read_particles(
             COMPULSORY);
   readArray(h_grp, "Velocities", FLOAT, N, 3, gparts, N_total, offset, v_full,
             COMPULSORY);
-  readArray(h_grp, "ParticleIDs", ULONGLONG, N, 1, gparts, N_total, offset, id,
-            COMPULSORY);
+  readArray(h_grp, "ParticleIDs", ULONGLONG, N, 1, gparts, N_total, offset,
+            id_or_neg_offset, COMPULSORY);
 }
 
 /**
@@ -75,6 +75,6 @@ __attribute__((always_inline)) INLINE static void darkmatter_write_particles(
              Ndm, 3, gparts, Ndm_total, mpi_rank, offset, v_full, us,
              UNIT_CONV_SPEED);
   writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "ParticleIDs",
-             ULONGLONG, Ndm, 1, gparts, Ndm_total, mpi_rank, offset, id, us,
-             UNIT_CONV_NO_UNITS);
+             ULONGLONG, Ndm, 1, gparts, Ndm_total, mpi_rank, offset,
+             id_or_neg_offset, us, UNIT_CONV_NO_UNITS);
 }
diff --git a/src/gravity/Default/gravity_part.h b/src/gravity/Default/gravity_part.h
index a54fc1d18e7e84a1ca7a7e7913117ed207549271..b6c9e62559207cb25323c8a108e4bffc87ea0fcf 100644
--- a/src/gravity/Default/gravity_part.h
+++ b/src/gravity/Default/gravity_part.h
@@ -47,14 +47,8 @@ struct gpart {
   /* float tx; */
   /* float tv; */
 
-  /* Anonymous union for id/part. */
-  union {
-
-    /* Particle ID. */
-    long long id;
-
-    /* Pointer to corresponding SPH part. */
-    struct part* part;
-  };
+  /* Particle ID. If negative, it is the negative offset of the #part with
+     which this gpart is linked. */
+  long long id_or_neg_offset;
 
 } __attribute__((aligned(gpart_align)));
diff --git a/src/hydro/Default/hydro_debug.h b/src/hydro/Default/hydro_debug.h
index 79ee392d46ca75a2c097bf045b2d82c9f3dc96c0..9b8136e8349398e4e9e804459a5de23d21043564 100644
--- a/src/hydro/Default/hydro_debug.h
+++ b/src/hydro/Default/hydro_debug.h
@@ -18,7 +18,7 @@
  ******************************************************************************/
 
 __attribute__((always_inline)) INLINE static void hydro_debug_particle(
-    struct part* p, struct xpart* xp) {
+    const struct part* p, const struct xpart* xp) {
   printf(
       "x=[%.3e,%.3e,%.3e], "
       "v=[%.3e,%.3e,%.3e],v_full=[%.3e,%.3e,%.3e] \n a=[%.3e,%.3e,%.3e],\n "
diff --git a/src/hydro/Default/hydro_iact.h b/src/hydro/Default/hydro_iact.h
index 4d651b61bd934388414d54fa813820111d26e682..0a577931b5e0ca67ab07dfe414a548da66e82cdd 100644
--- a/src/hydro/Default/hydro_iact.h
+++ b/src/hydro/Default/hydro_iact.h
@@ -20,12 +20,6 @@
 #ifndef SWIFT_RUNNER_IACT_H
 #define SWIFT_RUNNER_IACT_H
 
-/* Includes. */
-#include "const.h"
-#include "kernel_hydro.h"
-#include "part.h"
-#include "vector.h"
-
 /**
  * @brief SPH interaction functions following the Gadget-2 version of SPH.
  *
@@ -44,7 +38,6 @@
 /**
  * @brief Density loop
  */
-
 __attribute__((always_inline)) INLINE static void runner_iact_density(
     float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
 
@@ -218,7 +211,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_density(
 /**
  * @brief Density loop (non-symmetric version)
  */
-
 __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density(
     float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
 
@@ -267,7 +259,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density(
 /**
  * @brief Density loop (non-symmetric vectorized version)
  */
-
 __attribute__((always_inline)) INLINE static void
 runner_iact_nonsym_vec_density(float *R2, float *Dx, float *Hi, float *Hj,
                                struct part **pi, struct part **pj) {
@@ -360,7 +351,6 @@ runner_iact_nonsym_vec_density(float *R2, float *Dx, float *Hi, float *Hj,
 /**
  * @brief Force loop
  */
-
 __attribute__((always_inline)) INLINE static void runner_iact_force(
     float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
 
@@ -456,7 +446,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_force(
 /**
  * @brief Force loop (Vectorized version)
  */
-
 __attribute__((always_inline)) INLINE static void runner_iact_vec_force(
     float *R2, float *Dx, float *Hi, float *Hj, struct part **pi,
     struct part **pj) {
@@ -675,7 +664,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_force(
 /**
  * @brief Force loop (non-symmetric version)
  */
-
 __attribute__((always_inline)) INLINE static void runner_iact_nonsym_force(
     float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
 
@@ -766,7 +754,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_force(
 /**
  * @brief Force loop (Vectorized non-symmetric version)
  */
-
 __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force(
     float *R2, float *Dx, float *Hi, float *Hj, struct part **pi,
     struct part **pj) {
diff --git a/src/hydro/Default/hydro_io.h b/src/hydro/Default/hydro_io.h
index de71963b05149fa0b2df6222424478a6ad9b1f44..fb1f7af922b2272779c7251b15e304880a2af520 100644
--- a/src/hydro/Default/hydro_io.h
+++ b/src/hydro/Default/hydro_io.h
@@ -122,3 +122,10 @@ void writeSPHflavour(hid_t h_grpsph) {
   writeAttribute_f(h_grpsph, "Maximal Delta u change over dt",
                    const_max_u_change);
 }
+
+/**
+ * @brief Are we writing entropy in the internal energy field ?
+ *
+ * @return 1 if entropy is in 'internal energy', 0 otherwise.
+ */
+int writeEntropyFlag() { return 0; }
diff --git a/src/hydro/Gadget2/hydro_debug.h b/src/hydro/Gadget2/hydro_debug.h
index 31e89d438bc96c0f0f2ba56249664d28036cb607..b67e79182ccaaee7c0421c57a91ec9fa2adae65c 100644
--- a/src/hydro/Gadget2/hydro_debug.h
+++ b/src/hydro/Gadget2/hydro_debug.h
@@ -18,7 +18,7 @@
  ******************************************************************************/
 
 __attribute__((always_inline)) INLINE static void hydro_debug_particle(
-    struct part* p, struct xpart* xp) {
+    const struct part* p, const struct xpart* xp) {
   printf(
       "x=[%.3e,%.3e,%.3e], "
       "v=[%.3e,%.3e,%.3e],v_full=[%.3e,%.3e,%.3e] \n a=[%.3e,%.3e,%.3e],\n "
diff --git a/src/hydro/Gadget2/hydro_iact.h b/src/hydro/Gadget2/hydro_iact.h
index 8fcae293280e55bb838edf3c243f4e322914216f..8738b4be09931df4c938f1dff3adeed11468dcfc 100644
--- a/src/hydro/Gadget2/hydro_iact.h
+++ b/src/hydro/Gadget2/hydro_iact.h
@@ -20,12 +20,6 @@
 #ifndef SWIFT_RUNNER_IACT_LEGACY_H
 #define SWIFT_RUNNER_IACT_LEGACY_H
 
-/* Includes. */
-#include "const.h"
-#include "kernel_hydro.h"
-#include "part.h"
-#include "vector.h"
-
 /**
  * @brief SPH interaction functions following the Gadget-2 version of SPH.
  *
@@ -42,7 +36,6 @@
 /**
  * @brief Density loop
  */
-
 __attribute__((always_inline)) INLINE static void runner_iact_density(
     float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
 
@@ -113,7 +106,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_density(
 /**
  * @brief Density loop (non-symmetric version)
  */
-
 __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density(
     float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
 
@@ -162,7 +154,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density(
 /**
  * @brief Force loop
  */
-
 __attribute__((always_inline)) INLINE static void runner_iact_force(
     float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
 
@@ -260,7 +251,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_force(
 /**
  * @brief Force loop (non-symmetric version)
  */
-
 __attribute__((always_inline)) INLINE static void runner_iact_nonsym_force(
     float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
 
diff --git a/src/hydro/Gadget2/hydro_io.h b/src/hydro/Gadget2/hydro_io.h
index b977f25386fab0787c925635297a48fa85a8df24..f7ee04cbbad07ebb06d00ee39a614b9fed5c4dfd 100644
--- a/src/hydro/Gadget2/hydro_io.h
+++ b/src/hydro/Gadget2/hydro_io.h
@@ -83,8 +83,8 @@ __attribute__((always_inline)) INLINE static void hydro_write_particles(
   writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "SmoothingLength",
              FLOAT, N, 1, parts, N_total, mpi_rank, offset, h, us,
              UNIT_CONV_LENGTH);
-  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Entropy", FLOAT, N,
-             1, parts, N_total, mpi_rank, offset, entropy, us,
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "InternalEnergy",
+             FLOAT, N, 1, parts, N_total, mpi_rank, offset, entropy, us,
              UNIT_CONV_ENTROPY_PER_UNIT_MASS);
   writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "ParticleIDs",
              ULONGLONG, N, 1, parts, N_total, mpi_rank, offset, id, us,
@@ -113,3 +113,10 @@ void writeSPHflavour(hid_t h_grpsph) {
   writeAttribute_f(h_grpsph, "Viscosity alpha", const_viscosity_alpha);
   writeAttribute_f(h_grpsph, "Viscosity beta", 3.f);
 }
+
+/**
+ * @brief Are we writing entropy in the internal energy field ?
+ *
+ * @return 1 if entropy is in 'internal energy', 0 otherwise.
+ */
+int writeEntropyFlag() { return 1; }
diff --git a/src/hydro/Minimal/hydro_debug.h b/src/hydro/Minimal/hydro_debug.h
index 127ba75e99418b6a5dc197a44ccdc77de3cdef15..1e0e3f1f111149a3babeef431893dff2cbe6bb3c 100644
--- a/src/hydro/Minimal/hydro_debug.h
+++ b/src/hydro/Minimal/hydro_debug.h
@@ -18,7 +18,7 @@
  ******************************************************************************/
 
 __attribute__((always_inline)) INLINE static void hydro_debug_particle(
-    struct part* p, struct xpart* xp) {
+    const struct part* p, const struct xpart* xp) {
   printf(
       "x=[%.3e,%.3e,%.3e], "
       "v=[%.3e,%.3e,%.3e],v_full=[%.3e,%.3e,%.3e] \n a=[%.3e,%.3e,%.3e], "
diff --git a/src/hydro/Minimal/hydro_iact.h b/src/hydro/Minimal/hydro_iact.h
index f453d8fa1c495472af27ccf98356a3dab0894e98..c9da185b8a29eafe2a58420ae5de3a05ff043225 100644
--- a/src/hydro/Minimal/hydro_iact.h
+++ b/src/hydro/Minimal/hydro_iact.h
@@ -19,12 +19,6 @@
 #ifndef SWIFT_RUNNER_IACT_MINIMAL_H
 #define SWIFT_RUNNER_IACT_MINIMAL_H
 
-/* Includes. */
-#include "const.h"
-#include "kernel_hydro.h"
-#include "part.h"
-#include "vector.h"
-
 /**
  * @brief Minimal conservative implementation of SPH
  *
@@ -70,7 +64,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_density(
 /**
  * @brief Density loop (non-symmetric version)
  */
-
 __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density(
     float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
 
@@ -95,7 +88,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density(
 /**
  * @brief Force loop
  */
-
 __attribute__((always_inline)) INLINE static void runner_iact_force(
     float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
 
@@ -168,7 +160,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_force(
 /**
  * @brief Force loop (non-symmetric version)
  */
-
 __attribute__((always_inline)) INLINE static void runner_iact_nonsym_force(
     float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
 
diff --git a/src/hydro/Minimal/hydro_io.h b/src/hydro/Minimal/hydro_io.h
index 1bbfe1065358017e0b27e2131ff717848221fe9c..bed6b8f8221dc3a6e94c65d101c68aa9b3bdea91 100644
--- a/src/hydro/Minimal/hydro_io.h
+++ b/src/hydro/Minimal/hydro_io.h
@@ -115,3 +115,10 @@ void writeSPHflavour(hid_t h_grpsph) {
   writeAttribute_f(h_grpsph, "Maximal Delta u change over dt",
                    const_max_u_change);
 }
+
+/**
+ * @brief Are we writing entropy in the internal energy field ?
+ *
+ * @return 1 if entropy is in 'internal energy', 0 otherwise.
+ */
+int writeEntropyFlag() { return 0; }
diff --git a/src/hydro_properties.c b/src/hydro_properties.c
index b4e37d672408ad3c0adf1f37fcd4fcbd50095d92..16216f81a5b505fc3a887e86ca4898bc4179e4d5 100644
--- a/src/hydro_properties.c
+++ b/src/hydro_properties.c
@@ -29,6 +29,9 @@
 #include "hydro.h"
 #include "kernel_hydro.h"
 
+#define hydro_props_default_max_iterations 30
+#define hydro_props_default_volume_change 2.0f
+
 void hydro_props_init(struct hydro_props *p,
                       const struct swift_params *params) {
 
@@ -39,13 +42,13 @@ void hydro_props_init(struct hydro_props *p,
   p->delta_neighbours = parser_get_param_float(params, "SPH:delta_neighbours");
 
   /* Ghost stuff */
-  p->max_smoothing_iterations =
-      parser_get_param_int(params, "SPH:max_ghost_iterations");
+  p->max_smoothing_iterations = parser_get_opt_param_int(
+      params, "SPH:max_ghost_iterations", hydro_props_default_max_iterations);
 
   /* Time integration properties */
   p->CFL_condition = parser_get_param_float(params, "SPH:CFL_condition");
-  const float max_volume_change =
-      parser_get_param_float(params, "SPH:max_volume_change");
+  const float max_volume_change = parser_get_opt_param_float(
+      params, "SPH:max_volume_change", hydro_props_default_volume_change);
   p->log_max_h_change = logf(powf(max_volume_change, 0.33333333333f));
 }
 
@@ -60,4 +63,8 @@ void hydro_props_print(const struct hydro_props *p) {
       "Hydrodynamic integration: Max change of volume: %.2f "
       "(max|dlog(h)/dt|=%f).",
       powf(expf(p->log_max_h_change), 3.f), p->log_max_h_change);
+
+  if (p->max_smoothing_iterations != hydro_props_default_max_iterations)
+    message("Maximal iterations in ghost task set to %d (default is %d)",
+            p->max_smoothing_iterations, hydro_props_default_max_iterations);
 }
diff --git a/src/lock.h b/src/lock.h
index 31f80605efb29bfaaadc65e25ba4f31679f8fbe8..b2dd2eac9d0ca5d7807907e31cf3fa31894f9aed 100644
--- a/src/lock.h
+++ b/src/lock.h
@@ -23,7 +23,7 @@
 #include <pthread.h>
 
 /* Includes. */
-#include "inline.h"
+#include "atomic.h"
 
 #ifdef PTHREAD_SPINLOCK
 #include <pthread.h>
@@ -50,14 +50,14 @@
 #define lock_init(l) (*(l) = 0)
 #define lock_destroy(l) 0
 INLINE static int lock_lock(volatile int *l) {
-  while (__sync_val_compare_and_swap(l, 0, 1) != 0)
+  while (atomic_cas(l, 0, 1) != 0)
     ;
   // while( *l );
   return 0;
 }
-#define lock_trylock(l) ((*(l)) ? 1 : __sync_val_compare_and_swap(l, 0, 1))
-#define lock_unlock(l) (__sync_val_compare_and_swap(l, 1, 0) != 1)
-#define lock_unlock_blind(l) __sync_val_compare_and_swap(l, 1, 0)
+#define lock_trylock(l) ((*(l)) ? 1 : atomic_cas(l, 0, 1))
+#define lock_unlock(l) (atomic_cas(l, 1, 0) != 1)
+#define lock_unlock_blind(l) atomic_cas(l, 1, 0)
 #endif
 
 #endif /* SWIFT_LOCK_H */
diff --git a/src/map.c b/src/map.c
index fbe57fde7b1e29c49b0f27d86d177245dd9a27e2..f4f9ac7cfa7141606b578739517b66e951e65eab 100644
--- a/src/map.c
+++ b/src/map.c
@@ -18,15 +18,23 @@
  *
  ******************************************************************************/
 
-#include "map.h"
+/* Config parameters. */
+#include "../config.h"
+
+/* Some standard headers. */
 #include <stdio.h>
 #include <stdlib.h>
+
+/* This object's header. */
+#include "map.h"
+
+/* Local headers. */
+#include "atomic.h"
 #include "error.h"
 
 /**
  * @brief Mapping function to draw a specific cell (gnuplot).
  */
-
 void map_cells_plot(struct cell *c, void *data) {
 
   int depth = *(int *)data;
@@ -80,24 +88,21 @@ void map_cells_plot(struct cell *c, void *data) {
 /**
  * @brief Mapping function for checking if each part is in its box.
  */
+void map_check(struct part *p, struct cell *c, void *data) {
 
-/* void map_check ( struct part *p , struct cell *c , void *data ) {
-
-    if ( p->x[0] < c->loc[0] || p->x[0] > c->loc[0]+c->h[0] ||
-         p->x[0] < c->loc[0] || p->x[0] > c->loc[0]+c->h[0] ||
-         p->x[0] < c->loc[0] || p->x[0] > c->loc[0]+c->h[0] )
-        printf( "map_check: particle %i is outside of its box.\n" , p->id );
-
-    } */
+  if (p->x[0] < c->loc[0] || p->x[0] > c->loc[0] + c->h[0] ||
+      p->x[0] < c->loc[0] || p->x[0] > c->loc[0] + c->h[0] ||
+      p->x[0] < c->loc[0] || p->x[0] > c->loc[0] + c->h[0])
+    printf("map_check: particle %lld is outside of its box.\n", p->id);
+}
 
 /**
  * @brief Mapping function for neighbour count.
  */
-
 void map_cellcheck(struct cell *c, void *data) {
 
   int *count = (int *)data;
-  __sync_fetch_and_add(count, c->count);
+  atomic_add(count, c->count);
 
   /* Loop over all parts and check if they are in the cell. */
   for (int k = 0; k < c->count; k++) {
@@ -133,7 +138,6 @@ void map_cellcheck(struct cell *c, void *data) {
 /**
  * @brief Mapping function for maxdepth cell count.
  */
-
 void map_maxdepth(struct cell *c, void *data) {
 
   int maxdepth = ((int *)data)[0];
@@ -147,7 +151,6 @@ void map_maxdepth(struct cell *c, void *data) {
 /**
  * @brief Mapping function for neighbour count.
  */
-
 void map_count(struct part *p, struct cell *c, void *data) {
 
   double *wcount = (double *)data;
@@ -156,7 +159,6 @@ void map_count(struct part *p, struct cell *c, void *data) {
 
   *wcount += p->density.wcount;
 }
-
 void map_wcount_min(struct part *p, struct cell *c, void *data) {
 
   struct part **p2 = (struct part **)data;
@@ -188,7 +190,6 @@ void map_h_max(struct part *p, struct cell *c, void *data) {
 /**
  * @brief Mapping function for neighbour count.
  */
-
 void map_icount(struct part *p, struct cell *c, void *data) {
 
   // int *count = (int *)data;
@@ -201,7 +202,6 @@ void map_icount(struct part *p, struct cell *c, void *data) {
 /**
  * @brief Mapping function to print the particle position.
  */
-
 void map_dump(struct part *p, struct cell *c, void *data) {
 
   double *shift = (double *)data;
diff --git a/src/parallel_io.c b/src/parallel_io.c
index e779b56a85da3db72ea860fb336fa42037fbcd0e..66b02d2bff2bc0d6f2dd69d994a93576e8b96583 100644
--- a/src/parallel_io.c
+++ b/src/parallel_io.c
@@ -37,7 +37,11 @@
 
 /* Local includes. */
 #include "common_io.h"
+#include "engine.h"
 #include "error.h"
+#include "kernel_hydro.h"
+#include "part.h"
+#include "units.h"
 
 /**
  * @brief Reads a data array from a given HDF5 group.
@@ -363,8 +367,8 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile,
  */
 void read_ic_parallel(char* fileName, double dim[3], struct part** parts,
                       struct gpart** gparts, size_t* Ngas, size_t* Ngparts,
-                      int* periodic, int mpi_rank, int mpi_size, MPI_Comm comm,
-                      MPI_Info info, int dry_run) {
+                      int* periodic, int* flag_entropy, int mpi_rank,
+                      int mpi_size, MPI_Comm comm, MPI_Info info, int dry_run) {
   hid_t h_file = 0, h_grp = 0;
   /* GADGET has only cubic boxes (in cosmological mode) */
   double boxSize[3] = {0.0, -1.0, -1.0};
@@ -400,6 +404,7 @@ void read_ic_parallel(char* fileName, double dim[3], struct part** parts,
   if (h_grp < 0) error("Error while opening file header\n");
 
   /* Read the relevant information and print status */
+  readAttribute(h_grp, "Flag_Entropy_ICs", INT, flag_entropy);
   readAttribute(h_grp, "BoxSize", DOUBLE, boxSize);
   readAttribute(h_grp, "NumPart_Total", UINT, numParticles);
   readAttribute(h_grp, "NumPart_Total_HighWord", UINT, numParticles_highWord);
@@ -622,6 +627,7 @@ void write_output_parallel(struct engine* e, const char* baseName,
   double MassTable[6] = {0., 0., 0., 0., 0., 0.};
   writeAttribute(h_grp, "MassTable", DOUBLE, MassTable, NUM_PARTICLE_TYPES);
   unsigned int flagEntropy[NUM_PARTICLE_TYPES] = {0};
+  flagEntropy[0] = writeEntropyFlag();
   writeAttribute(h_grp, "Flag_Entropy_ICs", UINT, flagEntropy,
                  NUM_PARTICLE_TYPES);
   writeAttribute(h_grp, "NumFilesPerSnapshot", INT, &numFiles, 1);
diff --git a/src/parallel_io.h b/src/parallel_io.h
index 970ad8c41dcbc2df3a85b178f836e16926147788..709479fab029e862caddb36c4216b85077e96cec 100644
--- a/src/parallel_io.h
+++ b/src/parallel_io.h
@@ -19,6 +19,9 @@
 #ifndef SWIFT_PARALLEL_IO_H
 #define SWIFT_PARALLEL_IO_H
 
+/* Config parameters. */
+#include "../config.h"
+
 /* MPI headers. */
 #ifdef WITH_MPI
 #include <mpi.h>
@@ -33,8 +36,8 @@
 
 void read_ic_parallel(char* fileName, double dim[3], struct part** parts,
                       struct gpart** gparts, size_t* Ngas, size_t* Ngparts,
-                      int* periodic, int mpi_rank, int mpi_size, MPI_Comm comm,
-                      MPI_Info info, int dry_run);
+                      int* periodic, int* flag_entropy, int mpi_rank,
+                      int mpi_size, MPI_Comm comm, MPI_Info info, int dry_run);
 
 void write_output_parallel(struct engine* e, const char* baseName,
                            struct UnitSystem* us, int mpi_rank, int mpi_size,
diff --git a/src/part.c b/src/part.c
index b89abdde40fe8c7a57d1e9ac9e18fece83ba1f21..b00eaccaae0e86f7c4e8019a307f0bf455687b7c 100644
--- a/src/part.c
+++ b/src/part.c
@@ -29,6 +29,36 @@
 #include "error.h"
 #include "part.h"
 
+/**
+ * @brief Re-link the #gpart%s associated with the list of #part%s.
+ *
+ * @param parts The list of #part.
+ * @param N The number of particles to re-link;
+ * @param offset The offset of #part%s relative to the global parts list.
+ */
+void part_relink_gparts(struct part *parts, size_t N, ptrdiff_t offset) {
+  for (size_t k = 0; k < N; k++) {
+    if (parts[k].gpart) {
+      parts[k].gpart->id_or_neg_offset = -(k + offset);
+    }
+  }
+}
+
+/**
+ * @brief Re-link the #gpart%s associated with the list of #part%s.
+ *
+ * @param gparts The list of #gpart.
+ * @param N The number of particles to re-link;
+ * @param parts The global part array in which to find the #gpart offsets.
+ */
+void part_relink_parts(struct gpart *gparts, size_t N, struct part *parts) {
+  for (size_t k = 0; k < N; k++) {
+    if (gparts[k].id_or_neg_offset <= 0) {
+      parts[-gparts[k].id_or_neg_offset].gpart = &gparts[k];
+    }
+  }
+}
+
 #ifdef WITH_MPI
 /* MPI data type for the particle transfers */
 MPI_Datatype part_mpi_type;
diff --git a/src/part.h b/src/part.h
index 5d4c9c88a1acadea3d23a3df618c04da389fb61d..efca7b6b5bef49f20df1e2c45b30f65ecbbf4960 100644
--- a/src/part.h
+++ b/src/part.h
@@ -22,8 +22,8 @@
 /* Config parameters. */
 #include "../config.h"
 
-/* Some standard headers. */
-#include <stdlib.h>
+/* Standard headers. */
+#include <stddef.h>
 
 /* MPI headers. */
 #ifdef WITH_MPI
@@ -38,7 +38,7 @@
 #define xpart_align 32
 #define gpart_align 32
 
-/* Import the right particle definition */
+/* Import the right hydro particle definition */
 #if defined(MINIMAL_SPH)
 #include "./hydro/Minimal/hydro_part.h"
 #elif defined(GADGET2_SPH)
@@ -49,8 +49,11 @@
 #error "Invalid choice of SPH variant"
 #endif
 
+/* Import the right gravity particle definition */
 #include "./gravity/Default/gravity_part.h"
 
+void part_relink_gparts(struct part *parts, size_t N, ptrdiff_t offset);
+void part_relink_parts(struct gpart *gparts, size_t N, struct part *parts);
 #ifdef WITH_MPI
 /* MPI data type for the particle transfers */
 extern MPI_Datatype part_mpi_type;
diff --git a/src/partition.c b/src/partition.c
index b79dfd6b81d727b6e53f9e0906c7c59eee6607f1..6df437826de796c05143b2003dbdacb971d9b7be 100644
--- a/src/partition.c
+++ b/src/partition.c
@@ -453,9 +453,9 @@ static void repart_edge_metis(int partweights, int bothweights, int nodeID,
 
     /* Skip un-interesting tasks. */
     if (t->type != task_type_self && t->type != task_type_pair &&
-        t->type != task_type_sub && t->type != task_type_ghost &&
-        t->type != task_type_drift && t->type != task_type_kick &&
-        t->type != task_type_init)
+        t->type != task_type_sub_self && t->type != task_type_sub_self &&
+        t->type != task_type_ghost && t->type != task_type_drift &&
+        t->type != task_type_kick && t->type != task_type_init)
       continue;
 
     /* Get the task weight. */
@@ -496,15 +496,15 @@ static void repart_edge_metis(int partweights, int bothweights, int nodeID,
 
     /* Self interaction? */
     else if ((t->type == task_type_self && ci->nodeID == nodeID) ||
-             (t->type == task_type_sub && cj == NULL && ci->nodeID == nodeID)) {
+             (t->type == task_type_sub_self && cj == NULL &&
+              ci->nodeID == nodeID)) {
       /* Self interactions add only to vertex weight. */
       if (taskvweights) weights_v[cid] += w;
 
     }
 
     /* Pair? */
-    else if (t->type == task_type_pair ||
-             (t->type == task_type_sub && cj != NULL)) {
+    else if (t->type == task_type_pair || (t->type == task_type_sub_pair)) {
       /* In-cell pair? */
       if (ci == cj) {
         /* Add weight to vertex for ci. */
diff --git a/src/queue.c b/src/queue.c
index a62ded2561fc683bf2f44abcf1110b0fb7eebd0c..9883d77e66421eda6093331e0c9a8f6ac0155ded 100644
--- a/src/queue.c
+++ b/src/queue.c
@@ -38,22 +38,11 @@
 #include "const.h"
 #include "error.h"
 
-/* Counter macros. */
-#ifdef COUNTER
-#define COUNT(c) (__sync_add_and_fetch(&queue_counter[c], 1))
-#else
-#define COUNT(c)
-#endif
-
-/* The counters. */
-int queue_counter[queue_counter_count];
-
 /**
  * @brief Enqueue all tasks in the incoming DEQ.
  *
  * @param q The #queue, assumed to be locked.
  */
-
 void queue_get_incoming(struct queue *q) {
 
   int *tid = q->tid;
@@ -108,7 +97,6 @@ void queue_get_incoming(struct queue *q) {
  * @param q The #queue.
  * @param t The #task.
  */
-
 void queue_insert(struct queue *q, struct task *t) {
   /* Get an index in the DEQ. */
   const int ind = atomic_inc(&q->last_incoming) % queue_incoming_size;
@@ -140,7 +128,6 @@ void queue_insert(struct queue *q, struct task *t) {
  * @param q The #queue.
  * @param tasks List of tasks to which the queue indices refer to.
  */
-
 void queue_init(struct queue *q, struct task *tasks) {
 
   /* Allocate the task list if needed. */
@@ -176,7 +163,6 @@ void queue_init(struct queue *q, struct task *tasks) {
  * @param prev The previous #task extracted from this #queue.
  * @param blocking Block until access to the queue is granted.
  */
-
 struct task *queue_gettask(struct queue *q, const struct task *prev,
                            int blocking) {
 
diff --git a/src/runner.c b/src/runner.c
index b51cb0898b8a21d185eca9bc7ff99b25cfbce700..78c183de8922235cf5729b3e16e7e7b37ed9a826 100644
--- a/src/runner.c
+++ b/src/runner.c
@@ -40,6 +40,7 @@
 /* Local headers. */
 #include "approx_math.h"
 #include "atomic.h"
+#include "cell.h"
 #include "const.h"
 #include "debug.h"
 #include "drift.h"
@@ -86,9 +87,6 @@ const char runner_flip[27] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
 #define FUNCTION force
 #include "runner_doiact.h"
 
-/* Import the gravity loop functions. */
-#include "runner_doiact_grav.h"
-
 /**
  * @brief Calculate gravity acceleration from external potential
  *
@@ -98,8 +96,8 @@ const char runner_flip[27] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
  */
 void runner_do_grav_external(struct runner *r, struct cell *c, int timer) {
 
-  struct gpart *g, *gparts = c->gparts;
-  int i, k, gcount = c->gcount;
+  struct gpart *restrict gparts = c->gparts;
+  const int gcount = c->gcount;
   const int ti_current = r->e->ti_current;
   const struct external_potential *potential = r->e->external_potential;
   const struct phys_const *constants = r->e->physical_constants;
@@ -108,7 +106,7 @@ void runner_do_grav_external(struct runner *r, struct cell *c, int timer) {
 
   /* Recurse? */
   if (c->split) {
-    for (k = 0; k < 8; k++)
+    for (int k = 0; k < 8; k++)
       if (c->progeny[k] != NULL) runner_do_grav_external(r, c->progeny[k], 0);
     return;
   }
@@ -118,10 +116,10 @@ void runner_do_grav_external(struct runner *r, struct cell *c, int timer) {
 #endif
 
   /* Loop over the parts in this cell. */
-  for (i = 0; i < gcount; i++) {
+  for (int i = 0; i < gcount; i++) {
 
     /* Get a direct pointer on the part. */
-    g = &gparts[i];
+    struct gpart *const g = &gparts[i];
 
     /* Is this part within the time step? */
     if (g->ti_end <= ti_current) {
@@ -138,7 +136,6 @@ void runner_do_grav_external(struct runner *r, struct cell *c, int timer) {
  * @param sort The entries
  * @param N The number of entries.
  */
-
 void runner_do_sort_ascending(struct entry *sort, int N) {
 
   struct {
@@ -220,7 +217,6 @@ void runner_do_sort_ascending(struct entry *sort, int N) {
  * @param clock Flag indicating whether to record the timing or not, needed
  *      for recursive calls.
  */
-
 void runner_do_sort(struct runner *r, struct cell *c, int flags, int clock) {
 
   struct entry *finger;
@@ -350,18 +346,18 @@ void runner_do_sort(struct runner *r, struct cell *c, int flags, int clock) {
       }
   }
 
+#ifdef SWIFT_DEBUG_CHECKS
   /* Verify the sorting. */
-  /* for ( j = 0 ; j < 13 ; j++ ) {
-      if ( !( flags & (1 << j) ) )
-          continue;
-      finger = &sort[ j*(count + 1) ];
-      for ( k = 1 ; k < count ; k++ ) {
-          if ( finger[k].d < finger[k-1].d )
-              error( "Sorting failed, ascending array." );
-          if ( finger[k].i >= count )
-              error( "Sorting failed, indices borked." );
-          }
-      } */
+  for (j = 0; j < 13; j++) {
+    if (!(flags & (1 << j))) continue;
+    finger = &sort[j * (count + 1)];
+    for (k = 1; k < count; k++) {
+      if (finger[k].d < finger[k - 1].d)
+        error("Sorting failed, ascending array.");
+      if (finger[k].i >= count) error("Sorting failed, indices borked.");
+    }
+  }
+#endif
 
   if (clock) TIMER_TOC(timer_dosort);
 }
@@ -426,7 +422,6 @@ void runner_do_init(struct runner *r, struct cell *c, int timer) {
  * @param r The runner thread.
  * @param c The cell.
  */
-
 void runner_do_ghost(struct runner *r, struct cell *c) {
 
   struct part *p, *parts = c->parts;
@@ -553,8 +548,13 @@ void runner_do_ghost(struct runner *r, struct cell *c) {
 
           }
 
-          /* Otherwise, sub interaction? */
-          else if (l->t->type == task_type_sub) {
+          /* Otherwise, sub-self interaction? */
+          else if (l->t->type == task_type_sub_self)
+            runner_dosub_subset_density(r, finger, parts, pid, count, NULL, -1,
+                                        1);
+
+          /* Otherwise, sub-pair interaction? */
+          else if (l->t->type == task_type_sub_pair) {
 
             /* Left or right? */
             if (l->t->ci == finger)
@@ -760,7 +760,7 @@ void runner_do_kick_fixdt(struct runner *r, struct cell *c, int timer) {
       struct gpart *const gp = &gparts[k];
 
       /* If the g-particle has no counterpart */
-      if (gp->id < 0) {
+      if (gp->id_or_neg_offset > 0) {
 
         /* First, finish the force calculation */
         gravity_end_force(gp);
@@ -872,7 +872,7 @@ void runner_do_kick(struct runner *r, struct cell *c, int timer) {
       struct gpart *const gp = &gparts[k];
 
       /* If the g-particle has no counterpart and needs to be kicked */
-      if (gp->id < 0) {
+      if (gp->id_or_neg_offset > 0) {
 
         if (gp->ti_end <= ti_current) {
 
@@ -980,19 +980,36 @@ void runner_do_recv_cell(struct runner *r, struct cell *c, int timer) {
   int ti_end_max = 0;
   float h_max = 0.f;
 
-  /* Collect everything... */
-  for (size_t k = 0; k < nr_parts; k++) {
-    const int ti_end = parts[k].ti_end;
-    // if(ti_end < ti_current) error("Received invalid particle !");
-    ti_end_min = min(ti_end_min, ti_end);
-    ti_end_max = max(ti_end_max, ti_end);
-    h_max = fmaxf(h_max, parts[k].h);
+  /* If this cell is a leaf, collect the particle data. */
+  if (!c->split) {
+
+    /* Collect everything... */
+    for (size_t k = 0; k < nr_parts; k++) {
+      const int ti_end = parts[k].ti_end;
+      // if(ti_end < ti_current) error("Received invalid particle !");
+      ti_end_min = min(ti_end_min, ti_end);
+      ti_end_max = max(ti_end_max, ti_end);
+      h_max = fmaxf(h_max, parts[k].h);
+    }
+    for (size_t k = 0; k < nr_gparts; k++) {
+      const int ti_end = gparts[k].ti_end;
+      // if(ti_end < ti_current) error("Received invalid particle !");
+      ti_end_min = min(ti_end_min, ti_end);
+      ti_end_max = max(ti_end_max, ti_end);
+    }
+
   }
-  for (size_t k = 0; k < nr_gparts; k++) {
-    const int ti_end = gparts[k].ti_end;
-    // if(ti_end < ti_current) error("Received invalid particle !");
-    ti_end_min = min(ti_end_min, ti_end);
-    ti_end_max = max(ti_end_max, ti_end);
+
+  /* Otherwise, recurse and collect. */
+  else {
+    for (int k = 0; k < 8; k++) {
+      if (c->progeny[k] != NULL) {
+        runner_do_recv_cell(r, c->progeny[k], 0);
+        ti_end_min = min(ti_end_min, c->progeny[k]->ti_end_min);
+        ti_end_max = max(ti_end_max, c->progeny[k]->ti_end_max);
+        h_max = fmaxf(h_max, c->progeny[k]->h_max);
+      }
+    }
   }
 
   /* ... and store. */
@@ -1066,13 +1083,19 @@ void *runner_main(void *data) {
         case task_type_sort:
           runner_do_sort(r, ci, t->flags, 1);
           break;
-        case task_type_sub:
+        case task_type_sub_self:
+          if (t->subtype == task_subtype_density)
+            runner_dosub_self1_density(r, ci, 1);
+          else if (t->subtype == task_subtype_force)
+            runner_dosub_self2_force(r, ci, 1);
+          else
+            error("Unknown task subtype.");
+          break;
+        case task_type_sub_pair:
           if (t->subtype == task_subtype_density)
-            runner_dosub1_density(r, ci, cj, t->flags, 1);
+            runner_dosub_pair1_density(r, ci, cj, t->flags, 1);
           else if (t->subtype == task_subtype_force)
-            runner_dosub2_force(r, ci, cj, t->flags, 1);
-          else if (t->subtype == task_subtype_grav)
-            runner_dosub_grav(r, ci, cj, 1);
+            runner_dosub_pair2_force(r, ci, cj, t->flags, 1);
           else
             error("Unknown task subtype.");
           break;
@@ -1092,24 +1115,17 @@ void *runner_main(void *data) {
           runner_do_kick_fixdt(r, ci, 1);
           break;
         case task_type_send:
+          if (t->subtype == task_subtype_tend) {
+            free(t->buff);
+          }
           break;
         case task_type_recv:
-          runner_do_recv_cell(r, ci, 1);
-          break;
-        case task_type_grav_pp:
-          if (t->cj == NULL)
-            runner_doself_grav(r, t->ci);
-          else
-            runner_dopair_grav(r, t->ci, t->cj);
-          break;
-        case task_type_grav_mm:
-          runner_dograv_mm(r, t->ci, t->cj);
-          break;
-        case task_type_grav_up:
-          runner_dograv_up(r, t->ci);
-          break;
-        case task_type_grav_down:
-          runner_dograv_down(r, t->ci);
+          if (t->subtype == task_subtype_tend) {
+            cell_unpack_ti_ends(ci, t->buff);
+            free(t->buff);
+          } else {
+            runner_do_recv_cell(r, ci, 1);
+          }
           break;
         case task_type_grav_external:
           runner_do_grav_external(r, t->ci, 1);
diff --git a/src/runner.h b/src/runner.h
index 758b6cf57dbc1a46b6fc068a23615f3b28c8707e..6838b959955c4e54e208b8d2d16339e7fdb1740f 100644
--- a/src/runner.h
+++ b/src/runner.h
@@ -23,13 +23,12 @@
 #ifndef SWIFT_RUNNER_H
 #define SWIFT_RUNNER_H
 
-/* Includes. */
-#include "cell.h"
-#include "inline.h"
-
 extern const double runner_shift[13][3];
 extern const char runner_flip[27];
 
+struct cell;
+struct engine;
+
 /* A struct representing a runner's thread and its data. */
 struct runner {
 
diff --git a/src/runner_doiact.h b/src/runner_doiact.h
index e0c13e4e0adb101d9a8975651f24225551277f43..4da83b940d55460c4bf8d2f650534aedf94dbd5d 100644
--- a/src/runner_doiact.h
+++ b/src/runner_doiact.h
@@ -1,7 +1,7 @@
-
 /*******************************************************************************
  * This file is part of SWIFT.
  * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
+ *               2016 Matthieu Schaller (matthieu.schaller@durham.ac.uk)
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
@@ -18,10 +18,6 @@
  *
  ******************************************************************************/
 
-/* Includes. */
-#include "cell.h"
-#include "part.h"
-
 /* Before including this file, define FUNCTION, which is the
    name of the interaction function. This creates the interaction functions
    runner_dopair_FUNCTION, runner_dopair_FUNCTION_naive, runner_doself_FUNCTION,
@@ -57,11 +53,17 @@
 #define _DOSELF_SUBSET(f) PASTE(runner_doself_subset, f)
 #define DOSELF_SUBSET _DOSELF_SUBSET(FUNCTION)
 
-#define _DOSUB1(f) PASTE(runner_dosub1, f)
-#define DOSUB1 _DOSUB1(FUNCTION)
+#define _DOSUB_SELF1(f) PASTE(runner_dosub_self1, f)
+#define DOSUB_SELF1 _DOSUB_SELF1(FUNCTION)
+
+#define _DOSUB_PAIR1(f) PASTE(runner_dosub_pair1, f)
+#define DOSUB_PAIR1 _DOSUB_PAIR1(FUNCTION)
+
+#define _DOSUB_SELF2(f) PASTE(runner_dosub_self2, f)
+#define DOSUB_SELF2 _DOSUB_SELF2(FUNCTION)
 
-#define _DOSUB2(f) PASTE(runner_dosub2, f)
-#define DOSUB2 _DOSUB2(FUNCTION)
+#define _DOSUB_PAIR2(f) PASTE(runner_dosub_pair2, f)
+#define DOSUB_PAIR2 _DOSUB_PAIR2(FUNCTION)
 
 #define _DOSUB_SUBSET(f) PASTE(runner_dosub_subset, f)
 #define DOSUB_SUBSET _DOSUB_SUBSET(FUNCTION)
@@ -72,14 +74,23 @@
 #define _IACT(f) PASTE(runner_iact, f)
 #define IACT _IACT(FUNCTION)
 
+#define _IACT_NONSYM_VEC(f) PASTE(runner_iact_nonsym_vec, f)
+#define IACT_NONSYM_VEC _IACT_NONSYM_VEC(FUNCTION)
+
+#define _IACT_VEC(f) PASTE(runner_iact_vec, f)
+#define IACT_VEC _IACT_VEC(FUNCTION)
+
 #define _TIMER_DOSELF(f) PASTE(timer_doself, f)
 #define TIMER_DOSELF _TIMER_DOSELF(FUNCTION)
 
 #define _TIMER_DOPAIR(f) PASTE(timer_dopair, f)
 #define TIMER_DOPAIR _TIMER_DOPAIR(FUNCTION)
 
-#define _TIMER_DOSUB(f) PASTE(timer_dosub, f)
-#define TIMER_DOSUB _TIMER_DOSUB(FUNCTION)
+#define _TIMER_DOSUB_SELF(f) PASTE(timer_dosub_self, f)
+#define TIMER_DOSUB_SELF _TIMER_DOSUB_SELF(FUNCTION)
+
+#define _TIMER_DOSUB_PAIR(f) PASTE(timer_dosub_pair, f)
+#define TIMER_DOSUB_PAIR _TIMER_DOSUB_PAIR(FUNCTION)
 
 #define _TIMER_DOSELF_SUBSET(f) PASTE(timer_doself_subset, f)
 #define TIMER_DOSELF_SUBSET _TIMER_DOSELF_SUBSET(FUNCTION)
@@ -87,12 +98,6 @@
 #define _TIMER_DOPAIR_SUBSET(f) PASTE(timer_dopair_subset, f)
 #define TIMER_DOPAIR_SUBSET _TIMER_DOPAIR_SUBSET(FUNCTION)
 
-#define _IACT_NONSYM_VEC(f) PASTE(runner_iact_nonsym_vec, f)
-#define IACT_NONSYM_VEC _IACT_NONSYM_VEC(FUNCTION)
-
-#define _IACT_VEC(f) PASTE(runner_iact_vec, f)
-#define IACT_VEC _IACT_VEC(FUNCTION)
-
 /**
  * @brief Compute the interactions between a cell pair.
  *
@@ -100,18 +105,14 @@
  * @param ci The first #cell.
  * @param cj The second #cell.
  */
-
 void DOPAIR_NAIVE(struct runner *r, struct cell *restrict ci,
                   struct cell *restrict cj) {
 
-  struct engine *e = r->e;
-  int pid, pjd, k, count_i = ci->count, count_j = cj->count;
-  double shift[3] = {0.0, 0.0, 0.0};
-  struct part *restrict parts_i = ci->parts, *restrict parts_j = cj->parts;
-  struct part *restrict pi, *restrict pj;
-  double pix[3];
-  float dx[3], hi, hig2, r2;
+  const struct engine *e = r->e;
   const int ti_current = e->ti_current;
+
+  error("Don't use in actual runs ! Slow code !");
+
 #ifdef VECTORIZE
   int icount = 0;
   float r2q[VEC_SIZE] __attribute__((aligned(16)));
@@ -120,44 +121,47 @@ void DOPAIR_NAIVE(struct runner *r, struct cell *restrict ci,
   float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
   struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
 #endif
+
   TIMER_TIC
 
   /* Anything to do here? */
   if (ci->ti_end_min > ti_current && cj->ti_end_min > ti_current) return;
 
+  const int count_i = ci->count;
+  const int count_j = cj->count;
+  struct part *restrict parts_i = ci->parts;
+  struct part *restrict parts_j = cj->parts;
+
   /* Get the relative distance between the pairs, wrapping. */
-  for (k = 0; k < 3; k++) {
+  double shift[3] = {0.0, 0.0, 0.0};
+  for (int k = 0; k < 3; k++) {
     if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2)
       shift[k] = e->s->dim[k];
     else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2)
       shift[k] = -e->s->dim[k];
   }
 
-  /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with
-  %i/%i parts and shift = [ %g %g %g ].\n" ,
-      ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] ,
-  cj->loc[2] ,
-      ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout);
-  tic = getticks(); */
-
   /* Loop over the parts in ci. */
-  for (pid = 0; pid < count_i; pid++) {
+  for (int pid = 0; pid < count_i; pid++) {
 
     /* Get a hold of the ith part in ci. */
-    pi = &parts_i[pid];
-    for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
-    hi = pi->h;
-    hig2 = hi * hi * kernel_gamma2;
+    struct part *restrict pi = &parts_i[pid];
+    const float hi = pi->h;
+
+    double pix[3];
+    for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
+    const float hig2 = hi * hi * kernel_gamma2;
 
     /* Loop over the parts in cj. */
-    for (pjd = 0; pjd < count_j; pjd++) {
+    for (int pjd = 0; pjd < count_j; pjd++) {
 
       /* Get a pointer to the jth particle. */
-      pj = &parts_j[pjd];
+      struct part *restrict pj = &parts_j[pjd];
 
       /* Compute the pairwise distance. */
-      r2 = 0.0f;
-      for (k = 0; k < 3; k++) {
+      float r2 = 0.0f;
+      float dx[3];
+      for (int k = 0; k < 3; k++) {
         dx[k] = pix[k] - pj->x[k];
         r2 += dx[k] * dx[k];
       }
@@ -198,7 +202,7 @@ void DOPAIR_NAIVE(struct runner *r, struct cell *restrict ci,
 #ifdef VECTORIZE
   /* Pick up any leftovers. */
   if (icount > 0)
-    for (k = 0; k < icount; k++)
+    for (int k = 0; k < icount; k++)
       IACT(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
 #endif
 
@@ -207,12 +211,10 @@ void DOPAIR_NAIVE(struct runner *r, struct cell *restrict ci,
 
 void DOSELF_NAIVE(struct runner *r, struct cell *restrict c) {
 
-  int pid, pjd, k, count = c->count;
-  struct part *restrict parts = c->parts;
-  struct part *restrict pi, *restrict pj;
-  double pix[3] = {0.0, 0.0, 0.0};
-  float dx[3], hi, hig2, r2;
   const int ti_current = r->e->ti_current;
+
+  error("Don't use in actual runs ! Slow code !");
+
 #ifdef VECTORIZE
   int icount = 0;
   float r2q[VEC_SIZE] __attribute__((aligned(16)));
@@ -221,38 +223,34 @@ void DOSELF_NAIVE(struct runner *r, struct cell *restrict c) {
   float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
   struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
 #endif
+
   TIMER_TIC
 
   /* Anything to do here? */
   if (c->ti_end_min > ti_current) return;
 
-  /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with
-  %i/%i parts and shift = [ %g %g %g ].\n" ,
-      ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] ,
-  cj->loc[2] ,
-      ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout);
-  tic = getticks(); */
+  const int count = c->count;
+  struct part *restrict parts = c->parts;
 
   /* Loop over the parts in ci. */
-  for (pid = 0; pid < count; pid++) {
+  for (int pid = 0; pid < count; pid++) {
 
     /* Get a hold of the ith part in ci. */
-    pi = &parts[pid];
-    pix[0] = pi->x[0];
-    pix[1] = pi->x[1];
-    pix[2] = pi->x[2];
-    hi = pi->h;
-    hig2 = hi * hi * kernel_gamma2;
+    struct part *restrict pi = &parts[pid];
+    const double pix[3] = {pi->x[0], pi->x[1], pi->x[2]};
+    const float hi = pi->h;
+    const float hig2 = hi * hi * kernel_gamma2;
 
     /* Loop over the parts in cj. */
-    for (pjd = pid + 1; pjd < count; pjd++) {
+    for (int pjd = pid + 1; pjd < count; pjd++) {
 
       /* Get a pointer to the jth particle. */
-      pj = &parts[pjd];
+      struct part *restrict pj = &parts[pjd];
 
       /* Compute the pairwise distance. */
-      r2 = 0.0f;
-      for (k = 0; k < 3; k++) {
+      float r2 = 0.0f;
+      float dx[3];
+      for (int k = 0; k < 3; k++) {
         dx[k] = pix[k] - pj->x[k];
         r2 += dx[k] * dx[k];
       }
@@ -293,7 +291,7 @@ void DOSELF_NAIVE(struct runner *r, struct cell *restrict c) {
 #ifdef VECTORIZE
   /* Pick up any leftovers. */
   if (icount > 0)
-    for (k = 0; k < icount; k++)
+    for (int k = 0; k < icount; k++)
       IACT(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
 #endif
 
@@ -311,18 +309,121 @@ void DOSELF_NAIVE(struct runner *r, struct cell *restrict c) {
  * @param count The number of particles in @c ind.
  * @param cj The second #cell.
  */
+void DOPAIR_SUBSET_NAIVE(struct runner *r, struct cell *restrict ci,
+                         struct part *restrict parts_i, int *restrict ind,
+                         int count, struct cell *restrict cj) {
+
+  struct engine *e = r->e;
+
+  error("Don't use in actual runs ! Slow code !");
+
+#ifdef VECTORIZE
+  int icount = 0;
+  float r2q[VEC_SIZE] __attribute__((aligned(16)));
+  float hiq[VEC_SIZE] __attribute__((aligned(16)));
+  float hjq[VEC_SIZE] __attribute__((aligned(16)));
+  float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
+  struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
+#endif
+
+  TIMER_TIC
+
+  const int count_j = cj->count;
+  struct part *restrict parts_j = cj->parts;
+
+  /* Get the relative distance between the pairs, wrapping. */
+  double shift[3] = {0.0, 0.0, 0.0};
+  for (int k = 0; k < 3; k++) {
+    if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2)
+      shift[k] = e->s->dim[k];
+    else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2)
+      shift[k] = -e->s->dim[k];
+  }
+
+  /* Loop over the parts_i. */
+  for (int pid = 0; pid < count; pid++) {
+
+    /* Get a hold of the ith part in ci. */
+    struct part *restrict pi = &parts_i[ind[pid]];
+    double pix[3];
+    for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
+    const float hi = pi->h;
+    const float hig2 = hi * hi * kernel_gamma2;
+
+    /* Loop over the parts in cj. */
+    for (int pjd = 0; pjd < count_j; pjd++) {
+
+      /* Get a pointer to the jth particle. */
+      struct part *restrict pj = &parts_j[pjd];
+
+      /* Compute the pairwise distance. */
+      float r2 = 0.0f;
+      float dx[3];
+      for (int k = 0; k < 3; k++) {
+        dx[k] = pix[k] - pj->x[k];
+        r2 += dx[k] * dx[k];
+      }
+
+      /* Hit or miss? */
+      if (r2 < hig2) {
+
+#ifndef VECTORIZE
+
+        IACT_NONSYM(r2, dx, hi, pj->h, pi, pj);
+
+#else
+
+        /* Add this interaction to the queue. */
+        r2q[icount] = r2;
+        dxq[3 * icount + 0] = dx[0];
+        dxq[3 * icount + 1] = dx[1];
+        dxq[3 * icount + 2] = dx[2];
+        hiq[icount] = hi;
+        hjq[icount] = pj->h;
+        piq[icount] = pi;
+        pjq[icount] = pj;
+        icount += 1;
+
+        /* Flush? */
+        if (icount == VEC_SIZE) {
+          IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq);
+          icount = 0;
+        }
+
+#endif
+      }
+
+    } /* loop over the parts in cj. */
+
+  } /* loop over the parts in ci. */
+
+#ifdef VECTORIZE
+  /* Pick up any leftovers. */
+  if (icount > 0)
+    for (int k = 0; k < icount; k++)
+      IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
+#endif
+
+  TIMER_TOC(timer_dopair_subset);
+}
 
+/**
+ * @brief Compute the interactions between a cell pair, but only for the
+ *      given indices in ci.
+ *
+ * @param r The #runner.
+ * @param ci The first #cell.
+ * @param parts_i The #part to interact with @c cj.
+ * @param ind The list of indices of particles in @c ci to interact with.
+ * @param count The number of particles in @c ind.
+ * @param cj The second #cell.
+ */
 void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci,
                    struct part *restrict parts_i, int *restrict ind, int count,
                    struct cell *restrict cj) {
 
   struct engine *e = r->e;
-  int pid, pjd, sid, k, count_j = cj->count, flipped;
-  double shift[3] = {0.0, 0.0, 0.0};
-  struct part *restrict pi, *restrict pj, *restrict parts_j = cj->parts;
-  double pix[3];
-  float dx[3], hi, hig2, r2, di, dxj;
-  struct entry *sort_j;
+
 #ifdef VECTORIZE
   int icount = 0;
   float r2q[VEC_SIZE] __attribute__((aligned(16)));
@@ -331,10 +432,15 @@ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci,
   float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
   struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
 #endif
+
   TIMER_TIC
 
+  const int count_j = cj->count;
+  struct part *restrict parts_j = cj->parts;
+
   /* Get the relative distance between the pairs, wrapping. */
-  for (k = 0; k < 3; k++) {
+  double shift[3] = {0.0, 0.0, 0.0};
+  for (int k = 0; k < 3; k++) {
     if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2)
       shift[k] = e->s->dim[k];
     else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2)
@@ -342,52 +448,50 @@ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci,
   }
 
   /* Get the sorting index. */
-  for (sid = 0, k = 0; k < 3; k++)
+  int sid = 0;
+  for (int k = 0; k < 3; k++)
     sid = 3 * sid + ((cj->loc[k] - ci->loc[k] + shift[k] < 0)
                          ? 0
                          : (cj->loc[k] - ci->loc[k] + shift[k] > 0) ? 2 : 1);
 
   /* Switch the cells around? */
-  flipped = runner_flip[sid];
+  const int flipped = runner_flip[sid];
   sid = sortlistID[sid];
 
   /* Have the cells been sorted? */
   if (!(cj->sorted & (1 << sid))) error("Trying to interact unsorted cells.");
 
-  /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with
-  %i/%i parts and shift = [ %g %g %g ].\n" ,
-      ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] ,
-  cj->loc[2] ,
-      ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout);
-  tic = getticks(); */
-
   /* Pick-out the sorted lists. */
-  sort_j = &cj->sort[sid * (cj->count + 1)];
-  dxj = cj->dx_max;
+  const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)];
+  const float dxj = cj->dx_max;
 
   /* Parts are on the left? */
   if (!flipped) {
 
     /* Loop over the parts_i. */
-    for (pid = 0; pid < count; pid++) {
+    for (int pid = 0; pid < count; pid++) {
 
       /* Get a hold of the ith part in ci. */
-      pi = &parts_i[ind[pid]];
-      for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
-      hi = pi->h;
-      hig2 = hi * hi * kernel_gamma2;
-      di = hi * kernel_gamma + dxj + pix[0] * runner_shift[sid][0] +
-           pix[1] * runner_shift[sid][1] + pix[2] * runner_shift[sid][2];
+      struct part *restrict pi = &parts_i[ind[pid]];
+      double pix[3];
+      for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
+
+      const float hi = pi->h;
+      const float hig2 = hi * hi * kernel_gamma2;
+      const float di = hi * kernel_gamma + dxj + pix[0] * runner_shift[sid][0] +
+                       pix[1] * runner_shift[sid][1] +
+                       pix[2] * runner_shift[sid][2];
 
       /* Loop over the parts in cj. */
-      for (pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) {
+      for (int pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) {
 
         /* Get a pointer to the jth particle. */
-        pj = &parts_j[sort_j[pjd].i];
+        struct part *restrict pj = &parts_j[sort_j[pjd].i];
 
         /* Compute the pairwise distance. */
-        r2 = 0.0f;
-        for (k = 0; k < 3; k++) {
+        float r2 = 0.0f;
+        float dx[3];
+        for (int k = 0; k < 3; k++) {
           dx[k] = pix[k] - pj->x[k];
           r2 += dx[k] * dx[k];
         }
@@ -431,25 +535,28 @@ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci,
   else {
 
     /* Loop over the parts_i. */
-    for (pid = 0; pid < count; pid++) {
+    for (int pid = 0; pid < count; pid++) {
 
       /* Get a hold of the ith part in ci. */
-      pi = &parts_i[ind[pid]];
-      for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
-      hi = pi->h;
-      hig2 = hi * hi * kernel_gamma2;
-      di = -hi * kernel_gamma - dxj + pix[0] * runner_shift[sid][0] +
-           pix[1] * runner_shift[sid][1] + pix[2] * runner_shift[sid][2];
+      struct part *restrict pi = &parts_i[ind[pid]];
+      double pix[3];
+      for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
+      const float hi = pi->h;
+      const float hig2 = hi * hi * kernel_gamma2;
+      const float di =
+          -hi * kernel_gamma - dxj + pix[0] * runner_shift[sid][0] +
+          pix[1] * runner_shift[sid][1] + pix[2] * runner_shift[sid][2];
 
       /* Loop over the parts in cj. */
-      for (pjd = count_j - 1; pjd >= 0 && di < sort_j[pjd].d; pjd--) {
+      for (int pjd = count_j - 1; pjd >= 0 && di < sort_j[pjd].d; pjd--) {
 
         /* Get a pointer to the jth particle. */
-        pj = &parts_j[sort_j[pjd].i];
+        struct part *restrict pj = &parts_j[sort_j[pjd].i];
 
         /* Compute the pairwise distance. */
-        r2 = 0.0f;
-        for (k = 0; k < 3; k++) {
+        float r2 = 0.0f;
+        float dx[3];
+        for (int k = 0; k < 3; k++) {
           dx[k] = pix[k] - pj->x[k];
           r2 += dx[k] * dx[k];
         }
@@ -491,119 +598,7 @@ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci,
 #ifdef VECTORIZE
   /* Pick up any leftovers. */
   if (icount > 0)
-    for (k = 0; k < icount; k++)
-      IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
-#endif
-
-  TIMER_TOC(timer_dopair_subset);
-}
-
-/**
- * @brief Compute the interactions between a cell pair, but only for the
- *      given indices in ci.
- *
- * @param r The #runner.
- * @param ci The first #cell.
- * @param parts_i The #part to interact with @c cj.
- * @param ind The list of indices of particles in @c ci to interact with.
- * @param count The number of particles in @c ind.
- * @param cj The second #cell.
- */
-
-void DOPAIR_SUBSET_NAIVE(struct runner *r, struct cell *restrict ci,
-                         struct part *restrict parts_i, int *restrict ind,
-                         int count, struct cell *restrict cj) {
-
-  struct engine *e = r->e;
-  int pid, pjd, k, count_j = cj->count;
-  double shift[3] = {0.0, 0.0, 0.0};
-  struct part *restrict pi, *restrict pj, *restrict parts_j = cj->parts;
-  double pix[3];
-  float dx[3], hi, hig2, r2;
-#ifdef VECTORIZE
-  int icount = 0;
-  float r2q[VEC_SIZE] __attribute__((aligned(16)));
-  float hiq[VEC_SIZE] __attribute__((aligned(16)));
-  float hjq[VEC_SIZE] __attribute__((aligned(16)));
-  float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
-  struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
-#endif
-  TIMER_TIC
-
-  /* Get the relative distance between the pairs, wrapping. */
-  for (k = 0; k < 3; k++) {
-    if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2)
-      shift[k] = e->s->dim[k];
-    else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2)
-      shift[k] = -e->s->dim[k];
-  }
-
-  /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with
-  %i/%i parts and shift = [ %g %g %g ].\n" ,
-      ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] ,
-  cj->loc[2] ,
-      ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout);
-  tic = getticks(); */
-
-  /* Loop over the parts_i. */
-  for (pid = 0; pid < count; pid++) {
-
-    /* Get a hold of the ith part in ci. */
-    pi = &parts_i[ind[pid]];
-    for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
-    hi = pi->h;
-    hig2 = hi * hi * kernel_gamma2;
-
-    /* Loop over the parts in cj. */
-    for (pjd = 0; pjd < count_j; pjd++) {
-
-      /* Get a pointer to the jth particle. */
-      pj = &parts_j[pjd];
-
-      /* Compute the pairwise distance. */
-      r2 = 0.0f;
-      for (k = 0; k < 3; k++) {
-        dx[k] = pix[k] - pj->x[k];
-        r2 += dx[k] * dx[k];
-      }
-
-      /* Hit or miss? */
-      if (r2 < hig2) {
-
-#ifndef VECTORIZE
-
-        IACT_NONSYM(r2, dx, hi, pj->h, pi, pj);
-
-#else
-
-        /* Add this interaction to the queue. */
-        r2q[icount] = r2;
-        dxq[3 * icount + 0] = dx[0];
-        dxq[3 * icount + 1] = dx[1];
-        dxq[3 * icount + 2] = dx[2];
-        hiq[icount] = hi;
-        hjq[icount] = pj->h;
-        piq[icount] = pi;
-        pjq[icount] = pj;
-        icount += 1;
-
-        /* Flush? */
-        if (icount == VEC_SIZE) {
-          IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq);
-          icount = 0;
-        }
-
-#endif
-      }
-
-    } /* loop over the parts in cj. */
-
-  } /* loop over the parts in ci. */
-
-#ifdef VECTORIZE
-  /* Pick up any leftovers. */
-  if (icount > 0)
-    for (k = 0; k < icount; k++)
+    for (int k = 0; k < icount; k++)
       IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
 #endif
 
@@ -620,15 +615,9 @@ void DOPAIR_SUBSET_NAIVE(struct runner *r, struct cell *restrict ci,
  * @param ind The list of indices of particles in @c ci to interact with.
  * @param count The number of particles in @c ind.
  */
-
 void DOSELF_SUBSET(struct runner *r, struct cell *restrict ci,
                    struct part *restrict parts, int *restrict ind, int count) {
 
-  int pid, pjd, k, count_i = ci->count;
-  struct part *restrict parts_j = ci->parts;
-  struct part *restrict pi, *restrict pj;
-  double pix[3] = {0.0, 0.0, 0.0};
-  float dx[3], hi, hig2, r2;
 #ifdef VECTORIZE
   int icount = 0;
   float r2q[VEC_SIZE] __attribute__((aligned(16)));
@@ -637,35 +626,31 @@ void DOSELF_SUBSET(struct runner *r, struct cell *restrict ci,
   float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
   struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
 #endif
+
   TIMER_TIC
 
-  /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with
-  %i/%i parts and shift = [ %g %g %g ].\n" ,
-      ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] ,
-  cj->loc[2] ,
-      ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout);
-  tic = getticks(); */
+  const int count_i = ci->count;
+  struct part *restrict parts_j = ci->parts;
 
   /* Loop over the parts in ci. */
-  for (pid = 0; pid < count; pid++) {
+  for (int pid = 0; pid < count; pid++) {
 
     /* Get a hold of the ith part in ci. */
-    pi = &parts[ind[pid]];
-    pix[0] = pi->x[0];
-    pix[1] = pi->x[1];
-    pix[2] = pi->x[2];
-    hi = pi->h;
-    hig2 = hi * hi * kernel_gamma2;
+    struct part *restrict pi = &parts[ind[pid]];
+    const double pix[3] = {pi->x[0], pi->x[1], pi->x[2]};
+    const float hi = pi->h;
+    const float hig2 = hi * hi * kernel_gamma2;
 
     /* Loop over the parts in cj. */
-    for (pjd = 0; pjd < count_i; pjd++) {
+    for (int pjd = 0; pjd < count_i; pjd++) {
 
       /* Get a pointer to the jth particle. */
-      pj = &parts_j[pjd];
+      struct part *restrict pj = &parts_j[pjd];
 
       /* Compute the pairwise distance. */
-      r2 = 0.0f;
-      for (k = 0; k < 3; k++) {
+      float r2 = 0.0f;
+      float dx[3];
+      for (int k = 0; k < 3; k++) {
         dx[k] = pix[k] - pj->x[k];
         r2 += dx[k] * dx[k];
       }
@@ -706,7 +691,7 @@ void DOSELF_SUBSET(struct runner *r, struct cell *restrict ci,
 #ifdef VECTORIZE
   /* Pick up any leftovers. */
   if (icount > 0)
-    for (k = 0; k < icount; k++)
+    for (int k = 0; k < icount; k++)
       IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
 #endif
 
@@ -714,26 +699,17 @@ void DOSELF_SUBSET(struct runner *r, struct cell *restrict ci,
 }
 
 /**
- * @brief Compute the interactions between a cell pair.
+ * @brief Compute the interactions between a cell pair (non-symmetric).
  *
  * @param r The #runner.
  * @param ci The first #cell.
  * @param cj The second #cell.
  */
-
 void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) {
 
   struct engine *restrict e = r->e;
-  int pid, pjd, k, sid;
-  double rshift, shift[3] = {0.0, 0.0, 0.0};
-  struct entry *restrict sort_i, *restrict sort_j;
-  struct part *restrict pi, *restrict pj, *restrict parts_i, *restrict parts_j;
-  double pix[3], pjx[3], di, dj;
-  float dx[3], hi, hig2, hj, hjg2, r2, dx_max;
-  double hi_max, hj_max;
-  double di_max, dj_min;
-  int count_i, count_j;
   const int ti_current = e->ti_current;
+
 #ifdef VECTORIZE
   int icount = 0;
   float r2q[VEC_SIZE] __attribute__((aligned(16)));
@@ -742,60 +718,64 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) {
   float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
   struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
 #endif
+
   TIMER_TIC
 
   /* Anything to do here? */
   if (ci->ti_end_min > ti_current && cj->ti_end_min > ti_current) return;
 
   /* Get the sort ID. */
-  sid = space_getsid(e->s, &ci, &cj, shift);
+  double shift[3] = {0.0, 0.0, 0.0};
+  const int sid = space_getsid(e->s, &ci, &cj, shift);
 
   /* Have the cells been sorted? */
   if (!(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid)))
     error("Trying to interact unsorted cells.");
 
   /* Get the cutoff shift. */
-  for (rshift = 0.0, k = 0; k < 3; k++)
-    rshift += shift[k] * runner_shift[sid][k];
+  double rshift = 0.0;
+  for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k];
 
   /* Pick-out the sorted lists. */
-  sort_i = &ci->sort[sid * (ci->count + 1)];
-  sort_j = &cj->sort[sid * (cj->count + 1)];
+  const struct entry *restrict sort_i = &ci->sort[sid * (ci->count + 1)];
+  const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)];
 
   /* Get some other useful values. */
-  hi_max = ci->h_max * kernel_gamma - rshift;
-  hj_max = cj->h_max * kernel_gamma;
-  count_i = ci->count;
-  count_j = cj->count;
-  parts_i = ci->parts;
-  parts_j = cj->parts;
-  di_max = sort_i[count_i - 1].d - rshift;
-  dj_min = sort_j[0].d;
-  dx_max = (ci->dx_max + cj->dx_max);
+  const double hi_max = ci->h_max * kernel_gamma - rshift;
+  const double hj_max = cj->h_max * kernel_gamma;
+  const int count_i = ci->count;
+  const int count_j = cj->count;
+  struct part *restrict parts_i = ci->parts;
+  struct part *restrict parts_j = cj->parts;
+  const double di_max = sort_i[count_i - 1].d - rshift;
+  const double dj_min = sort_j[0].d;
+  const float dx_max = (ci->dx_max + cj->dx_max);
 
   /* Loop over the parts in ci. */
-  for (pid = count_i - 1; pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min;
-       pid--) {
+  for (int pid = count_i - 1;
+       pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min; pid--) {
 
     /* Get a hold of the ith part in ci. */
-    pi = &parts_i[sort_i[pid].i];
+    struct part *restrict pi = &parts_i[sort_i[pid].i];
     if (pi->ti_end > ti_current) continue;
-    hi = pi->h;
-    di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift;
+    const float hi = pi->h;
+    const double di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift;
     if (di < dj_min) continue;
 
-    hig2 = hi * hi * kernel_gamma2;
-    for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
+    double pix[3];
+    for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
+    const float hig2 = hi * hi * kernel_gamma2;
 
     /* Loop over the parts in cj. */
-    for (pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) {
+    for (int pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) {
 
       /* Get a pointer to the jth particle. */
-      pj = &parts_j[sort_j[pjd].i];
+      struct part *restrict pj = &parts_j[sort_j[pjd].i];
 
       /* Compute the pairwise distance. */
-      r2 = 0.0f;
-      for (k = 0; k < 3; k++) {
+      float r2 = 0.0f;
+      float dx[3];
+      for (int k = 0; k < 3; k++) {
         dx[k] = pix[k] - pj->x[k];
         r2 += dx[k] * dx[k];
       }
@@ -833,33 +813,31 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) {
 
   } /* loop over the parts in ci. */
 
-  /* printf( "runner_dopair: first half took %.3f %s...\n" ,
-  clocks_from_ticks(getticks() - tic), clocks_getunit());
-  tic = getticks(); */
-
   /* Loop over the parts in cj. */
-  for (pjd = 0; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max;
+  for (int pjd = 0; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max;
        pjd++) {
 
     /* Get a hold of the jth part in cj. */
-    pj = &parts_j[sort_j[pjd].i];
+    struct part *restrict pj = &parts_j[sort_j[pjd].i];
     if (pj->ti_end > ti_current) continue;
-    hj = pj->h;
-    dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift;
+    const float hj = pj->h;
+    const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift;
     if (dj > di_max) continue;
 
-    for (k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k];
-    hjg2 = hj * hj * kernel_gamma2;
+    double pjx[3];
+    for (int k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k];
+    const float hjg2 = hj * hj * kernel_gamma2;
 
     /* Loop over the parts in ci. */
-    for (pid = count_i - 1; pid >= 0 && sort_i[pid].d > dj; pid--) {
+    for (int pid = count_i - 1; pid >= 0 && sort_i[pid].d > dj; pid--) {
 
       /* Get a pointer to the jth particle. */
-      pi = &parts_i[sort_i[pid].i];
+      struct part *restrict pi = &parts_i[sort_i[pid].i];
 
       /* Compute the pairwise distance. */
-      r2 = 0.0f;
-      for (k = 0; k < 3; k++) {
+      float r2 = 0.0f;
+      float dx[3];
+      for (int k = 0; k < 3; k++) {
         dx[k] = pjx[k] - pi->x[k];
         r2 += dx[k] * dx[k];
       }
@@ -900,28 +878,25 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) {
 #ifdef VECTORIZE
   /* Pick up any leftovers. */
   if (icount > 0)
-    for (k = 0; k < icount; k++)
+    for (int k = 0; k < icount; k++)
       IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
 #endif
 
   TIMER_TOC(TIMER_DOPAIR);
 }
 
+/**
+ * @brief Compute the interactions between a cell pair (symmetric)
+ *
+ * @param r The #runner.
+ * @param ci The first #cell.
+ * @param cj The second #cell.
+ */
 void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
 
   struct engine *restrict e = r->e;
-  int pid, pjd, k, sid;
-  double rshift, shift[3] = {0.0, 0.0, 0.0};
-  struct entry *sort_i, *sort_j;
-  struct entry *sortdt_i = NULL, *sortdt_j = NULL;
-  int countdt_i = 0, countdt_j = 0;
-  struct part *restrict pi, *restrict pj, *restrict parts_i, *restrict parts_j;
-  double pix[3], pjx[3], di, dj;
-  float dx[3], hi, hig2, hj, hjg2, r2, dx_max;
-  double hi_max, hj_max;
-  double di_max, dj_min;
-  int count_i, count_j;
   const int ti_current = e->ti_current;
+
 #ifdef VECTORIZE
   int icount1 = 0;
   float r2q1[VEC_SIZE] __attribute__((aligned(16)));
@@ -936,38 +911,42 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
   float dxq2[3 * VEC_SIZE] __attribute__((aligned(16)));
   struct part *piq2[VEC_SIZE], *pjq2[VEC_SIZE];
 #endif
+
   TIMER_TIC
 
   /* Anything to do here? */
   if (ci->ti_end_min > ti_current && cj->ti_end_min > ti_current) return;
 
   /* Get the shift ID. */
-  sid = space_getsid(e->s, &ci, &cj, shift);
+  double shift[3] = {0.0, 0.0, 0.0};
+  const int sid = space_getsid(e->s, &ci, &cj, shift);
 
   /* Have the cells been sorted? */
   if (!(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid)))
     error("Trying to interact unsorted cells.");
 
   /* Get the cutoff shift. */
-  for (rshift = 0.0, k = 0; k < 3; k++)
-    rshift += shift[k] * runner_shift[sid][k];
+  double rshift = 0.0;
+  for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k];
 
   /* Pick-out the sorted lists. */
-  sort_i = &ci->sort[sid * (ci->count + 1)];
-  sort_j = &cj->sort[sid * (cj->count + 1)];
+  struct entry *restrict sort_i = &ci->sort[sid * (ci->count + 1)];
+  struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)];
 
   /* Get some other useful values. */
-  hi_max = ci->h_max * kernel_gamma - rshift;
-  hj_max = cj->h_max * kernel_gamma;
-  count_i = ci->count;
-  count_j = cj->count;
-  parts_i = ci->parts;
-  parts_j = cj->parts;
-  di_max = sort_i[count_i - 1].d - rshift;
-  dj_min = sort_j[0].d;
-  dx_max = (ci->dx_max + cj->dx_max);
+  const double hi_max = ci->h_max * kernel_gamma - rshift;
+  const double hj_max = cj->h_max * kernel_gamma;
+  const int count_i = ci->count;
+  const int count_j = cj->count;
+  struct part *restrict parts_i = ci->parts;
+  struct part *restrict parts_j = cj->parts;
+  const double di_max = sort_i[count_i - 1].d - rshift;
+  const double dj_min = sort_j[0].d;
+  const double dx_max = (ci->dx_max + cj->dx_max);
 
   /* Collect the number of parts left and right below dt. */
+  int countdt_i = 0, countdt_j = 0;
+  struct entry *restrict sortdt_i = NULL, *restrict sortdt_j = NULL;
   if (ci->ti_end_max <= ti_current) {
     sortdt_i = sort_i;
     countdt_i = count_i;
@@ -975,7 +954,7 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
     if ((sortdt_i = (struct entry *)alloca(sizeof(struct entry) * count_i)) ==
         NULL)
       error("Failed to allocate dt sortlists.");
-    for (k = 0; k < count_i; k++)
+    for (int k = 0; k < count_i; k++)
       if (parts_i[sort_i[k].i].ti_end <= ti_current) {
         sortdt_i[countdt_i] = sort_i[k];
         countdt_i += 1;
@@ -988,7 +967,7 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
     if ((sortdt_j = (struct entry *)alloca(sizeof(struct entry) * count_j)) ==
         NULL)
       error("Failed to allocate dt sortlists.");
-    for (k = 0; k < count_j; k++)
+    for (int k = 0; k < count_j; k++)
       if (parts_j[sort_j[k].i].ti_end <= ti_current) {
         sortdt_j[countdt_j] = sort_j[k];
         countdt_j += 1;
@@ -996,31 +975,33 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
   }
 
   /* Loop over the parts in ci. */
-  for (pid = count_i - 1; pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min;
-       pid--) {
+  for (int pid = count_i - 1;
+       pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min; pid--) {
 
     /* Get a hold of the ith part in ci. */
-    pi = &parts_i[sort_i[pid].i];
-    hi = pi->h;
-    di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift;
+    struct part *restrict pi = &parts_i[sort_i[pid].i];
+    const float hi = pi->h;
+    const double di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift;
     if (di < dj_min) continue;
 
-    hig2 = hi * hi * kernel_gamma2;
-    for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
+    double pix[3];
+    for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
+    const float hig2 = hi * hi * kernel_gamma2;
 
     /* Look at valid dt parts only? */
     if (pi->ti_end > ti_current) {
 
       /* Loop over the parts in cj within dt. */
-      for (pjd = 0; pjd < countdt_j && sortdt_j[pjd].d < di; pjd++) {
+      for (int pjd = 0; pjd < countdt_j && sortdt_j[pjd].d < di; pjd++) {
 
         /* Get a pointer to the jth particle. */
-        pj = &parts_j[sortdt_j[pjd].i];
-        hj = pj->h;
+        struct part *restrict pj = &parts_j[sortdt_j[pjd].i];
+        const float hj = pj->h;
 
         /* Compute the pairwise distance. */
-        r2 = 0.0f;
-        for (k = 0; k < 3; k++) {
+        float r2 = 0.0f;
+        float dx[3];
+        for (int k = 0; k < 3; k++) {
           dx[k] = pj->x[k] - pix[k];
           r2 += dx[k] * dx[k];
         }
@@ -1062,15 +1043,16 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
     else {
 
       /* Loop over the parts in cj. */
-      for (pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) {
+      for (int pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) {
 
         /* Get a pointer to the jth particle. */
-        pj = &parts_j[sort_j[pjd].i];
-        hj = pj->h;
+        struct part *restrict pj = &parts_j[sort_j[pjd].i];
+        const float hj = pj->h;
 
         /* Compute the pairwise distance. */
-        r2 = 0.0f;
-        for (k = 0; k < 3; k++) {
+        float r2 = 0.0f;
+        float dx[3];
+        for (int k = 0; k < 3; k++) {
           dx[k] = pix[k] - pj->x[k];
           r2 += dx[k] * dx[k];
         }
@@ -1136,36 +1118,34 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
 
   } /* loop over the parts in ci. */
 
-  /* printf( "runner_dopair: first half took %.3f %s...\n" ,
-  clocks_from_ticks(getticks() - tic), clocks_getunit());
-  tic = getticks(); */
-
   /* Loop over the parts in cj. */
-  for (pjd = 0; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max;
+  for (int pjd = 0; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max;
        pjd++) {
 
     /* Get a hold of the jth part in cj. */
-    pj = &parts_j[sort_j[pjd].i];
-    hj = pj->h;
-    dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift;
+    struct part *restrict pj = &parts_j[sort_j[pjd].i];
+    const float hj = pj->h;
+    const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift;
     if (dj > di_max) continue;
 
-    for (k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k];
-    hjg2 = hj * hj * kernel_gamma2;
+    double pjx[3];
+    for (int k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k];
+    const float hjg2 = hj * hj * kernel_gamma2;
 
     /* Is this particle outside the dt? */
     if (pj->ti_end > ti_current) {
 
       /* Loop over the parts in ci. */
-      for (pid = countdt_i - 1; pid >= 0 && sortdt_i[pid].d > dj; pid--) {
+      for (int pid = countdt_i - 1; pid >= 0 && sortdt_i[pid].d > dj; pid--) {
 
         /* Get a pointer to the jth particle. */
-        pi = &parts_i[sortdt_i[pid].i];
-        hi = pi->h;
+        struct part *restrict pi = &parts_i[sortdt_i[pid].i];
+        const float hi = pi->h;
 
         /* Compute the pairwise distance. */
-        r2 = 0.0f;
-        for (k = 0; k < 3; k++) {
+        float r2 = 0.0f;
+        float dx[3];
+        for (int k = 0; k < 3; k++) {
           dx[k] = pi->x[k] - pjx[k];
           r2 += dx[k] * dx[k];
         }
@@ -1206,15 +1186,16 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
     else {
 
       /* Loop over the parts in ci. */
-      for (pid = count_i - 1; pid >= 0 && sort_i[pid].d > dj; pid--) {
+      for (int pid = count_i - 1; pid >= 0 && sort_i[pid].d > dj; pid--) {
 
         /* Get a pointer to the jth particle. */
-        pi = &parts_i[sort_i[pid].i];
-        hi = pi->h;
+        struct part *restrict pi = &parts_i[sort_i[pid].i];
+        const float hi = pi->h;
 
         /* Compute the pairwise distance. */
-        r2 = 0.0f;
-        for (k = 0; k < 3; k++) {
+        float r2 = 0.0f;
+        float dx[3];
+        for (int k = 0; k < 3; k++) {
           dx[k] = pjx[k] - pi->x[k];
           r2 += dx[k] * dx[k];
         }
@@ -1283,10 +1264,10 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
 #ifdef VECTORIZE
   /* Pick up any leftovers. */
   if (icount1 > 0)
-    for (k = 0; k < icount1; k++)
+    for (int k = 0; k < icount1; k++)
       IACT_NONSYM(r2q1[k], &dxq1[3 * k], hiq1[k], hjq1[k], piq1[k], pjq1[k]);
   if (icount2 > 0)
-    for (k = 0; k < icount2; k++)
+    for (int k = 0; k < icount2; k++)
       IACT(r2q2[k], &dxq2[3 * k], hiq2[k], hjq2[k], piq2[k], pjq2[k]);
 #endif
 
@@ -1294,20 +1275,15 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
 }
 
 /**
- * @brief Compute the cell self-interaction.
+ * @brief Compute the cell self-interaction (non-symmetric).
  *
  * @param r The #runner.
  * @param c The #cell.
  */
-
 void DOSELF1(struct runner *r, struct cell *restrict c) {
 
-  int k, pid, pjd, count = c->count;
-  double pix[3];
-  float dx[3], hi, hj, hig2, r2;
-  struct part *restrict parts = c->parts, *restrict pi, *restrict pj;
   const int ti_current = r->e->ti_current;
-  int firstdt = 0, countdt = 0, *indt = NULL, doj;
+
 #ifdef VECTORIZE
   int icount1 = 0;
   float r2q1[VEC_SIZE] __attribute__((aligned(16)));
@@ -1324,43 +1300,49 @@ void DOSELF1(struct runner *r, struct cell *restrict c) {
 #endif
   TIMER_TIC
 
-  /* Set up indt if needed. */
-  if (c->ti_end_min > ti_current)
-    return;
-  else if (c->ti_end_max > ti_current) {
-    if ((indt = (int *)alloca(sizeof(int) * count)) == NULL)
-      error("Failed to allocate indt.");
-    for (k = 0; k < count; k++)
-      if (parts[k].ti_end <= ti_current) {
-        indt[countdt] = k;
-        countdt += 1;
-      }
-  }
+  if (c->ti_end_min > ti_current) return;
+  if (c->ti_end_max < ti_current) error("Cell in an impossible time-zone");
+
+  struct part *restrict parts = c->parts;
+  const int count = c->count;
+
+  /* Set up indt. */
+  int *indt = NULL;
+  int countdt = 0, firstdt = 0;
+  if ((indt = (int *)alloca(sizeof(int) * count)) == NULL)
+    error("Failed to allocate indt.");
+  for (int k = 0; k < count; k++)
+    if (parts[k].ti_end <= ti_current) {
+      indt[countdt] = k;
+      countdt += 1;
+    }
 
   /* Loop over the particles in the cell. */
-  for (pid = 0; pid < count; pid++) {
+  for (int pid = 0; pid < count; pid++) {
 
     /* Get a pointer to the ith particle. */
-    pi = &parts[pid];
+    struct part *restrict pi = &parts[pid];
 
     /* Get the particle position and radius. */
-    for (k = 0; k < 3; k++) pix[k] = pi->x[k];
-    hi = pi->h;
-    hig2 = hi * hi * kernel_gamma2;
+    double pix[3];
+    for (int k = 0; k < 3; k++) pix[k] = pi->x[k];
+    const float hi = pi->h;
+    const float hig2 = hi * hi * kernel_gamma2;
 
     /* Is the ith particle inactive? */
     if (pi->ti_end > ti_current) {
 
       /* Loop over the other particles .*/
-      for (pjd = firstdt; pjd < countdt; pjd++) {
+      for (int pjd = firstdt; pjd < countdt; pjd++) {
 
         /* Get a pointer to the jth particle. */
-        pj = &parts[indt[pjd]];
-        hj = pj->h;
+        struct part *restrict pj = &parts[indt[pjd]];
+        const float hj = pj->h;
 
         /* Compute the pairwise distance. */
-        r2 = 0.0f;
-        for (k = 0; k < 3; k++) {
+        float r2 = 0.0f;
+        float dx[3];
+        for (int k = 0; k < 3; k++) {
           dx[k] = pj->x[k] - pix[k];
           r2 += dx[k] * dx[k];
         }
@@ -1405,19 +1387,21 @@ void DOSELF1(struct runner *r, struct cell *restrict c) {
       firstdt += 1;
 
       /* Loop over the other particles .*/
-      for (pjd = pid + 1; pjd < count; pjd++) {
+      for (int pjd = pid + 1; pjd < count; pjd++) {
 
         /* Get a pointer to the jth particle. */
-        pj = &parts[pjd];
-        hj = pj->h;
+        struct part *restrict pj = &parts[pjd];
+        const float hj = pj->h;
 
         /* Compute the pairwise distance. */
-        r2 = 0.0f;
-        for (k = 0; k < 3; k++) {
+        float r2 = 0.0f;
+        float dx[3];
+        for (int k = 0; k < 3; k++) {
           dx[k] = pix[k] - pj->x[k];
           r2 += dx[k] * dx[k];
         }
-        doj = (pj->ti_end <= ti_current) && (r2 < hj * hj * kernel_gamma2);
+        const int doj =
+            (pj->ti_end <= ti_current) && (r2 < hj * hj * kernel_gamma2);
 
         /* Hit or miss? */
         if (r2 < hig2 || doj) {
@@ -1508,24 +1492,26 @@ void DOSELF1(struct runner *r, struct cell *restrict c) {
 #ifdef VECTORIZE
   /* Pick up any leftovers. */
   if (icount1 > 0)
-    for (k = 0; k < icount1; k++)
+    for (int k = 0; k < icount1; k++)
       IACT_NONSYM(r2q1[k], &dxq1[3 * k], hiq1[k], hjq1[k], piq1[k], pjq1[k]);
   if (icount2 > 0)
-    for (k = 0; k < icount2; k++)
+    for (int k = 0; k < icount2; k++)
       IACT(r2q2[k], &dxq2[3 * k], hiq2[k], hjq2[k], piq2[k], pjq2[k]);
 #endif
 
   TIMER_TOC(TIMER_DOSELF);
 }
 
+/**
+ * @brief Compute the cell self-interaction (symmetric).
+ *
+ * @param r The #runner.
+ * @param c The #cell.
+ */
 void DOSELF2(struct runner *r, struct cell *restrict c) {
 
-  int k, pid, pjd, count = c->count;
-  double pix[3];
-  float dx[3], hi, hj, hig2, r2;
-  struct part *restrict parts = c->parts, *restrict pi, *restrict pj;
   const int ti_current = r->e->ti_current;
-  int firstdt = 0, countdt = 0, *indt = NULL;
+
 #ifdef VECTORIZE
   int icount1 = 0;
   float r2q1[VEC_SIZE] __attribute__((aligned(16)));
@@ -1542,43 +1528,49 @@ void DOSELF2(struct runner *r, struct cell *restrict c) {
 #endif
   TIMER_TIC
 
-  /* Set up indt if needed. */
-  if (c->ti_end_min > ti_current)
-    return;
-  else if (c->ti_end_max > ti_current) {
-    if ((indt = (int *)alloca(sizeof(int) * count)) == NULL)
-      error("Failed to allocate indt.");
-    for (k = 0; k < count; k++)
-      if (parts[k].ti_end <= ti_current) {
-        indt[countdt] = k;
-        countdt += 1;
-      }
-  }
+  if (c->ti_end_min > ti_current) return;
+  if (c->ti_end_max < ti_current) error("Cell in an impossible time-zone");
+
+  struct part *restrict parts = c->parts;
+  const int count = c->count;
+
+  /* Set up indt. */
+  int *indt = NULL;
+  int countdt = 0, firstdt = 0;
+  if ((indt = (int *)alloca(sizeof(int) * count)) == NULL)
+    error("Failed to allocate indt.");
+  for (int k = 0; k < count; k++)
+    if (parts[k].ti_end <= ti_current) {
+      indt[countdt] = k;
+      countdt += 1;
+    }
 
   /* Loop over the particles in the cell. */
-  for (pid = 0; pid < count; pid++) {
+  for (int pid = 0; pid < count; pid++) {
 
     /* Get a pointer to the ith particle. */
-    pi = &parts[pid];
+    struct part *restrict pi = &parts[pid];
 
     /* Get the particle position and radius. */
-    for (k = 0; k < 3; k++) pix[k] = pi->x[k];
-    hi = pi->h;
-    hig2 = hi * hi * kernel_gamma2;
+    double pix[3];
+    for (int k = 0; k < 3; k++) pix[k] = pi->x[k];
+    const float hi = pi->h;
+    const float hig2 = hi * hi * kernel_gamma2;
 
     /* Is the ith particle not active? */
     if (pi->ti_end > ti_current) {
 
       /* Loop over the other particles .*/
-      for (pjd = firstdt; pjd < countdt; pjd++) {
+      for (int pjd = firstdt; pjd < countdt; pjd++) {
 
         /* Get a pointer to the jth particle. */
-        pj = &parts[indt[pjd]];
-        hj = pj->h;
+        struct part *restrict pj = &parts[indt[pjd]];
+        const float hj = pj->h;
 
         /* Compute the pairwise distance. */
-        r2 = 0.0f;
-        for (k = 0; k < 3; k++) {
+        float r2 = 0.0f;
+        float dx[3];
+        for (int k = 0; k < 3; k++) {
           dx[k] = pj->x[k] - pix[k];
           r2 += dx[k] * dx[k];
         }
@@ -1623,15 +1615,16 @@ void DOSELF2(struct runner *r, struct cell *restrict c) {
       firstdt += 1;
 
       /* Loop over the other particles .*/
-      for (pjd = pid + 1; pjd < count; pjd++) {
+      for (int pjd = pid + 1; pjd < count; pjd++) {
 
         /* Get a pointer to the jth particle. */
-        pj = &parts[pjd];
-        hj = pj->h;
+        struct part *restrict pj = &parts[pjd];
+        const float hj = pj->h;
 
         /* Compute the pairwise distance. */
-        r2 = 0.0f;
-        for (k = 0; k < 3; k++) {
+        float r2 = 0.0f;
+        float dx[3];
+        for (int k = 0; k < 3; k++) {
           dx[k] = pix[k] - pj->x[k];
           r2 += dx[k] * dx[k];
         }
@@ -1700,10 +1693,10 @@ void DOSELF2(struct runner *r, struct cell *restrict c) {
 #ifdef VECTORIZE
   /* Pick up any leftovers. */
   if (icount1 > 0)
-    for (k = 0; k < icount1; k++)
+    for (int k = 0; k < icount1; k++)
       IACT_NONSYM(r2q1[k], &dxq1[3 * k], hiq1[k], hjq1[k], piq1[k], pjq1[k]);
   if (icount2 > 0)
-    for (k = 0; k < icount2; k++)
+    for (int k = 0; k < icount2; k++)
       IACT(r2q2[k], &dxq2[3 * k], hiq2[k], hjq2[k], piq2[k], pjq2[k]);
 #endif
 
@@ -1711,7 +1704,7 @@ void DOSELF2(struct runner *r, struct cell *restrict c) {
 }
 
 /**
- * @brief Compute grouped sub-cell interactions
+ * @brief Compute grouped sub-cell interactions for pairs
  *
  * @param r The #runner.
  * @param ci The first #cell.
@@ -1722,559 +1715,574 @@ void DOSELF2(struct runner *r, struct cell *restrict c) {
  * @todo Hard-code the sid on the recursive calls to avoid the
  * redundant computations to find the sid on-the-fly.
  */
+void DOSUB_PAIR1(struct runner *r, struct cell *ci, struct cell *cj, int sid,
+                 int gettimer) {
 
-void DOSUB1(struct runner *r, struct cell *ci, struct cell *cj, int sid,
-            int gettimer) {
-
-  int j = 0, k;
-  double shift[3];
-  float h;
   struct space *s = r->e->s;
   const int ti_current = r->e->ti_current;
 
   TIMER_TIC
 
-  /* Is this a single cell? */
-  if (cj == NULL) {
-
-    /* Should we even bother? */
-    if (ci->ti_end_min > ti_current) return;
+  /* Should we even bother? */
+  if (ci->ti_end_min > ti_current && cj->ti_end_min > ti_current) return;
 
-    /* Recurse? */
-    if (ci->split) {
+  /* Get the cell dimensions. */
+  const float h = fmin(ci->h[0], fmin(ci->h[1], ci->h[2]));
 
-      /* Loop over all progeny. */
-      for (k = 0; k < 8; k++)
-        if (ci->progeny[k] != NULL) {
-          DOSUB1(r, ci->progeny[k], NULL, -1, 0);
-          for (j = k + 1; j < 8; j++)
-            if (ci->progeny[j] != NULL)
-              DOSUB1(r, ci->progeny[k], ci->progeny[j], -1, 0);
-        }
+  /* Get the type of pair if not specified explicitly. */
+  // if ( sid < 0 )
+  double shift[3];
+  sid = space_getsid(s, &ci, &cj, shift);
 
-    }
+  /* Recurse? */
+  if (ci->split && cj->split &&
+      fmaxf(ci->h_max, cj->h_max) * kernel_gamma + ci->dx_max + cj->dx_max <
+          h / 2) {
 
-    /* Otherwise, compute self-interaction. */
-    else
-      DOSELF1(r, ci);
+    /* Different types of flags. */
+    switch (sid) {
 
-  } /* self-interaction. */
+      /* Regular sub-cell interactions of a single cell. */
+      case 0: /* (  1 ,  1 ,  1 ) */
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        break;
 
-  /* Otherwise, it's a pair interaction. */
-  else {
+      case 1: /* (  1 ,  1 ,  0 ) */
+        if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[0], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[1], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[1], -1, 0);
+        break;
 
-    /* Should we even bother? */
-    if (ci->ti_end_min > ti_current && cj->ti_end_min > ti_current) return;
+      case 2: /* (  1 ,  1 , -1 ) */
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[1], -1, 0);
+        break;
 
-    /* Get the cell dimensions. */
-    h = fmin(ci->h[0], fmin(ci->h[1], ci->h[2]));
+      case 3: /* (  1 ,  0 ,  1 ) */
+        if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[0], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[2], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[2], -1, 0);
+        break;
 
-    /* Get the type of pair if not specified explicitly. */
-    // if ( sid < 0 )
-    sid = space_getsid(s, &ci, &cj, shift);
+      case 4: /* (  1 ,  0 ,  0 ) */
+        if (ci->progeny[4] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[4], cj->progeny[0], -1, 0);
+        if (ci->progeny[4] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[4], cj->progeny[1], -1, 0);
+        if (ci->progeny[4] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[4], cj->progeny[2], -1, 0);
+        if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[4], cj->progeny[3], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[0], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[1], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[2], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[3], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[0], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[1], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[2], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[3], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[1], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[2], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[3], -1, 0);
+        break;
 
-    /* Recurse? */
-    if (ci->split && cj->split &&
-        fmaxf(ci->h_max, cj->h_max) * kernel_gamma + ci->dx_max + cj->dx_max <
-            h / 2) {
+      case 5: /* (  1 ,  0 , -1 ) */
+        if (ci->progeny[4] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[4], cj->progeny[1], -1, 0);
+        if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[4], cj->progeny[3], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[1], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[3], -1, 0);
+        break;
 
-      /* Different types of flags. */
-      switch (sid) {
+      case 6: /* (  1 , -1 ,  1 ) */
+        if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[2], -1, 0);
+        break;
 
-        /* Regular sub-cell interactions of a single cell. */
-        case 0: /* (  1 ,  1 ,  1 ) */
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          break;
+      case 7: /* (  1 , -1 ,  0 ) */
+        if (ci->progeny[4] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[4], cj->progeny[2], -1, 0);
+        if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[4], cj->progeny[3], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[2], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[3], -1, 0);
+        break;
 
-        case 1: /* (  1 ,  1 ,  0 ) */
-          if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[0], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[1], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[1], -1, 0);
-          break;
+      case 8: /* (  1 , -1 , -1 ) */
+        if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[4], cj->progeny[3], -1, 0);
+        break;
 
-        case 2: /* (  1 ,  1 , -1 ) */
-          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[1], -1, 0);
-          break;
+      case 9: /* (  0 ,  1 ,  1 ) */
+        if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[3], cj->progeny[0], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[3], cj->progeny[4], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[4], -1, 0);
+        break;
 
-        case 3: /* (  1 ,  0 ,  1 ) */
-          if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[5], cj->progeny[0], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
-            DOSUB1(r, ci->progeny[5], cj->progeny[2], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[2], -1, 0);
-          break;
+      case 10: /* (  0 ,  1 ,  0 ) */
+        if (ci->progeny[2] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[2], cj->progeny[0], -1, 0);
+        if (ci->progeny[2] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[2], cj->progeny[1], -1, 0);
+        if (ci->progeny[2] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[2], cj->progeny[4], -1, 0);
+        if (ci->progeny[2] != NULL && cj->progeny[5] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[2], cj->progeny[5], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[3], cj->progeny[0], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[3], cj->progeny[1], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[3], cj->progeny[4], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[5] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[3], cj->progeny[5], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[0], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[1], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[4], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[5] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[5], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[1], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[4], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[5] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[5], -1, 0);
+        break;
 
-        case 4: /* (  1 ,  0 ,  0 ) */
-          if (ci->progeny[4] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[4], cj->progeny[0], -1, 0);
-          if (ci->progeny[4] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[4], cj->progeny[1], -1, 0);
-          if (ci->progeny[4] != NULL && cj->progeny[2] != NULL)
-            DOSUB1(r, ci->progeny[4], cj->progeny[2], -1, 0);
-          if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
-            DOSUB1(r, ci->progeny[4], cj->progeny[3], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[5], cj->progeny[0], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[5], cj->progeny[1], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
-            DOSUB1(r, ci->progeny[5], cj->progeny[2], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[3] != NULL)
-            DOSUB1(r, ci->progeny[5], cj->progeny[3], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[0], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[1], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[2] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[2], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[3] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[3], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[1], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[2], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[3] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[3], -1, 0);
-          break;
+      case 11: /* (  0 ,  1 , -1 ) */
+        if (ci->progeny[2] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[2], cj->progeny[1], -1, 0);
+        if (ci->progeny[2] != NULL && cj->progeny[5] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[2], cj->progeny[5], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[1], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[5] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[5], -1, 0);
+        break;
 
-        case 5: /* (  1 ,  0 , -1 ) */
-          if (ci->progeny[4] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[4], cj->progeny[1], -1, 0);
-          if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
-            DOSUB1(r, ci->progeny[4], cj->progeny[3], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[1], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[3] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[3], -1, 0);
-          break;
+      case 12: /* (  0 ,  0 ,  1 ) */
+        if (ci->progeny[1] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[1], cj->progeny[0], -1, 0);
+        if (ci->progeny[1] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[1], cj->progeny[2], -1, 0);
+        if (ci->progeny[1] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[1], cj->progeny[4], -1, 0);
+        if (ci->progeny[1] != NULL && cj->progeny[6] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[1], cj->progeny[6], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[3], cj->progeny[0], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[3], cj->progeny[2], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[3], cj->progeny[4], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[6] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[3], cj->progeny[6], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[0], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[2], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[4], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[6] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[6], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[2], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[4], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[6] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[6], -1, 0);
+        break;
+    }
 
-        case 6: /* (  1 , -1 ,  1 ) */
-          if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
-            DOSUB1(r, ci->progeny[5], cj->progeny[2], -1, 0);
-          break;
+  }
 
-        case 7: /* (  1 , -1 ,  0 ) */
-          if (ci->progeny[4] != NULL && cj->progeny[2] != NULL)
-            DOSUB1(r, ci->progeny[4], cj->progeny[2], -1, 0);
-          if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
-            DOSUB1(r, ci->progeny[4], cj->progeny[3], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
-            DOSUB1(r, ci->progeny[5], cj->progeny[2], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[3] != NULL)
-            DOSUB1(r, ci->progeny[5], cj->progeny[3], -1, 0);
-          break;
+  /* Otherwise, compute the pair directly. */
+  else if (ci->ti_end_min <= ti_current || cj->ti_end_min <= ti_current) {
 
-        case 8: /* (  1 , -1 , -1 ) */
-          if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
-            DOSUB1(r, ci->progeny[4], cj->progeny[3], -1, 0);
-          break;
+    /* Do any of the cells need to be sorted first? */
+    if (!(ci->sorted & (1 << sid))) runner_do_sort(r, ci, (1 << sid), 1);
+    if (!(cj->sorted & (1 << sid))) runner_do_sort(r, cj, (1 << sid), 1);
 
-        case 9: /* (  0 ,  1 ,  1 ) */
-          if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[3], cj->progeny[0], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
-            DOSUB1(r, ci->progeny[3], cj->progeny[4], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[4], -1, 0);
-          break;
+    /* Compute the interactions. */
+    DOPAIR1(r, ci, cj);
+  }
 
-        case 10: /* (  0 ,  1 ,  0 ) */
-          if (ci->progeny[2] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[2], cj->progeny[0], -1, 0);
-          if (ci->progeny[2] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[2], cj->progeny[1], -1, 0);
-          if (ci->progeny[2] != NULL && cj->progeny[4] != NULL)
-            DOSUB1(r, ci->progeny[2], cj->progeny[4], -1, 0);
-          if (ci->progeny[2] != NULL && cj->progeny[5] != NULL)
-            DOSUB1(r, ci->progeny[2], cj->progeny[5], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[3], cj->progeny[0], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[3], cj->progeny[1], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
-            DOSUB1(r, ci->progeny[3], cj->progeny[4], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[5] != NULL)
-            DOSUB1(r, ci->progeny[3], cj->progeny[5], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[0], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[1], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[4] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[4], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[5] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[5], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[1], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[4], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[5] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[5], -1, 0);
-          break;
+  if (gettimer) TIMER_TOC(TIMER_DOSUB_PAIR);
+}
 
-        case 11: /* (  0 ,  1 , -1 ) */
-          if (ci->progeny[2] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[2], cj->progeny[1], -1, 0);
-          if (ci->progeny[2] != NULL && cj->progeny[5] != NULL)
-            DOSUB1(r, ci->progeny[2], cj->progeny[5], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[1], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[5] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[5], -1, 0);
-          break;
+/**
+ * @brief Compute grouped sub-cell interactions for self tasks
+ *
+ * @param r The #runner.
+ * @param ci The first #cell.
+ * @param gettimer Do we have a timer ?
+ */
+void DOSUB_SELF1(struct runner *r, struct cell *ci, int gettimer) {
 
-        case 12: /* (  0 ,  0 ,  1 ) */
-          if (ci->progeny[1] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[1], cj->progeny[0], -1, 0);
-          if (ci->progeny[1] != NULL && cj->progeny[2] != NULL)
-            DOSUB1(r, ci->progeny[1], cj->progeny[2], -1, 0);
-          if (ci->progeny[1] != NULL && cj->progeny[4] != NULL)
-            DOSUB1(r, ci->progeny[1], cj->progeny[4], -1, 0);
-          if (ci->progeny[1] != NULL && cj->progeny[6] != NULL)
-            DOSUB1(r, ci->progeny[1], cj->progeny[6], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[3], cj->progeny[0], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[2] != NULL)
-            DOSUB1(r, ci->progeny[3], cj->progeny[2], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
-            DOSUB1(r, ci->progeny[3], cj->progeny[4], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[6] != NULL)
-            DOSUB1(r, ci->progeny[3], cj->progeny[6], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[5], cj->progeny[0], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
-            DOSUB1(r, ci->progeny[5], cj->progeny[2], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[4] != NULL)
-            DOSUB1(r, ci->progeny[5], cj->progeny[4], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[6] != NULL)
-            DOSUB1(r, ci->progeny[5], cj->progeny[6], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[2], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[4], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[6] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[6], -1, 0);
-          break;
-      }
+  const int ti_current = r->e->ti_current;
 
-    }
+  TIMER_TIC
 
-    /* Otherwise, compute the pair directly. */
-    else if (ci->ti_end_min <= ti_current || cj->ti_end_min <= ti_current) {
+  /* Should we even bother? */
+  if (ci->ti_end_min > ti_current) return;
 
-      /* Do any of the cells need to be sorted first? */
-      if (!(ci->sorted & (1 << sid))) runner_do_sort(r, ci, (1 << sid), 1);
-      if (!(cj->sorted & (1 << sid))) runner_do_sort(r, cj, (1 << sid), 1);
+  /* Recurse? */
+  if (ci->split) {
 
-      /* Compute the interactions. */
-      DOPAIR1(r, ci, cj);
-    }
+    /* Loop over all progeny. */
+    for (int k = 0; k < 8; k++)
+      if (ci->progeny[k] != NULL) {
+        DOSUB_SELF1(r, ci->progeny[k], 0);
+        for (int j = k + 1; j < 8; j++)
+          if (ci->progeny[j] != NULL)
+            DOSUB_PAIR1(r, ci->progeny[k], ci->progeny[j], -1, 0);
+      }
+  }
 
-  } /* otherwise, pair interaction. */
+  /* Otherwise, compute self-interaction. */
+  else
+    DOSELF1(r, ci);
 
-  if (gettimer) TIMER_TOC(TIMER_DOSUB);
+  if (gettimer) TIMER_TOC(TIMER_DOSUB_SELF);
 }
 
-void DOSUB2(struct runner *r, struct cell *ci, struct cell *cj, int sid,
-            int gettimer) {
+/**
+ * @brief Compute grouped sub-cell interactions for pairs (symmetric case)
+ *
+ * @param r The #runner.
+ * @param ci The first #cell.
+ * @param cj The second #cell.
+ * @param sid The direction linking the cells
+ * @param gettimer Do we have a timer ?
+ *
+ * @todo Hard-code the sid on the recursive calls to avoid the
+ * redundant computations to find the sid on-the-fly.
+ */
+void DOSUB_PAIR2(struct runner *r, struct cell *ci, struct cell *cj, int sid,
+                 int gettimer) {
 
-  int j, k;
-  double shift[3];
-  float h;
   struct space *s = r->e->s;
   const int ti_current = r->e->ti_current;
 
   TIMER_TIC
 
-  /* Is this a single cell? */
-  if (cj == NULL) {
+  /* Should we even bother? */
+  if (ci->ti_end_min > ti_current && cj->ti_end_min > ti_current) return;
 
-    /* Should we even bother? */
-    if (ci->ti_end_min > ti_current) return;
+  /* Get the cell dimensions. */
+  const float h = fmin(ci->h[0], fmin(ci->h[1], ci->h[2]));
 
-    /* Recurse? */
-    if (ci->split) {
+  /* Get the type of pair if not specified explicitly. */
+  // if ( sid < 0 )
+  double shift[3];
+  sid = space_getsid(s, &ci, &cj, shift);
 
-      /* Loop over all progeny. */
-      for (k = 0; k < 8; k++)
-        if (ci->progeny[k] != NULL) {
-          DOSUB2(r, ci->progeny[k], NULL, -1, 0);
-          for (j = k + 1; j < 8; j++)
-            if (ci->progeny[j] != NULL)
-              DOSUB2(r, ci->progeny[k], ci->progeny[j], -1, 0);
-        }
+  /* Recurse? */
+  if (ci->split && cj->split &&
+      fmaxf(ci->h_max, cj->h_max) * kernel_gamma + ci->dx_max + cj->dx_max <
+          h / 2) {
 
-    }
+    /* Different types of flags. */
+    switch (sid) {
 
-    /* Otherwise, compute self-interaction. */
-    else
-      DOSELF2(r, ci);
+      /* Regular sub-cell interactions of a single cell. */
+      case 0: /* (  1 ,  1 ,  1 ) */
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        break;
 
-  } /* self-interaction. */
+      case 1: /* (  1 ,  1 ,  0 ) */
+        if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[0], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[1], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[1], -1, 0);
+        break;
 
-  /* Otherwise, it's a pair interaction. */
-  else {
+      case 2: /* (  1 ,  1 , -1 ) */
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[1], -1, 0);
+        break;
 
-    /* Should we even bother? */
-    if (ci->ti_end_min > ti_current && cj->ti_end_min > ti_current) return;
+      case 3: /* (  1 ,  0 ,  1 ) */
+        if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[0], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[2], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[2], -1, 0);
+        break;
 
-    /* Get the cell dimensions. */
-    h = fmin(ci->h[0], fmin(ci->h[1], ci->h[2]));
+      case 4: /* (  1 ,  0 ,  0 ) */
+        if (ci->progeny[4] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[4], cj->progeny[0], -1, 0);
+        if (ci->progeny[4] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[4], cj->progeny[1], -1, 0);
+        if (ci->progeny[4] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[4], cj->progeny[2], -1, 0);
+        if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[4], cj->progeny[3], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[0], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[1], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[2], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[3], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[0], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[1], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[2], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[3], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[1], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[2], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[3], -1, 0);
+        break;
 
-    /* Get the type of pair if not specified explicitly. */
-    // if ( sid < 0 )
-    sid = space_getsid(s, &ci, &cj, shift);
+      case 5: /* (  1 ,  0 , -1 ) */
+        if (ci->progeny[4] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[4], cj->progeny[1], -1, 0);
+        if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[4], cj->progeny[3], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[1], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[3], -1, 0);
+        break;
 
-    /* Recurse? */
-    if (ci->split && cj->split &&
-        fmaxf(ci->h_max, cj->h_max) * kernel_gamma + ci->dx_max + cj->dx_max <
-            h / 2) {
+      case 6: /* (  1 , -1 ,  1 ) */
+        if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[2], -1, 0);
+        break;
 
-      /* Different types of flags. */
-      switch (sid) {
+      case 7: /* (  1 , -1 ,  0 ) */
+        if (ci->progeny[4] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[4], cj->progeny[2], -1, 0);
+        if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[4], cj->progeny[3], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[2], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[3], -1, 0);
+        break;
 
-        /* Regular sub-cell interactions of a single cell. */
-        case 0: /* (  1 ,  1 ,  1 ) */
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          break;
+      case 8: /* (  1 , -1 , -1 ) */
+        if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[4], cj->progeny[3], -1, 0);
+        break;
 
-        case 1: /* (  1 ,  1 ,  0 ) */
-          if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[0], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[1], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[1], -1, 0);
-          break;
+      case 9: /* (  0 ,  1 ,  1 ) */
+        if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[3], cj->progeny[0], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[3], cj->progeny[4], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[4], -1, 0);
+        break;
 
-        case 2: /* (  1 ,  1 , -1 ) */
-          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[1], -1, 0);
-          break;
+      case 10: /* (  0 ,  1 ,  0 ) */
+        if (ci->progeny[2] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[2], cj->progeny[0], -1, 0);
+        if (ci->progeny[2] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[2], cj->progeny[1], -1, 0);
+        if (ci->progeny[2] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[2], cj->progeny[4], -1, 0);
+        if (ci->progeny[2] != NULL && cj->progeny[5] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[2], cj->progeny[5], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[3], cj->progeny[0], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[3], cj->progeny[1], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[3], cj->progeny[4], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[5] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[3], cj->progeny[5], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[0], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[1], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[4], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[5] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[5], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[1], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[4], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[5] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[5], -1, 0);
+        break;
 
-        case 3: /* (  1 ,  0 ,  1 ) */
-          if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[5], cj->progeny[0], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
-            DOSUB2(r, ci->progeny[5], cj->progeny[2], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[2], -1, 0);
-          break;
+      case 11: /* (  0 ,  1 , -1 ) */
+        if (ci->progeny[2] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[2], cj->progeny[1], -1, 0);
+        if (ci->progeny[2] != NULL && cj->progeny[5] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[2], cj->progeny[5], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[1], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[5] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[5], -1, 0);
+        break;
 
-        case 4: /* (  1 ,  0 ,  0 ) */
-          if (ci->progeny[4] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[4], cj->progeny[0], -1, 0);
-          if (ci->progeny[4] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[4], cj->progeny[1], -1, 0);
-          if (ci->progeny[4] != NULL && cj->progeny[2] != NULL)
-            DOSUB2(r, ci->progeny[4], cj->progeny[2], -1, 0);
-          if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
-            DOSUB2(r, ci->progeny[4], cj->progeny[3], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[5], cj->progeny[0], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[5], cj->progeny[1], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
-            DOSUB2(r, ci->progeny[5], cj->progeny[2], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[3] != NULL)
-            DOSUB2(r, ci->progeny[5], cj->progeny[3], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[0], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[1], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[2] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[2], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[3] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[3], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[1], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[2], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[3] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[3], -1, 0);
-          break;
+      case 12: /* (  0 ,  0 ,  1 ) */
+        if (ci->progeny[1] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[1], cj->progeny[0], -1, 0);
+        if (ci->progeny[1] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[1], cj->progeny[2], -1, 0);
+        if (ci->progeny[1] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[1], cj->progeny[4], -1, 0);
+        if (ci->progeny[1] != NULL && cj->progeny[6] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[1], cj->progeny[6], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[3], cj->progeny[0], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[3], cj->progeny[2], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[3], cj->progeny[4], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[6] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[3], cj->progeny[6], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[0], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[2], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[4], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[6] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[6], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[2], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[4], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[6] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[6], -1, 0);
+        break;
+    }
 
-        case 5: /* (  1 ,  0 , -1 ) */
-          if (ci->progeny[4] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[4], cj->progeny[1], -1, 0);
-          if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
-            DOSUB2(r, ci->progeny[4], cj->progeny[3], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[1], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[3] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[3], -1, 0);
-          break;
+  }
 
-        case 6: /* (  1 , -1 ,  1 ) */
-          if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
-            DOSUB2(r, ci->progeny[5], cj->progeny[2], -1, 0);
-          break;
+  /* Otherwise, compute the pair directly. */
+  else if (ci->ti_end_min <= ti_current || cj->ti_end_min <= ti_current) {
 
-        case 7: /* (  1 , -1 ,  0 ) */
-          if (ci->progeny[4] != NULL && cj->progeny[2] != NULL)
-            DOSUB2(r, ci->progeny[4], cj->progeny[2], -1, 0);
-          if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
-            DOSUB2(r, ci->progeny[4], cj->progeny[3], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
-            DOSUB2(r, ci->progeny[5], cj->progeny[2], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[3] != NULL)
-            DOSUB2(r, ci->progeny[5], cj->progeny[3], -1, 0);
-          break;
+    /* Do any of the cells need to be sorted first? */
+    if (!(ci->sorted & (1 << sid))) runner_do_sort(r, ci, (1 << sid), 1);
+    if (!(cj->sorted & (1 << sid))) runner_do_sort(r, cj, (1 << sid), 1);
 
-        case 8: /* (  1 , -1 , -1 ) */
-          if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
-            DOSUB2(r, ci->progeny[4], cj->progeny[3], -1, 0);
-          break;
+    /* Compute the interactions. */
+    DOPAIR2(r, ci, cj);
+  }
 
-        case 9: /* (  0 ,  1 ,  1 ) */
-          if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[3], cj->progeny[0], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
-            DOSUB2(r, ci->progeny[3], cj->progeny[4], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[4], -1, 0);
-          break;
+  if (gettimer) TIMER_TOC(TIMER_DOSUB_PAIR);
+}
 
-        case 10: /* (  0 ,  1 ,  0 ) */
-          if (ci->progeny[2] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[2], cj->progeny[0], -1, 0);
-          if (ci->progeny[2] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[2], cj->progeny[1], -1, 0);
-          if (ci->progeny[2] != NULL && cj->progeny[4] != NULL)
-            DOSUB2(r, ci->progeny[2], cj->progeny[4], -1, 0);
-          if (ci->progeny[2] != NULL && cj->progeny[5] != NULL)
-            DOSUB2(r, ci->progeny[2], cj->progeny[5], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[3], cj->progeny[0], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[3], cj->progeny[1], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
-            DOSUB2(r, ci->progeny[3], cj->progeny[4], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[5] != NULL)
-            DOSUB2(r, ci->progeny[3], cj->progeny[5], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[0], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[1], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[4] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[4], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[5] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[5], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[1], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[4], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[5] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[5], -1, 0);
-          break;
+/**
+ * @brief Compute grouped sub-cell interactions for self tasks (symmetric case)
+ *
+ * @param r The #runner.
+ * @param ci The first #cell.
+ * @param gettimer Do we have a timer ?
+ */
+void DOSUB_SELF2(struct runner *r, struct cell *ci, int gettimer) {
 
-        case 11: /* (  0 ,  1 , -1 ) */
-          if (ci->progeny[2] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[2], cj->progeny[1], -1, 0);
-          if (ci->progeny[2] != NULL && cj->progeny[5] != NULL)
-            DOSUB2(r, ci->progeny[2], cj->progeny[5], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[1], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[5] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[5], -1, 0);
-          break;
+  const int ti_current = r->e->ti_current;
 
-        case 12: /* (  0 ,  0 ,  1 ) */
-          if (ci->progeny[1] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[1], cj->progeny[0], -1, 0);
-          if (ci->progeny[1] != NULL && cj->progeny[2] != NULL)
-            DOSUB2(r, ci->progeny[1], cj->progeny[2], -1, 0);
-          if (ci->progeny[1] != NULL && cj->progeny[4] != NULL)
-            DOSUB2(r, ci->progeny[1], cj->progeny[4], -1, 0);
-          if (ci->progeny[1] != NULL && cj->progeny[6] != NULL)
-            DOSUB2(r, ci->progeny[1], cj->progeny[6], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[3], cj->progeny[0], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[2] != NULL)
-            DOSUB2(r, ci->progeny[3], cj->progeny[2], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
-            DOSUB2(r, ci->progeny[3], cj->progeny[4], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[6] != NULL)
-            DOSUB2(r, ci->progeny[3], cj->progeny[6], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[5], cj->progeny[0], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
-            DOSUB2(r, ci->progeny[5], cj->progeny[2], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[4] != NULL)
-            DOSUB2(r, ci->progeny[5], cj->progeny[4], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[6] != NULL)
-            DOSUB2(r, ci->progeny[5], cj->progeny[6], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[2], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[4], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[6] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[6], -1, 0);
-          break;
-      }
+  TIMER_TIC
 
-    }
+  /* Should we even bother? */
+  if (ci->ti_end_min > ti_current) return;
 
-    /* Otherwise, compute the pair directly. */
-    else if (ci->ti_end_min <= ti_current || cj->ti_end_min <= ti_current) {
+  /* Recurse? */
+  if (ci->split) {
 
-      /* Do any of the cells need to be sorted first? */
-      if (!(ci->sorted & (1 << sid))) runner_do_sort(r, ci, (1 << sid), 1);
-      if (!(cj->sorted & (1 << sid))) runner_do_sort(r, cj, (1 << sid), 1);
+    /* Loop over all progeny. */
+    for (int k = 0; k < 8; k++)
+      if (ci->progeny[k] != NULL) {
+        DOSUB_SELF2(r, ci->progeny[k], 0);
+        for (int j = k + 1; j < 8; j++)
+          if (ci->progeny[j] != NULL)
+            DOSUB_PAIR2(r, ci->progeny[k], ci->progeny[j], -1, 0);
+      }
 
-      /* Compute the interactions. */
-      DOPAIR2(r, ci, cj);
-    }
+  }
 
-  } /* otherwise, pair interaction. */
+  /* Otherwise, compute self-interaction. */
+  else
+    DOSELF2(r, ci);
 
-  if (gettimer) TIMER_TOC(TIMER_DOSUB);
+  if (gettimer) TIMER_TOC(TIMER_DOSUB_SELF);
 }
 
 void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts,
                   int *ind, int count, struct cell *cj, int sid, int gettimer) {
 
-  int j, k;
-  double shift[3];
-  float h;
   struct space *s = r->e->s;
-  struct cell *sub = NULL;
   const int ti_current = r->e->ti_current;
 
   TIMER_TIC
 
   /* Find out in which sub-cell of ci the parts are. */
-  for (k = 0; k < 8; k++)
+  struct cell *sub = NULL;
+  for (int k = 0; k < 8; k++)
     if (ci->progeny[k] != NULL) {
       // if ( parts[ ind[ 0 ] ].x[0] >= ci->progeny[k]->loc[0] &&
       //      parts[ ind[ 0 ] ].x[0] <= ci->progeny[k]->loc[0] +
@@ -2300,7 +2308,7 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts,
 
       /* Loop over all progeny. */
       DOSUB_SUBSET(r, sub, parts, ind, count, NULL, -1, 0);
-      for (j = 0; j < 8; j++)
+      for (int j = 0; j < 8; j++)
         if (ci->progeny[j] != sub && ci->progeny[j] != NULL)
           DOSUB_SUBSET(r, sub, parts, ind, count, ci->progeny[j], -1, 0);
 
@@ -2316,7 +2324,7 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts,
   else {
 
     /* Get the cell dimensions. */
-    h = fmin(ci->h[0], fmin(ci->h[1], ci->h[2]));
+    const float h = fmin(ci->h[0], fmin(ci->h[1], ci->h[2]));
 
     /* Recurse? */
     if (ci->split && cj->split &&
@@ -2324,6 +2332,7 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts,
             h / 2) {
 
       /* Get the type of pair if not specified explicitly. */
+      double shift[3] = {0.0, 0.0, 0.0};
       sid = space_getsid(s, &ci, &cj, shift);
 
       /* Different types of flags. */
@@ -2335,7 +2344,7 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts,
             DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[0],
                          -1, 0);
           if (ci->progeny[7] != NULL && cj->progeny[0] == sub)
-            DOSUB_SUBSET(r, ci->progeny[0], parts, ind, count, cj->progeny[7],
+            DOSUB_SUBSET(r, cj->progeny[0], parts, ind, count, ci->progeny[7],
                          -1, 0);
           break;
 
@@ -2832,7 +2841,8 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts,
     else if (ci->ti_end_min <= ti_current || cj->ti_end_min <= ti_current) {
 
       /* Get the relative distance between the pairs, wrapping. */
-      for (k = 0; k < 3; k++) {
+      double shift[3] = {0.0, 0.0, 0.0};
+      for (int k = 0; k < 3; k++) {
         if (cj->loc[k] - ci->loc[k] < -s->dim[k] / 2)
           shift[k] = s->dim[k];
         else if (cj->loc[k] - ci->loc[k] > s->dim[k] / 2)
@@ -2840,7 +2850,8 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts,
       }
 
       /* Get the sorting index. */
-      for (sid = 0, k = 0; k < 3; k++)
+      int sid = 0;
+      for (int k = 0; k < 3; k++)
         sid =
             3 * sid + ((cj->loc[k] - ci->loc[k] + shift[k] < 0)
                            ? 0
@@ -2856,5 +2867,5 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts,
 
   } /* otherwise, pair interaction. */
 
-  if (gettimer) TIMER_TOC(TIMER_DOSUB);
+  if (gettimer) TIMER_TOC(TIMER_DOSUB_PAIR);
 }
diff --git a/src/runner_doiact_grav.h b/src/runner_doiact_grav.h
index 13e6fe68ca223fa95a307fd063208e1f5d5efa6c..e3788dfa1123584c913bca6baa6fc2db6698e6d0 100644
--- a/src/runner_doiact_grav.h
+++ b/src/runner_doiact_grav.h
@@ -19,11 +19,6 @@
 #ifndef SWIFT_RUNNER_DOIACT_GRAV_H
 #define SWIFT_RUNNER_DOIACT_GRAV_H
 
-/* Includes. */
-#include "cell.h"
-#include "clocks.h"
-#include "part.h"
-
 /**
  * @brief Compute the sorted gravity interactions between a cell pair.
  *
diff --git a/src/scheduler.c b/src/scheduler.c
index cae4132e0fadc3bf2a8c179092a2d875edfd86e1..05f699557cc7aafb641112bf9d9493188ffde52c 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -44,6 +44,9 @@
 #include "error.h"
 #include "intrinsics.h"
 #include "kernel_hydro.h"
+#include "queue.h"
+#include "space.h"
+#include "task.h"
 #include "timers.h"
 
 /**
@@ -106,17 +109,14 @@ void scheduler_addunlock(struct scheduler *s, struct task *ta,
 void scheduler_splittasks_mapper(void *map_data, int num_elements,
                                  void *extra_data) {
 
-  /* Static constants. */
-  const static int pts[7][8] = {{-1, 12, 10, 9, 4, 3, 1, 0},
-                                {-1, -1, 11, 10, 5, 4, 2, 1},
-                                {-1, -1, -1, 12, 7, 6, 4, 3},
-                                {-1, -1, -1, -1, 8, 7, 5, 4},
-                                {-1, -1, -1, -1, -1, 12, 10, 9},
-                                {-1, -1, -1, -1, -1, -1, 11, 10},
-                                {-1, -1, -1, -1, -1, -1, -1, 12}};
-  const static float sid_scale[13] = {0.1897, 0.4025, 0.1897, 0.4025, 0.5788,
-                                      0.4025, 0.1897, 0.4025, 0.1897, 0.4025,
-                                      0.5788, 0.4025, 0.5788};
+  const int pts[7][8] = {
+      {-1, 12, 10, 9, 4, 3, 1, 0},     {-1, -1, 11, 10, 5, 4, 2, 1},
+      {-1, -1, -1, 12, 7, 6, 4, 3},    {-1, -1, -1, -1, 8, 7, 5, 4},
+      {-1, -1, -1, -1, -1, 12, 10, 9}, {-1, -1, -1, -1, -1, -1, 11, 10},
+      {-1, -1, -1, -1, -1, -1, -1, 12}};
+  const float sid_scale[13] = {0.1897, 0.4025, 0.1897, 0.4025, 0.5788,
+                               0.4025, 0.1897, 0.4025, 0.1897, 0.4025,
+                               0.5788, 0.4025, 0.5788};
 
   /* Extract the parameters. */
   struct scheduler *s = (struct scheduler *)extra_data;
@@ -681,9 +681,6 @@ struct task *scheduler_addtask(struct scheduler *s, int type, int subtype,
   t->rid = -1;
   t->last_rid = -1;
 
-  /* Init the lock. */
-  lock_init(&t->lock);
-
   /* Add an index for it. */
   // lock_lock( &s->lock );
   s->tasks_ind[atomic_inc(&s->nr_tasks)] = ind;
@@ -742,42 +739,11 @@ void scheduler_set_unlocks(struct scheduler *s) {
     t->unlock_tasks = &s->unlocks[offsets[k]];
   }
 
-  /* Verify that there are no duplicate unlocks. */
-  /* for (int k = 0; k < s->nr_tasks; k++) {
-    struct task *t = &s->tasks[k];
-    for (int i = 0; i < t->nr_unlock_tasks; i++) {
-      for (int j = i + 1; j < t->nr_unlock_tasks; j++) {
-        if (t->unlock_tasks[i] == t->unlock_tasks[j])
-          error("duplicate unlock!");
-      }
-    }
-  } */
-
   /* Clean up. */
   free(counts);
   free(offsets);
 }
 
-/**
- * @brief #threadpool_map function which runs through the task
- *        graph and re-computes the task wait counters.
- */
-
-void scheduler_simple_rewait_mapper(void *map_data, int num_elements,
-                                    void *extra_data) {
-
-  struct task *tasks = (struct task *)map_data;
-  for (int ind = 0; ind < num_elements; ind++) {
-    struct task *t = &tasks[ind];
-
-    /* Increment the waits of the dependances */
-    for (int k = 0; k < t->nr_unlock_tasks; k++) {
-      struct task *u = t->unlock_tasks[k];
-      atomic_inc(&u->wait);
-    }
-  }
-}
-
 /**
  * @brief Sort the tasks in topological order over all queues.
  *
@@ -829,10 +795,12 @@ void scheduler_ranktasks(struct scheduler *s) {
     j = left_old;
   }
 
+#ifdef SWIFT_DEBUG_CHECKS
   /* Verify that the tasks were ranked correctly. */
-  /* for (int k = 1; k < s->nr_tasks; k++)
+  for (int k = 1; k < s->nr_tasks; k++)
     if (tasks[tid[k - 1]].rank > tasks[tid[k - 1]].rank)
-      error("Task ranking failed."); */
+      error("Task ranking failed.");
+#endif
 }
 
 /**
@@ -917,23 +885,23 @@ void scheduler_reweight(struct scheduler *s) {
             t->weight +=
                 2 * wscale * t->ci->count * t->cj->count * sid_scale[t->flags];
           break;
-        case task_type_sub:
-          if (t->cj != NULL) {
-            if (t->ci->nodeID != nodeID || t->cj->nodeID != nodeID) {
-              if (t->flags < 0)
-                t->weight += 3 * wscale * t->ci->count * t->cj->count;
-              else
-                t->weight += 3 * wscale * t->ci->count * t->cj->count *
-                             sid_scale[t->flags];
-            } else {
-              if (t->flags < 0)
-                t->weight += 2 * wscale * t->ci->count * t->cj->count;
-              else
-                t->weight += 2 * wscale * t->ci->count * t->cj->count *
-                             sid_scale[t->flags];
-            }
-          } else
-            t->weight += 1 * wscale * t->ci->count * t->ci->count;
+        case task_type_sub_pair:
+          if (t->ci->nodeID != nodeID || t->cj->nodeID != nodeID) {
+            if (t->flags < 0)
+              t->weight += 3 * wscale * t->ci->count * t->cj->count;
+            else
+              t->weight += 3 * wscale * t->ci->count * t->cj->count *
+                           sid_scale[t->flags];
+          } else {
+            if (t->flags < 0)
+              t->weight += 2 * wscale * t->ci->count * t->cj->count;
+            else
+              t->weight += 2 * wscale * t->ci->count * t->cj->count *
+                           sid_scale[t->flags];
+          }
+          break;
+        case task_type_sub_self:
+          t->weight += 1 * wscale * t->ci->count * t->ci->count;
           break;
         case task_type_ghost:
           if (t->ci == t->ci->super) t->weight += wscale * t->ci->count;
@@ -950,8 +918,6 @@ void scheduler_reweight(struct scheduler *s) {
         default:
           break;
       }
-    if (t->type == task_type_send) t->weight = INT_MAX / 8;
-    if (t->type == task_type_recv) t->weight *= 1.41;
   }
   // message( "weighting tasks took %.3f %s." ,
   // clocks_from_ticks( getticks() - tic ), clocks_getunit());
@@ -1075,6 +1041,7 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
   if (t->implicit) {
     for (int j = 0; j < t->nr_unlock_tasks; j++) {
       struct task *t2 = t->unlock_tasks[j];
+
       if (atomic_dec(&t2->wait) == 1) scheduler_enqueue(s, t2);
     }
   }
@@ -1097,17 +1064,22 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
         qid = t->ci->super->owner;
         break;
       case task_type_pair:
-      case task_type_sub:
+      case task_type_sub_pair:
         qid = t->ci->super->owner;
-        if (t->cj != NULL &&
-            (qid < 0 ||
-             s->queues[qid].count > s->queues[t->cj->super->owner].count))
+        if (qid < 0 ||
+            s->queues[qid].count > s->queues[t->cj->super->owner].count)
           qid = t->cj->super->owner;
         break;
       case task_type_recv:
 #ifdef WITH_MPI
-        err = MPI_Irecv(t->ci->parts, t->ci->count, part_mpi_type,
-                        t->ci->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+        if (t->subtype == task_subtype_tend) {
+          t->buff = malloc(sizeof(int) * t->ci->pcell_size);
+          err = MPI_Irecv(t->buff, t->ci->pcell_size, MPI_INT, t->ci->nodeID,
+                          t->flags, MPI_COMM_WORLD, &t->req);
+        } else {
+          err = MPI_Irecv(t->ci->parts, t->ci->count, part_mpi_type,
+                          t->ci->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+        }
         if (err != MPI_SUCCESS) {
           mpi_error(err, "Failed to emit irecv for particle data.");
         }
@@ -1121,8 +1093,15 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
         break;
       case task_type_send:
 #ifdef WITH_MPI
-        err = MPI_Isend(t->ci->parts, t->ci->count, part_mpi_type,
-                        t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+        if (t->subtype == task_subtype_tend) {
+          t->buff = malloc(sizeof(int) * t->ci->pcell_size);
+          cell_pack_ti_ends(t->ci, t->buff);
+          err = MPI_Isend(t->buff, t->ci->pcell_size, MPI_INT, t->cj->nodeID,
+                          t->flags, MPI_COMM_WORLD, &t->req);
+        } else {
+          err = MPI_Isend(t->ci->parts, t->ci->count, part_mpi_type,
+                          t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+        }
         if (err != MPI_SUCCESS) {
           mpi_error(err, "Failed to emit isend for particle data.");
         }
@@ -1326,8 +1305,7 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
  */
 
 void scheduler_init(struct scheduler *s, struct space *space, int nr_tasks,
-                    int nr_queues, unsigned int flags, int nodeID,
-                    struct threadpool *tp) {
+                    int nr_queues, unsigned int flags, int nodeID) {
 
   /* Init the lock. */
   lock_init(&s->lock);
@@ -1347,7 +1325,7 @@ void scheduler_init(struct scheduler *s, struct space *space, int nr_tasks,
 
   /* Init the unlocks. */
   if ((s->unlocks = (struct task **)malloc(
-           sizeof(struct task *) *scheduler_init_nr_unlocks)) == NULL ||
+           sizeof(struct task *) * scheduler_init_nr_unlocks)) == NULL ||
       (s->unlock_ind =
            (int *)malloc(sizeof(int) * scheduler_init_nr_unlocks)) == NULL)
     error("Failed to allocate unlocks.");
@@ -1359,7 +1337,6 @@ void scheduler_init(struct scheduler *s, struct space *space, int nr_tasks,
   s->flags = flags;
   s->space = space;
   s->nodeID = nodeID;
-  s->threadpool = tp;
 
   /* Init the tasks array. */
   s->size = 0;
@@ -1392,3 +1369,31 @@ void scheduler_print_tasks(const struct scheduler *s, const char *fileName) {
 
   fclose(file);
 }
+
+/**
+ * @brief Sets the waits of the dependants of a range of task
+ *
+ * @param t_begin Beginning of the #task range
+ * @param t_end End of the #task range
+ * @param mask The scheduler task mask
+ * @param submask The scheduler subtask mask
+ */
+void scheduler_do_rewait(struct task *t_begin, struct task *t_end,
+                         unsigned int mask, unsigned int submask) {
+  for (struct task *t2 = t_begin; t2 != t_end; t2++) {
+
+    if (t2->skip) continue;
+
+    /* Skip tasks not in the mask */
+    if (!((1 << t2->type) & mask) || !((1 << t2->subtype) & submask)) continue;
+
+    /* Skip sort tasks that have already been performed */
+    if (t2->type == task_type_sort && t2->flags == 0) continue;
+
+    /* Sets the waits of the dependances */
+    for (int k = 0; k < t2->nr_unlock_tasks; k++) {
+      struct task *t3 = t2->unlock_tasks[k];
+      atomic_inc(&t3->wait);
+    }
+  }
+}
diff --git a/src/scheduler.h b/src/scheduler.h
index 7528e13c2f7901ba9979a3ef7567bef1e23d8ac4..37ddb704ac8adc19f870916103b691715c820c4e 100644
--- a/src/scheduler.h
+++ b/src/scheduler.h
@@ -35,7 +35,6 @@
 #include "cell.h"
 #include "lock.h"
 #include "queue.h"
-#include "space.h"
 #include "task.h"
 #include "threadpool.h"
 
diff --git a/src/serial_io.c b/src/serial_io.c
index 5abd3ebc28672d68c4135efe5753dc4713c2d3c6..03d3c38850e0a7800c83d7dd0942025632bf1d0b 100644
--- a/src/serial_io.c
+++ b/src/serial_io.c
@@ -26,22 +26,22 @@
 /* Some standard headers. */
 #include <hdf5.h>
 #include <math.h>
+#include <mpi.h>
 #include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
-/* MPI headers. */
-#ifdef WITH_MPI
-#include <mpi.h>
-#endif
-
 /* This object's header. */
 #include "serial_io.h"
 
 /* Local includes. */
 #include "common_io.h"
+#include "engine.h"
 #include "error.h"
+#include "kernel_hydro.h"
+#include "part.h"
+#include "units.h"
 
 /*-----------------------------------------------------------------------------
  * Routines reading an IC file
@@ -424,8 +424,8 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile,
  */
 void read_ic_serial(char* fileName, double dim[3], struct part** parts,
                     struct gpart** gparts, size_t* Ngas, size_t* Ngparts,
-                    int* periodic, int mpi_rank, int mpi_size, MPI_Comm comm,
-                    MPI_Info info, int dry_run) {
+                    int* periodic, int* flag_entropy, int mpi_rank,
+                    int mpi_size, MPI_Comm comm, MPI_Info info, int dry_run) {
   hid_t h_file = 0, h_grp = 0;
   /* GADGET has only cubic boxes (in cosmological mode) */
   double boxSize[3] = {0.0, -1.0, -1.0};
@@ -462,6 +462,7 @@ void read_ic_serial(char* fileName, double dim[3], struct part** parts,
     if (h_grp < 0) error("Error while opening file header\n");
 
     /* Read the relevant information and print status */
+    readAttribute(h_grp, "Flag_Entropy_ICs", INT, flag_entropy);
     readAttribute(h_grp, "BoxSize", DOUBLE, boxSize);
     readAttribute(h_grp, "NumPart_Total", UINT, numParticles);
     readAttribute(h_grp, "NumPart_Total_HighWord", UINT, numParticles_highWord);
@@ -488,6 +489,7 @@ void read_ic_serial(char* fileName, double dim[3], struct part** parts,
   }
 
   /* Now need to broadcast that information to all ranks. */
+  MPI_Bcast(flag_entropy, 1, MPI_INT, 0, comm);
   MPI_Bcast(periodic, 1, MPI_INT, 0, comm);
   MPI_Bcast(&N_total, NUM_PARTICLE_TYPES, MPI_LONG_LONG, 0, comm);
   MPI_Bcast(dim, 3, MPI_DOUBLE, 0, comm);
@@ -699,6 +701,7 @@ void write_output_serial(struct engine* e, const char* baseName,
     double MassTable[6] = {0., 0., 0., 0., 0., 0.};
     writeAttribute(h_grp, "MassTable", DOUBLE, MassTable, NUM_PARTICLE_TYPES);
     unsigned int flagEntropy[NUM_PARTICLE_TYPES] = {0};
+    flagEntropy[0] = writeEntropyFlag();
     writeAttribute(h_grp, "Flag_Entropy_ICs", UINT, flagEntropy,
                    NUM_PARTICLE_TYPES);
     writeAttribute(h_grp, "NumFilesPerSnapshot", INT, &numFiles, 1);
diff --git a/src/serial_io.h b/src/serial_io.h
index b7ed6eb62d823829a473f828696c291e552effa3..fa88d5cb87edfba2c101b6096b8a57b04b7178cd 100644
--- a/src/serial_io.h
+++ b/src/serial_io.h
@@ -19,6 +19,9 @@
 #ifndef SWIFT_SERIAL_IO_H
 #define SWIFT_SERIAL_IO_H
 
+/* Config parameters. */
+#include "../config.h"
+
 /* MPI headers. */
 #ifdef WITH_MPI
 #include <mpi.h>
@@ -33,8 +36,8 @@
 
 void read_ic_serial(char* fileName, double dim[3], struct part** parts,
                     struct gpart** gparts, size_t* Ngas, size_t* Ngparts,
-                    int* periodic, int mpi_rank, int mpi_size, MPI_Comm comm,
-                    MPI_Info info, int dry_run);
+                    int* periodic, int* flag_entropy, int mpi_rank,
+                    int mpi_size, MPI_Comm comm, MPI_Info info, int dry_run);
 
 void write_output_serial(struct engine* e, const char* baseName,
                          struct UnitSystem* us, int mpi_rank, int mpi_size,
diff --git a/src/single_io.c b/src/single_io.c
index fb3bf4368feed9892b098228d85c99f0bc3e724b..3893f1c86784e07e5cd78afc32a49339e60840ba 100644
--- a/src/single_io.c
+++ b/src/single_io.c
@@ -36,8 +36,11 @@
 
 /* Local includes. */
 #include "common_io.h"
-#include "const.h"
+#include "engine.h"
 #include "error.h"
+#include "kernel_hydro.h"
+#include "part.h"
+#include "units.h"
 
 /*-----------------------------------------------------------------------------
  * Routines reading an IC file
@@ -319,6 +322,8 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile,
  * @param Ngas (output) number of Gas particles read.
  * @param Ngparts (output) The number of #gpart read.
  * @param periodic (output) 1 if the volume is periodic, 0 if not.
+ * @param flag_entropy 1 if the ICs contained Entropy in the InternalEnergy
+ * field
  * @param dry_run If 1, don't read the particle. Only allocates the arrays.
  *
  * Opens the HDF5 file fileName and reads the particles contained
@@ -331,7 +336,7 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile,
  */
 void read_ic_single(char* fileName, double dim[3], struct part** parts,
                     struct gpart** gparts, size_t* Ngas, size_t* Ngparts,
-                    int* periodic, int dry_run) {
+                    int* periodic, int* flag_entropy, int dry_run) {
   hid_t h_file = 0, h_grp = 0;
   /* GADGET has only cubic boxes (in cosmological mode) */
   double boxSize[3] = {0.0, -1.0, -1.0};
@@ -365,6 +370,7 @@ void read_ic_single(char* fileName, double dim[3], struct part** parts,
   if (h_grp < 0) error("Error while opening file header\n");
 
   /* Read the relevant information and print status */
+  readAttribute(h_grp, "Flag_Entropy_ICs", INT, flag_entropy);
   readAttribute(h_grp, "BoxSize", DOUBLE, boxSize);
   readAttribute(h_grp, "NumPart_Total", UINT, numParticles);
   readAttribute(h_grp, "NumPart_Total_HighWord", UINT, numParticles_highWord);
@@ -546,6 +552,7 @@ void write_output_single(struct engine* e, const char* baseName,
   double MassTable[NUM_PARTICLE_TYPES] = {0};
   writeAttribute(h_grp, "MassTable", DOUBLE, MassTable, NUM_PARTICLE_TYPES);
   unsigned int flagEntropy[NUM_PARTICLE_TYPES] = {0};
+  flagEntropy[0] = writeEntropyFlag();
   writeAttribute(h_grp, "Flag_Entropy_ICs", UINT, flagEntropy,
                  NUM_PARTICLE_TYPES);
   writeAttribute(h_grp, "NumFilesPerSnapshot", INT, &numFiles, 1);
diff --git a/src/single_io.h b/src/single_io.h
index adfc5b43941b2c0d69d9ce0924164aff56864a23..4936aa3eb35a5c76a5a8f242c9cc79bbc86161a0 100644
--- a/src/single_io.h
+++ b/src/single_io.h
@@ -19,6 +19,9 @@
 #ifndef SWIFT_SINGLE_IO_H
 #define SWIFT_SINGLE_IO_H
 
+/* Config parameters. */
+#include "../config.h"
+
 /* Includes. */
 #include "engine.h"
 #include "part.h"
@@ -28,7 +31,7 @@
 
 void read_ic_single(char* fileName, double dim[3], struct part** parts,
                     struct gpart** gparts, size_t* Ngas, size_t* Ndm,
-                    int* periodic, int dry_run);
+                    int* periodic, int* flag_entropy, int dry_run);
 
 void write_output_single(struct engine* e, const char* baseName,
                          struct UnitSystem* us);
diff --git a/src/space.c b/src/space.c
index 4f6a3362534b1454b5864bfde25a2a497ce72697..04214e49e0c66fc468c91f0098f75c10ef59c252 100644
--- a/src/space.c
+++ b/src/space.c
@@ -439,10 +439,10 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
       s->parts[k] = s->parts[nr_parts];
       s->parts[nr_parts] = tp;
       if (s->parts[k].gpart != NULL) {
-        s->parts[k].gpart->part = &s->parts[k];
+        s->parts[k].gpart->id_or_neg_offset = -k;
       }
       if (s->parts[nr_parts].gpart != NULL) {
-        s->parts[nr_parts].gpart->part = &s->parts[nr_parts];
+        s->parts[nr_parts].gpart->id_or_neg_offset = -nr_parts;
       }
       const struct xpart txp = s->xparts[k];
       s->xparts[k] = s->xparts[nr_parts];
@@ -456,8 +456,9 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
     }
   }
 
+#ifdef SWIFT_DEBUG_CHECKS
   /* Check that all parts are in the correct places. */
-  /*  for (size_t k = 0; k < nr_parts; k++) {
+  for (size_t k = 0; k < nr_parts; k++) {
     if (cells[ind[k]].nodeID != local_nodeID) {
       error("Failed to move all non-local parts to send list");
     }
@@ -466,7 +467,8 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
     if (cells[ind[k]].nodeID == local_nodeID) {
       error("Failed to remove local parts from send list");
     }
-  }*/
+  }
+#endif
 
   /* Move non-local gparts to the end of the list. */
   for (int k = 0; k < nr_gparts;) {
@@ -476,11 +478,12 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
       const struct gpart tp = s->gparts[k];
       s->gparts[k] = s->gparts[nr_gparts];
       s->gparts[nr_gparts] = tp;
-      if (s->gparts[k].id > 0) {
-        s->gparts[k].part->gpart = &s->gparts[k];
+      if (s->gparts[k].id_or_neg_offset <= 0) {
+        s->parts[-s->gparts[k].id_or_neg_offset].gpart = &s->gparts[k];
       }
-      if (s->gparts[nr_gparts].id > 0) {
-        s->gparts[nr_gparts].part->gpart = &s->gparts[nr_gparts];
+      if (s->gparts[nr_gparts].id_or_neg_offset <= 0) {
+        s->parts[-s->gparts[nr_gparts].id_or_neg_offset].gpart =
+            &s->gparts[nr_gparts];
       }
       const int t = gind[k];
       gind[k] = gind[nr_gparts];
@@ -491,8 +494,8 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
     }
   }
 
+#ifdef SWIFT_DEBUG_CHECKS
   /* Check that all gparts are in the correct place (untested). */
-  /*
   for (size_t k = 0; k < nr_gparts; k++) {
     if (cells[gind[k]].nodeID != local_nodeID) {
       error("Failed to move all non-local gparts to send list");
@@ -502,7 +505,8 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
     if (cells[gind[k]].nodeID == local_nodeID) {
       error("Failed to remove local gparts from send list");
     }
-  }*/
+  }
+#endif
 
   /* Exchange the strays, note that this potentially re-allocates
      the parts arrays. */
@@ -531,29 +535,34 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
     ind[k] =
         cell_getid(cdim, p->x[0] * ih[0], p->x[1] * ih[1], p->x[2] * ih[2]);
     cells[ind[k]].count += 1;
-    /* if ( cells[ ind[k] ].nodeID != nodeID )
-        error( "Received part that does not belong to me (nodeID=%i)." , cells[
-       ind[k] ].nodeID ); */
+#ifdef SWIFT_DEBUG_CHECKS
+    if (cells[ind[k]].nodeID != local_nodeID)
+      error("Received part that does not belong to me (nodeID=%i).",
+            cells[ind[k]].nodeID);
+#endif
   }
   nr_parts = s->nr_parts;
-#endif
+
+#endif /* WITH_MPI */
 
   /* Sort the parts according to their cells. */
   space_parts_sort(s, ind, nr_parts, 0, s->nr_cells - 1, verbose);
 
   /* Re-link the gparts. */
-  for (size_t k = 0; k < nr_parts; k++)
-    if (s->parts[k].gpart != NULL) s->parts[k].gpart->part = &s->parts[k];
-
-  /* Verify sort_struct. */
-  /* for ( k = 1 ; k < nr_parts ; k++ ) {
-      if ( ind[k-1] > ind[k] ) {
-          error( "Sort failed!" );
-          }
-      else if ( ind[k] != cell_getid( cdim , parts[k].x[0]*ih[0] ,
-     parts[k].x[1]*ih[1] , parts[k].x[2]*ih[2] ) )
-          error( "Incorrect indices!" );
-      } */
+  part_relink_gparts(s->parts, nr_parts, 0);
+
+#ifdef SWIFT_DEBUG_CHECKS
+  /* Verify space_sort_struct. */
+  for (size_t k = 1; k < nr_parts; k++) {
+    if (ind[k - 1] > ind[k]) {
+      error("Sort failed!");
+    } else if (ind[k] != cell_getid(cdim, s->parts[k].x[0] * ih[0],
+                                    s->parts[k].x[1] * ih[1],
+                                    s->parts[k].x[2] * ih[2])) {
+      error("Incorrect indices!");
+    }
+  }
+#endif
 
   /* We no longer need the indices as of here. */
   free(ind);
@@ -588,33 +597,34 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
   space_gparts_sort(s, gind, nr_gparts, 0, s->nr_cells - 1, verbose);
 
   /* Re-link the parts. */
-  for (int k = 0; k < nr_gparts; k++)
-    if (s->gparts[k].id > 0) s->gparts[k].part->gpart = &s->gparts[k];
+  part_relink_parts(s->gparts, nr_gparts, s->parts);
 
   /* We no longer need the indices as of here. */
   free(gind);
 
+#ifdef SWIFT_DEBUG_CHECKS
   /* Verify that the links are correct */
-  /* MATTHIEU: To be commented out once we are happy */
   for (size_t k = 0; k < nr_gparts; ++k) {
 
-    if (s->gparts[k].id > 0) {
+    if (s->gparts[k].id_or_neg_offset < 0) {
+
+      const struct part *part = &s->parts[-s->gparts[k].id_or_neg_offset];
 
-      if (s->gparts[k].part->gpart != &s->gparts[k]) error("Linking problem !");
+      if (part->gpart != &s->gparts[k]) error("Linking problem !");
 
-      if (s->gparts[k].x[0] != s->gparts[k].part->x[0] ||
-          s->gparts[k].x[1] != s->gparts[k].part->x[1] ||
-          s->gparts[k].x[2] != s->gparts[k].part->x[2])
+      if (s->gparts[k].x[0] != part->x[0] || s->gparts[k].x[1] != part->x[1] ||
+          s->gparts[k].x[2] != part->x[2])
         error("Linked particles are not at the same position !");
     }
   }
   for (size_t k = 0; k < nr_parts; ++k) {
 
-    if (s->parts[k].gpart != NULL) {
-
-      if (s->parts[k].gpart->part != &s->parts[k]) error("Linking problem !");
+    if (s->parts[k].gpart != NULL &&
+        s->parts[k].gpart->id_or_neg_offset != -k) {
+      error("Linking problem !");
     }
   }
+#endif
 
   /* Hook the cells up to the parts. */
   // tic = getticks();
@@ -705,13 +715,14 @@ void space_parts_sort(struct space *s, int *ind, size_t N, int min, int max,
   threadpool_map(&s->e->threadpool, space_parts_sort_mapper, &sort_struct,
                  s->e->threadpool.num_threads, 0, 1, NULL);
 
-  /* Verify sort_struct. */
-  /* for (int i = 1; i < N; i++)
+#ifdef SWIFT_DEBUG_CHECKS
+  /* Verify space_sort_struct. */
+  for (int i = 1; i < N; i++)
     if (ind[i - 1] > ind[i])
       error("Sorting failed (ind[%i]=%i,ind[%i]=%i), min=%i, max=%i.", i - 1,
-  ind[i - 1], i,
-            ind[i], min, max);
-  message("Sorting succeeded."); */
+            ind[i - 1], i, ind[i], min, max);
+  message("Sorting succeeded.");
+#endif
 
   /* Clean up. */
   free(sort_struct.stack);
@@ -775,19 +786,21 @@ void space_parts_sort_mapper(void *map_data, int num_elements, void *extra_data)
         }
       }
 
-      /* Verify sort_struct. */
-      /* for (int k = i; k <= jj; k++)
+#ifdef SWIFT_DEBUG_CHECKS
+      /* Verify space_sort_struct. */
+      for (int k = i; k <= jj; k++)
         if (ind[k] > pivot) {
-          message("sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i.", k,
-                  ind[k], pivot, i, j);
+          message("sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%li, j=%li.",
+                  k, ind[k], pivot, i, j);
           error("Partition failed (<=pivot).");
         }
       for (int k = jj + 1; k <= j; k++)
         if (ind[k] <= pivot) {
-          message("sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i.", k,
-                  ind[k], pivot, i, j);
+          message("sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%li, j=%li.",
+                  k, ind[k], pivot, i, j);
           error("Partition failed (>pivot).");
-        } */
+        }
+#endif
 
       /* Split-off largest interval. */
       if (jj - i > j - jj + 1) {
@@ -886,13 +899,14 @@ void space_gparts_sort(struct space *s, int *ind, size_t N, int min, int max,
   threadpool_map(&s->e->threadpool, space_gparts_sort_mapper, &sort_struct,
                  s->e->threadpool.num_threads, 0, 1, NULL);
 
-  /* Verify sort_struct. */
-  /* for (int i = 1; i < N; i++)
+#ifdef SWIFT_DEBUG_CHECKS
+  /* Verify space_sort_struct. */
+  for (int i = 1; i < N; i++)
     if (ind[i - 1] > ind[i])
       error("Sorting failed (ind[%i]=%i,ind[%i]=%i), min=%i, max=%i.", i - 1,
-  ind[i - 1], i,
-            ind[i], min, max);
-  message("Sorting succeeded."); */
+            ind[i - 1], i, ind[i], min, max);
+  message("Sorting succeeded.");
+#endif
 
   /* Clean up. */
   free(sort_struct.stack);
@@ -952,19 +966,21 @@ void space_gparts_sort_mapper(void *map_data, int num_elements, void *extra_data
         }
       }
 
-      /* Verify sort_struct. */
-      /* for (int k = i; k <= jj; k++)
+#ifdef SWIFT_DEBUG_CHECKS
+      /* Verify space_sort_struct. */
+      for (int k = i; k <= jj; k++)
         if (ind[k] > pivot) {
-          message("sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i.", k,
-                  ind[k], pivot, i, j);
+          message("sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%li, j=%li.",
+                  k, ind[k], pivot, i, j);
           error("Partition failed (<=pivot).");
         }
       for (int k = jj + 1; k <= j; k++)
         if (ind[k] <= pivot) {
-          message("sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i.", k,
-                  ind[k], pivot, i, j);
+          message("sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%li, j=%li.",
+                  k, ind[k], pivot, i, j);
           error("Partition failed (>pivot).");
-        } */
+        }
+#endif
 
       /* Split-off largest interval. */
       if (jj - i > j - jj + 1) {
diff --git a/src/space.h b/src/space.h
index d6802db02fccc398f15ad5637d24c5ab7140d147..f6a4c30250e8e737365a338530f003a9846f5cee 100644
--- a/src/space.h
+++ b/src/space.h
@@ -23,16 +23,18 @@
 #ifndef SWIFT_SPACE_H
 #define SWIFT_SPACE_H
 
-/* Includes. */
+/* Config parameters. */
+#include "../config.h"
+
+/* Some standard headers. */
 #include <stddef.h>
 
-/* Local includes. */
+/* Includes. */
 #include "cell.h"
+#include "lock.h"
 #include "parser.h"
 #include "part.h"
-
-/* Forward-declare the engine to avoid cyclic includes. */
-struct engine;
+#include "space.h"
 
 /* Some constants. */
 #define space_maxdepth 10
@@ -159,4 +161,5 @@ void space_split_mapper(void *map_data, int num_elements, void *extra_data);
 void space_do_parts_sort();
 void space_do_gparts_sort();
 void space_link_cleanup(struct space *s);
+
 #endif /* SWIFT_SPACE_H */
diff --git a/src/task.c b/src/task.c
index b93dfc78adeba446d0828202b92c5477a474611f..cb39501a96d22809051ba694285e694c92ff9562 100644
--- a/src/task.c
+++ b/src/task.c
@@ -112,13 +112,14 @@ void task_unlock(struct task *t) {
   /* Act based on task type. */
   switch (t->type) {
     case task_type_self:
+    case task_type_sub_self:
     case task_type_sort:
       cell_unlocktree(t->ci);
       break;
     case task_type_pair:
-    case task_type_sub:
+    case task_type_sub_pair:
       cell_unlocktree(t->ci);
-      if (t->cj != NULL) cell_unlocktree(t->cj);
+      cell_unlocktree(t->cj);
       break;
     case task_type_grav_pp:
     case task_type_grav_mm:
@@ -139,174 +140,74 @@ void task_unlock(struct task *t) {
 
 int task_lock(struct task *t) {
 
-  int type = t->type;
+  const int type = t->type;
+  const int subtype = t->subtype;
   struct cell *ci = t->ci, *cj = t->cj;
+#ifdef WITH_MPI
+  int res = 0, err = 0;
+  MPI_Status stat;
+#endif
 
-  /* Communication task? */
-  if (type == task_type_recv || type == task_type_send) {
+  switch (type) {
 
+    /* Communication task? */
+    case task_type_recv:
+    case task_type_send:
 #ifdef WITH_MPI
-    /* Check the status of the MPI request. */
-    int res = 0, err = 0;
-    MPI_Status stat;
-    if ((err = MPI_Test(&t->req, &res, &stat)) != MPI_SUCCESS) {
-      char buff[MPI_MAX_ERROR_STRING];
-      int len;
-      MPI_Error_string(err, buff, &len);
-      error("Failed to test request on send/recv task (tag=%i, %s).", t->flags,
-            buff);
-    }
-    return res;
+      /* Check the status of the MPI request. */
+      if ((err = MPI_Test(&t->req, &res, &stat)) != MPI_SUCCESS) {
+        char buff[MPI_MAX_ERROR_STRING];
+        int len;
+        MPI_Error_string(err, buff, &len);
+        error("Failed to test request on send/recv task (tag=%i, %s).",
+              t->flags, buff);
+      }
+      return res;
 #else
-    error("SWIFT was not compiled with MPI support.");
+      error("SWIFT was not compiled with MPI support.");
 #endif
+      break;
 
-  }
+    case task_type_sort:
+      if (cell_locktree(ci) != 0) return 0;
+      break;
 
-  /* Unary lock? */
-  else if (type == task_type_self || type == task_type_sort ||
-           (type == task_type_sub && cj == NULL)) {
-    if (cell_locktree(ci) != 0) return 0;
-  }
+    case task_type_self:
+    case task_type_sub_self:
+      if (subtype == task_subtype_grav) {
+        if (cell_glocktree(ci) != 0) return 0;
+      } else {
+        if (cell_locktree(ci) != 0) return 0;
+      }
+      break;
 
-  /* Otherwise, binary lock. */
-  else if (type == task_type_pair || (type == task_type_sub && cj != NULL)) {
-    if (ci->hold || cj->hold) return 0;
-    if (cell_locktree(ci) != 0) return 0;
-    if (cell_locktree(cj) != 0) {
-      cell_unlocktree(ci);
-      return 0;
-    }
-  }
+    case task_type_pair:
+    case task_type_sub_pair:
+      if (subtype == task_subtype_grav) {
+        if (ci->ghold || cj->ghold) return 0;
+        if (cell_glocktree(ci) != 0) return 0;
+        if (cell_glocktree(cj) != 0) {
+          cell_gunlocktree(ci);
+          return 0;
+        }
+      } else {
+        if (ci->hold || cj->hold) return 0;
+        if (cell_locktree(ci) != 0) return 0;
+        if (cell_locktree(cj) != 0) {
+          cell_unlocktree(ci);
+          return 0;
+        }
+      }
+      break;
 
-  /* Gravity tasks? */
-  else if (type == task_type_grav_mm || type == task_type_grav_pp ||
-           type == task_type_grav_down) {
-    if (ci->ghold || (cj != NULL && cj->ghold)) return 0;
-    if (cell_glocktree(ci) != 0) return 0;
-    if (cj != NULL && cell_glocktree(cj) != 0) {
-      cell_gunlocktree(ci);
-      return 0;
-    }
+    default:
+      break;
   }
 
   /* If we made it this far, we've got a lock. */
   return 1;
 }
 
-/**
- * @brief Remove all unlocks to tasks that are of the given type.
- *
- * @param t The #task.
- * @param type The task type ID to remove.
- */
-
-void task_cleanunlock(struct task *t, int type) {
-
-  int k;
-
-  lock_lock(&t->lock);
-
-  for (k = 0; k < t->nr_unlock_tasks; k++)
-    if (t->unlock_tasks[k]->type == type) {
-      t->nr_unlock_tasks -= 1;
-      t->unlock_tasks[k] = t->unlock_tasks[t->nr_unlock_tasks];
-    }
-
-  lock_unlock_blind(&t->lock);
-}
-
-/**
- * @brief Remove an unlock_task from the given task.
- *
- * @param ta The unlocking #task.
- * @param tb The #task that will be unlocked.
- */
-
-void task_rmunlock(struct task *ta, struct task *tb) {
-
-  int k;
-
-  lock_lock(&ta->lock);
-
-  for (k = 0; k < ta->nr_unlock_tasks; k++)
-    if (ta->unlock_tasks[k] == tb) {
-      ta->nr_unlock_tasks -= 1;
-      ta->unlock_tasks[k] = ta->unlock_tasks[ta->nr_unlock_tasks];
-      lock_unlock_blind(&ta->lock);
-      return;
-    }
-  error("Task not found.");
-}
-
-/**
- * @brief Remove an unlock_task from the given task.
- *
- * @param ta The unlocking #task.
- * @param tb The #task that will be unlocked.
- *
- * Differs from #task_rmunlock in that it will not fail if
- * the task @c tb is not in the unlocks of @c ta.
- */
-
-void task_rmunlock_blind(struct task *ta, struct task *tb) {
-
-  int k;
-
-  lock_lock(&ta->lock);
-
-  for (k = 0; k < ta->nr_unlock_tasks; k++)
-    if (ta->unlock_tasks[k] == tb) {
-      ta->nr_unlock_tasks -= 1;
-      ta->unlock_tasks[k] = ta->unlock_tasks[ta->nr_unlock_tasks];
-      break;
-    }
-
-  lock_unlock_blind(&ta->lock);
-}
-
-/**
- * @brief Add an unlock_task to the given task.
- *
- * @param ta The unlocking #task.
- * @param tb The #task that will be unlocked.
- */
-
-void task_addunlock(struct task *ta, struct task *tb) {
-
-  error("Use sched_addunlock instead.");
-
-  /* Add the lock atomically. */
-  ta->unlock_tasks[atomic_inc(&ta->nr_unlock_tasks)] = tb;
-
-  /* Check a posteriori if we did not overshoot. */
-  if (ta->nr_unlock_tasks > task_maxunlock)
-    error("Too many unlock_tasks in task.");
-}
-
-void task_addunlock_old(struct task *ta, struct task *tb) {
-
-  int k;
-
-  lock_lock(&ta->lock);
-
-  /* Check if ta already unlocks tb. */
-  for (k = 0; k < ta->nr_unlock_tasks; k++)
-    if (ta->unlock_tasks[k] == tb) {
-      error("Duplicate unlock.");
-      lock_unlock_blind(&ta->lock);
-      return;
-    }
-
-  if (ta->nr_unlock_tasks == task_maxunlock)
-    error("Too many unlock_tasks in task.");
-
-  ta->unlock_tasks[ta->nr_unlock_tasks] = tb;
-  ta->nr_unlock_tasks += 1;
-
-  lock_unlock_blind(&ta->lock);
-}
-
 /**
  * @brief Prints the list of tasks contained in a given mask
  *
diff --git a/src/task.h b/src/task.h
index a949e02fce1e9e0fa6b86f27e1c192a45b9073f3..db7305238dc85351f39fb127197b318d3b04c1ee 100644
--- a/src/task.h
+++ b/src/task.h
@@ -37,7 +37,8 @@ enum task_types {
   task_type_sort,
   task_type_self,
   task_type_pair,
-  task_type_sub,
+  task_type_sub_self,
+  task_type_sub_pair,
   task_type_init,
   task_type_ghost,
   task_type_drift,
@@ -62,6 +63,7 @@ enum task_subtypes {
   task_subtype_density,
   task_subtype_force,
   task_subtype_grav,
+  task_subtype_tend,
   task_subtype_count
 };
 
@@ -75,10 +77,10 @@ struct task {
   char skip, tight, implicit;
   int flags, wait, rank, weight;
 
-  swift_lock_type lock;
-
   struct cell *ci, *cj;
 
+  void *buff;
+
 #ifdef WITH_MPI
   MPI_Request req;
 #endif
@@ -91,10 +93,6 @@ struct task {
 };
 
 /* Function prototypes. */
-void task_rmunlock(struct task *ta, struct task *tb);
-void task_rmunlock_blind(struct task *ta, struct task *tb);
-void task_cleanunlock(struct task *t, int type);
-void task_addunlock(struct task *ta, struct task *tb);
 void task_unlock(struct task *t);
 float task_overlap(const struct task *ta, const struct task *tb);
 int task_lock(struct task *t);
diff --git a/src/timers.h b/src/timers.h
index 92b685ebe9b11d49c4703e5837d35cffdca81c4d..c48961b39737f23eb936d7283f76651d33892991 100644
--- a/src/timers.h
+++ b/src/timers.h
@@ -23,6 +23,7 @@
 #define SWIFT_TIMERS_H
 
 /* Includes. */
+#include "atomic.h"
 #include "cycle.h"
 #include "inline.h"
 
@@ -41,9 +42,12 @@ enum {
   timer_dopair_force,
   timer_dopair_grav,
   timer_dograv_external,
-  timer_dosub_density,
-  timer_dosub_force,
-  timer_dosub_grav,
+  timer_dosub_self_density,
+  timer_dosub_self_force,
+  timer_dosub_self_grav,
+  timer_dosub_pair_density,
+  timer_dosub_pair_force,
+  timer_dosub_pair_grav,
   timer_dopair_subset,
   timer_do_ghost,
   timer_dorecv_cell,
@@ -71,7 +75,7 @@ extern ticks timers[timer_count];
 #define TIMER_TOC2(t) timers_toc(t, tic2)
 INLINE static ticks timers_toc(int t, ticks tic) {
   ticks d = (getticks() - tic);
-  __sync_add_and_fetch(&timers[t], d);
+  atomic_add(&timers[t], d);
   return d;
 }
 #else
diff --git a/src/tools.c b/src/tools.c
index 0363100331ad5b298279e9ebadcf87eab5f9e896..710efd7e4dd54784ed735492b4e1af0b466ef682 100644
--- a/src/tools.c
+++ b/src/tools.c
@@ -19,16 +19,25 @@
  *
  ******************************************************************************/
 
+/* Config parameters. */
+#include "../config.h"
+
+/* Some standard headers. */
 #include <math.h>
 #include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
 
+/* This object's header. */
+#include "tools.h"
+
+/* Local includes. */
 #include "cell.h"
 #include "error.h"
+#include "gravity.h"
+#include "hydro.h"
 #include "part.h"
-#include "swift.h"
-#include "tools.h"
+#include "runner.h"
 
 /**
  *  Factorize a given integer, attempts to keep larger pair of factors.
@@ -56,9 +65,7 @@ void factor(int value, int *f1, int *f2) {
  * @param N The number of parts.
  * @param periodic Periodic boundary conditions flag.
  */
-
-void pairs_n2(double *dim, struct part *__restrict__ parts, int N,
-              int periodic) {
+void pairs_n2(double *dim, struct part *restrict parts, int N, int periodic) {
   int i, j, k, count = 0;
   // int mj, mk;
   // double maxratio = 1.0;
@@ -121,8 +128,7 @@ void pairs_n2(double *dim, struct part *__restrict__ parts, int N,
 }
 
 void pairs_single_density(double *dim, long long int pid,
-                          struct part *__restrict__ parts, int N,
-                          int periodic) {
+                          struct part *restrict parts, int N, int periodic) {
   int i, k;
   // int mj, mk;
   // double maxratio = 1.0;
@@ -175,16 +181,19 @@ void pairs_single_density(double *dim, long long int pid,
 }
 
 void pairs_all_density(struct runner *r, struct cell *ci, struct cell *cj) {
+
   float r2, hi, hj, hig2, hjg2, dx[3];
   struct part *pi, *pj;
 
   /* Implements a double-for loop and checks every interaction */
   for (int i = 0; i < ci->count; ++i) {
+
     pi = &ci->parts[i];
     hi = pi->h;
     hig2 = hi * hi * kernel_gamma2;
 
     for (int j = 0; j < cj->count; ++j) {
+
       pj = &cj->parts[j];
 
       /* Pairwise distance */
@@ -196,6 +205,7 @@ void pairs_all_density(struct runner *r, struct cell *ci, struct cell *cj) {
 
       /* Hit or miss? */
       if (r2 < hig2) {
+
         /* Interact */
         runner_iact_nonsym_density(r2, dx, hi, pj->h, pi, pj);
       }
@@ -204,11 +214,13 @@ void pairs_all_density(struct runner *r, struct cell *ci, struct cell *cj) {
 
   /* Reverse double-for loop and checks every interaction */
   for (int j = 0; j < cj->count; ++j) {
+
     pj = &cj->parts[j];
     hj = pj->h;
     hjg2 = hj * hj * kernel_gamma2;
 
     for (int i = 0; i < ci->count; ++i) {
+
       pi = &ci->parts[i];
 
       /* Pairwise distance */
@@ -220,6 +232,7 @@ void pairs_all_density(struct runner *r, struct cell *ci, struct cell *cj) {
 
       /* Hit or miss? */
       if (r2 < hjg2) {
+
         /* Interact */
         runner_iact_nonsym_density(r2, dx, hj, pi->h, pj, pi);
       }
@@ -233,11 +246,13 @@ void self_all_density(struct runner *r, struct cell *ci) {
 
   /* Implements a double-for loop and checks every interaction */
   for (int i = 0; i < ci->count; ++i) {
+
     pi = &ci->parts[i];
     hi = pi->h;
     hig2 = hi * hi * kernel_gamma2;
 
     for (int j = i + 1; j < ci->count; ++j) {
+
       pj = &ci->parts[j];
       hj = pj->h;
       hjg2 = hj * hj * kernel_gamma2;
@@ -253,12 +268,14 @@ void self_all_density(struct runner *r, struct cell *ci) {
 
       /* Hit or miss? */
       if (r2 < hig2) {
+
         /* Interact */
         runner_iact_nonsym_density(r2, dxi, hi, hj, pi, pj);
       }
 
       /* Hit or miss? */
       if (r2 < hjg2) {
+
         dxi[0] = -dxi[0];
         dxi[1] = -dxi[1];
         dxi[2] = -dxi[2];
@@ -271,7 +288,9 @@ void self_all_density(struct runner *r, struct cell *ci) {
 }
 
 void pairs_single_grav(double *dim, long long int pid,
-                       struct gpart *__restrict__ parts, int N, int periodic) {
+                       struct gpart *restrict gparts, const struct part *parts,
+                       int N, int periodic) {
+
   int i, k;
   // int mj, mk;
   // double maxratio = 1.0;
@@ -282,18 +301,20 @@ void pairs_single_grav(double *dim, long long int pid,
 
   /* Find "our" part. */
   for (k = 0; k < N; k++)
-    if ((parts[k].id > 0 && parts[k].part->id == pid) || parts[k].id == -pid)
+    if ((gparts[k].id_or_neg_offset < 0 &&
+         parts[-gparts[k].id_or_neg_offset].id == pid) ||
+        gparts[k].id_or_neg_offset == pid)
       break;
   if (k == N) error("Part not found.");
-  pi = parts[k];
+  pi = gparts[k];
   pi.a_grav[0] = 0.0f;
   pi.a_grav[1] = 0.0f;
   pi.a_grav[2] = 0.0f;
 
   /* Loop over all particle pairs. */
   for (k = 0; k < N; k++) {
-    if (parts[k].id == pi.id) continue;
-    pj = parts[k];
+    if (gparts[k].id_or_neg_offset == pi.id_or_neg_offset) continue;
+    pj = gparts[k];
     for (i = 0; i < 3; i++) {
       dx[i] = pi.x[i] - pj.x[i];
       if (periodic) {
@@ -320,7 +341,8 @@ void pairs_single_grav(double *dim, long long int pid,
   /* Dump the result. */
   message(
       "acceleration on gpart %lli is a=[ %e %e %e ], |a|=[ %.2e %.2e %.2e ].\n",
-      pi.part->id, a[0], a[1], a[2], aabs[0], aabs[1], aabs[2]);
+      parts[-pi.id_or_neg_offset].id, a[0], a[1], a[2], aabs[0], aabs[1],
+      aabs[2]);
 }
 
 /**
@@ -328,8 +350,8 @@ void pairs_single_grav(double *dim, long long int pid,
  *
  * @param N number of intervals in [0,1].
  */
-
 void density_dump(int N) {
+
   int k;
   float r2[4] = {0.0f, 0.0f, 0.0f, 0.0f}, hi[4], hj[4];
   struct part /**pi[4],  *pj[4],*/ Pi[4], Pj[4];
@@ -363,10 +385,8 @@ void density_dump(int N) {
 /**
  * @brief Compute the force on a single particle brute-force.
  */
-
 void engine_single_density(double *dim, long long int pid,
-                           struct part *__restrict__ parts, int N,
-                           int periodic) {
+                           struct part *restrict parts, int N, int periodic) {
   int i, k;
   double r2, dx[3];
   float fdx[3], ih;
@@ -412,7 +432,7 @@ void engine_single_density(double *dim, long long int pid,
 }
 
 void engine_single_force(double *dim, long long int pid,
-                         struct part *__restrict__ parts, int N, int periodic) {
+                         struct part *restrict parts, int N, int periodic) {
   int i, k;
   double r2, dx[3];
   float fdx[3];
diff --git a/src/tools.h b/src/tools.h
index 5f9f41d033ab03983bde3bb37e87f8a39d2deecd..97f036994bc0d1e2b4a95d806cfdbd253664a260 100644
--- a/src/tools.h
+++ b/src/tools.h
@@ -22,21 +22,26 @@
 #ifndef SWIFT_TOOL_H
 #define SWIFT_TOOL_H
 
+/* Config parameters. */
+#include "../config.h"
+
+/* Includes. */
 #include "cell.h"
+#include "part.h"
 #include "runner.h"
 
 void factor(int value, int *f1, int *f2);
 void density_dump(int N);
 void pairs_single_grav(double *dim, long long int pid,
-                       struct gpart *__restrict__ parts, int N, int periodic);
+                       struct gpart *restrict gparts, const struct part *parts,
+                       int N, int periodic);
 void pairs_single_density(double *dim, long long int pid,
-                          struct part *__restrict__ parts, int N, int periodic);
+                          struct part *restrict parts, int N, int periodic);
 
 void pairs_all_density(struct runner *r, struct cell *ci, struct cell *cj);
 void self_all_density(struct runner *r, struct cell *ci);
 
-void pairs_n2(double *dim, struct part *__restrict__ parts, int N,
-              int periodic);
+void pairs_n2(double *dim, struct part *restrict parts, int N, int periodic);
 
 double random_uniform(double a, double b);
 void shuffle_particles(struct part *parts, const int count);