diff --git a/INSTALL.swift b/INSTALL.swift
index a4a2a9208c9443aacc796c87dfe1b96ee88fbb34..bd49dfffd641a3ad59ec25284b14685afb9f7b24 100644
--- a/INSTALL.swift
+++ b/INSTALL.swift
@@ -88,6 +88,9 @@ Before running configure the "mpirun" command should be available in the
 shell. If your command isn't called "mpirun" then define the "MPIRUN"
 environment variable, either in the shell or when running configure.
 
+The MPI compiler can be controlled using the MPICC variable, much like
+the CC one. Use this when your MPI compiler has a none-standard name.
+
 
 METIS: a build of the METIS library can be optionally used to optimize the
 load between MPI nodes (requires an MPI library). This should be found in
diff --git a/README b/README
index a010c4595a6e3a113879659390b373553c6e07d3..b0a21c610f7ab4c380e523b51c6f74b81f98a2f2 100644
--- a/README
+++ b/README
@@ -11,7 +11,10 @@
 
 See INSTALL.swift for install instructions.
 
-Usage: swift [OPTION] PARAMFILE
+Usage: swift [OPTION]... PARAMFILE
+       swift_mpi [OPTION]... PARAMFILE
+       swift_fixdt [OPTION]... PARAMFILE
+       swift_fixdt_mpi [OPTION]... PARAMFILE
 
 Valid options are:
   -a          Pin runners using processor affinity
diff --git a/doc/Makefile.am b/doc/Makefile.am
index 1736a4ad81a674e68932a45e2f8bea8ef82305f3..dc88489ca4ac4ef2fe856d910420fee6a7e87c8a 100644
--- a/doc/Makefile.am
+++ b/doc/Makefile.am
@@ -1,3 +1,19 @@
+# This file is part of SWIFT.
+# Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk),
+#                    Matthieu Schaller (matthieu.schaller@durham.ac.uk).
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 doxyfile.stamp: 
 if HAVE_DOXYGEN
diff --git a/examples/CosmoVolume/cosmoVolume.yml b/examples/CosmoVolume/cosmoVolume.yml
index d33de8170284b3251a00a43c1a8acaaf10e35cc8..acfd449cbf57408718a7c2a50d8a16bf5ccd5b4e 100644
--- a/examples/CosmoVolume/cosmoVolume.yml
+++ b/examples/CosmoVolume/cosmoVolume.yml
@@ -37,10 +37,8 @@ Statistics:
 SPH:
   resolution_eta:        1.2348   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
   delta_neighbours:      0.1      # The tolerance for the targetted number of neighbours.
-  max_ghost_iterations:  30       # Maximal number of iterations allowed to converge towards the smoothing length.
   max_smoothing_length:  0.705    # Maximal smoothing length allowed (in internal units).
   CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
-  max_volume_change:     2.       # Maximal allowed change of volume over one time-step
 
 # Parameters related to the initial conditions
 InitialConditions:
diff --git a/examples/ExternalPointMass/externalPointMass.yml b/examples/ExternalPointMass/externalPointMass.yml
index e4c3abc28362801ee53320f9e547bf6d01670ec7..52330163caa13609fd7674a5cdf2921743fbe227 100644
--- a/examples/ExternalPointMass/externalPointMass.yml
+++ b/examples/ExternalPointMass/externalPointMass.yml
@@ -32,10 +32,8 @@ Statistics:
 SPH:
   resolution_eta:        1.2348   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
   delta_neighbours:      0.1      # The tolerance for the targetted number of neighbours.
-  max_ghost_iterations:  30       # Maximal number of iterations allowed to converge towards the smoothing length.
   max_smoothing_length:  10.      # Maximal smoothing length allowed (in internal units).
   CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
-  max_volume_change:     2.       # Maximal allowed change of volume over one time-step
 
 # Parameters related to the initial conditions
 InitialConditions:
diff --git a/examples/Makefile.am b/examples/Makefile.am
index fcb6d3ac4158e140ed5bf227f9215d488f608dd7..ecc5d4acf806fcc82a97488c5a0412357409ed25 100644
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -1,4 +1,3 @@
-
 # This file is part of SWIFT.
 # Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk),
 #                    Matthieu Schaller (matthieu.schaller@durham.ac.uk).
diff --git a/examples/SedovBlast/sedov.yml b/examples/SedovBlast/sedov.yml
index 96b16c5b85b2e534e5ac8b7680f61ed9912da3d6..9fbabb4969b6accdb7323d8270b735951ac0693a 100644
--- a/examples/SedovBlast/sedov.yml
+++ b/examples/SedovBlast/sedov.yml
@@ -32,10 +32,8 @@ Statistics:
 SPH:
   resolution_eta:        1.2348   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
   delta_neighbours:      0.1      # The tolerance for the targetted number of neighbours.
-  max_ghost_iterations:  30       # Maximal number of iterations allowed to converge towards the smoothing length.
   max_smoothing_length:  1.       # Maximal smoothing length allowed (in internal units).
   CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
-  max_volume_change:     2.       # Maximal allowed change of volume over one time-step
 
 # Parameters related to the initial conditions
 InitialConditions:
diff --git a/examples/SodShock/sodShock.yml b/examples/SodShock/sodShock.yml
index 62ed0c68cad8e063f49ab0655aee0e10f57e6314..003b7286777230ceb3b84ee01c6a1f335aeb9476 100644
--- a/examples/SodShock/sodShock.yml
+++ b/examples/SodShock/sodShock.yml
@@ -32,10 +32,8 @@ Statistics:
 SPH:
   resolution_eta:        1.2348   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
   delta_neighbours:      0.1      # The tolerance for the targetted number of neighbours.
-  max_ghost_iterations:  30       # Maximal number of iterations allowed to converge towards the smoothing length.
   max_smoothing_length:  0.01     # Maximal smoothing length allowed (in internal units).
   CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
-  max_volume_change:     2.       # Maximal allowed change of volume over one time-step
 
 # Parameters related to the initial conditions
 InitialConditions:
diff --git a/examples/UniformBox/uniformBox.yml b/examples/UniformBox/uniformBox.yml
index 64f9a135fff6ecb2e4500ade95d1500a0712f2c8..50afbee02ddc8a801ca77ed4900b7d2e0e3b50b5 100644
--- a/examples/UniformBox/uniformBox.yml
+++ b/examples/UniformBox/uniformBox.yml
@@ -32,10 +32,8 @@ Statistics:
 SPH:
   resolution_eta:        1.2348   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
   delta_neighbours:      0.1      # The tolerance for the targetted number of neighbours.
-  max_ghost_iterations:  30       # Maximal number of iterations allowed to converge towards the smoothing length.
   max_smoothing_length:  0.1      # Maximal smoothing length allowed (in internal units).
   CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
-  max_volume_change:     2.       # Maximal allowed change of volume over one time-step
   
 # Parameters related to the initial conditions
 InitialConditions:
diff --git a/examples/main.c b/examples/main.c
index 9f371d64dde9c66f118477beb145f426946fc9d9..fa0ac2567ac253568903ce8c47b2c9c38fee3704 100644
--- a/examples/main.c
+++ b/examples/main.c
@@ -50,7 +50,11 @@
  */
 void print_help_message() {
 
-  printf("\nUsage: swift [OPTION] PARAMFILE\n\n");
+  printf("\nUsage: swift [OPTION]... PARAMFILE\n");
+  printf("       swift_mpi [OPTION]... PARAMFILE\n");
+  printf("       swift_fixdt [OPTION]... PARAMFILE\n");
+  printf("       swift_fixdt_mpi [OPTION]... PARAMFILE\n\n");
+
   printf("Valid options are:\n");
   printf("  %2s %8s %s\n", "-a", "", "Pin runners using processor affinity");
   printf("  %2s %8s %s\n", "-c", "", "Run with cosmological time integration");
@@ -89,7 +93,6 @@ void print_help_message() {
  * @brief Main routine that loads a few particles and generates some output.
  *
  */
-
 int main(int argc, char *argv[]) {
 
   struct clocks_time tic, toc;
@@ -478,10 +481,8 @@ int main(int argc, char *argv[]) {
     if (j % 100 == 2) e.forcerepart = reparttype;
 #endif
 
+    /* Reset timers */
     timers_reset(timers_mask_all);
-#ifdef COUNTER
-    for (k = 0; k < runner_counter_count; k++) runner_counter[k] = 0;
-#endif
 
     /* Take a step. */
     engine_step(&e);
diff --git a/examples/parameter_example.yml b/examples/parameter_example.yml
index 9b2ebd028a3506e231aa36ed2078bb44faf4eb49..fb1e6ff6931b2fdee00792eed7f178872ee6d950 100644
--- a/examples/parameter_example.yml
+++ b/examples/parameter_example.yml
@@ -39,10 +39,10 @@ Statistics:
 SPH:
   resolution_eta:        1.2348   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
   delta_neighbours:      0.1      # The tolerance for the targetted number of neighbours.
-  max_ghost_iterations:  30       # Maximal number of iterations allowed to converge towards the smoothing length.
+  max_ghost_iterations:  30       # (Optional) Maximal number of iterations allowed to converge towards the smoothing length.
   max_smoothing_length:  0.1      # Maximal smoothing length allowed (in internal units).
   CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
-  max_volume_change:     2.       # Maximal allowed change of kernel volume over one time-step
+  max_volume_change:     2.       # (Optional) Maximal allowed change of kernel volume over one time-step
 
 # Parameters related to the initial conditions
 InitialConditions:
diff --git a/examples/plot_tasks.py b/examples/plot_tasks.py
index e070bdb565d6d8305b7974137292910dd720afe2..f2d0aa95d1f35f30476e1989349a07be8d9e5b0a 100755
--- a/examples/plot_tasks.py
+++ b/examples/plot_tasks.py
@@ -55,8 +55,8 @@ PLOT_PARAMS = {"axes.labelsize": 10,
 pl.rcParams.update(PLOT_PARAMS)
 
 #  Tasks and subtypes. Indexed as in tasks.h.
-TASKTYPES = ["none", "sort", "self", "pair", "sub", "init", "ghost", "drift",
-             "kick", "kick_fixdt", "send", "recv", "grav_pp", "grav_mm",
+TASKTYPES = ["none", "sort", "self", "pair", "sub_self", "sub_pair", "init", "ghost",
+             "drift", "kick", "kick_fixdt", "send", "recv", "grav_pp", "grav_mm",
              "grav_up", "grav_down", "grav_external", "part_sort", "gpart_sort",
              "split_cell", "rewait", "count"]
 
@@ -64,7 +64,8 @@ TASKCOLOURS = {"none": "black",
                "sort": "lightblue",
                "self": "greenyellow",
                "pair": "navy",
-               "sub": "hotpink",
+               "sub_self": "greenyellow",
+               "sub_pair": "navy",
                "init": "indigo",
                "ghost": "cyan",
                "drift": "maroon",
diff --git a/examples/plot_tasks_MPI.py b/examples/plot_tasks_MPI.py
index 44705a0164034f8a22f1f3aa860a64d3e7656d2e..9a92faf9417c9a302831eb8cb2f4471eb672d59c 100755
--- a/examples/plot_tasks_MPI.py
+++ b/examples/plot_tasks_MPI.py
@@ -61,8 +61,8 @@ PLOT_PARAMS = {"axes.labelsize": 10,
 pl.rcParams.update(PLOT_PARAMS)
 
 #  Tasks and subtypes. Indexed as in tasks.h.
-TASKTYPES = ["none", "sort", "self", "pair", "sub", "init", "ghost", "drift",
-             "kick", "kick_fixdt", "send", "recv", "grav_pp", "grav_mm",
+TASKTYPES = ["none", "sort", "self", "pair", "sub_self", "sub_pair", "init", "ghost",
+             "drift", "kick", "kick_fixdt", "send", "recv", "grav_pp", "grav_mm",
              "grav_up", "grav_down", "grav_external", "part_sort", "gpart_sort",
              "split_cell", "rewait", "count"]
 
@@ -70,7 +70,8 @@ TASKCOLOURS = {"none": "black",
                "sort": "lightblue",
                "self": "greenyellow",
                "pair": "navy",
-               "sub": "hotpink",
+               "sub_self": "greenyellow",
+               "sub_pair": "navy",
                "init": "indigo",
                "ghost": "cyan",
                "drift": "maroon",
diff --git a/src/Makefile.am b/src/Makefile.am
index 6d96fd01de5b4be002ad5dc03c65d5145ada7177..cbf12389c2e7e725f1673a1fa7510c490ea81354 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1,4 +1,3 @@
-
 # This file is part of SWIFT.
 # Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk),
 #                    Matthieu Schaller (matthieu.schaller@durham.ac.uk).
@@ -17,10 +16,10 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 # Add the debug flag to the whole thing
-AM_CFLAGS = -DTIMER -DCOUNTER $(HDF5_CPPFLAGS)
+AM_CFLAGS = -DTIMER $(HDF5_CPPFLAGS)
 
 # Assign a "safe" version number
-AM_LDFLAGS = $(LAPACK_LIBS) $(BLAS_LIBS) $(HDF5_LDFLAGS) -version-info 0:0:0 # -fsanitize=address
+AM_LDFLAGS = $(HDF5_LDFLAGS) -version-info 0:0:0
 
 # The git command, if available.
 GIT_CMD = @GIT_CMD@
@@ -71,6 +70,7 @@ libswiftsim_la_SOURCES = $(AM_SOURCES)
 # Sources and flags for MPI library
 libswiftsim_mpi_la_SOURCES = $(AM_SOURCES)
 libswiftsim_mpi_la_CFLAGS = $(AM_CFLAGS) -DWITH_MPI $(METIS_INCS)
+libswiftsim_mpi_la_LDFLAGS = $(AM_LDFLAGS) -DWITH_MPI $(METIS_LIBS)
 libswiftsim_mpi_la_SHORTNAME = mpi
 
 
diff --git a/src/approx_math.h b/src/approx_math.h
index ef93ea63c383c74caa3eaff65446962872389a35..cbca602b3fafcc5044b0939b2207b8f9d50a7446 100644
--- a/src/approx_math.h
+++ b/src/approx_math.h
@@ -19,6 +19,8 @@
 #ifndef SWIFT_APPROX_MATH_H
 #define SWIFT_APPROX_MATH_H
 
+#include "inline.h"
+
 /**
  * @brief Approximate version of expf(x) using a 4th order Taylor expansion
  *
diff --git a/src/cell.c b/src/cell.c
index 97a67583a7872999d7b7e7c5a0eb9787c83b4bd3..e152cd6136a7d2a04eb6a657fb9ad85f7d1b8c1b 100644
--- a/src/cell.c
+++ b/src/cell.c
@@ -269,7 +269,7 @@ int cell_locktree(struct cell *c) {
     /* Undo the holds up to finger. */
     for (struct cell *finger2 = c->parent; finger2 != finger;
          finger2 = finger2->parent)
-      __sync_fetch_and_sub(&finger2->hold, 1);
+      atomic_dec(&finger2->hold);
 
     /* Unlock this cell. */
     if (lock_unlock(&c->lock) != 0) error("Failed to unlock cell.");
@@ -309,7 +309,7 @@ int cell_glocktree(struct cell *c) {
     if (lock_trylock(&finger->glock) != 0) break;
 
     /* Increment the hold. */
-    __sync_fetch_and_add(&finger->ghold, 1);
+    atomic_inc(&finger->ghold);
 
     /* Unlock the cell. */
     if (lock_unlock(&finger->glock) != 0) error("Failed to unlock cell.");
@@ -327,7 +327,7 @@ int cell_glocktree(struct cell *c) {
     /* Undo the holds up to finger. */
     for (struct cell *finger2 = c->parent; finger2 != finger;
          finger2 = finger2->parent)
-      __sync_fetch_and_sub(&finger2->ghold, 1);
+      atomic_dec(&finger2->ghold);
 
     /* Unlock this cell. */
     if (lock_unlock(&c->glock) != 0) error("Failed to unlock cell.");
@@ -353,7 +353,7 @@ void cell_unlocktree(struct cell *c) {
 
   /* Climb up the tree and unhold the parents. */
   for (struct cell *finger = c->parent; finger != NULL; finger = finger->parent)
-    __sync_fetch_and_sub(&finger->hold, 1);
+    atomic_dec(&finger->hold);
 
   TIMER_TOC(timer_locktree);
 }
@@ -367,7 +367,7 @@ void cell_gunlocktree(struct cell *c) {
 
   /* Climb up the tree and unhold the parents. */
   for (struct cell *finger = c->parent; finger != NULL; finger = finger->parent)
-    __sync_fetch_and_sub(&finger->ghold, 1);
+    atomic_dec(&finger->ghold);
 
   TIMER_TOC(timer_locktree);
 }
@@ -406,12 +406,14 @@ void cell_split(struct cell *c) {
       xparts[j] = xtemp;
     }
   }
-  /* for ( k = 0 ; k <= j ; k++ )
-      if ( parts[k].x[0] > pivot[0] )
-          error( "cell_split: sorting failed." );
-  for ( k = i ; k < count ; k++ )
-      if ( parts[k].x[0] < pivot[0] )
-          error( "cell_split: sorting failed." ); */
+
+#ifdef SWIFT_DEBUG_CHECKS
+  for (int k = 0; k <= j; k++)
+    if (parts[k].x[0] > pivot[0]) error("cell_split: sorting failed.");
+  for (int k = i; k < count; k++)
+    if (parts[k].x[0] < pivot[0]) error("cell_split: sorting failed.");
+#endif
+
   left[1] = i;
   right[1] = count - 1;
   left[0] = 0;
@@ -433,14 +435,17 @@ void cell_split(struct cell *c) {
         xparts[j] = xtemp;
       }
     }
-    /* for ( int kk = left[k] ; kk <= j ; kk++ )
-        if ( parts[kk].x[1] > pivot[1] ) {
-            message( "ival=[%i,%i], i=%i, j=%i." , left[k] , right[k] , i , j );
-            error( "sorting failed (left)." );
-            }
-    for ( int kk = i ; kk <= right[k] ; kk++ )
-        if ( parts[kk].x[1] < pivot[1] )
-            error( "sorting failed (right)." ); */
+
+#ifdef SWIFT_DEBUG_CHECKS
+    for (int kk = left[k]; kk <= j; kk++)
+      if (parts[kk].x[1] > pivot[1]) {
+        message("ival=[%i,%i], i=%i, j=%i.", left[k], right[k], i, j);
+        error("sorting failed (left).");
+      }
+    for (int kk = i; kk <= right[k]; kk++)
+      if (parts[kk].x[1] < pivot[1]) error("sorting failed (right).");
+#endif
+
     left[2 * k + 1] = i;
     right[2 * k + 1] = right[k];
     left[2 * k] = left[k];
@@ -463,16 +468,20 @@ void cell_split(struct cell *c) {
         xparts[j] = xtemp;
       }
     }
-    /* for ( int kk = left[k] ; kk <= j ; kk++ )
-        if ( parts[kk].x[2] > pivot[2] ) {
-            message( "ival=[%i,%i], i=%i, j=%i." , left[k] , right[k] , i , j );
-            error( "sorting failed (left)." );
-            }
-    for ( int kk = i ; kk <= right[k] ; kk++ )
-        if ( parts[kk].x[2] < pivot[2] ) {
-            message( "ival=[%i,%i], i=%i, j=%i." , left[k] , right[k] , i , j );
-            error( "sorting failed (right)." );
-            } */
+
+#ifdef SWIFT_DEBUG_CHECKS
+    for (int kk = left[k]; kk <= j; kk++)
+      if (parts[kk].x[2] > pivot[2]) {
+        message("ival=[%i,%i], i=%i, j=%i.", left[k], right[k], i, j);
+        error("sorting failed (left).");
+      }
+    for (int kk = i; kk <= right[k]; kk++)
+      if (parts[kk].x[2] < pivot[2]) {
+        message("ival=[%i,%i], i=%i, j=%i.", left[k], right[k], i, j);
+        error("sorting failed (right).");
+      }
+#endif
+
     left[2 * k + 1] = i;
     right[2 * k + 1] = right[k];
     left[2 * k] = left[k];
@@ -490,32 +499,34 @@ void cell_split(struct cell *c) {
   for (int k = 0; k < count; k++)
     if (parts[k].gpart != NULL) parts[k].gpart->part = &parts[k];
 
+#ifdef SWIFT_DEBUG_CHECKS
   /* Verify that _all_ the parts have been assigned to a cell. */
-  /* for ( k = 1 ; k < 8 ; k++ )
-      if ( &c->progeny[k-1]->parts[ c->progeny[k-1]->count ] !=
-  c->progeny[k]->parts )
-          error( "Particle sorting failed (internal consistency)." );
-  if ( c->progeny[0]->parts != c->parts )
-      error( "Particle sorting failed (left edge)." );
-  if ( &c->progeny[7]->parts[ c->progeny[7]->count ] != &c->parts[ count ] )
-      error( "Particle sorting failed (right edge)." ); */
+  for (int k = 1; k < 8; k++)
+    if (&c->progeny[k - 1]->parts[c->progeny[k - 1]->count] !=
+        c->progeny[k]->parts)
+      error("Particle sorting failed (internal consistency).");
+  if (c->progeny[0]->parts != c->parts)
+    error("Particle sorting failed (left edge).");
+  if (&c->progeny[7]->parts[c->progeny[7]->count] != &c->parts[count])
+    error("Particle sorting failed (right edge).");
 
   /* Verify a few sub-cells. */
-  /* for (int k = 0 ; k < c->progeny[0]->count ; k++ )
-      if ( c->progeny[0]->parts[k].x[0] > pivot[0] ||
-           c->progeny[0]->parts[k].x[1] > pivot[1] ||
-           c->progeny[0]->parts[k].x[2] > pivot[2] )
-          error( "Sorting failed (progeny=0)." );
-  for (int k = 0 ; k < c->progeny[1]->count ; k++ )
-      if ( c->progeny[1]->parts[k].x[0] > pivot[0] ||
-           c->progeny[1]->parts[k].x[1] > pivot[1] ||
-           c->progeny[1]->parts[k].x[2] <= pivot[2] )
-          error( "Sorting failed (progeny=1)." );
-  for (int k = 0 ; k < c->progeny[2]->count ; k++ )
-      if ( c->progeny[2]->parts[k].x[0] > pivot[0] ||
-           c->progeny[2]->parts[k].x[1] <= pivot[1] ||
-           c->progeny[2]->parts[k].x[2] > pivot[2] )
-          error( "Sorting failed (progeny=2)." ); */
+  for (int k = 0; k < c->progeny[0]->count; k++)
+    if (c->progeny[0]->parts[k].x[0] > pivot[0] ||
+        c->progeny[0]->parts[k].x[1] > pivot[1] ||
+        c->progeny[0]->parts[k].x[2] > pivot[2])
+      error("Sorting failed (progeny=0).");
+  for (int k = 0; k < c->progeny[1]->count; k++)
+    if (c->progeny[1]->parts[k].x[0] > pivot[0] ||
+        c->progeny[1]->parts[k].x[1] > pivot[1] ||
+        c->progeny[1]->parts[k].x[2] <= pivot[2])
+      error("Sorting failed (progeny=1).");
+  for (int k = 0; k < c->progeny[2]->count; k++)
+    if (c->progeny[2]->parts[k].x[0] > pivot[0] ||
+        c->progeny[2]->parts[k].x[1] <= pivot[1] ||
+        c->progeny[2]->parts[k].x[2] > pivot[2])
+      error("Sorting failed (progeny=2).");
+#endif
 
   /* Now do the same song and dance for the gparts. */
 
diff --git a/src/cell.h b/src/cell.h
index 65665c9dcfcd652cc688f652008044d618b10790..ad39354c6ced334c8c6dfa8420dc6c7f649009a3 100644
--- a/src/cell.h
+++ b/src/cell.h
@@ -27,8 +27,9 @@
 #include "lock.h"
 #include "multipole.h"
 #include "part.h"
+#include "task.h"
 
-/* Forward declaration of space, needed for cell_unpack. */
+/* Avoid cyclic inclusions */
 struct space;
 
 /* Max tag size set to 2^29 to take into account some MPI implementations
diff --git a/src/common_io.c b/src/common_io.c
index 4399c83f7ebd2bb5355b7df9cc1fdecd301e36aa..03d5618cd0cafe5ff6067cf72107baacffb5d85a 100644
--- a/src/common_io.c
+++ b/src/common_io.c
@@ -43,6 +43,8 @@
 #include "const.h"
 #include "error.h"
 #include "kernel_hydro.h"
+#include "part.h"
+#include "units.h"
 #include "version.h"
 
 const char* particle_type_names[NUM_PARTICLE_TYPES] = {
diff --git a/src/common_io.h b/src/common_io.h
index ed1c96801c904d9952c7b3ca995b847afdc3fb43..fa6811a26671a2ed85c12ada7bb380094f55795d 100644
--- a/src/common_io.h
+++ b/src/common_io.h
@@ -23,13 +23,11 @@
 /* Config parameters. */
 #include "../config.h"
 
-/* Includes. */
-#include "kernel_hydro.h"
+#if defined(HAVE_HDF5)
+
 #include "part.h"
 #include "units.h"
 
-#if defined(HAVE_HDF5)
-
 /**
  * @brief The different types of data used in the GADGET IC files.
  *
@@ -107,6 +105,6 @@ void writeXMFline(FILE* xmfFile, char* fileName, char* partTypeGroupName,
 void writeCodeDescription(hid_t h_file);
 void writeUnitSystem(hid_t h_file, struct UnitSystem* us);
 
-#endif
+#endif /* defined HDF5 */
 
 #endif /* SWIFT_COMMON_IO_H */
diff --git a/src/const.h b/src/const.h
index 6432ef6a9e8107d94c93a32967e291d7e1e4d24d..673aa3bf0093c1c426938a384fd017c1c1d18310 100644
--- a/src/const.h
+++ b/src/const.h
@@ -60,4 +60,7 @@
 /* Gravity properties */
 #define EXTERNAL_POTENTIAL_POINTMASS
 
+/* Are we debugging ? */
+//#define SWIFT_DEBUG_CHECKS
+
 #endif /* SWIFT_CONST_H */
diff --git a/src/debug.c b/src/debug.c
index a3647915d96a9456cb6d8177d144dab56fd0b97e..77238ab135e3f27b8c2c43b0ec18e7745493b138 100644
--- a/src/debug.c
+++ b/src/debug.c
@@ -20,11 +20,19 @@
  *
  ******************************************************************************/
 
+/* Config parameters. */
+#include "../config.h"
+
+/* Some standard headers. */
 #include <stdio.h>
 
+/* This object's header. */
+#include "debug.h"
+
+/* Local includes. */
 #include "config.h"
 #include "const.h"
-#include "debug.h"
+#include "inline.h"
 #include "part.h"
 
 /* Import the right hydro definition */
@@ -51,7 +59,6 @@
  *
  * (Should be used for debugging only as it runs in O(N).)
  */
-
 void printParticle(struct part *parts, struct xpart *xparts, long long int id,
                    size_t N) {
 
@@ -69,6 +76,17 @@ void printParticle(struct part *parts, struct xpart *xparts, long long int id,
   if (!found) printf("## Particles[???] id=%lld not found\n", id);
 }
 
+/**
+ * @brief Looks for the g-particle with the given id and prints its information
+ * to
+ * the standard output.
+ *
+ * @param gparts The array of g-particles.
+ * @param id The id too look for.
+ * @param N The size of the array of g-particles.
+ *
+ * (Should be used for debugging only as it runs in O(N).)
+ */
 void printgParticle(struct gpart *gparts, long long int id, size_t N) {
 
   int found = 0;
@@ -95,9 +113,7 @@ void printgParticle(struct gpart *gparts, long long int id, size_t N) {
  *
  * @param p The particle to print
  * @param xp The extended data ot the particle to print
- *
  */
-
 void printParticle_single(struct part *p, struct xpart *xp) {
 
   printf("## Particle: id=%lld ", p->id);
@@ -105,6 +121,18 @@ void printParticle_single(struct part *p, struct xpart *xp) {
   printf("\n");
 }
 
+/**
+ * @brief Prints the details of a given particle to stdout
+ *
+ * @param gp The g-particle to print
+ */
+void printgParticle_single(struct gpart *gp) {
+
+  printf("## g-Particle: id=%lld ", gp->id);
+  gravity_debug_particle(gp);
+  printf("\n");
+}
+
 #ifdef HAVE_METIS
 
 /**
diff --git a/src/debug.h b/src/debug.h
index fba5cb68680472715025f9fee39a5bf06abacf81..13be15adb867e8bafe95e2900f68caaa36192510 100644
--- a/src/debug.h
+++ b/src/debug.h
@@ -19,14 +19,15 @@
 #ifndef SWIFT_DEBUG_H
 #define SWIFT_DEBUG_H
 
-/* Includes. */
-#include "cell.h"
-#include "part.h"
+struct part;
+struct gpart;
+struct xpart;
 
 void printParticle(struct part *parts, struct xpart *xparts, long long int id,
                    size_t N);
 void printgParticle(struct gpart *parts, long long int id, size_t N);
 void printParticle_single(struct part *p, struct xpart *xp);
+void printgParticle_single(struct gpart *gp);
 
 #ifdef HAVE_METIS
 #include "metis.h"
diff --git a/src/engine.c b/src/engine.c
index e8d0302d75586e988b26e5d68d15dc6705bd3880..4fdc10428a1510271e257231b4df8938d93f7182 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -59,15 +59,26 @@
 #include "parallel_io.h"
 #include "part.h"
 #include "partition.h"
+#include "proxy.h"
+#include "runner.h"
 #include "serial_io.h"
 #include "single_io.h"
 #include "timers.h"
-
-const char *engine_policy_names[13] = {
-    "none",                 "rand",   "steal",        "keep",
-    "block",                "fix_dt", "cpu_tight",    "mpi",
-    "numa_affinity",        "hydro",  "self_gravity", "external_gravity",
-    "cosmology_integration"};
+#include "units.h"
+
+const char *engine_policy_names[13] = {"none",
+                                       "rand",
+                                       "steal",
+                                       "keep",
+                                       "block",
+                                       "fix_dt",
+                                       "cpu_tight",
+                                       "mpi",
+                                       "numa_affinity",
+                                       "hydro",
+                                       "self_gravity",
+                                       "external_gravity",
+                                       "cosmology_integration"};
 
 /** The rank of the engine as a global variable (for messages). */
 int engine_rank;
@@ -111,7 +122,6 @@ void engine_addlink(struct engine *e, struct link **l, struct task *t) {
  * @param c The #cell.
  * @param super The super #cell.
  */
-
 void engine_make_hierarchical_tasks(struct engine *e, struct cell *c,
                                     struct cell *super) {
 
@@ -217,7 +227,7 @@ void engine_redistribute(struct engine *e) {
   bzero(counts, sizeof(int) * nr_nodes * nr_nodes);
   bzero(g_counts, sizeof(int) * nr_nodes * nr_nodes);
 
-  // Allocate the destination index arrays.
+  /* Allocate the destination index arrays. */
   int *dest, *g_dest;
   if ((dest = (int *)malloc(sizeof(int) * s->nr_parts)) == NULL)
     error("Failed to allocate dest temporary buffer.");
@@ -236,9 +246,12 @@ void engine_redistribute(struct engine *e) {
     }
     const int cid = cell_getid(cdim, parts[k].x[0] * ih[0],
                                parts[k].x[1] * ih[1], parts[k].x[2] * ih[2]);
-    /* if (cid < 0 || cid >= s->nr_cells)
-       error("Bad cell id %i for part %i at [%.3e,%.3e,%.3e].",
-             cid, k, parts[k].x[0], parts[k].x[1], parts[k].x[2]); */
+#ifdef SWIFT_DEBUG_CHECKS
+    if (cid < 0 || cid >= s->nr_cells)
+      error("Bad cell id %i for part %zi at [%.3e,%.3e,%.3e].", cid, k,
+            parts[k].x[0], parts[k].x[1], parts[k].x[2]);
+#endif
+
     dest[k] = cells[cid].nodeID;
 
     /* The counts array is indexed as count[from * nr_nodes + to]. */
@@ -265,9 +278,10 @@ void engine_redistribute(struct engine *e) {
           count_this_dest = 0;
         }
 
-        /* Debug */
-        /* if(s->parts[k].gpart->id < 0) */
-        /*   error("Trying to link a partnerless gpart !"); */
+#ifdef SWIFT_DEBUG_CHECKS
+        if (s->parts[k].gpart->id < 0)
+          error("Trying to link a partnerless gpart !");
+#endif
 
         s->parts[k].gpart->id = count_this_dest;
         count_this_dest++;
@@ -287,9 +301,12 @@ void engine_redistribute(struct engine *e) {
     }
     const int cid = cell_getid(cdim, gparts[k].x[0] * ih[0],
                                gparts[k].x[1] * ih[1], gparts[k].x[2] * ih[2]);
-    /* if (cid < 0 || cid >= s->nr_cells)
-       error("Bad cell id %i for part %i at [%.3e,%.3e,%.3e].",
-             cid, k, g_parts[k].x[0], g_parts[k].x[1], g_parts[k].x[2]); */
+#ifdef SWIFT_DEBUG_CHECKS
+    if (cid < 0 || cid >= s->nr_cells)
+      error("Bad cell id %i for part %zi at [%.3e,%.3e,%.3e].", cid, k,
+            gparts[k].x[0], gparts[k].x[1], gparts[k].x[2]);
+#endif
+
     g_dest[k] = cells[cid].nodeID;
 
     /* The counts array is indexed as count[from * nr_nodes + to]. */
@@ -464,17 +481,17 @@ void engine_redistribute(struct engine *e) {
     offset_gparts += count_gparts;
   }
 
+#ifdef SWIFT_DEBUG_CHECKS
   /* Verify that all parts are in the right place. */
-  /* for ( int k = 0 ; k < nr_parts ; k++ ) {
-      int cid = cell_getid( cdim , parts_new[k].x[0]*ih[0],
-    parts_new[k].x[1]*ih[1], parts_new[k].x[2]*ih[2] );
-      if ( cells[ cid ].nodeID != nodeID )
-          error( "Received particle (%i) that does not belong here
-    (nodeID=%i).", k , cells[ cid ].nodeID );
-    } */
+  for (int k = 0; k < nr_parts; k++) {
+    int cid = cell_getid(cdim, parts_new[k].x[0] * ih[0],
+                         parts_new[k].x[1] * ih[1], parts_new[k].x[2] * ih[2]);
+    if (cells[cid].nodeID != nodeID)
+      error("Received particle (%i) that does not belong here (nodeID=%i).", k,
+            cells[cid].nodeID);
+  }
 
   /* Verify that the links are correct */
-  /* MATTHIEU: To be commented out once we are happy */
   for (size_t k = 0; k < nr_gparts; ++k) {
 
     if (gparts_new[k].id > 0) {
@@ -495,6 +512,7 @@ void engine_redistribute(struct engine *e) {
       if (parts_new[k].gpart->part != &parts_new[k]) error("Linking problem !");
     }
   }
+#endif
 
   /* Set the new part data, free the old. */
   free(parts);
@@ -535,7 +553,6 @@ void engine_redistribute(struct engine *e) {
  *
  * @param e The #engine.
  */
-
 void engine_repartition(struct engine *e) {
 
 #if defined(WITH_MPI) && defined(HAVE_METIS)
@@ -585,7 +602,6 @@ void engine_repartition(struct engine *e) {
  * @param up The upward gravity #task.
  * @param down The downward gravity #task.
  */
-
 void engine_addtasks_grav(struct engine *e, struct cell *c, struct task *up,
                           struct task *down) {
 
@@ -607,7 +623,6 @@ void engine_addtasks_grav(struct engine *e, struct cell *c, struct task *up,
  * @param ci The sending #cell.
  * @param cj The receiving #cell
  */
-
 void engine_addtasks_send(struct engine *e, struct cell *ci, struct cell *cj) {
 
 #ifdef WITH_MPI
@@ -706,7 +721,6 @@ void engine_addtasks_recv(struct engine *e, struct cell *c, struct task *t_xv,
  *
  * @param e The #engine.
  */
-
 void engine_exchange_cells(struct engine *e) {
 
 #ifdef WITH_MPI
@@ -721,8 +735,7 @@ void engine_exchange_cells(struct engine *e) {
   MPI_Status status;
   ticks tic = getticks();
 
-  /* Run through the cells and get the size of the ones that will be sent off.
-   */
+  /* Run through the cells and get the size of the ones that will be sent */
   int count_out = 0;
   for (int k = 0; k < nr_cells; k++) {
     offset[k] = count_out;
@@ -854,7 +867,6 @@ void engine_exchange_cells(struct engine *e) {
  * Note that this function does not mess-up the linkage between parts and
  * gparts, i.e. the received particles have correct linkeage.
  */
-
 void engine_exchange_strays(struct engine *e, size_t offset_parts,
                             int *ind_part, size_t *Npart, size_t offset_gparts,
                             int *ind_gpart, size_t *Ngpart) {
@@ -1165,7 +1177,7 @@ void engine_make_hydroloop_tasks(struct engine *e) {
  *
  * @param e The #engine.
  */
- 
+
 void engine_count_and_link_tasks_mapper(void *map_data, int num_elements,
                                         void *extra_data) {
 
@@ -1203,16 +1215,19 @@ void engine_count_and_link_tasks_mapper(void *map_data, int num_elements,
         engine_addlink(e, &t->cj->density, t);
         atomic_inc(&t->cj->nr_density);
       }
-    } else if (t->type == task_type_sub) {
+    } else if (t->type == task_type_sub_self) {
       atomic_inc(&t->ci->nr_tasks);
-      if (t->cj != NULL) atomic_inc(&t->cj->nr_tasks);
       if (t->subtype == task_subtype_density) {
         engine_addlink(e, &t->ci->density, t);
         atomic_inc(&t->ci->nr_density);
-        if (t->cj != NULL) {
-          engine_addlink(e, &t->cj->density, t);
-          atomic_inc(&t->cj->nr_density);
-        }
+      }
+    } else if (t->type == task_type_sub_pair) {
+      atomic_inc(&t->ci->nr_tasks);
+      if (t->subtype == task_subtype_density) {
+        engine_addlink(e, &t->ci->density, t);
+        atomic_inc(&t->ci->nr_density);
+        engine_addlink(e, &t->cj->density, t);
+        atomic_inc(&t->cj->nr_density);
       }
     }
   }
@@ -1252,16 +1267,21 @@ void engine_count_and_link_tasks_serial(struct engine *e) {
         engine_addlink(e, &t->cj->density, t);
         atomic_inc(&t->cj->nr_density);
       }
-    } else if (t->type == task_type_sub) {
+    } else if (t->type == task_type_sub_self) {
       atomic_inc(&t->ci->nr_tasks);
       if (t->cj != NULL) atomic_inc(&t->cj->nr_tasks);
       if (t->subtype == task_subtype_density) {
         engine_addlink(e, &t->ci->density, t);
         atomic_inc(&t->ci->nr_density);
-        if (t->cj != NULL) {
-          engine_addlink(e, &t->cj->density, t);
-          atomic_inc(&t->cj->nr_density);
-        }
+      }
+    } else if (t->type == task_type_sub_pair) {
+      atomic_inc(&t->ci->nr_tasks);
+      if (t->cj != NULL) atomic_inc(&t->cj->nr_tasks);
+      if (t->subtype == task_subtype_density) {
+        engine_addlink(e, &t->ci->density, t);
+        atomic_inc(&t->ci->nr_density);
+        engine_addlink(e, &t->cj->density, t);
+        atomic_inc(&t->cj->nr_density);
       }
     }
   }
@@ -1354,29 +1374,47 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       }
     }
 
-    /* Otherwise, sub interaction? */
-    else if (t->type == task_type_sub && t->subtype == task_subtype_density) {
+    /* Otherwise, sub self-interaction? */
+    else if (t->type == task_type_sub_self &&
+             t->subtype == task_subtype_density) {
 
       /* Start by constructing the task for the second hydro loop */
       struct task *t2 =
-          scheduler_addtask(sched, task_type_sub, task_subtype_force, t->flags,
-                            0, t->ci, t->cj, 0);
+          scheduler_addtask(sched, task_type_sub_self, task_subtype_force,
+                            t->flags, 0, t->ci, t->cj, 0);
 
       /* Add the link between the new loop and both cells */
       engine_addlink(e, &t->ci->force, t2);
       atomic_inc(&t->ci->nr_force);
-      if (t->cj != NULL) {
-        engine_addlink(e, &t->cj->force, t2);
-        atomic_inc(&t->cj->nr_force);
+
+      /* Now, build all the dependencies for the hydro for the cells */
+      /* that are local and are not descendant of the same super-cells */
+      if (t->ci->nodeID == nodeID) {
+        engine_make_hydro_loops_dependencies(sched, t, t2, t->ci);
       }
+    }
+
+    /* Otherwise, sub pair-interaction? */
+    else if (t->type == task_type_sub_pair &&
+             t->subtype == task_subtype_density) {
+
+      /* Start by constructing the task for the second hydro loop */
+      struct task *t2 =
+          scheduler_addtask(sched, task_type_sub_pair, task_subtype_force,
+                            t->flags, 0, t->ci, t->cj, 0);
+
+      /* Add the link between the new loop and both cells */
+      engine_addlink(e, &t->ci->force, t2);
+      atomic_inc(&t->ci->nr_force);
+      engine_addlink(e, &t->cj->force, t2);
+      atomic_inc(&t->cj->nr_force);
 
       /* Now, build all the dependencies for the hydro for the cells */
       /* that are local and are not descendant of the same super-cells */
       if (t->ci->nodeID == nodeID) {
         engine_make_hydro_loops_dependencies(sched, t, t2, t->ci);
       }
-      if (t->cj != NULL && t->cj->nodeID == nodeID &&
-          t->ci->super != t->cj->super) {
+      if (t->cj->nodeID == nodeID && t->ci->super != t->cj->super) {
         engine_make_hydro_loops_dependencies(sched, t, t2, t->cj);
       }
     }
@@ -1443,13 +1481,14 @@ void engine_make_extra_hydroloop_tasks_serial(struct engine *e) {
       }
     }
 
-    /* Otherwise, sub interaction? */
-    else if (t->type == task_type_sub && t->subtype == task_subtype_density) {
+    /* Otherwise, sub-self interaction? */
+    else if (t->type == task_type_sub_self &&
+             t->subtype == task_subtype_density) {
 
       /* Start by constructing the task for the second hydro loop */
       struct task *t2 =
-          scheduler_addtask(sched, task_type_sub, task_subtype_force, t->flags,
-                            0, t->ci, t->cj, 0);
+          scheduler_addtask(sched, task_type_sub_self, task_subtype_force,
+                            t->flags, 0, t->ci, t->cj, 0);
 
       /* Add the link between the new loop and both cells */
       engine_addlink(e, &t->ci->force, t2);
@@ -1458,22 +1497,33 @@ void engine_make_extra_hydroloop_tasks_serial(struct engine *e) {
         engine_addlink(e, &t->cj->force, t2);
         atomic_inc(&t->cj->nr_force);
       }
+    }
+
+    /* Otherwise, sub-pair interaction? */
+    else if (t->type == task_type_sub_pair &&
+             t->subtype == task_subtype_density) {
+
+      /* Start by constructing the task for the second hydro loop */
+      struct task *t2 =
+          scheduler_addtask(sched, task_type_sub_pair, task_subtype_force,
+                            t->flags, 0, t->ci, t->cj, 0);
+
+      /* Add the link between the new loop and both cells */
+      engine_addlink(e, &t->ci->force, t2);
+      atomic_inc(&t->ci->nr_force);
+      engine_addlink(e, &t->cj->force, t2);
+      atomic_inc(&t->cj->nr_force);
 
       /* Now, build all the dependencies for the hydro for the cells */
       /* that are local and are not descendant of the same super-cells */
       if (t->ci->nodeID == nodeID) {
         engine_make_hydro_loops_dependencies(sched, t, t2, t->ci);
       }
-      if (t->cj != NULL && t->cj->nodeID == nodeID &&
-          t->ci->super != t->cj->super) {
+      if (t->cj->nodeID == nodeID && t->ci->super != t->cj->super) {
         engine_make_hydro_loops_dependencies(sched, t, t2, t->cj);
       }
     }
 
-    /* /\* Kick tasks should rely on the grav_down tasks of their cell. *\/ */
-    /* else if (t->type == task_type_kick && t->ci->grav_down != NULL) */
-    /*   scheduler_addunlock(sched, t->ci->grav_down, t); */
-
     /* External gravity tasks should depend on init and unlock the kick */
     else if (t->type == task_type_grav_external) {
       scheduler_addunlock(sched, t->ci->init, t);
@@ -1579,8 +1629,8 @@ void engine_maketasks(struct engine *e) {
   {
     const ticks tic = getticks();
     scheduler_splittasks(sched);
-    message("scheduler_splittasks took %.3f %s.", clocks_from_ticks(getticks() - tic),
-            clocks_getunit());
+    message("scheduler_splittasks took %.3f %s.",
+            clocks_from_ticks(getticks() - tic), clocks_getunit());
   }
 
   /* Allocate the list of cell-task links. The maximum number of links
@@ -1600,7 +1650,8 @@ void engine_maketasks(struct engine *e) {
      store the density tasks in each cell, and make each sort
      depend on the sorts of its progeny. */
   /* threadpool_map(&e->threadpool, engine_count_and_link_tasks_mapper,
-                 sched->tasks, sched->nr_tasks, sizeof(struct task), 1000, e); */
+                 sched->tasks, sched->nr_tasks, sizeof(struct task), 1000, e);
+     */
   engine_count_and_link_tasks_serial(e);
 
   /* Append hierarchical tasks to each cells */
@@ -1611,7 +1662,8 @@ void engine_maketasks(struct engine *e) {
      Each force task depends on the cell ghosts and unlocks the kick task
      of its super-cell. */
   /* threadpool_map(&e->threadpool, engine_make_extra_hydroloop_tasks_mapper,
-                 sched->tasks, sched->nr_tasks, sizeof(struct task), 1000, e); */
+                 sched->tasks, sched->nr_tasks, sizeof(struct task), 1000, e);
+     */
   engine_make_extra_hydroloop_tasks_serial(e);
 
   /* Add the communication tasks if MPI is being used. */
@@ -1648,16 +1700,16 @@ void engine_maketasks(struct engine *e) {
   {
     const ticks tic = getticks();
     scheduler_ranktasks(sched);
-    message("scheduler_ranktasks took %.3f %s.", clocks_from_ticks(getticks() - tic),
-            clocks_getunit());
+    message("scheduler_ranktasks took %.3f %s.",
+            clocks_from_ticks(getticks() - tic), clocks_getunit());
   }
 
   /* Weight the tasks. */
   {
     const ticks tic = getticks();
     scheduler_reweight(sched);
-    message("scheduler_reweight took %.3f %s.", clocks_from_ticks(getticks() - tic),
-            clocks_getunit());
+    message("scheduler_reweight took %.3f %s.",
+            clocks_from_ticks(getticks() - tic), clocks_getunit());
   }
 
   /* Set the tasks age. */
@@ -1684,8 +1736,7 @@ void engine_marktasks_fixdt_mapper(void *map_data, int num_elements,
     struct task *t = &tasks[ind];
 
     /* Pair? */
-    if (t->type == task_type_pair ||
-        (t->type == task_type_sub && t->cj != NULL)) {
+    if (t->type == task_type_pair || t->type == task_type_sub_pair) {
 
       /* Local pointers. */
       const struct cell *ci = t->ci;
@@ -1734,15 +1785,14 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
 
     /* Single-cell task? */
     if (t->type == task_type_self || t->type == task_type_ghost ||
-        (t->type == task_type_sub && t->cj == NULL)) {
+        t->type == task_type_sub_self) {
 
       /* Set this task's skip. */
       t->skip = (t->ci->ti_end_min > ti_end);
     }
 
     /* Pair? */
-    else if (t->type == task_type_pair ||
-             (t->type == task_type_sub && t->cj != NULL)) {
+    else if (t->type == task_type_pair || t->type == task_type_sub_pair) {
 
       /* Local pointers. */
       const struct cell *ci = t->ci;
@@ -1834,7 +1884,6 @@ int engine_marktasks(struct engine *e) {
  *
  * @param e The #engine.
  */
-
 void engine_print_task_counts(struct engine *e) {
 
   struct scheduler *sched = &e->sched;
@@ -1903,7 +1952,6 @@ void engine_rebuild(struct engine *e) {
  *
  * @param e The #engine to prepare.
  */
-
 void engine_prepare(struct engine *e) {
 
   TIMER_TIC;
@@ -1944,7 +1992,6 @@ void engine_prepare(struct engine *e) {
  * @param e The #engine.
  * @param tid The thread ID
  */
-
 void engine_barrier(struct engine *e, int tid) {
 
   /* First, get the barrier mutex. */
@@ -2129,7 +2176,11 @@ void engine_collect_drift(struct cell *c) {
   c->ang_mom[1] = ang_mom[1];
   c->ang_mom[2] = ang_mom[2];
 }
-
+/**
+ * @brief Print the conserved quantities statistics to a log file
+ *
+ * @param e The #engine.
+ */
 void engine_print_stats(struct engine *e) {
 
   const struct space *s = e->s;
@@ -2253,8 +2304,6 @@ void engine_init_particles(struct engine *e) {
 
   if (e->nodeID == 0) message("Initialising particles");
 
-  engine_prepare(e);
-
   /* Make sure all particles are ready to go */
   /* i.e. clean-up any stupid state in the ICs */
   if (e->policy & engine_policy_hydro) {
@@ -2265,6 +2314,8 @@ void engine_init_particles(struct engine *e) {
     space_map_cells_pre(s, 0, cell_init_gparts, NULL);
   }
 
+  engine_prepare(e);
+
   engine_marktasks(e);
 
   /* Build the masks corresponding to the policy */
@@ -2280,7 +2331,8 @@ void engine_init_particles(struct engine *e) {
 
     mask |= 1 << task_type_self;
     mask |= 1 << task_type_pair;
-    mask |= 1 << task_type_sub;
+    mask |= 1 << task_type_sub_self;
+    mask |= 1 << task_type_sub_pair;
     mask |= 1 << task_type_ghost;
 
     submask |= 1 << task_subtype_density;
@@ -2409,7 +2461,8 @@ void engine_step(struct engine *e) {
 
     mask |= 1 << task_type_self;
     mask |= 1 << task_type_pair;
-    mask |= 1 << task_type_sub;
+    mask |= 1 << task_type_sub_self;
+    mask |= 1 << task_type_sub_pair;
     mask |= 1 << task_type_ghost;
 
     submask |= 1 << task_subtype_density;
@@ -2459,7 +2512,6 @@ int engine_is_done(struct engine *e) {
  *
  * @param e The #engine.
  */
-
 void engine_makeproxies(struct engine *e) {
 
 #ifdef WITH_MPI
@@ -2565,7 +2617,6 @@ void engine_makeproxies(struct engine *e) {
  * @param e The #engine.
  * @param initial_partition structure defining the cell partition technique
  */
-
 void engine_split(struct engine *e, struct partition *initial_partition) {
 
 #ifdef WITH_MPI
@@ -2617,8 +2668,9 @@ void engine_split(struct engine *e, struct partition *initial_partition) {
   for (size_t k = 0; k < s->nr_gparts; k++)
     if (s->gparts[k].id > 0) s->gparts[k].part->gpart = &s->gparts[k];
 
+#ifdef SWIFT_DEBUG_CHECKS
+
   /* Verify that the links are correct */
-  /* MATTHIEU: To be commented out once we are happy */
   for (size_t k = 0; k < s->nr_gparts; ++k) {
 
     if (s->gparts[k].id > 0) {
@@ -2634,11 +2686,12 @@ void engine_split(struct engine *e, struct partition *initial_partition) {
   for (size_t k = 0; k < s->nr_parts; ++k) {
 
     if (s->parts[k].gpart != NULL) {
-
       if (s->parts[k].gpart->part != &s->parts[k]) error("Linking problem !");
     }
   }
 
+#endif
+
 #else
   error("SWIFT was not compiled with MPI support.");
 #endif
@@ -2745,7 +2798,6 @@ void engine_unpin() {
  * @param hydro The #hydro_props used for this run.
  * @param potential The properties of the external potential.
  */
-
 void engine_init(struct engine *e, struct space *s,
                  const struct swift_params *params, int nr_nodes, int nodeID,
                  int nr_threads, int with_aff, int policy, int verbose,
diff --git a/src/engine.h b/src/engine.h
index f1fea143e6252aa218ad69210a7e105aefdcc13c..dc73b32523649cb54fd66ec4308b0aa4d8cadcaf 100644
--- a/src/engine.h
+++ b/src/engine.h
@@ -37,13 +37,10 @@
 #include <stdio.h>
 
 /* Includes. */
-#include "hydro_properties.h"
-#include "lock.h"
+#include "clocks.h"
 #include "parser.h"
 #include "partition.h"
-#include "physical_constants.h"
 #include "potentials.h"
-#include "proxy.h"
 #include "runner.h"
 #include "scheduler.h"
 #include "space.h"
@@ -204,7 +201,7 @@ struct engine {
 
   /* Properties of external gravitational potential */
   const struct external_potential *external_potential;
-  
+
   /* The (parsed) parameter file */
   const struct swift_params *parameter_file;
 };
diff --git a/src/hydro/Default/hydro_iact.h b/src/hydro/Default/hydro_iact.h
index 4d651b61bd934388414d54fa813820111d26e682..0a577931b5e0ca67ab07dfe414a548da66e82cdd 100644
--- a/src/hydro/Default/hydro_iact.h
+++ b/src/hydro/Default/hydro_iact.h
@@ -20,12 +20,6 @@
 #ifndef SWIFT_RUNNER_IACT_H
 #define SWIFT_RUNNER_IACT_H
 
-/* Includes. */
-#include "const.h"
-#include "kernel_hydro.h"
-#include "part.h"
-#include "vector.h"
-
 /**
  * @brief SPH interaction functions following the Gadget-2 version of SPH.
  *
@@ -44,7 +38,6 @@
 /**
  * @brief Density loop
  */
-
 __attribute__((always_inline)) INLINE static void runner_iact_density(
     float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
 
@@ -218,7 +211,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_density(
 /**
  * @brief Density loop (non-symmetric version)
  */
-
 __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density(
     float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
 
@@ -267,7 +259,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density(
 /**
  * @brief Density loop (non-symmetric vectorized version)
  */
-
 __attribute__((always_inline)) INLINE static void
 runner_iact_nonsym_vec_density(float *R2, float *Dx, float *Hi, float *Hj,
                                struct part **pi, struct part **pj) {
@@ -360,7 +351,6 @@ runner_iact_nonsym_vec_density(float *R2, float *Dx, float *Hi, float *Hj,
 /**
  * @brief Force loop
  */
-
 __attribute__((always_inline)) INLINE static void runner_iact_force(
     float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
 
@@ -456,7 +446,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_force(
 /**
  * @brief Force loop (Vectorized version)
  */
-
 __attribute__((always_inline)) INLINE static void runner_iact_vec_force(
     float *R2, float *Dx, float *Hi, float *Hj, struct part **pi,
     struct part **pj) {
@@ -675,7 +664,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_force(
 /**
  * @brief Force loop (non-symmetric version)
  */
-
 __attribute__((always_inline)) INLINE static void runner_iact_nonsym_force(
     float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
 
@@ -766,7 +754,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_force(
 /**
  * @brief Force loop (Vectorized non-symmetric version)
  */
-
 __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force(
     float *R2, float *Dx, float *Hi, float *Hj, struct part **pi,
     struct part **pj) {
diff --git a/src/hydro/Gadget2/hydro_iact.h b/src/hydro/Gadget2/hydro_iact.h
index 8fcae293280e55bb838edf3c243f4e322914216f..8738b4be09931df4c938f1dff3adeed11468dcfc 100644
--- a/src/hydro/Gadget2/hydro_iact.h
+++ b/src/hydro/Gadget2/hydro_iact.h
@@ -20,12 +20,6 @@
 #ifndef SWIFT_RUNNER_IACT_LEGACY_H
 #define SWIFT_RUNNER_IACT_LEGACY_H
 
-/* Includes. */
-#include "const.h"
-#include "kernel_hydro.h"
-#include "part.h"
-#include "vector.h"
-
 /**
  * @brief SPH interaction functions following the Gadget-2 version of SPH.
  *
@@ -42,7 +36,6 @@
 /**
  * @brief Density loop
  */
-
 __attribute__((always_inline)) INLINE static void runner_iact_density(
     float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
 
@@ -113,7 +106,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_density(
 /**
  * @brief Density loop (non-symmetric version)
  */
-
 __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density(
     float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
 
@@ -162,7 +154,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density(
 /**
  * @brief Force loop
  */
-
 __attribute__((always_inline)) INLINE static void runner_iact_force(
     float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
 
@@ -260,7 +251,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_force(
 /**
  * @brief Force loop (non-symmetric version)
  */
-
 __attribute__((always_inline)) INLINE static void runner_iact_nonsym_force(
     float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
 
diff --git a/src/hydro/Minimal/hydro_iact.h b/src/hydro/Minimal/hydro_iact.h
index f453d8fa1c495472af27ccf98356a3dab0894e98..c9da185b8a29eafe2a58420ae5de3a05ff043225 100644
--- a/src/hydro/Minimal/hydro_iact.h
+++ b/src/hydro/Minimal/hydro_iact.h
@@ -19,12 +19,6 @@
 #ifndef SWIFT_RUNNER_IACT_MINIMAL_H
 #define SWIFT_RUNNER_IACT_MINIMAL_H
 
-/* Includes. */
-#include "const.h"
-#include "kernel_hydro.h"
-#include "part.h"
-#include "vector.h"
-
 /**
  * @brief Minimal conservative implementation of SPH
  *
@@ -70,7 +64,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_density(
 /**
  * @brief Density loop (non-symmetric version)
  */
-
 __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density(
     float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
 
@@ -95,7 +88,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density(
 /**
  * @brief Force loop
  */
-
 __attribute__((always_inline)) INLINE static void runner_iact_force(
     float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
 
@@ -168,7 +160,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_force(
 /**
  * @brief Force loop (non-symmetric version)
  */
-
 __attribute__((always_inline)) INLINE static void runner_iact_nonsym_force(
     float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
 
diff --git a/src/hydro_properties.c b/src/hydro_properties.c
index b4e37d672408ad3c0adf1f37fcd4fcbd50095d92..16216f81a5b505fc3a887e86ca4898bc4179e4d5 100644
--- a/src/hydro_properties.c
+++ b/src/hydro_properties.c
@@ -29,6 +29,9 @@
 #include "hydro.h"
 #include "kernel_hydro.h"
 
+#define hydro_props_default_max_iterations 30
+#define hydro_props_default_volume_change 2.0f
+
 void hydro_props_init(struct hydro_props *p,
                       const struct swift_params *params) {
 
@@ -39,13 +42,13 @@ void hydro_props_init(struct hydro_props *p,
   p->delta_neighbours = parser_get_param_float(params, "SPH:delta_neighbours");
 
   /* Ghost stuff */
-  p->max_smoothing_iterations =
-      parser_get_param_int(params, "SPH:max_ghost_iterations");
+  p->max_smoothing_iterations = parser_get_opt_param_int(
+      params, "SPH:max_ghost_iterations", hydro_props_default_max_iterations);
 
   /* Time integration properties */
   p->CFL_condition = parser_get_param_float(params, "SPH:CFL_condition");
-  const float max_volume_change =
-      parser_get_param_float(params, "SPH:max_volume_change");
+  const float max_volume_change = parser_get_opt_param_float(
+      params, "SPH:max_volume_change", hydro_props_default_volume_change);
   p->log_max_h_change = logf(powf(max_volume_change, 0.33333333333f));
 }
 
@@ -60,4 +63,8 @@ void hydro_props_print(const struct hydro_props *p) {
       "Hydrodynamic integration: Max change of volume: %.2f "
       "(max|dlog(h)/dt|=%f).",
       powf(expf(p->log_max_h_change), 3.f), p->log_max_h_change);
+
+  if (p->max_smoothing_iterations != hydro_props_default_max_iterations)
+    message("Maximal iterations in ghost task set to %d (default is %d)",
+            p->max_smoothing_iterations, hydro_props_default_max_iterations);
 }
diff --git a/src/lock.h b/src/lock.h
index 31f80605efb29bfaaadc65e25ba4f31679f8fbe8..b2dd2eac9d0ca5d7807907e31cf3fa31894f9aed 100644
--- a/src/lock.h
+++ b/src/lock.h
@@ -23,7 +23,7 @@
 #include <pthread.h>
 
 /* Includes. */
-#include "inline.h"
+#include "atomic.h"
 
 #ifdef PTHREAD_SPINLOCK
 #include <pthread.h>
@@ -50,14 +50,14 @@
 #define lock_init(l) (*(l) = 0)
 #define lock_destroy(l) 0
 INLINE static int lock_lock(volatile int *l) {
-  while (__sync_val_compare_and_swap(l, 0, 1) != 0)
+  while (atomic_cas(l, 0, 1) != 0)
     ;
   // while( *l );
   return 0;
 }
-#define lock_trylock(l) ((*(l)) ? 1 : __sync_val_compare_and_swap(l, 0, 1))
-#define lock_unlock(l) (__sync_val_compare_and_swap(l, 1, 0) != 1)
-#define lock_unlock_blind(l) __sync_val_compare_and_swap(l, 1, 0)
+#define lock_trylock(l) ((*(l)) ? 1 : atomic_cas(l, 0, 1))
+#define lock_unlock(l) (atomic_cas(l, 1, 0) != 1)
+#define lock_unlock_blind(l) atomic_cas(l, 1, 0)
 #endif
 
 #endif /* SWIFT_LOCK_H */
diff --git a/src/map.c b/src/map.c
index fbe57fde7b1e29c49b0f27d86d177245dd9a27e2..f4f9ac7cfa7141606b578739517b66e951e65eab 100644
--- a/src/map.c
+++ b/src/map.c
@@ -18,15 +18,23 @@
  *
  ******************************************************************************/
 
-#include "map.h"
+/* Config parameters. */
+#include "../config.h"
+
+/* Some standard headers. */
 #include <stdio.h>
 #include <stdlib.h>
+
+/* This object's header. */
+#include "map.h"
+
+/* Local headers. */
+#include "atomic.h"
 #include "error.h"
 
 /**
  * @brief Mapping function to draw a specific cell (gnuplot).
  */
-
 void map_cells_plot(struct cell *c, void *data) {
 
   int depth = *(int *)data;
@@ -80,24 +88,21 @@ void map_cells_plot(struct cell *c, void *data) {
 /**
  * @brief Mapping function for checking if each part is in its box.
  */
+void map_check(struct part *p, struct cell *c, void *data) {
 
-/* void map_check ( struct part *p , struct cell *c , void *data ) {
-
-    if ( p->x[0] < c->loc[0] || p->x[0] > c->loc[0]+c->h[0] ||
-         p->x[0] < c->loc[0] || p->x[0] > c->loc[0]+c->h[0] ||
-         p->x[0] < c->loc[0] || p->x[0] > c->loc[0]+c->h[0] )
-        printf( "map_check: particle %i is outside of its box.\n" , p->id );
-
-    } */
+  if (p->x[0] < c->loc[0] || p->x[0] > c->loc[0] + c->h[0] ||
+      p->x[0] < c->loc[0] || p->x[0] > c->loc[0] + c->h[0] ||
+      p->x[0] < c->loc[0] || p->x[0] > c->loc[0] + c->h[0])
+    printf("map_check: particle %lld is outside of its box.\n", p->id);
+}
 
 /**
  * @brief Mapping function for neighbour count.
  */
-
 void map_cellcheck(struct cell *c, void *data) {
 
   int *count = (int *)data;
-  __sync_fetch_and_add(count, c->count);
+  atomic_add(count, c->count);
 
   /* Loop over all parts and check if they are in the cell. */
   for (int k = 0; k < c->count; k++) {
@@ -133,7 +138,6 @@ void map_cellcheck(struct cell *c, void *data) {
 /**
  * @brief Mapping function for maxdepth cell count.
  */
-
 void map_maxdepth(struct cell *c, void *data) {
 
   int maxdepth = ((int *)data)[0];
@@ -147,7 +151,6 @@ void map_maxdepth(struct cell *c, void *data) {
 /**
  * @brief Mapping function for neighbour count.
  */
-
 void map_count(struct part *p, struct cell *c, void *data) {
 
   double *wcount = (double *)data;
@@ -156,7 +159,6 @@ void map_count(struct part *p, struct cell *c, void *data) {
 
   *wcount += p->density.wcount;
 }
-
 void map_wcount_min(struct part *p, struct cell *c, void *data) {
 
   struct part **p2 = (struct part **)data;
@@ -188,7 +190,6 @@ void map_h_max(struct part *p, struct cell *c, void *data) {
 /**
  * @brief Mapping function for neighbour count.
  */
-
 void map_icount(struct part *p, struct cell *c, void *data) {
 
   // int *count = (int *)data;
@@ -201,7 +202,6 @@ void map_icount(struct part *p, struct cell *c, void *data) {
 /**
  * @brief Mapping function to print the particle position.
  */
-
 void map_dump(struct part *p, struct cell *c, void *data) {
 
   double *shift = (double *)data;
diff --git a/src/parallel_io.c b/src/parallel_io.c
index e779b56a85da3db72ea860fb336fa42037fbcd0e..c5cac1cb5efc6e533e599867e39cdd7c7b2c87fa 100644
--- a/src/parallel_io.c
+++ b/src/parallel_io.c
@@ -37,7 +37,11 @@
 
 /* Local includes. */
 #include "common_io.h"
+#include "engine.h"
 #include "error.h"
+#include "kernel_hydro.h"
+#include "part.h"
+#include "units.h"
 
 /**
  * @brief Reads a data array from a given HDF5 group.
diff --git a/src/parallel_io.h b/src/parallel_io.h
index 970ad8c41dcbc2df3a85b178f836e16926147788..26757cb679e475d6acd2ce3c408135dfe5e49081 100644
--- a/src/parallel_io.h
+++ b/src/parallel_io.h
@@ -19,6 +19,9 @@
 #ifndef SWIFT_PARALLEL_IO_H
 #define SWIFT_PARALLEL_IO_H
 
+/* Config parameters. */
+#include "../config.h"
+
 /* MPI headers. */
 #ifdef WITH_MPI
 #include <mpi.h>
diff --git a/src/part.h b/src/part.h
index 5d4c9c88a1acadea3d23a3df618c04da389fb61d..e99be6e51a9bd74dd9eec8f590e80989e83ec2e1 100644
--- a/src/part.h
+++ b/src/part.h
@@ -22,9 +22,6 @@
 /* Config parameters. */
 #include "../config.h"
 
-/* Some standard headers. */
-#include <stdlib.h>
-
 /* MPI headers. */
 #ifdef WITH_MPI
 #include <mpi.h>
@@ -38,7 +35,7 @@
 #define xpart_align 32
 #define gpart_align 32
 
-/* Import the right particle definition */
+/* Import the right hydro particle definition */
 #if defined(MINIMAL_SPH)
 #include "./hydro/Minimal/hydro_part.h"
 #elif defined(GADGET2_SPH)
@@ -49,6 +46,7 @@
 #error "Invalid choice of SPH variant"
 #endif
 
+/* Import the right gravity particle definition */
 #include "./gravity/Default/gravity_part.h"
 
 #ifdef WITH_MPI
diff --git a/src/partition.c b/src/partition.c
index b79dfd6b81d727b6e53f9e0906c7c59eee6607f1..6df437826de796c05143b2003dbdacb971d9b7be 100644
--- a/src/partition.c
+++ b/src/partition.c
@@ -453,9 +453,9 @@ static void repart_edge_metis(int partweights, int bothweights, int nodeID,
 
     /* Skip un-interesting tasks. */
     if (t->type != task_type_self && t->type != task_type_pair &&
-        t->type != task_type_sub && t->type != task_type_ghost &&
-        t->type != task_type_drift && t->type != task_type_kick &&
-        t->type != task_type_init)
+        t->type != task_type_sub_self && t->type != task_type_sub_self &&
+        t->type != task_type_ghost && t->type != task_type_drift &&
+        t->type != task_type_kick && t->type != task_type_init)
       continue;
 
     /* Get the task weight. */
@@ -496,15 +496,15 @@ static void repart_edge_metis(int partweights, int bothweights, int nodeID,
 
     /* Self interaction? */
     else if ((t->type == task_type_self && ci->nodeID == nodeID) ||
-             (t->type == task_type_sub && cj == NULL && ci->nodeID == nodeID)) {
+             (t->type == task_type_sub_self && cj == NULL &&
+              ci->nodeID == nodeID)) {
       /* Self interactions add only to vertex weight. */
       if (taskvweights) weights_v[cid] += w;
 
     }
 
     /* Pair? */
-    else if (t->type == task_type_pair ||
-             (t->type == task_type_sub && cj != NULL)) {
+    else if (t->type == task_type_pair || (t->type == task_type_sub_pair)) {
       /* In-cell pair? */
       if (ci == cj) {
         /* Add weight to vertex for ci. */
diff --git a/src/queue.c b/src/queue.c
index dfdac883f9713eb57bb2dc45eb97774983e3a9b2..9883d77e66421eda6093331e0c9a8f6ac0155ded 100644
--- a/src/queue.c
+++ b/src/queue.c
@@ -34,18 +34,62 @@
 #include "queue.h"
 
 /* Local headers. */
+#include "atomic.h"
 #include "const.h"
 #include "error.h"
 
-/* Counter macros. */
-#ifdef COUNTER
-#define COUNT(c) (__sync_add_and_fetch(&queue_counter[c], 1))
-#else
-#define COUNT(c)
-#endif
+/**
+ * @brief Enqueue all tasks in the incoming DEQ.
+ *
+ * @param q The #queue, assumed to be locked.
+ */
+void queue_get_incoming(struct queue *q) {
+
+  int *tid = q->tid;
+  struct task *tasks = q->tasks;
+
+  /* Loop over the incoming DEQ. */
+  while (1) {
+
+    /* Is there a next element? */
+    const int ind = q->first_incoming % queue_incoming_size;
+    if (q->tid_incoming[ind] < 0) break;
+
+    /* Get the next offset off the DEQ. */
+    const int offset = atomic_swap(&q->tid_incoming[ind], -1);
+    atomic_inc(&q->first_incoming);
+
+    /* Does the queue need to be grown? */
+    if (q->count == q->size) {
+      int *temp;
+      q->size *= queue_sizegrow;
+      if ((temp = (int *)malloc(sizeof(int) * q->size)) == NULL)
+        error("Failed to allocate new indices.");
+      memcpy(temp, tid, sizeof(int) * q->count);
+      free(tid);
+      q->tid = tid = temp;
+    }
+
+    /* Drop the task at the end of the queue. */
+    tid[q->count] = offset;
+    q->count += 1;
+    atomic_dec(&q->count_incoming);
+
+    /* Shuffle up. */
+    for (int k = q->count - 1; k > 0; k = (k - 1) / 2)
+      if (tasks[tid[k]].weight > tasks[tid[(k - 1) / 2]].weight) {
+        int temp = tid[k];
+        tid[k] = tid[(k - 1) / 2];
+        tid[(k - 1) / 2] = temp;
+      } else
+        break;
 
-/* The counters. */
-int queue_counter[queue_counter_count];
+    /* Check the queue's consistency. */
+    /* for (int k = 1; k < q->count; k++)
+        if ( tasks[ tid[(k-1)/2] ].weight < tasks[ tid[k] ].weight )
+            error( "Queue heap is disordered." ); */
+  }
+}
 
 /**
  * @brief Insert a used tasks into the given queue.
@@ -53,49 +97,29 @@ int queue_counter[queue_counter_count];
  * @param q The #queue.
  * @param t The #task.
  */
-
 void queue_insert(struct queue *q, struct task *t) {
+  /* Get an index in the DEQ. */
+  const int ind = atomic_inc(&q->last_incoming) % queue_incoming_size;
 
-  int k, *tid;
-  struct task *tasks;
+  /* Spin until the new offset can be stored. */
+  while (atomic_cas(&q->tid_incoming[ind], -1, t - q->tasks) != -1) {
 
-  /* Lock the queue. */
-  if (lock_lock(&q->lock) != 0) error("Failed to get queue lock.");
+    /* Try to get the queue lock, non-blocking, ensures that at
+       least somebody is working on this queue. */
+    if (lock_trylock(&q->lock) == 0) {
 
-  tid = q->tid;
-  tasks = q->tasks;
+      /* Clean up the incoming DEQ. */
+      queue_get_incoming(q);
 
-  /* Does the queue need to be grown? */
-  if (q->count == q->size) {
-    int *temp;
-    q->size *= queue_sizegrow;
-    if ((temp = (int *)malloc(sizeof(int) * q->size)) == NULL)
-      error("Failed to allocate new indices.");
-    memcpy(temp, tid, sizeof(int) * q->count);
-    free(tid);
-    q->tid = tid = temp;
+      /* Release the queue lock. */
+      if (lock_unlock(&q->lock) != 0) {
+        error("Unlocking the qlock failed.\n");
+      }
+    }
   }
 
-  /* Drop the task at the end of the queue. */
-  tid[q->count] = (t - tasks);
-  q->count += 1;
-
-  /* Shuffle up. */
-  for (k = q->count - 1; k > 0; k = (k - 1) / 2)
-    if (tasks[tid[k]].weight > tasks[tid[(k - 1) / 2]].weight) {
-      int temp = tid[k];
-      tid[k] = tid[(k - 1) / 2];
-      tid[(k - 1) / 2] = temp;
-    } else
-      break;
-
-  /* Check the queue's consistency. */
-  /* for ( k = 1 ; k < q->count ; k++ )
-      if ( tasks[ tid[(k-1)/2] ].weight < tasks[ tid[k] ].weight )
-          error( "Queue heap is disordered." ); */
-
-  /* Unlock the queue. */
-  if (lock_unlock(&q->lock) != 0) error("Failed to unlock queue.");
+  /* Increase the incoming count. */
+  atomic_inc(&q->count_incoming);
 }
 
 /**
@@ -104,7 +128,6 @@ void queue_insert(struct queue *q, struct task *t) {
  * @param q The #queue.
  * @param tasks List of tasks to which the queue indices refer to.
  */
-
 void queue_init(struct queue *q, struct task *tasks) {
 
   /* Allocate the task list if needed. */
@@ -120,6 +143,17 @@ void queue_init(struct queue *q, struct task *tasks) {
 
   /* Init the queue lock. */
   if (lock_init(&q->lock) != 0) error("Failed to init queue lock.");
+
+  /* Init the incoming DEQ. */
+  if ((q->tid_incoming = (int *)malloc(sizeof(int) * queue_incoming_size)) ==
+      NULL)
+    error("Failed to allocate queue incoming buffer.");
+  for (int k = 0; k < queue_incoming_size; k++) {
+    q->tid_incoming[k] = -1;
+  }
+  q->first_incoming = 0;
+  q->last_incoming = 0;
+  q->count_incoming = 0;
 }
 
 /**
@@ -129,7 +163,6 @@ void queue_init(struct queue *q, struct task *tasks) {
  * @param prev The previous #task extracted from this #queue.
  * @param blocking Block until access to the queue is granted.
  */
-
 struct task *queue_gettask(struct queue *q, const struct task *prev,
                            int blocking) {
 
@@ -143,6 +176,9 @@ struct task *queue_gettask(struct queue *q, const struct task *prev,
     if (lock_trylock(qlock) != 0) return NULL;
   }
 
+  /* Fill any tasks from the incoming DEQ. */
+  queue_get_incoming(q);
+
   /* If there are no tasks, leave immediately. */
   if (q->count == 0) {
     lock_unlock_blind(qlock);
diff --git a/src/queue.h b/src/queue.h
index b28039a30724fa0c086f9111470af2566933e3ac..5878866c890f53f22c3deaac7fe9b6bba75d499e 100644
--- a/src/queue.h
+++ b/src/queue.h
@@ -29,6 +29,7 @@
 #define queue_sizeinit 100
 #define queue_sizegrow 2
 #define queue_search_window 8
+#define queue_incoming_size 1024
 
 /* Counters. */
 enum {
@@ -52,6 +53,10 @@ struct queue {
   /* The task indices. */
   int *tid;
 
+  /* DEQ for incoming tasks. */
+  int *tid_incoming;
+  volatile unsigned int first_incoming, last_incoming, count_incoming;
+
 } __attribute__((aligned(64)));
 
 /* Function prototypes. */
diff --git a/src/riemann/riemann_trrs.h b/src/riemann/riemann_trrs.h
index 56f7feadae6ea89cec742e298a269e787d58e7b3..efdbfb59877c09a59d535a4785ad74620c0f3651 100644
--- a/src/riemann/riemann_trrs.h
+++ b/src/riemann/riemann_trrs.h
@@ -40,9 +40,9 @@
  * By assuming 2 rarefaction waves, we can analytically solve for the pressure
  * and velocity in the intermediate region, eliminating the iterative procedure.
  *
- * According to Toro: "The two-rarefaction approximation is generally quite
+ * According to Toro: 'The two-rarefaction approximation is generally quite
  * robust; (...) The TRRS is in fact exact when both non-linear waves are
- * actually rarefaction waves."
+ * actually rarefaction waves.'
  *
  * @param WL The left state vector
  * @param WR The right state vector
diff --git a/src/runner.c b/src/runner.c
index 25bd27f109b09f93fafe78fe32350a2dceea0187..c06068c357e8bd1efe6c0278a9a94f48099b2fef 100644
--- a/src/runner.c
+++ b/src/runner.c
@@ -40,6 +40,7 @@
 /* Local headers. */
 #include "approx_math.h"
 #include "atomic.h"
+#include "cell.h"
 #include "const.h"
 #include "debug.h"
 #include "drift.h"
@@ -57,46 +58,20 @@
 #include "timestep.h"
 
 /* Orientation of the cell pairs */
-const float runner_shift[13 * 3] = {
-    5.773502691896258e-01,
-    5.773502691896258e-01,
-    5.773502691896258e-01,
-    7.071067811865475e-01,
-    7.071067811865475e-01,
-    0.0,
-    5.773502691896258e-01,
-    5.773502691896258e-01,
-    -5.773502691896258e-01,
-    7.071067811865475e-01,
-    0.0,
-    7.071067811865475e-01,
-    1.0,
-    0.0,
-    0.0,
-    7.071067811865475e-01,
-    0.0,
-    -7.071067811865475e-01,
-    5.773502691896258e-01,
-    -5.773502691896258e-01,
-    5.773502691896258e-01,
-    7.071067811865475e-01,
-    -7.071067811865475e-01,
-    0.0,
-    5.773502691896258e-01,
-    -5.773502691896258e-01,
-    -5.773502691896258e-01,
-    0.0,
-    7.071067811865475e-01,
-    7.071067811865475e-01,
-    0.0,
-    1.0,
-    0.0,
-    0.0,
-    7.071067811865475e-01,
-    -7.071067811865475e-01,
-    0.0,
-    0.0,
-    1.0,
+const double runner_shift[13][3] = {
+    {5.773502691896258e-01, 5.773502691896258e-01, 5.773502691896258e-01},
+    {7.071067811865475e-01, 7.071067811865475e-01, 0.0},
+    {5.773502691896258e-01, 5.773502691896258e-01, -5.773502691896258e-01},
+    {7.071067811865475e-01, 0.0, 7.071067811865475e-01},
+    {1.0, 0.0, 0.0},
+    {7.071067811865475e-01, 0.0, -7.071067811865475e-01},
+    {5.773502691896258e-01, -5.773502691896258e-01, 5.773502691896258e-01},
+    {7.071067811865475e-01, -7.071067811865475e-01, 0.0},
+    {5.773502691896258e-01, -5.773502691896258e-01, -5.773502691896258e-01},
+    {0.0, 7.071067811865475e-01, 7.071067811865475e-01},
+    {0.0, 1.0, 0.0},
+    {0.0, 7.071067811865475e-01, -7.071067811865475e-01},
+    {0.0, 0.0, 1.0},
 };
 
 /* Does the axis need flipping ? */
@@ -112,9 +87,6 @@ const char runner_flip[27] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
 #define FUNCTION force
 #include "runner_doiact.h"
 
-/* Import the gravity loop functions. */
-#include "runner_doiact_grav.h"
-
 /**
  * @brief Calculate gravity acceleration from external potential
  *
@@ -124,8 +96,8 @@ const char runner_flip[27] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
  */
 void runner_do_grav_external(struct runner *r, struct cell *c, int timer) {
 
-  struct gpart *g, *gparts = c->gparts;
-  int i, k, gcount = c->gcount;
+  struct gpart *restrict gparts = c->gparts;
+  const int gcount = c->gcount;
   const int ti_current = r->e->ti_current;
   const struct external_potential *potential = r->e->external_potential;
   const struct phys_const *constants = r->e->physical_constants;
@@ -134,7 +106,7 @@ void runner_do_grav_external(struct runner *r, struct cell *c, int timer) {
 
   /* Recurse? */
   if (c->split) {
-    for (k = 0; k < 8; k++)
+    for (int k = 0; k < 8; k++)
       if (c->progeny[k] != NULL) runner_do_grav_external(r, c->progeny[k], 0);
     return;
   }
@@ -144,10 +116,10 @@ void runner_do_grav_external(struct runner *r, struct cell *c, int timer) {
 #endif
 
   /* Loop over the parts in this cell. */
-  for (i = 0; i < gcount; i++) {
+  for (int i = 0; i < gcount; i++) {
 
     /* Get a direct pointer on the part. */
-    g = &gparts[i];
+    struct gpart *const g = &gparts[i];
 
     /* Is this part within the time step? */
     if (g->ti_end <= ti_current) {
@@ -164,7 +136,6 @@ void runner_do_grav_external(struct runner *r, struct cell *c, int timer) {
  * @param sort The entries
  * @param N The number of entries.
  */
-
 void runner_do_sort_ascending(struct entry *sort, int N) {
 
   struct {
@@ -246,7 +217,6 @@ void runner_do_sort_ascending(struct entry *sort, int N) {
  * @param clock Flag indicating whether to record the timing or not, needed
  *      for recursive calls.
  */
-
 void runner_do_sort(struct runner *r, struct cell *c, int flags, int clock) {
 
   struct entry *finger;
@@ -255,8 +225,8 @@ void runner_do_sort(struct runner *r, struct cell *c, int flags, int clock) {
   struct entry *sort;
   int j, k, count = c->count;
   int i, ind, off[8], inds[8], temp_i, missing;
-  // float shift[3];
-  float buff[8], px[3];
+  float buff[8];
+  double px[3];
 
   TIMER_TIC
 
@@ -360,9 +330,9 @@ void runner_do_sort(struct runner *r, struct cell *c, int flags, int clock) {
       for (j = 0; j < 13; j++)
         if (flags & (1 << j)) {
           sort[j * (count + 1) + k].i = k;
-          sort[j * (count + 1) + k].d = px[0] * runner_shift[3 * j + 0] +
-                                        px[1] * runner_shift[3 * j + 1] +
-                                        px[2] * runner_shift[3 * j + 2];
+          sort[j * (count + 1) + k].d = px[0] * runner_shift[j][0] +
+                                        px[1] * runner_shift[j][1] +
+                                        px[2] * runner_shift[j][2];
         }
     }
 
@@ -376,163 +346,18 @@ void runner_do_sort(struct runner *r, struct cell *c, int flags, int clock) {
       }
   }
 
+#ifdef SWIFT_DEBUG_CHECKS
   /* Verify the sorting. */
-  /* for ( j = 0 ; j < 13 ; j++ ) {
-      if ( !( flags & (1 << j) ) )
-          continue;
-      finger = &sort[ j*(count + 1) ];
-      for ( k = 1 ; k < count ; k++ ) {
-          if ( finger[k].d < finger[k-1].d )
-              error( "Sorting failed, ascending array." );
-          if ( finger[k].i >= count )
-              error( "Sorting failed, indices borked." );
-          }
-      } */
-
-  if (clock) TIMER_TOC(timer_dosort);
-}
-
-void runner_do_gsort(struct runner *r, struct cell *c, int flags, int clock) {
-
-  struct entry *finger;
-  struct entry *fingers[8];
-  struct gpart *gparts = c->gparts;
-  struct entry *gsort;
-  int j, k, count = c->gcount;
-  int i, ind, off[8], inds[8], temp_i, missing;
-  // float shift[3];
-  float buff[8], px[3];
-
-  TIMER_TIC
-
-  /* Clean-up the flags, i.e. filter out what's already been sorted. */
-  flags &= ~c->gsorted;
-  if (flags == 0) return;
-
-  /* start by allocating the entry arrays. */
-  if (c->gsort == NULL || c->gsortsize < count) {
-    if (c->gsort != NULL) free(c->gsort);
-    c->gsortsize = count * 1.1;
-    if ((c->gsort = (struct entry *)malloc(sizeof(struct entry) *
-                                           (c->gsortsize + 1) * 13)) == NULL)
-      error("Failed to allocate sort memory.");
-  }
-  gsort = c->gsort;
-
-  /* Does this cell have any progeny? */
-  if (c->split) {
-
-    /* Fill in the gaps within the progeny. */
-    for (k = 0; k < 8; k++) {
-      if (c->progeny[k] == NULL) continue;
-      missing = flags & ~c->progeny[k]->gsorted;
-      if (missing) runner_do_gsort(r, c->progeny[k], missing, 0);
+  for (j = 0; j < 13; j++) {
+    if (!(flags & (1 << j))) continue;
+    finger = &sort[j * (count + 1)];
+    for (k = 1; k < count; k++) {
+      if (finger[k].d < finger[k - 1].d)
+        error("Sorting failed, ascending array.");
+      if (finger[k].i >= count) error("Sorting failed, indices borked.");
     }
-
-    /* Loop over the 13 different sort arrays. */
-    for (j = 0; j < 13; j++) {
-
-      /* Has this sort array been flagged? */
-      if (!(flags & (1 << j))) continue;
-
-      /* Init the particle index offsets. */
-      for (off[0] = 0, k = 1; k < 8; k++)
-        if (c->progeny[k - 1] != NULL)
-          off[k] = off[k - 1] + c->progeny[k - 1]->gcount;
-        else
-          off[k] = off[k - 1];
-
-      /* Init the entries and indices. */
-      for (k = 0; k < 8; k++) {
-        inds[k] = k;
-        if (c->progeny[k] != NULL && c->progeny[k]->gcount > 0) {
-          fingers[k] = &c->progeny[k]->gsort[j * (c->progeny[k]->gcount + 1)];
-          buff[k] = fingers[k]->d;
-          off[k] = off[k];
-        } else
-          buff[k] = FLT_MAX;
-      }
-
-      /* Sort the buffer. */
-      for (i = 0; i < 7; i++)
-        for (k = i + 1; k < 8; k++)
-          if (buff[inds[k]] < buff[inds[i]]) {
-            temp_i = inds[i];
-            inds[i] = inds[k];
-            inds[k] = temp_i;
-          }
-
-      /* For each entry in the new sort list. */
-      finger = &gsort[j * (count + 1)];
-      for (ind = 0; ind < count; ind++) {
-
-        /* Copy the minimum into the new sort array. */
-        finger[ind].d = buff[inds[0]];
-        finger[ind].i = fingers[inds[0]]->i + off[inds[0]];
-
-        /* Update the buffer. */
-        fingers[inds[0]] += 1;
-        buff[inds[0]] = fingers[inds[0]]->d;
-
-        /* Find the smallest entry. */
-        for (k = 1; k < 8 && buff[inds[k]] < buff[inds[k - 1]]; k++) {
-          temp_i = inds[k - 1];
-          inds[k - 1] = inds[k];
-          inds[k] = temp_i;
-        }
-
-      } /* Merge. */
-
-      /* Add a sentinel. */
-      gsort[j * (count + 1) + count].d = FLT_MAX;
-      gsort[j * (count + 1) + count].i = 0;
-
-      /* Mark as sorted. */
-      c->gsorted |= (1 << j);
-
-    } /* loop over sort arrays. */
-
-  } /* progeny? */
-
-  /* Otherwise, just sort. */
-  else {
-
-    /* Fill the sort array. */
-    for (k = 0; k < count; k++) {
-      px[0] = gparts[k].x[0];
-      px[1] = gparts[k].x[1];
-      px[2] = gparts[k].x[2];
-      for (j = 0; j < 13; j++)
-        if (flags & (1 << j)) {
-          gsort[j * (count + 1) + k].i = k;
-          gsort[j * (count + 1) + k].d = px[0] * runner_shift[3 * j + 0] +
-                                         px[1] * runner_shift[3 * j + 1] +
-                                         px[2] * runner_shift[3 * j + 2];
-        }
-    }
-
-    /* Add the sentinel and sort. */
-    for (j = 0; j < 13; j++)
-      if (flags & (1 << j)) {
-        gsort[j * (count + 1) + count].d = FLT_MAX;
-        gsort[j * (count + 1) + count].i = 0;
-        runner_do_sort_ascending(&gsort[j * (count + 1)], count);
-        c->gsorted |= (1 << j);
-      }
   }
-
-  /* Verify the sorting. */
-  /* for ( j = 0 ; j < 13 ; j++ ) {
-      if ( !( flags & (1 << j) ) )
-          continue;
-      finger = &c->gsort[ j*(count + 1) ];
-      for ( k = 1 ; k < count ; k++ ) {
-          if ( finger[k].d < finger[k-1].d )
-              error( "Sorting failed, ascending array." );
-          if ( finger[k].i < 0 || finger[k].i >= count )
-              error( "Sorting failed, indices borked." );
-          }
-      } */
+#endif
 
   if (clock) TIMER_TOC(timer_dosort);
 }
@@ -597,7 +422,6 @@ void runner_do_init(struct runner *r, struct cell *c, int timer) {
  * @param r The runner thread.
  * @param c The cell.
  */
-
 void runner_do_ghost(struct runner *r, struct cell *c) {
 
   struct part *p, *parts = c->parts;
@@ -724,8 +548,13 @@ void runner_do_ghost(struct runner *r, struct cell *c) {
 
           }
 
-          /* Otherwise, sub interaction? */
-          else if (l->t->type == task_type_sub) {
+          /* Otherwise, sub-self interaction? */
+          else if (l->t->type == task_type_sub_self)
+            runner_dosub_subset_density(r, finger, parts, pid, count, NULL, -1,
+                                        1);
+
+          /* Otherwise, sub-pair interaction? */
+          else if (l->t->type == task_type_sub_pair) {
 
             /* Left or right? */
             if (l->t->ci == finger)
@@ -1237,13 +1066,19 @@ void *runner_main(void *data) {
         case task_type_sort:
           runner_do_sort(r, ci, t->flags, 1);
           break;
-        case task_type_sub:
+        case task_type_sub_self:
           if (t->subtype == task_subtype_density)
-            runner_dosub1_density(r, ci, cj, t->flags, 1);
+            runner_dosub_self1_density(r, ci, 1);
           else if (t->subtype == task_subtype_force)
-            runner_dosub2_force(r, ci, cj, t->flags, 1);
-          else if (t->subtype == task_subtype_grav)
-            runner_dosub_grav(r, ci, cj, 1);
+            runner_dosub_self2_force(r, ci, 1);
+          else
+            error("Unknown task subtype.");
+          break;
+        case task_type_sub_pair:
+          if (t->subtype == task_subtype_density)
+            runner_dosub_pair1_density(r, ci, cj, t->flags, 1);
+          else if (t->subtype == task_subtype_force)
+            runner_dosub_pair2_force(r, ci, cj, t->flags, 1);
           else
             error("Unknown task subtype.");
           break;
@@ -1267,21 +1102,6 @@ void *runner_main(void *data) {
         case task_type_recv:
           runner_do_recv_cell(r, ci, 1);
           break;
-        case task_type_grav_pp:
-          if (t->cj == NULL)
-            runner_doself_grav(r, t->ci);
-          else
-            runner_dopair_grav(r, t->ci, t->cj);
-          break;
-        case task_type_grav_mm:
-          runner_dograv_mm(r, t->ci, t->cj);
-          break;
-        case task_type_grav_up:
-          runner_dograv_up(r, t->ci);
-          break;
-        case task_type_grav_down:
-          runner_dograv_down(r, t->ci);
-          break;
         case task_type_grav_external:
           runner_do_grav_external(r, t->ci, 1);
           break;
diff --git a/src/runner.h b/src/runner.h
index 35e5f56a7b94145eec656b02a1c5568ec39352b6..6838b959955c4e54e208b8d2d16339e7fdb1740f 100644
--- a/src/runner.h
+++ b/src/runner.h
@@ -23,13 +23,12 @@
 #ifndef SWIFT_RUNNER_H
 #define SWIFT_RUNNER_H
 
-/* Includes. */
-#include "cell.h"
-#include "inline.h"
-
-extern const float runner_shift[13 * 3];
+extern const double runner_shift[13][3];
 extern const char runner_flip[27];
 
+struct cell;
+struct engine;
+
 /* A struct representing a runner's thread and its data. */
 struct runner {
 
diff --git a/src/runner_doiact.h b/src/runner_doiact.h
index 97e22138b6c09750c1737741f58e61744a891c98..4da83b940d55460c4bf8d2f650534aedf94dbd5d 100644
--- a/src/runner_doiact.h
+++ b/src/runner_doiact.h
@@ -1,7 +1,7 @@
-
 /*******************************************************************************
  * This file is part of SWIFT.
  * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
+ *               2016 Matthieu Schaller (matthieu.schaller@durham.ac.uk)
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
@@ -18,10 +18,6 @@
  *
  ******************************************************************************/
 
-/* Includes. */
-#include "cell.h"
-#include "part.h"
-
 /* Before including this file, define FUNCTION, which is the
    name of the interaction function. This creates the interaction functions
    runner_dopair_FUNCTION, runner_dopair_FUNCTION_naive, runner_doself_FUNCTION,
@@ -57,11 +53,17 @@
 #define _DOSELF_SUBSET(f) PASTE(runner_doself_subset, f)
 #define DOSELF_SUBSET _DOSELF_SUBSET(FUNCTION)
 
-#define _DOSUB1(f) PASTE(runner_dosub1, f)
-#define DOSUB1 _DOSUB1(FUNCTION)
+#define _DOSUB_SELF1(f) PASTE(runner_dosub_self1, f)
+#define DOSUB_SELF1 _DOSUB_SELF1(FUNCTION)
+
+#define _DOSUB_PAIR1(f) PASTE(runner_dosub_pair1, f)
+#define DOSUB_PAIR1 _DOSUB_PAIR1(FUNCTION)
+
+#define _DOSUB_SELF2(f) PASTE(runner_dosub_self2, f)
+#define DOSUB_SELF2 _DOSUB_SELF2(FUNCTION)
 
-#define _DOSUB2(f) PASTE(runner_dosub2, f)
-#define DOSUB2 _DOSUB2(FUNCTION)
+#define _DOSUB_PAIR2(f) PASTE(runner_dosub_pair2, f)
+#define DOSUB_PAIR2 _DOSUB_PAIR2(FUNCTION)
 
 #define _DOSUB_SUBSET(f) PASTE(runner_dosub_subset, f)
 #define DOSUB_SUBSET _DOSUB_SUBSET(FUNCTION)
@@ -72,14 +74,23 @@
 #define _IACT(f) PASTE(runner_iact, f)
 #define IACT _IACT(FUNCTION)
 
+#define _IACT_NONSYM_VEC(f) PASTE(runner_iact_nonsym_vec, f)
+#define IACT_NONSYM_VEC _IACT_NONSYM_VEC(FUNCTION)
+
+#define _IACT_VEC(f) PASTE(runner_iact_vec, f)
+#define IACT_VEC _IACT_VEC(FUNCTION)
+
 #define _TIMER_DOSELF(f) PASTE(timer_doself, f)
 #define TIMER_DOSELF _TIMER_DOSELF(FUNCTION)
 
 #define _TIMER_DOPAIR(f) PASTE(timer_dopair, f)
 #define TIMER_DOPAIR _TIMER_DOPAIR(FUNCTION)
 
-#define _TIMER_DOSUB(f) PASTE(timer_dosub, f)
-#define TIMER_DOSUB _TIMER_DOSUB(FUNCTION)
+#define _TIMER_DOSUB_SELF(f) PASTE(timer_dosub_self, f)
+#define TIMER_DOSUB_SELF _TIMER_DOSUB_SELF(FUNCTION)
+
+#define _TIMER_DOSUB_PAIR(f) PASTE(timer_dosub_pair, f)
+#define TIMER_DOSUB_PAIR _TIMER_DOSUB_PAIR(FUNCTION)
 
 #define _TIMER_DOSELF_SUBSET(f) PASTE(timer_doself_subset, f)
 #define TIMER_DOSELF_SUBSET _TIMER_DOSELF_SUBSET(FUNCTION)
@@ -87,12 +98,6 @@
 #define _TIMER_DOPAIR_SUBSET(f) PASTE(timer_dopair_subset, f)
 #define TIMER_DOPAIR_SUBSET _TIMER_DOPAIR_SUBSET(FUNCTION)
 
-#define _IACT_NONSYM_VEC(f) PASTE(runner_iact_nonsym_vec, f)
-#define IACT_NONSYM_VEC _IACT_NONSYM_VEC(FUNCTION)
-
-#define _IACT_VEC(f) PASTE(runner_iact_vec, f)
-#define IACT_VEC _IACT_VEC(FUNCTION)
-
 /**
  * @brief Compute the interactions between a cell pair.
  *
@@ -100,18 +105,14 @@
  * @param ci The first #cell.
  * @param cj The second #cell.
  */
-
 void DOPAIR_NAIVE(struct runner *r, struct cell *restrict ci,
                   struct cell *restrict cj) {
 
-  struct engine *e = r->e;
-  int pid, pjd, k, count_i = ci->count, count_j = cj->count;
-  double shift[3] = {0.0, 0.0, 0.0};
-  struct part *restrict parts_i = ci->parts, *restrict parts_j = cj->parts;
-  struct part *restrict pi, *restrict pj;
-  double pix[3];
-  float dx[3], hi, hig2, r2;
+  const struct engine *e = r->e;
   const int ti_current = e->ti_current;
+
+  error("Don't use in actual runs ! Slow code !");
+
 #ifdef VECTORIZE
   int icount = 0;
   float r2q[VEC_SIZE] __attribute__((aligned(16)));
@@ -120,44 +121,47 @@ void DOPAIR_NAIVE(struct runner *r, struct cell *restrict ci,
   float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
   struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
 #endif
+
   TIMER_TIC
 
   /* Anything to do here? */
   if (ci->ti_end_min > ti_current && cj->ti_end_min > ti_current) return;
 
+  const int count_i = ci->count;
+  const int count_j = cj->count;
+  struct part *restrict parts_i = ci->parts;
+  struct part *restrict parts_j = cj->parts;
+
   /* Get the relative distance between the pairs, wrapping. */
-  for (k = 0; k < 3; k++) {
+  double shift[3] = {0.0, 0.0, 0.0};
+  for (int k = 0; k < 3; k++) {
     if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2)
       shift[k] = e->s->dim[k];
     else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2)
       shift[k] = -e->s->dim[k];
   }
 
-  /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with
-  %i/%i parts and shift = [ %g %g %g ].\n" ,
-      ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] ,
-  cj->loc[2] ,
-      ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout);
-  tic = getticks(); */
-
   /* Loop over the parts in ci. */
-  for (pid = 0; pid < count_i; pid++) {
+  for (int pid = 0; pid < count_i; pid++) {
 
     /* Get a hold of the ith part in ci. */
-    pi = &parts_i[pid];
-    for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
-    hi = pi->h;
-    hig2 = hi * hi * kernel_gamma2;
+    struct part *restrict pi = &parts_i[pid];
+    const float hi = pi->h;
+
+    double pix[3];
+    for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
+    const float hig2 = hi * hi * kernel_gamma2;
 
     /* Loop over the parts in cj. */
-    for (pjd = 0; pjd < count_j; pjd++) {
+    for (int pjd = 0; pjd < count_j; pjd++) {
 
       /* Get a pointer to the jth particle. */
-      pj = &parts_j[pjd];
+      struct part *restrict pj = &parts_j[pjd];
 
       /* Compute the pairwise distance. */
-      r2 = 0.0f;
-      for (k = 0; k < 3; k++) {
+      float r2 = 0.0f;
+      float dx[3];
+      for (int k = 0; k < 3; k++) {
         dx[k] = pix[k] - pj->x[k];
         r2 += dx[k] * dx[k];
       }
@@ -198,7 +202,7 @@ void DOPAIR_NAIVE(struct runner *r, struct cell *restrict ci,
 #ifdef VECTORIZE
   /* Pick up any leftovers. */
   if (icount > 0)
-    for (k = 0; k < icount; k++)
+    for (int k = 0; k < icount; k++)
       IACT(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
 #endif
 
@@ -207,12 +211,10 @@ void DOPAIR_NAIVE(struct runner *r, struct cell *restrict ci,
 
 void DOSELF_NAIVE(struct runner *r, struct cell *restrict c) {
 
-  int pid, pjd, k, count = c->count;
-  struct part *restrict parts = c->parts;
-  struct part *restrict pi, *restrict pj;
-  double pix[3] = {0.0, 0.0, 0.0};
-  float dx[3], hi, hig2, r2;
   const int ti_current = r->e->ti_current;
+
+  error("Don't use in actual runs ! Slow code !");
+
 #ifdef VECTORIZE
   int icount = 0;
   float r2q[VEC_SIZE] __attribute__((aligned(16)));
@@ -221,38 +223,34 @@ void DOSELF_NAIVE(struct runner *r, struct cell *restrict c) {
   float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
   struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
 #endif
+
   TIMER_TIC
 
   /* Anything to do here? */
   if (c->ti_end_min > ti_current) return;
 
-  /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with
-  %i/%i parts and shift = [ %g %g %g ].\n" ,
-      ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] ,
-  cj->loc[2] ,
-      ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout);
-  tic = getticks(); */
+  const int count = c->count;
+  struct part *restrict parts = c->parts;
 
   /* Loop over the parts in ci. */
-  for (pid = 0; pid < count; pid++) {
+  for (int pid = 0; pid < count; pid++) {
 
     /* Get a hold of the ith part in ci. */
-    pi = &parts[pid];
-    pix[0] = pi->x[0];
-    pix[1] = pi->x[1];
-    pix[2] = pi->x[2];
-    hi = pi->h;
-    hig2 = hi * hi * kernel_gamma2;
+    struct part *restrict pi = &parts[pid];
+    const double pix[3] = {pi->x[0], pi->x[1], pi->x[2]};
+    const float hi = pi->h;
+    const float hig2 = hi * hi * kernel_gamma2;
 
     /* Loop over the parts in cj. */
-    for (pjd = pid + 1; pjd < count; pjd++) {
+    for (int pjd = pid + 1; pjd < count; pjd++) {
 
       /* Get a pointer to the jth particle. */
-      pj = &parts[pjd];
+      struct part *restrict pj = &parts[pjd];
 
       /* Compute the pairwise distance. */
-      r2 = 0.0f;
-      for (k = 0; k < 3; k++) {
+      float r2 = 0.0f;
+      float dx[3];
+      for (int k = 0; k < 3; k++) {
         dx[k] = pix[k] - pj->x[k];
         r2 += dx[k] * dx[k];
       }
@@ -293,7 +291,7 @@ void DOSELF_NAIVE(struct runner *r, struct cell *restrict c) {
 #ifdef VECTORIZE
   /* Pick up any leftovers. */
   if (icount > 0)
-    for (k = 0; k < icount; k++)
+    for (int k = 0; k < icount; k++)
       IACT(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
 #endif
 
@@ -311,18 +309,121 @@ void DOSELF_NAIVE(struct runner *r, struct cell *restrict c) {
  * @param count The number of particles in @c ind.
  * @param cj The second #cell.
  */
+void DOPAIR_SUBSET_NAIVE(struct runner *r, struct cell *restrict ci,
+                         struct part *restrict parts_i, int *restrict ind,
+                         int count, struct cell *restrict cj) {
+
+  struct engine *e = r->e;
+
+  error("Don't use in actual runs ! Slow code !");
+
+#ifdef VECTORIZE
+  int icount = 0;
+  float r2q[VEC_SIZE] __attribute__((aligned(16)));
+  float hiq[VEC_SIZE] __attribute__((aligned(16)));
+  float hjq[VEC_SIZE] __attribute__((aligned(16)));
+  float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
+  struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
+#endif
+
+  TIMER_TIC
+
+  const int count_j = cj->count;
+  struct part *restrict parts_j = cj->parts;
+
+  /* Get the relative distance between the pairs, wrapping. */
+  double shift[3] = {0.0, 0.0, 0.0};
+  for (int k = 0; k < 3; k++) {
+    if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2)
+      shift[k] = e->s->dim[k];
+    else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2)
+      shift[k] = -e->s->dim[k];
+  }
+
+  /* Loop over the parts_i. */
+  for (int pid = 0; pid < count; pid++) {
+
+    /* Get a hold of the ith part in ci. */
+    struct part *restrict pi = &parts_i[ind[pid]];
+    double pix[3];
+    for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
+    const float hi = pi->h;
+    const float hig2 = hi * hi * kernel_gamma2;
+
+    /* Loop over the parts in cj. */
+    for (int pjd = 0; pjd < count_j; pjd++) {
+
+      /* Get a pointer to the jth particle. */
+      struct part *restrict pj = &parts_j[pjd];
+
+      /* Compute the pairwise distance. */
+      float r2 = 0.0f;
+      float dx[3];
+      for (int k = 0; k < 3; k++) {
+        dx[k] = pix[k] - pj->x[k];
+        r2 += dx[k] * dx[k];
+      }
+
+      /* Hit or miss? */
+      if (r2 < hig2) {
+
+#ifndef VECTORIZE
+
+        IACT_NONSYM(r2, dx, hi, pj->h, pi, pj);
+
+#else
+
+        /* Add this interaction to the queue. */
+        r2q[icount] = r2;
+        dxq[3 * icount + 0] = dx[0];
+        dxq[3 * icount + 1] = dx[1];
+        dxq[3 * icount + 2] = dx[2];
+        hiq[icount] = hi;
+        hjq[icount] = pj->h;
+        piq[icount] = pi;
+        pjq[icount] = pj;
+        icount += 1;
+
+        /* Flush? */
+        if (icount == VEC_SIZE) {
+          IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq);
+          icount = 0;
+        }
+
+#endif
+      }
+
+    } /* loop over the parts in cj. */
+
+  } /* loop over the parts in ci. */
+
+#ifdef VECTORIZE
+  /* Pick up any leftovers. */
+  if (icount > 0)
+    for (int k = 0; k < icount; k++)
+      IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
+#endif
+
+  TIMER_TOC(timer_dopair_subset);
+}
 
+/**
+ * @brief Compute the interactions between a cell pair, but only for the
+ *      given indices in ci.
+ *
+ * @param r The #runner.
+ * @param ci The first #cell.
+ * @param parts_i The #part to interact with @c cj.
+ * @param ind The list of indices of particles in @c ci to interact with.
+ * @param count The number of particles in @c ind.
+ * @param cj The second #cell.
+ */
 void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci,
                    struct part *restrict parts_i, int *restrict ind, int count,
                    struct cell *restrict cj) {
 
   struct engine *e = r->e;
-  int pid, pjd, sid, k, count_j = cj->count, flipped;
-  double shift[3] = {0.0, 0.0, 0.0};
-  struct part *restrict pi, *restrict pj, *restrict parts_j = cj->parts;
-  double pix[3];
-  float dx[3], hi, hig2, r2, di, dxj;
-  struct entry *sort_j;
+
 #ifdef VECTORIZE
   int icount = 0;
   float r2q[VEC_SIZE] __attribute__((aligned(16)));
@@ -331,10 +432,15 @@ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci,
   float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
   struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
 #endif
+
   TIMER_TIC
 
+  const int count_j = cj->count;
+  struct part *restrict parts_j = cj->parts;
+
   /* Get the relative distance between the pairs, wrapping. */
-  for (k = 0; k < 3; k++) {
+  double shift[3] = {0.0, 0.0, 0.0};
+  for (int k = 0; k < 3; k++) {
     if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2)
       shift[k] = e->s->dim[k];
     else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2)
@@ -342,53 +448,50 @@ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci,
   }
 
   /* Get the sorting index. */
-  for (sid = 0, k = 0; k < 3; k++)
+  int sid = 0;
+  for (int k = 0; k < 3; k++)
     sid = 3 * sid + ((cj->loc[k] - ci->loc[k] + shift[k] < 0)
                          ? 0
                          : (cj->loc[k] - ci->loc[k] + shift[k] > 0) ? 2 : 1);
 
   /* Switch the cells around? */
-  flipped = runner_flip[sid];
+  const int flipped = runner_flip[sid];
   sid = sortlistID[sid];
 
   /* Have the cells been sorted? */
   if (!(cj->sorted & (1 << sid))) error("Trying to interact unsorted cells.");
 
-  /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with
-  %i/%i parts and shift = [ %g %g %g ].\n" ,
-      ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] ,
-  cj->loc[2] ,
-      ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout);
-  tic = getticks(); */
-
   /* Pick-out the sorted lists. */
-  sort_j = &cj->sort[sid * (cj->count + 1)];
-  dxj = cj->dx_max;
+  const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)];
+  const float dxj = cj->dx_max;
 
   /* Parts are on the left? */
   if (!flipped) {
 
     /* Loop over the parts_i. */
-    for (pid = 0; pid < count; pid++) {
+    for (int pid = 0; pid < count; pid++) {
 
       /* Get a hold of the ith part in ci. */
-      pi = &parts_i[ind[pid]];
-      for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
-      hi = pi->h;
-      hig2 = hi * hi * kernel_gamma2;
-      di = hi * kernel_gamma + dxj + pix[0] * runner_shift[3 * sid + 0] +
-           pix[1] * runner_shift[3 * sid + 1] +
-           pix[2] * runner_shift[3 * sid + 2];
+      struct part *restrict pi = &parts_i[ind[pid]];
+      double pix[3];
+      for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
+
+      const float hi = pi->h;
+      const float hig2 = hi * hi * kernel_gamma2;
+      const float di = hi * kernel_gamma + dxj + pix[0] * runner_shift[sid][0] +
+                       pix[1] * runner_shift[sid][1] +
+                       pix[2] * runner_shift[sid][2];
 
       /* Loop over the parts in cj. */
-      for (pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) {
+      for (int pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) {
 
         /* Get a pointer to the jth particle. */
-        pj = &parts_j[sort_j[pjd].i];
+        struct part *restrict pj = &parts_j[sort_j[pjd].i];
 
         /* Compute the pairwise distance. */
-        r2 = 0.0f;
-        for (k = 0; k < 3; k++) {
+        float r2 = 0.0f;
+        float dx[3];
+        for (int k = 0; k < 3; k++) {
           dx[k] = pix[k] - pj->x[k];
           r2 += dx[k] * dx[k];
         }
@@ -432,26 +535,28 @@ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci,
   else {
 
     /* Loop over the parts_i. */
-    for (pid = 0; pid < count; pid++) {
+    for (int pid = 0; pid < count; pid++) {
 
       /* Get a hold of the ith part in ci. */
-      pi = &parts_i[ind[pid]];
-      for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
-      hi = pi->h;
-      hig2 = hi * hi * kernel_gamma2;
-      di = -hi * kernel_gamma - dxj + pix[0] * runner_shift[3 * sid + 0] +
-           pix[1] * runner_shift[3 * sid + 1] +
-           pix[2] * runner_shift[3 * sid + 2];
+      struct part *restrict pi = &parts_i[ind[pid]];
+      double pix[3];
+      for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
+      const float hi = pi->h;
+      const float hig2 = hi * hi * kernel_gamma2;
+      const float di =
+          -hi * kernel_gamma - dxj + pix[0] * runner_shift[sid][0] +
+          pix[1] * runner_shift[sid][1] + pix[2] * runner_shift[sid][2];
 
       /* Loop over the parts in cj. */
-      for (pjd = count_j - 1; pjd >= 0 && di < sort_j[pjd].d; pjd--) {
+      for (int pjd = count_j - 1; pjd >= 0 && di < sort_j[pjd].d; pjd--) {
 
         /* Get a pointer to the jth particle. */
-        pj = &parts_j[sort_j[pjd].i];
+        struct part *restrict pj = &parts_j[sort_j[pjd].i];
 
         /* Compute the pairwise distance. */
-        r2 = 0.0f;
-        for (k = 0; k < 3; k++) {
+        float r2 = 0.0f;
+        float dx[3];
+        for (int k = 0; k < 3; k++) {
           dx[k] = pix[k] - pj->x[k];
           r2 += dx[k] * dx[k];
         }
@@ -493,119 +598,7 @@ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci,
 #ifdef VECTORIZE
   /* Pick up any leftovers. */
   if (icount > 0)
-    for (k = 0; k < icount; k++)
-      IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
-#endif
-
-  TIMER_TOC(timer_dopair_subset);
-}
-
-/**
- * @brief Compute the interactions between a cell pair, but only for the
- *      given indices in ci.
- *
- * @param r The #runner.
- * @param ci The first #cell.
- * @param parts_i The #part to interact with @c cj.
- * @param ind The list of indices of particles in @c ci to interact with.
- * @param count The number of particles in @c ind.
- * @param cj The second #cell.
- */
-
-void DOPAIR_SUBSET_NAIVE(struct runner *r, struct cell *restrict ci,
-                         struct part *restrict parts_i, int *restrict ind,
-                         int count, struct cell *restrict cj) {
-
-  struct engine *e = r->e;
-  int pid, pjd, k, count_j = cj->count;
-  double shift[3] = {0.0, 0.0, 0.0};
-  struct part *restrict pi, *restrict pj, *restrict parts_j = cj->parts;
-  double pix[3];
-  float dx[3], hi, hig2, r2;
-#ifdef VECTORIZE
-  int icount = 0;
-  float r2q[VEC_SIZE] __attribute__((aligned(16)));
-  float hiq[VEC_SIZE] __attribute__((aligned(16)));
-  float hjq[VEC_SIZE] __attribute__((aligned(16)));
-  float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
-  struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
-#endif
-  TIMER_TIC
-
-  /* Get the relative distance between the pairs, wrapping. */
-  for (k = 0; k < 3; k++) {
-    if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2)
-      shift[k] = e->s->dim[k];
-    else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2)
-      shift[k] = -e->s->dim[k];
-  }
-
-  /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with
-  %i/%i parts and shift = [ %g %g %g ].\n" ,
-      ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] ,
-  cj->loc[2] ,
-      ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout);
-  tic = getticks(); */
-
-  /* Loop over the parts_i. */
-  for (pid = 0; pid < count; pid++) {
-
-    /* Get a hold of the ith part in ci. */
-    pi = &parts_i[ind[pid]];
-    for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
-    hi = pi->h;
-    hig2 = hi * hi * kernel_gamma2;
-
-    /* Loop over the parts in cj. */
-    for (pjd = 0; pjd < count_j; pjd++) {
-
-      /* Get a pointer to the jth particle. */
-      pj = &parts_j[pjd];
-
-      /* Compute the pairwise distance. */
-      r2 = 0.0f;
-      for (k = 0; k < 3; k++) {
-        dx[k] = pix[k] - pj->x[k];
-        r2 += dx[k] * dx[k];
-      }
-
-      /* Hit or miss? */
-      if (r2 < hig2) {
-
-#ifndef VECTORIZE
-
-        IACT_NONSYM(r2, dx, hi, pj->h, pi, pj);
-
-#else
-
-        /* Add this interaction to the queue. */
-        r2q[icount] = r2;
-        dxq[3 * icount + 0] = dx[0];
-        dxq[3 * icount + 1] = dx[1];
-        dxq[3 * icount + 2] = dx[2];
-        hiq[icount] = hi;
-        hjq[icount] = pj->h;
-        piq[icount] = pi;
-        pjq[icount] = pj;
-        icount += 1;
-
-        /* Flush? */
-        if (icount == VEC_SIZE) {
-          IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq);
-          icount = 0;
-        }
-
-#endif
-      }
-
-    } /* loop over the parts in cj. */
-
-  } /* loop over the parts in ci. */
-
-#ifdef VECTORIZE
-  /* Pick up any leftovers. */
-  if (icount > 0)
-    for (k = 0; k < icount; k++)
+    for (int k = 0; k < icount; k++)
       IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
 #endif
 
@@ -622,15 +615,9 @@ void DOPAIR_SUBSET_NAIVE(struct runner *r, struct cell *restrict ci,
  * @param ind The list of indices of particles in @c ci to interact with.
  * @param count The number of particles in @c ind.
  */
-
 void DOSELF_SUBSET(struct runner *r, struct cell *restrict ci,
                    struct part *restrict parts, int *restrict ind, int count) {
 
-  int pid, pjd, k, count_i = ci->count;
-  struct part *restrict parts_j = ci->parts;
-  struct part *restrict pi, *restrict pj;
-  double pix[3] = {0.0, 0.0, 0.0};
-  float dx[3], hi, hig2, r2;
 #ifdef VECTORIZE
   int icount = 0;
   float r2q[VEC_SIZE] __attribute__((aligned(16)));
@@ -639,35 +626,31 @@ void DOSELF_SUBSET(struct runner *r, struct cell *restrict ci,
   float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
   struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
 #endif
+
   TIMER_TIC
 
-  /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with
-  %i/%i parts and shift = [ %g %g %g ].\n" ,
-      ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] ,
-  cj->loc[2] ,
-      ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout);
-  tic = getticks(); */
+  const int count_i = ci->count;
+  struct part *restrict parts_j = ci->parts;
 
   /* Loop over the parts in ci. */
-  for (pid = 0; pid < count; pid++) {
+  for (int pid = 0; pid < count; pid++) {
 
     /* Get a hold of the ith part in ci. */
-    pi = &parts[ind[pid]];
-    pix[0] = pi->x[0];
-    pix[1] = pi->x[1];
-    pix[2] = pi->x[2];
-    hi = pi->h;
-    hig2 = hi * hi * kernel_gamma2;
+    struct part *restrict pi = &parts[ind[pid]];
+    const double pix[3] = {pi->x[0], pi->x[1], pi->x[2]};
+    const float hi = pi->h;
+    const float hig2 = hi * hi * kernel_gamma2;
 
     /* Loop over the parts in cj. */
-    for (pjd = 0; pjd < count_i; pjd++) {
+    for (int pjd = 0; pjd < count_i; pjd++) {
 
       /* Get a pointer to the jth particle. */
-      pj = &parts_j[pjd];
+      struct part *restrict pj = &parts_j[pjd];
 
       /* Compute the pairwise distance. */
-      r2 = 0.0f;
-      for (k = 0; k < 3; k++) {
+      float r2 = 0.0f;
+      float dx[3];
+      for (int k = 0; k < 3; k++) {
         dx[k] = pix[k] - pj->x[k];
         r2 += dx[k] * dx[k];
       }
@@ -708,7 +691,7 @@ void DOSELF_SUBSET(struct runner *r, struct cell *restrict ci,
 #ifdef VECTORIZE
   /* Pick up any leftovers. */
   if (icount > 0)
-    for (k = 0; k < icount; k++)
+    for (int k = 0; k < icount; k++)
       IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
 #endif
 
@@ -716,26 +699,17 @@ void DOSELF_SUBSET(struct runner *r, struct cell *restrict ci,
 }
 
 /**
- * @brief Compute the interactions between a cell pair.
+ * @brief Compute the interactions between a cell pair (non-symmetric).
  *
  * @param r The #runner.
  * @param ci The first #cell.
  * @param cj The second #cell.
  */
-
 void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) {
 
   struct engine *restrict e = r->e;
-  int pid, pjd, k, sid;
-  double rshift, shift[3] = {0.0, 0.0, 0.0};
-  struct entry *restrict sort_i, *restrict sort_j;
-  struct part *restrict pi, *restrict pj, *restrict parts_i, *restrict parts_j;
-  double pix[3], pjx[3], di, dj;
-  float dx[3], hi, hig2, hj, hjg2, r2, dx_max;
-  double hi_max, hj_max;
-  double di_max, dj_min;
-  int count_i, count_j;
   const int ti_current = e->ti_current;
+
 #ifdef VECTORIZE
   int icount = 0;
   float r2q[VEC_SIZE] __attribute__((aligned(16)));
@@ -744,60 +718,64 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) {
   float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
   struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
 #endif
+
   TIMER_TIC
 
   /* Anything to do here? */
   if (ci->ti_end_min > ti_current && cj->ti_end_min > ti_current) return;
 
   /* Get the sort ID. */
-  sid = space_getsid(e->s, &ci, &cj, shift);
+  double shift[3] = {0.0, 0.0, 0.0};
+  const int sid = space_getsid(e->s, &ci, &cj, shift);
 
   /* Have the cells been sorted? */
   if (!(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid)))
     error("Trying to interact unsorted cells.");
 
   /* Get the cutoff shift. */
-  for (rshift = 0.0, k = 0; k < 3; k++)
-    rshift += shift[k] * runner_shift[3 * sid + k];
+  double rshift = 0.0;
+  for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k];
 
   /* Pick-out the sorted lists. */
-  sort_i = &ci->sort[sid * (ci->count + 1)];
-  sort_j = &cj->sort[sid * (cj->count + 1)];
+  const struct entry *restrict sort_i = &ci->sort[sid * (ci->count + 1)];
+  const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)];
 
   /* Get some other useful values. */
-  hi_max = ci->h_max * kernel_gamma - rshift;
-  hj_max = cj->h_max * kernel_gamma;
-  count_i = ci->count;
-  count_j = cj->count;
-  parts_i = ci->parts;
-  parts_j = cj->parts;
-  di_max = sort_i[count_i - 1].d - rshift;
-  dj_min = sort_j[0].d;
-  dx_max = (ci->dx_max + cj->dx_max);
+  const double hi_max = ci->h_max * kernel_gamma - rshift;
+  const double hj_max = cj->h_max * kernel_gamma;
+  const int count_i = ci->count;
+  const int count_j = cj->count;
+  struct part *restrict parts_i = ci->parts;
+  struct part *restrict parts_j = cj->parts;
+  const double di_max = sort_i[count_i - 1].d - rshift;
+  const double dj_min = sort_j[0].d;
+  const float dx_max = (ci->dx_max + cj->dx_max);
 
   /* Loop over the parts in ci. */
-  for (pid = count_i - 1; pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min;
-       pid--) {
+  for (int pid = count_i - 1;
+       pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min; pid--) {
 
     /* Get a hold of the ith part in ci. */
-    pi = &parts_i[sort_i[pid].i];
+    struct part *restrict pi = &parts_i[sort_i[pid].i];
     if (pi->ti_end > ti_current) continue;
-    hi = pi->h;
-    di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift;
+    const float hi = pi->h;
+    const double di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift;
     if (di < dj_min) continue;
 
-    hig2 = hi * hi * kernel_gamma2;
-    for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
+    double pix[3];
+    for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
+    const float hig2 = hi * hi * kernel_gamma2;
 
     /* Loop over the parts in cj. */
-    for (pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) {
+    for (int pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) {
 
       /* Get a pointer to the jth particle. */
-      pj = &parts_j[sort_j[pjd].i];
+      struct part *restrict pj = &parts_j[sort_j[pjd].i];
 
       /* Compute the pairwise distance. */
-      r2 = 0.0f;
-      for (k = 0; k < 3; k++) {
+      float r2 = 0.0f;
+      float dx[3];
+      for (int k = 0; k < 3; k++) {
         dx[k] = pix[k] - pj->x[k];
         r2 += dx[k] * dx[k];
       }
@@ -835,33 +813,31 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) {
 
   } /* loop over the parts in ci. */
 
-  /* printf( "runner_dopair: first half took %.3f %s...\n" ,
-  clocks_from_ticks(getticks() - tic), clocks_getunit());
-  tic = getticks(); */
-
   /* Loop over the parts in cj. */
-  for (pjd = 0; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max;
+  for (int pjd = 0; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max;
        pjd++) {
 
     /* Get a hold of the jth part in cj. */
-    pj = &parts_j[sort_j[pjd].i];
+    struct part *restrict pj = &parts_j[sort_j[pjd].i];
     if (pj->ti_end > ti_current) continue;
-    hj = pj->h;
-    dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift;
+    const float hj = pj->h;
+    const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift;
     if (dj > di_max) continue;
 
-    for (k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k];
-    hjg2 = hj * hj * kernel_gamma2;
+    double pjx[3];
+    for (int k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k];
+    const float hjg2 = hj * hj * kernel_gamma2;
 
     /* Loop over the parts in ci. */
-    for (pid = count_i - 1; pid >= 0 && sort_i[pid].d > dj; pid--) {
+    for (int pid = count_i - 1; pid >= 0 && sort_i[pid].d > dj; pid--) {
 
       /* Get a pointer to the jth particle. */
-      pi = &parts_i[sort_i[pid].i];
+      struct part *restrict pi = &parts_i[sort_i[pid].i];
 
       /* Compute the pairwise distance. */
-      r2 = 0.0f;
-      for (k = 0; k < 3; k++) {
+      float r2 = 0.0f;
+      float dx[3];
+      for (int k = 0; k < 3; k++) {
         dx[k] = pjx[k] - pi->x[k];
         r2 += dx[k] * dx[k];
       }
@@ -902,28 +878,25 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) {
 #ifdef VECTORIZE
   /* Pick up any leftovers. */
   if (icount > 0)
-    for (k = 0; k < icount; k++)
+    for (int k = 0; k < icount; k++)
       IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
 #endif
 
   TIMER_TOC(TIMER_DOPAIR);
 }
 
+/**
+ * @brief Compute the interactions between a cell pair (symmetric)
+ *
+ * @param r The #runner.
+ * @param ci The first #cell.
+ * @param cj The second #cell.
+ */
 void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
 
   struct engine *restrict e = r->e;
-  int pid, pjd, k, sid;
-  double rshift, shift[3] = {0.0, 0.0, 0.0};
-  struct entry *sort_i, *sort_j;
-  struct entry *sortdt_i = NULL, *sortdt_j = NULL;
-  int countdt_i = 0, countdt_j = 0;
-  struct part *restrict pi, *restrict pj, *restrict parts_i, *restrict parts_j;
-  double pix[3], pjx[3], di, dj;
-  float dx[3], hi, hig2, hj, hjg2, r2, dx_max;
-  double hi_max, hj_max;
-  double di_max, dj_min;
-  int count_i, count_j;
   const int ti_current = e->ti_current;
+
 #ifdef VECTORIZE
   int icount1 = 0;
   float r2q1[VEC_SIZE] __attribute__((aligned(16)));
@@ -938,38 +911,42 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
   float dxq2[3 * VEC_SIZE] __attribute__((aligned(16)));
   struct part *piq2[VEC_SIZE], *pjq2[VEC_SIZE];
 #endif
+
   TIMER_TIC
 
   /* Anything to do here? */
   if (ci->ti_end_min > ti_current && cj->ti_end_min > ti_current) return;
 
   /* Get the shift ID. */
-  sid = space_getsid(e->s, &ci, &cj, shift);
+  double shift[3] = {0.0, 0.0, 0.0};
+  const int sid = space_getsid(e->s, &ci, &cj, shift);
 
   /* Have the cells been sorted? */
   if (!(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid)))
     error("Trying to interact unsorted cells.");
 
   /* Get the cutoff shift. */
-  for (rshift = 0.0, k = 0; k < 3; k++)
-    rshift += shift[k] * runner_shift[3 * sid + k];
+  double rshift = 0.0;
+  for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k];
 
   /* Pick-out the sorted lists. */
-  sort_i = &ci->sort[sid * (ci->count + 1)];
-  sort_j = &cj->sort[sid * (cj->count + 1)];
+  struct entry *restrict sort_i = &ci->sort[sid * (ci->count + 1)];
+  struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)];
 
   /* Get some other useful values. */
-  hi_max = ci->h_max * kernel_gamma - rshift;
-  hj_max = cj->h_max * kernel_gamma;
-  count_i = ci->count;
-  count_j = cj->count;
-  parts_i = ci->parts;
-  parts_j = cj->parts;
-  di_max = sort_i[count_i - 1].d - rshift;
-  dj_min = sort_j[0].d;
-  dx_max = (ci->dx_max + cj->dx_max);
+  const double hi_max = ci->h_max * kernel_gamma - rshift;
+  const double hj_max = cj->h_max * kernel_gamma;
+  const int count_i = ci->count;
+  const int count_j = cj->count;
+  struct part *restrict parts_i = ci->parts;
+  struct part *restrict parts_j = cj->parts;
+  const double di_max = sort_i[count_i - 1].d - rshift;
+  const double dj_min = sort_j[0].d;
+  const double dx_max = (ci->dx_max + cj->dx_max);
 
   /* Collect the number of parts left and right below dt. */
+  int countdt_i = 0, countdt_j = 0;
+  struct entry *restrict sortdt_i = NULL, *restrict sortdt_j = NULL;
   if (ci->ti_end_max <= ti_current) {
     sortdt_i = sort_i;
     countdt_i = count_i;
@@ -977,7 +954,7 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
     if ((sortdt_i = (struct entry *)alloca(sizeof(struct entry) * count_i)) ==
         NULL)
       error("Failed to allocate dt sortlists.");
-    for (k = 0; k < count_i; k++)
+    for (int k = 0; k < count_i; k++)
       if (parts_i[sort_i[k].i].ti_end <= ti_current) {
         sortdt_i[countdt_i] = sort_i[k];
         countdt_i += 1;
@@ -990,7 +967,7 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
     if ((sortdt_j = (struct entry *)alloca(sizeof(struct entry) * count_j)) ==
         NULL)
       error("Failed to allocate dt sortlists.");
-    for (k = 0; k < count_j; k++)
+    for (int k = 0; k < count_j; k++)
       if (parts_j[sort_j[k].i].ti_end <= ti_current) {
         sortdt_j[countdt_j] = sort_j[k];
         countdt_j += 1;
@@ -998,31 +975,33 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
   }
 
   /* Loop over the parts in ci. */
-  for (pid = count_i - 1; pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min;
-       pid--) {
+  for (int pid = count_i - 1;
+       pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min; pid--) {
 
     /* Get a hold of the ith part in ci. */
-    pi = &parts_i[sort_i[pid].i];
-    hi = pi->h;
-    di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift;
+    struct part *restrict pi = &parts_i[sort_i[pid].i];
+    const float hi = pi->h;
+    const double di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift;
     if (di < dj_min) continue;
 
-    hig2 = hi * hi * kernel_gamma2;
-    for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
+    double pix[3];
+    for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
+    const float hig2 = hi * hi * kernel_gamma2;
 
     /* Look at valid dt parts only? */
     if (pi->ti_end > ti_current) {
 
       /* Loop over the parts in cj within dt. */
-      for (pjd = 0; pjd < countdt_j && sortdt_j[pjd].d < di; pjd++) {
+      for (int pjd = 0; pjd < countdt_j && sortdt_j[pjd].d < di; pjd++) {
 
         /* Get a pointer to the jth particle. */
-        pj = &parts_j[sortdt_j[pjd].i];
-        hj = pj->h;
+        struct part *restrict pj = &parts_j[sortdt_j[pjd].i];
+        const float hj = pj->h;
 
         /* Compute the pairwise distance. */
-        r2 = 0.0f;
-        for (k = 0; k < 3; k++) {
+        float r2 = 0.0f;
+        float dx[3];
+        for (int k = 0; k < 3; k++) {
           dx[k] = pj->x[k] - pix[k];
           r2 += dx[k] * dx[k];
         }
@@ -1064,15 +1043,16 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
     else {
 
       /* Loop over the parts in cj. */
-      for (pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) {
+      for (int pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) {
 
         /* Get a pointer to the jth particle. */
-        pj = &parts_j[sort_j[pjd].i];
-        hj = pj->h;
+        struct part *restrict pj = &parts_j[sort_j[pjd].i];
+        const float hj = pj->h;
 
         /* Compute the pairwise distance. */
-        r2 = 0.0f;
-        for (k = 0; k < 3; k++) {
+        float r2 = 0.0f;
+        float dx[3];
+        for (int k = 0; k < 3; k++) {
           dx[k] = pix[k] - pj->x[k];
           r2 += dx[k] * dx[k];
         }
@@ -1138,36 +1118,34 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
 
   } /* loop over the parts in ci. */
 
-  /* printf( "runner_dopair: first half took %.3f %s...\n" ,
-  clocks_from_ticks(getticks() - tic), clocks_getunit());
-  tic = getticks(); */
-
   /* Loop over the parts in cj. */
-  for (pjd = 0; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max;
+  for (int pjd = 0; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max;
        pjd++) {
 
     /* Get a hold of the jth part in cj. */
-    pj = &parts_j[sort_j[pjd].i];
-    hj = pj->h;
-    dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift;
+    struct part *restrict pj = &parts_j[sort_j[pjd].i];
+    const float hj = pj->h;
+    const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift;
     if (dj > di_max) continue;
 
-    for (k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k];
-    hjg2 = hj * hj * kernel_gamma2;
+    double pjx[3];
+    for (int k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k];
+    const float hjg2 = hj * hj * kernel_gamma2;
 
     /* Is this particle outside the dt? */
     if (pj->ti_end > ti_current) {
 
       /* Loop over the parts in ci. */
-      for (pid = countdt_i - 1; pid >= 0 && sortdt_i[pid].d > dj; pid--) {
+      for (int pid = countdt_i - 1; pid >= 0 && sortdt_i[pid].d > dj; pid--) {
 
         /* Get a pointer to the jth particle. */
-        pi = &parts_i[sortdt_i[pid].i];
-        hi = pi->h;
+        struct part *restrict pi = &parts_i[sortdt_i[pid].i];
+        const float hi = pi->h;
 
         /* Compute the pairwise distance. */
-        r2 = 0.0f;
-        for (k = 0; k < 3; k++) {
+        float r2 = 0.0f;
+        float dx[3];
+        for (int k = 0; k < 3; k++) {
           dx[k] = pi->x[k] - pjx[k];
           r2 += dx[k] * dx[k];
         }
@@ -1208,15 +1186,16 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
     else {
 
       /* Loop over the parts in ci. */
-      for (pid = count_i - 1; pid >= 0 && sort_i[pid].d > dj; pid--) {
+      for (int pid = count_i - 1; pid >= 0 && sort_i[pid].d > dj; pid--) {
 
         /* Get a pointer to the jth particle. */
-        pi = &parts_i[sort_i[pid].i];
-        hi = pi->h;
+        struct part *restrict pi = &parts_i[sort_i[pid].i];
+        const float hi = pi->h;
 
         /* Compute the pairwise distance. */
-        r2 = 0.0f;
-        for (k = 0; k < 3; k++) {
+        float r2 = 0.0f;
+        float dx[3];
+        for (int k = 0; k < 3; k++) {
           dx[k] = pjx[k] - pi->x[k];
           r2 += dx[k] * dx[k];
         }
@@ -1285,10 +1264,10 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
 #ifdef VECTORIZE
   /* Pick up any leftovers. */
   if (icount1 > 0)
-    for (k = 0; k < icount1; k++)
+    for (int k = 0; k < icount1; k++)
       IACT_NONSYM(r2q1[k], &dxq1[3 * k], hiq1[k], hjq1[k], piq1[k], pjq1[k]);
   if (icount2 > 0)
-    for (k = 0; k < icount2; k++)
+    for (int k = 0; k < icount2; k++)
       IACT(r2q2[k], &dxq2[3 * k], hiq2[k], hjq2[k], piq2[k], pjq2[k]);
 #endif
 
@@ -1296,20 +1275,15 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
 }
 
 /**
- * @brief Compute the cell self-interaction.
+ * @brief Compute the cell self-interaction (non-symmetric).
  *
  * @param r The #runner.
  * @param c The #cell.
  */
-
 void DOSELF1(struct runner *r, struct cell *restrict c) {
 
-  int k, pid, pjd, count = c->count;
-  double pix[3];
-  float dx[3], hi, hj, hig2, r2;
-  struct part *restrict parts = c->parts, *restrict pi, *restrict pj;
   const int ti_current = r->e->ti_current;
-  int firstdt = 0, countdt = 0, *indt = NULL, doj;
+
 #ifdef VECTORIZE
   int icount1 = 0;
   float r2q1[VEC_SIZE] __attribute__((aligned(16)));
@@ -1326,43 +1300,49 @@ void DOSELF1(struct runner *r, struct cell *restrict c) {
 #endif
   TIMER_TIC
 
-  /* Set up indt if needed. */
-  if (c->ti_end_min > ti_current)
-    return;
-  else if (c->ti_end_max > ti_current) {
-    if ((indt = (int *)alloca(sizeof(int) * count)) == NULL)
-      error("Failed to allocate indt.");
-    for (k = 0; k < count; k++)
-      if (parts[k].ti_end <= ti_current) {
-        indt[countdt] = k;
-        countdt += 1;
-      }
-  }
+  if (c->ti_end_min > ti_current) return;
+  if (c->ti_end_max < ti_current) error("Cell in an impossible time-zone");
+
+  struct part *restrict parts = c->parts;
+  const int count = c->count;
+
+  /* Set up indt. */
+  int *indt = NULL;
+  int countdt = 0, firstdt = 0;
+  if ((indt = (int *)alloca(sizeof(int) * count)) == NULL)
+    error("Failed to allocate indt.");
+  for (int k = 0; k < count; k++)
+    if (parts[k].ti_end <= ti_current) {
+      indt[countdt] = k;
+      countdt += 1;
+    }
 
   /* Loop over the particles in the cell. */
-  for (pid = 0; pid < count; pid++) {
+  for (int pid = 0; pid < count; pid++) {
 
     /* Get a pointer to the ith particle. */
-    pi = &parts[pid];
+    struct part *restrict pi = &parts[pid];
 
     /* Get the particle position and radius. */
-    for (k = 0; k < 3; k++) pix[k] = pi->x[k];
-    hi = pi->h;
-    hig2 = hi * hi * kernel_gamma2;
+    double pix[3];
+    for (int k = 0; k < 3; k++) pix[k] = pi->x[k];
+    const float hi = pi->h;
+    const float hig2 = hi * hi * kernel_gamma2;
 
     /* Is the ith particle inactive? */
     if (pi->ti_end > ti_current) {
 
       /* Loop over the other particles .*/
-      for (pjd = firstdt; pjd < countdt; pjd++) {
+      for (int pjd = firstdt; pjd < countdt; pjd++) {
 
         /* Get a pointer to the jth particle. */
-        pj = &parts[indt[pjd]];
-        hj = pj->h;
+        struct part *restrict pj = &parts[indt[pjd]];
+        const float hj = pj->h;
 
         /* Compute the pairwise distance. */
-        r2 = 0.0f;
-        for (k = 0; k < 3; k++) {
+        float r2 = 0.0f;
+        float dx[3];
+        for (int k = 0; k < 3; k++) {
           dx[k] = pj->x[k] - pix[k];
           r2 += dx[k] * dx[k];
         }
@@ -1407,19 +1387,21 @@ void DOSELF1(struct runner *r, struct cell *restrict c) {
       firstdt += 1;
 
       /* Loop over the other particles .*/
-      for (pjd = pid + 1; pjd < count; pjd++) {
+      for (int pjd = pid + 1; pjd < count; pjd++) {
 
         /* Get a pointer to the jth particle. */
-        pj = &parts[pjd];
-        hj = pj->h;
+        struct part *restrict pj = &parts[pjd];
+        const float hj = pj->h;
 
         /* Compute the pairwise distance. */
-        r2 = 0.0f;
-        for (k = 0; k < 3; k++) {
+        float r2 = 0.0f;
+        float dx[3];
+        for (int k = 0; k < 3; k++) {
           dx[k] = pix[k] - pj->x[k];
           r2 += dx[k] * dx[k];
         }
-        doj = (pj->ti_end <= ti_current) && (r2 < hj * hj * kernel_gamma2);
+        const int doj =
+            (pj->ti_end <= ti_current) && (r2 < hj * hj * kernel_gamma2);
 
         /* Hit or miss? */
         if (r2 < hig2 || doj) {
@@ -1510,24 +1492,26 @@ void DOSELF1(struct runner *r, struct cell *restrict c) {
 #ifdef VECTORIZE
   /* Pick up any leftovers. */
   if (icount1 > 0)
-    for (k = 0; k < icount1; k++)
+    for (int k = 0; k < icount1; k++)
       IACT_NONSYM(r2q1[k], &dxq1[3 * k], hiq1[k], hjq1[k], piq1[k], pjq1[k]);
   if (icount2 > 0)
-    for (k = 0; k < icount2; k++)
+    for (int k = 0; k < icount2; k++)
       IACT(r2q2[k], &dxq2[3 * k], hiq2[k], hjq2[k], piq2[k], pjq2[k]);
 #endif
 
   TIMER_TOC(TIMER_DOSELF);
 }
 
+/**
+ * @brief Compute the cell self-interaction (symmetric).
+ *
+ * @param r The #runner.
+ * @param c The #cell.
+ */
 void DOSELF2(struct runner *r, struct cell *restrict c) {
 
-  int k, pid, pjd, count = c->count;
-  double pix[3];
-  float dx[3], hi, hj, hig2, r2;
-  struct part *restrict parts = c->parts, *restrict pi, *restrict pj;
   const int ti_current = r->e->ti_current;
-  int firstdt = 0, countdt = 0, *indt = NULL;
+
 #ifdef VECTORIZE
   int icount1 = 0;
   float r2q1[VEC_SIZE] __attribute__((aligned(16)));
@@ -1544,43 +1528,49 @@ void DOSELF2(struct runner *r, struct cell *restrict c) {
 #endif
   TIMER_TIC
 
-  /* Set up indt if needed. */
-  if (c->ti_end_min > ti_current)
-    return;
-  else if (c->ti_end_max > ti_current) {
-    if ((indt = (int *)alloca(sizeof(int) * count)) == NULL)
-      error("Failed to allocate indt.");
-    for (k = 0; k < count; k++)
-      if (parts[k].ti_end <= ti_current) {
-        indt[countdt] = k;
-        countdt += 1;
-      }
-  }
+  if (c->ti_end_min > ti_current) return;
+  if (c->ti_end_max < ti_current) error("Cell in an impossible time-zone");
+
+  struct part *restrict parts = c->parts;
+  const int count = c->count;
+
+  /* Set up indt. */
+  int *indt = NULL;
+  int countdt = 0, firstdt = 0;
+  if ((indt = (int *)alloca(sizeof(int) * count)) == NULL)
+    error("Failed to allocate indt.");
+  for (int k = 0; k < count; k++)
+    if (parts[k].ti_end <= ti_current) {
+      indt[countdt] = k;
+      countdt += 1;
+    }
 
   /* Loop over the particles in the cell. */
-  for (pid = 0; pid < count; pid++) {
+  for (int pid = 0; pid < count; pid++) {
 
     /* Get a pointer to the ith particle. */
-    pi = &parts[pid];
+    struct part *restrict pi = &parts[pid];
 
     /* Get the particle position and radius. */
-    for (k = 0; k < 3; k++) pix[k] = pi->x[k];
-    hi = pi->h;
-    hig2 = hi * hi * kernel_gamma2;
+    double pix[3];
+    for (int k = 0; k < 3; k++) pix[k] = pi->x[k];
+    const float hi = pi->h;
+    const float hig2 = hi * hi * kernel_gamma2;
 
     /* Is the ith particle not active? */
     if (pi->ti_end > ti_current) {
 
       /* Loop over the other particles .*/
-      for (pjd = firstdt; pjd < countdt; pjd++) {
+      for (int pjd = firstdt; pjd < countdt; pjd++) {
 
         /* Get a pointer to the jth particle. */
-        pj = &parts[indt[pjd]];
-        hj = pj->h;
+        struct part *restrict pj = &parts[indt[pjd]];
+        const float hj = pj->h;
 
         /* Compute the pairwise distance. */
-        r2 = 0.0f;
-        for (k = 0; k < 3; k++) {
+        float r2 = 0.0f;
+        float dx[3];
+        for (int k = 0; k < 3; k++) {
           dx[k] = pj->x[k] - pix[k];
           r2 += dx[k] * dx[k];
         }
@@ -1625,15 +1615,16 @@ void DOSELF2(struct runner *r, struct cell *restrict c) {
       firstdt += 1;
 
       /* Loop over the other particles .*/
-      for (pjd = pid + 1; pjd < count; pjd++) {
+      for (int pjd = pid + 1; pjd < count; pjd++) {
 
         /* Get a pointer to the jth particle. */
-        pj = &parts[pjd];
-        hj = pj->h;
+        struct part *restrict pj = &parts[pjd];
+        const float hj = pj->h;
 
         /* Compute the pairwise distance. */
-        r2 = 0.0f;
-        for (k = 0; k < 3; k++) {
+        float r2 = 0.0f;
+        float dx[3];
+        for (int k = 0; k < 3; k++) {
           dx[k] = pix[k] - pj->x[k];
           r2 += dx[k] * dx[k];
         }
@@ -1702,10 +1693,10 @@ void DOSELF2(struct runner *r, struct cell *restrict c) {
 #ifdef VECTORIZE
   /* Pick up any leftovers. */
   if (icount1 > 0)
-    for (k = 0; k < icount1; k++)
+    for (int k = 0; k < icount1; k++)
       IACT_NONSYM(r2q1[k], &dxq1[3 * k], hiq1[k], hjq1[k], piq1[k], pjq1[k]);
   if (icount2 > 0)
-    for (k = 0; k < icount2; k++)
+    for (int k = 0; k < icount2; k++)
       IACT(r2q2[k], &dxq2[3 * k], hiq2[k], hjq2[k], piq2[k], pjq2[k]);
 #endif
 
@@ -1713,7 +1704,7 @@ void DOSELF2(struct runner *r, struct cell *restrict c) {
 }
 
 /**
- * @brief Compute grouped sub-cell interactions
+ * @brief Compute grouped sub-cell interactions for pairs
  *
  * @param r The #runner.
  * @param ci The first #cell.
@@ -1724,559 +1715,574 @@ void DOSELF2(struct runner *r, struct cell *restrict c) {
  * @todo Hard-code the sid on the recursive calls to avoid the
  * redundant computations to find the sid on-the-fly.
  */
+void DOSUB_PAIR1(struct runner *r, struct cell *ci, struct cell *cj, int sid,
+                 int gettimer) {
 
-void DOSUB1(struct runner *r, struct cell *ci, struct cell *cj, int sid,
-            int gettimer) {
-
-  int j = 0, k;
-  double shift[3];
-  float h;
   struct space *s = r->e->s;
   const int ti_current = r->e->ti_current;
 
   TIMER_TIC
 
-  /* Is this a single cell? */
-  if (cj == NULL) {
-
-    /* Should we even bother? */
-    if (ci->ti_end_min > ti_current) return;
+  /* Should we even bother? */
+  if (ci->ti_end_min > ti_current && cj->ti_end_min > ti_current) return;
 
-    /* Recurse? */
-    if (ci->split) {
+  /* Get the cell dimensions. */
+  const float h = fmin(ci->h[0], fmin(ci->h[1], ci->h[2]));
 
-      /* Loop over all progeny. */
-      for (k = 0; k < 8; k++)
-        if (ci->progeny[k] != NULL) {
-          DOSUB1(r, ci->progeny[k], NULL, -1, 0);
-          for (j = k + 1; j < 8; j++)
-            if (ci->progeny[j] != NULL)
-              DOSUB1(r, ci->progeny[k], ci->progeny[j], -1, 0);
-        }
+  /* Get the type of pair if not specified explicitly. */
+  // if ( sid < 0 )
+  double shift[3];
+  sid = space_getsid(s, &ci, &cj, shift);
 
-    }
+  /* Recurse? */
+  if (ci->split && cj->split &&
+      fmaxf(ci->h_max, cj->h_max) * kernel_gamma + ci->dx_max + cj->dx_max <
+          h / 2) {
 
-    /* Otherwise, compute self-interaction. */
-    else
-      DOSELF1(r, ci);
+    /* Different types of flags. */
+    switch (sid) {
 
-  } /* self-interaction. */
+      /* Regular sub-cell interactions of a single cell. */
+      case 0: /* (  1 ,  1 ,  1 ) */
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        break;
 
-  /* Otherwise, it's a pair interaction. */
-  else {
+      case 1: /* (  1 ,  1 ,  0 ) */
+        if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[0], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[1], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[1], -1, 0);
+        break;
 
-    /* Should we even bother? */
-    if (ci->ti_end_min > ti_current && cj->ti_end_min > ti_current) return;
+      case 2: /* (  1 ,  1 , -1 ) */
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[1], -1, 0);
+        break;
 
-    /* Get the cell dimensions. */
-    h = fmin(ci->h[0], fmin(ci->h[1], ci->h[2]));
+      case 3: /* (  1 ,  0 ,  1 ) */
+        if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[0], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[2], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[2], -1, 0);
+        break;
 
-    /* Get the type of pair if not specified explicitly. */
-    // if ( sid < 0 )
-    sid = space_getsid(s, &ci, &cj, shift);
+      case 4: /* (  1 ,  0 ,  0 ) */
+        if (ci->progeny[4] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[4], cj->progeny[0], -1, 0);
+        if (ci->progeny[4] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[4], cj->progeny[1], -1, 0);
+        if (ci->progeny[4] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[4], cj->progeny[2], -1, 0);
+        if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[4], cj->progeny[3], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[0], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[1], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[2], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[3], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[0], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[1], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[2], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[3], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[1], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[2], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[3], -1, 0);
+        break;
 
-    /* Recurse? */
-    if (ci->split && cj->split &&
-        fmaxf(ci->h_max, cj->h_max) * kernel_gamma + ci->dx_max + cj->dx_max <
-            h / 2) {
+      case 5: /* (  1 ,  0 , -1 ) */
+        if (ci->progeny[4] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[4], cj->progeny[1], -1, 0);
+        if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[4], cj->progeny[3], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[1], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[3], -1, 0);
+        break;
 
-      /* Different types of flags. */
-      switch (sid) {
+      case 6: /* (  1 , -1 ,  1 ) */
+        if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[2], -1, 0);
+        break;
 
-        /* Regular sub-cell interactions of a single cell. */
-        case 0: /* (  1 ,  1 ,  1 ) */
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          break;
+      case 7: /* (  1 , -1 ,  0 ) */
+        if (ci->progeny[4] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[4], cj->progeny[2], -1, 0);
+        if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[4], cj->progeny[3], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[2], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[3], -1, 0);
+        break;
 
-        case 1: /* (  1 ,  1 ,  0 ) */
-          if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[0], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[1], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[1], -1, 0);
-          break;
+      case 8: /* (  1 , -1 , -1 ) */
+        if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[4], cj->progeny[3], -1, 0);
+        break;
 
-        case 2: /* (  1 ,  1 , -1 ) */
-          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[1], -1, 0);
-          break;
+      case 9: /* (  0 ,  1 ,  1 ) */
+        if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[3], cj->progeny[0], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[3], cj->progeny[4], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[4], -1, 0);
+        break;
 
-        case 3: /* (  1 ,  0 ,  1 ) */
-          if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[5], cj->progeny[0], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
-            DOSUB1(r, ci->progeny[5], cj->progeny[2], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[2], -1, 0);
-          break;
+      case 10: /* (  0 ,  1 ,  0 ) */
+        if (ci->progeny[2] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[2], cj->progeny[0], -1, 0);
+        if (ci->progeny[2] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[2], cj->progeny[1], -1, 0);
+        if (ci->progeny[2] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[2], cj->progeny[4], -1, 0);
+        if (ci->progeny[2] != NULL && cj->progeny[5] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[2], cj->progeny[5], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[3], cj->progeny[0], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[3], cj->progeny[1], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[3], cj->progeny[4], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[5] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[3], cj->progeny[5], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[0], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[1], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[4], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[5] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[5], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[1], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[4], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[5] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[5], -1, 0);
+        break;
 
-        case 4: /* (  1 ,  0 ,  0 ) */
-          if (ci->progeny[4] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[4], cj->progeny[0], -1, 0);
-          if (ci->progeny[4] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[4], cj->progeny[1], -1, 0);
-          if (ci->progeny[4] != NULL && cj->progeny[2] != NULL)
-            DOSUB1(r, ci->progeny[4], cj->progeny[2], -1, 0);
-          if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
-            DOSUB1(r, ci->progeny[4], cj->progeny[3], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[5], cj->progeny[0], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[5], cj->progeny[1], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
-            DOSUB1(r, ci->progeny[5], cj->progeny[2], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[3] != NULL)
-            DOSUB1(r, ci->progeny[5], cj->progeny[3], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[0], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[1], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[2] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[2], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[3] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[3], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[1], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[2], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[3] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[3], -1, 0);
-          break;
+      case 11: /* (  0 ,  1 , -1 ) */
+        if (ci->progeny[2] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[2], cj->progeny[1], -1, 0);
+        if (ci->progeny[2] != NULL && cj->progeny[5] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[2], cj->progeny[5], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[1], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[5] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[5], -1, 0);
+        break;
 
-        case 5: /* (  1 ,  0 , -1 ) */
-          if (ci->progeny[4] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[4], cj->progeny[1], -1, 0);
-          if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
-            DOSUB1(r, ci->progeny[4], cj->progeny[3], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[1], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[3] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[3], -1, 0);
-          break;
+      case 12: /* (  0 ,  0 ,  1 ) */
+        if (ci->progeny[1] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[1], cj->progeny[0], -1, 0);
+        if (ci->progeny[1] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[1], cj->progeny[2], -1, 0);
+        if (ci->progeny[1] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[1], cj->progeny[4], -1, 0);
+        if (ci->progeny[1] != NULL && cj->progeny[6] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[1], cj->progeny[6], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[3], cj->progeny[0], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[3], cj->progeny[2], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[3], cj->progeny[4], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[6] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[3], cj->progeny[6], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[0], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[2], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[4], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[6] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[6], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[2], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[4], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[6] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[6], -1, 0);
+        break;
+    }
 
-        case 6: /* (  1 , -1 ,  1 ) */
-          if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
-            DOSUB1(r, ci->progeny[5], cj->progeny[2], -1, 0);
-          break;
+  }
 
-        case 7: /* (  1 , -1 ,  0 ) */
-          if (ci->progeny[4] != NULL && cj->progeny[2] != NULL)
-            DOSUB1(r, ci->progeny[4], cj->progeny[2], -1, 0);
-          if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
-            DOSUB1(r, ci->progeny[4], cj->progeny[3], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
-            DOSUB1(r, ci->progeny[5], cj->progeny[2], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[3] != NULL)
-            DOSUB1(r, ci->progeny[5], cj->progeny[3], -1, 0);
-          break;
+  /* Otherwise, compute the pair directly. */
+  else if (ci->ti_end_min <= ti_current || cj->ti_end_min <= ti_current) {
 
-        case 8: /* (  1 , -1 , -1 ) */
-          if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
-            DOSUB1(r, ci->progeny[4], cj->progeny[3], -1, 0);
-          break;
+    /* Do any of the cells need to be sorted first? */
+    if (!(ci->sorted & (1 << sid))) runner_do_sort(r, ci, (1 << sid), 1);
+    if (!(cj->sorted & (1 << sid))) runner_do_sort(r, cj, (1 << sid), 1);
 
-        case 9: /* (  0 ,  1 ,  1 ) */
-          if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[3], cj->progeny[0], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
-            DOSUB1(r, ci->progeny[3], cj->progeny[4], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[4], -1, 0);
-          break;
+    /* Compute the interactions. */
+    DOPAIR1(r, ci, cj);
+  }
 
-        case 10: /* (  0 ,  1 ,  0 ) */
-          if (ci->progeny[2] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[2], cj->progeny[0], -1, 0);
-          if (ci->progeny[2] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[2], cj->progeny[1], -1, 0);
-          if (ci->progeny[2] != NULL && cj->progeny[4] != NULL)
-            DOSUB1(r, ci->progeny[2], cj->progeny[4], -1, 0);
-          if (ci->progeny[2] != NULL && cj->progeny[5] != NULL)
-            DOSUB1(r, ci->progeny[2], cj->progeny[5], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[3], cj->progeny[0], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[3], cj->progeny[1], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
-            DOSUB1(r, ci->progeny[3], cj->progeny[4], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[5] != NULL)
-            DOSUB1(r, ci->progeny[3], cj->progeny[5], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[0], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[1], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[4] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[4], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[5] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[5], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[1], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[4], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[5] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[5], -1, 0);
-          break;
+  if (gettimer) TIMER_TOC(TIMER_DOSUB_PAIR);
+}
 
-        case 11: /* (  0 ,  1 , -1 ) */
-          if (ci->progeny[2] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[2], cj->progeny[1], -1, 0);
-          if (ci->progeny[2] != NULL && cj->progeny[5] != NULL)
-            DOSUB1(r, ci->progeny[2], cj->progeny[5], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[1], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[5] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[5], -1, 0);
-          break;
+/**
+ * @brief Compute grouped sub-cell interactions for self tasks
+ *
+ * @param r The #runner.
+ * @param ci The first #cell.
+ * @param gettimer Do we have a timer ?
+ */
+void DOSUB_SELF1(struct runner *r, struct cell *ci, int gettimer) {
 
-        case 12: /* (  0 ,  0 ,  1 ) */
-          if (ci->progeny[1] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[1], cj->progeny[0], -1, 0);
-          if (ci->progeny[1] != NULL && cj->progeny[2] != NULL)
-            DOSUB1(r, ci->progeny[1], cj->progeny[2], -1, 0);
-          if (ci->progeny[1] != NULL && cj->progeny[4] != NULL)
-            DOSUB1(r, ci->progeny[1], cj->progeny[4], -1, 0);
-          if (ci->progeny[1] != NULL && cj->progeny[6] != NULL)
-            DOSUB1(r, ci->progeny[1], cj->progeny[6], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[3], cj->progeny[0], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[2] != NULL)
-            DOSUB1(r, ci->progeny[3], cj->progeny[2], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
-            DOSUB1(r, ci->progeny[3], cj->progeny[4], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[6] != NULL)
-            DOSUB1(r, ci->progeny[3], cj->progeny[6], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[5], cj->progeny[0], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
-            DOSUB1(r, ci->progeny[5], cj->progeny[2], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[4] != NULL)
-            DOSUB1(r, ci->progeny[5], cj->progeny[4], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[6] != NULL)
-            DOSUB1(r, ci->progeny[5], cj->progeny[6], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[2], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[4], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[6] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[6], -1, 0);
-          break;
-      }
+  const int ti_current = r->e->ti_current;
 
-    }
+  TIMER_TIC
 
-    /* Otherwise, compute the pair directly. */
-    else if (ci->ti_end_min <= ti_current || cj->ti_end_min <= ti_current) {
+  /* Should we even bother? */
+  if (ci->ti_end_min > ti_current) return;
 
-      /* Do any of the cells need to be sorted first? */
-      if (!(ci->sorted & (1 << sid))) runner_do_sort(r, ci, (1 << sid), 1);
-      if (!(cj->sorted & (1 << sid))) runner_do_sort(r, cj, (1 << sid), 1);
+  /* Recurse? */
+  if (ci->split) {
 
-      /* Compute the interactions. */
-      DOPAIR1(r, ci, cj);
-    }
+    /* Loop over all progeny. */
+    for (int k = 0; k < 8; k++)
+      if (ci->progeny[k] != NULL) {
+        DOSUB_SELF1(r, ci->progeny[k], 0);
+        for (int j = k + 1; j < 8; j++)
+          if (ci->progeny[j] != NULL)
+            DOSUB_PAIR1(r, ci->progeny[k], ci->progeny[j], -1, 0);
+      }
+  }
 
-  } /* otherwise, pair interaction. */
+  /* Otherwise, compute self-interaction. */
+  else
+    DOSELF1(r, ci);
 
-  if (gettimer) TIMER_TOC(TIMER_DOSUB);
+  if (gettimer) TIMER_TOC(TIMER_DOSUB_SELF);
 }
 
-void DOSUB2(struct runner *r, struct cell *ci, struct cell *cj, int sid,
-            int gettimer) {
+/**
+ * @brief Compute grouped sub-cell interactions for pairs (symmetric case)
+ *
+ * @param r The #runner.
+ * @param ci The first #cell.
+ * @param cj The second #cell.
+ * @param sid The direction linking the cells
+ * @param gettimer Do we have a timer ?
+ *
+ * @todo Hard-code the sid on the recursive calls to avoid the
+ * redundant computations to find the sid on-the-fly.
+ */
+void DOSUB_PAIR2(struct runner *r, struct cell *ci, struct cell *cj, int sid,
+                 int gettimer) {
 
-  int j, k;
-  double shift[3];
-  float h;
   struct space *s = r->e->s;
   const int ti_current = r->e->ti_current;
 
   TIMER_TIC
 
-  /* Is this a single cell? */
-  if (cj == NULL) {
+  /* Should we even bother? */
+  if (ci->ti_end_min > ti_current && cj->ti_end_min > ti_current) return;
 
-    /* Should we even bother? */
-    if (ci->ti_end_min > ti_current) return;
+  /* Get the cell dimensions. */
+  const float h = fmin(ci->h[0], fmin(ci->h[1], ci->h[2]));
 
-    /* Recurse? */
-    if (ci->split) {
+  /* Get the type of pair if not specified explicitly. */
+  // if ( sid < 0 )
+  double shift[3];
+  sid = space_getsid(s, &ci, &cj, shift);
 
-      /* Loop over all progeny. */
-      for (k = 0; k < 8; k++)
-        if (ci->progeny[k] != NULL) {
-          DOSUB2(r, ci->progeny[k], NULL, -1, 0);
-          for (j = k + 1; j < 8; j++)
-            if (ci->progeny[j] != NULL)
-              DOSUB2(r, ci->progeny[k], ci->progeny[j], -1, 0);
-        }
+  /* Recurse? */
+  if (ci->split && cj->split &&
+      fmaxf(ci->h_max, cj->h_max) * kernel_gamma + ci->dx_max + cj->dx_max <
+          h / 2) {
 
-    }
+    /* Different types of flags. */
+    switch (sid) {
 
-    /* Otherwise, compute self-interaction. */
-    else
-      DOSELF2(r, ci);
+      /* Regular sub-cell interactions of a single cell. */
+      case 0: /* (  1 ,  1 ,  1 ) */
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        break;
 
-  } /* self-interaction. */
+      case 1: /* (  1 ,  1 ,  0 ) */
+        if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[0], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[1], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[1], -1, 0);
+        break;
 
-  /* Otherwise, it's a pair interaction. */
-  else {
+      case 2: /* (  1 ,  1 , -1 ) */
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[1], -1, 0);
+        break;
 
-    /* Should we even bother? */
-    if (ci->ti_end_min > ti_current && cj->ti_end_min > ti_current) return;
+      case 3: /* (  1 ,  0 ,  1 ) */
+        if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[0], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[2], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[2], -1, 0);
+        break;
 
-    /* Get the cell dimensions. */
-    h = fmin(ci->h[0], fmin(ci->h[1], ci->h[2]));
+      case 4: /* (  1 ,  0 ,  0 ) */
+        if (ci->progeny[4] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[4], cj->progeny[0], -1, 0);
+        if (ci->progeny[4] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[4], cj->progeny[1], -1, 0);
+        if (ci->progeny[4] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[4], cj->progeny[2], -1, 0);
+        if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[4], cj->progeny[3], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[0], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[1], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[2], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[3], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[0], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[1], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[2], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[3], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[1], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[2], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[3], -1, 0);
+        break;
 
-    /* Get the type of pair if not specified explicitly. */
-    // if ( sid < 0 )
-    sid = space_getsid(s, &ci, &cj, shift);
+      case 5: /* (  1 ,  0 , -1 ) */
+        if (ci->progeny[4] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[4], cj->progeny[1], -1, 0);
+        if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[4], cj->progeny[3], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[1], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[3], -1, 0);
+        break;
 
-    /* Recurse? */
-    if (ci->split && cj->split &&
-        fmaxf(ci->h_max, cj->h_max) * kernel_gamma + ci->dx_max + cj->dx_max <
-            h / 2) {
+      case 6: /* (  1 , -1 ,  1 ) */
+        if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[2], -1, 0);
+        break;
 
-      /* Different types of flags. */
-      switch (sid) {
+      case 7: /* (  1 , -1 ,  0 ) */
+        if (ci->progeny[4] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[4], cj->progeny[2], -1, 0);
+        if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[4], cj->progeny[3], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[2], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[3], -1, 0);
+        break;
 
-        /* Regular sub-cell interactions of a single cell. */
-        case 0: /* (  1 ,  1 ,  1 ) */
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          break;
+      case 8: /* (  1 , -1 , -1 ) */
+        if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[4], cj->progeny[3], -1, 0);
+        break;
 
-        case 1: /* (  1 ,  1 ,  0 ) */
-          if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[0], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[1], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[1], -1, 0);
-          break;
+      case 9: /* (  0 ,  1 ,  1 ) */
+        if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[3], cj->progeny[0], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[3], cj->progeny[4], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[4], -1, 0);
+        break;
 
-        case 2: /* (  1 ,  1 , -1 ) */
-          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[1], -1, 0);
-          break;
+      case 10: /* (  0 ,  1 ,  0 ) */
+        if (ci->progeny[2] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[2], cj->progeny[0], -1, 0);
+        if (ci->progeny[2] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[2], cj->progeny[1], -1, 0);
+        if (ci->progeny[2] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[2], cj->progeny[4], -1, 0);
+        if (ci->progeny[2] != NULL && cj->progeny[5] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[2], cj->progeny[5], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[3], cj->progeny[0], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[3], cj->progeny[1], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[3], cj->progeny[4], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[5] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[3], cj->progeny[5], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[0], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[1], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[4], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[5] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[5], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[1], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[4], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[5] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[5], -1, 0);
+        break;
 
-        case 3: /* (  1 ,  0 ,  1 ) */
-          if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[5], cj->progeny[0], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
-            DOSUB2(r, ci->progeny[5], cj->progeny[2], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[2], -1, 0);
-          break;
+      case 11: /* (  0 ,  1 , -1 ) */
+        if (ci->progeny[2] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[2], cj->progeny[1], -1, 0);
+        if (ci->progeny[2] != NULL && cj->progeny[5] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[2], cj->progeny[5], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[1], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[5] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[5], -1, 0);
+        break;
 
-        case 4: /* (  1 ,  0 ,  0 ) */
-          if (ci->progeny[4] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[4], cj->progeny[0], -1, 0);
-          if (ci->progeny[4] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[4], cj->progeny[1], -1, 0);
-          if (ci->progeny[4] != NULL && cj->progeny[2] != NULL)
-            DOSUB2(r, ci->progeny[4], cj->progeny[2], -1, 0);
-          if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
-            DOSUB2(r, ci->progeny[4], cj->progeny[3], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[5], cj->progeny[0], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[5], cj->progeny[1], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
-            DOSUB2(r, ci->progeny[5], cj->progeny[2], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[3] != NULL)
-            DOSUB2(r, ci->progeny[5], cj->progeny[3], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[0], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[1], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[2] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[2], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[3] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[3], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[1], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[2], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[3] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[3], -1, 0);
-          break;
+      case 12: /* (  0 ,  0 ,  1 ) */
+        if (ci->progeny[1] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[1], cj->progeny[0], -1, 0);
+        if (ci->progeny[1] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[1], cj->progeny[2], -1, 0);
+        if (ci->progeny[1] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[1], cj->progeny[4], -1, 0);
+        if (ci->progeny[1] != NULL && cj->progeny[6] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[1], cj->progeny[6], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[3], cj->progeny[0], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[3], cj->progeny[2], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[3], cj->progeny[4], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[6] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[3], cj->progeny[6], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[0], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[2], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[4], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[6] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[6], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[2], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[4], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[6] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[6], -1, 0);
+        break;
+    }
 
-        case 5: /* (  1 ,  0 , -1 ) */
-          if (ci->progeny[4] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[4], cj->progeny[1], -1, 0);
-          if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
-            DOSUB2(r, ci->progeny[4], cj->progeny[3], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[1], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[3] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[3], -1, 0);
-          break;
+  }
 
-        case 6: /* (  1 , -1 ,  1 ) */
-          if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
-            DOSUB2(r, ci->progeny[5], cj->progeny[2], -1, 0);
-          break;
+  /* Otherwise, compute the pair directly. */
+  else if (ci->ti_end_min <= ti_current || cj->ti_end_min <= ti_current) {
 
-        case 7: /* (  1 , -1 ,  0 ) */
-          if (ci->progeny[4] != NULL && cj->progeny[2] != NULL)
-            DOSUB2(r, ci->progeny[4], cj->progeny[2], -1, 0);
-          if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
-            DOSUB2(r, ci->progeny[4], cj->progeny[3], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
-            DOSUB2(r, ci->progeny[5], cj->progeny[2], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[3] != NULL)
-            DOSUB2(r, ci->progeny[5], cj->progeny[3], -1, 0);
-          break;
+    /* Do any of the cells need to be sorted first? */
+    if (!(ci->sorted & (1 << sid))) runner_do_sort(r, ci, (1 << sid), 1);
+    if (!(cj->sorted & (1 << sid))) runner_do_sort(r, cj, (1 << sid), 1);
 
-        case 8: /* (  1 , -1 , -1 ) */
-          if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
-            DOSUB2(r, ci->progeny[4], cj->progeny[3], -1, 0);
-          break;
+    /* Compute the interactions. */
+    DOPAIR2(r, ci, cj);
+  }
 
-        case 9: /* (  0 ,  1 ,  1 ) */
-          if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[3], cj->progeny[0], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
-            DOSUB2(r, ci->progeny[3], cj->progeny[4], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[4], -1, 0);
-          break;
+  if (gettimer) TIMER_TOC(TIMER_DOSUB_PAIR);
+}
 
-        case 10: /* (  0 ,  1 ,  0 ) */
-          if (ci->progeny[2] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[2], cj->progeny[0], -1, 0);
-          if (ci->progeny[2] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[2], cj->progeny[1], -1, 0);
-          if (ci->progeny[2] != NULL && cj->progeny[4] != NULL)
-            DOSUB2(r, ci->progeny[2], cj->progeny[4], -1, 0);
-          if (ci->progeny[2] != NULL && cj->progeny[5] != NULL)
-            DOSUB2(r, ci->progeny[2], cj->progeny[5], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[3], cj->progeny[0], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[3], cj->progeny[1], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
-            DOSUB2(r, ci->progeny[3], cj->progeny[4], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[5] != NULL)
-            DOSUB2(r, ci->progeny[3], cj->progeny[5], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[0], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[1], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[4] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[4], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[5] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[5], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[1], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[4], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[5] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[5], -1, 0);
-          break;
+/**
+ * @brief Compute grouped sub-cell interactions for self tasks (symmetric case)
+ *
+ * @param r The #runner.
+ * @param ci The first #cell.
+ * @param gettimer Do we have a timer ?
+ */
+void DOSUB_SELF2(struct runner *r, struct cell *ci, int gettimer) {
 
-        case 11: /* (  0 ,  1 , -1 ) */
-          if (ci->progeny[2] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[2], cj->progeny[1], -1, 0);
-          if (ci->progeny[2] != NULL && cj->progeny[5] != NULL)
-            DOSUB2(r, ci->progeny[2], cj->progeny[5], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[1], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[5] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[5], -1, 0);
-          break;
+  const int ti_current = r->e->ti_current;
 
-        case 12: /* (  0 ,  0 ,  1 ) */
-          if (ci->progeny[1] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[1], cj->progeny[0], -1, 0);
-          if (ci->progeny[1] != NULL && cj->progeny[2] != NULL)
-            DOSUB2(r, ci->progeny[1], cj->progeny[2], -1, 0);
-          if (ci->progeny[1] != NULL && cj->progeny[4] != NULL)
-            DOSUB2(r, ci->progeny[1], cj->progeny[4], -1, 0);
-          if (ci->progeny[1] != NULL && cj->progeny[6] != NULL)
-            DOSUB2(r, ci->progeny[1], cj->progeny[6], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[3], cj->progeny[0], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[2] != NULL)
-            DOSUB2(r, ci->progeny[3], cj->progeny[2], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
-            DOSUB2(r, ci->progeny[3], cj->progeny[4], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[6] != NULL)
-            DOSUB2(r, ci->progeny[3], cj->progeny[6], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[5], cj->progeny[0], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
-            DOSUB2(r, ci->progeny[5], cj->progeny[2], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[4] != NULL)
-            DOSUB2(r, ci->progeny[5], cj->progeny[4], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[6] != NULL)
-            DOSUB2(r, ci->progeny[5], cj->progeny[6], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[2], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[4], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[6] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[6], -1, 0);
-          break;
-      }
+  TIMER_TIC
 
-    }
+  /* Should we even bother? */
+  if (ci->ti_end_min > ti_current) return;
 
-    /* Otherwise, compute the pair directly. */
-    else if (ci->ti_end_min <= ti_current || cj->ti_end_min <= ti_current) {
+  /* Recurse? */
+  if (ci->split) {
 
-      /* Do any of the cells need to be sorted first? */
-      if (!(ci->sorted & (1 << sid))) runner_do_sort(r, ci, (1 << sid), 1);
-      if (!(cj->sorted & (1 << sid))) runner_do_sort(r, cj, (1 << sid), 1);
+    /* Loop over all progeny. */
+    for (int k = 0; k < 8; k++)
+      if (ci->progeny[k] != NULL) {
+        DOSUB_SELF2(r, ci->progeny[k], 0);
+        for (int j = k + 1; j < 8; j++)
+          if (ci->progeny[j] != NULL)
+            DOSUB_PAIR2(r, ci->progeny[k], ci->progeny[j], -1, 0);
+      }
 
-      /* Compute the interactions. */
-      DOPAIR2(r, ci, cj);
-    }
+  }
 
-  } /* otherwise, pair interaction. */
+  /* Otherwise, compute self-interaction. */
+  else
+    DOSELF2(r, ci);
 
-  if (gettimer) TIMER_TOC(TIMER_DOSUB);
+  if (gettimer) TIMER_TOC(TIMER_DOSUB_SELF);
 }
 
 void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts,
                   int *ind, int count, struct cell *cj, int sid, int gettimer) {
 
-  int j, k;
-  double shift[3];
-  float h;
   struct space *s = r->e->s;
-  struct cell *sub = NULL;
   const int ti_current = r->e->ti_current;
 
   TIMER_TIC
 
   /* Find out in which sub-cell of ci the parts are. */
-  for (k = 0; k < 8; k++)
+  struct cell *sub = NULL;
+  for (int k = 0; k < 8; k++)
     if (ci->progeny[k] != NULL) {
       // if ( parts[ ind[ 0 ] ].x[0] >= ci->progeny[k]->loc[0] &&
       //      parts[ ind[ 0 ] ].x[0] <= ci->progeny[k]->loc[0] +
@@ -2302,7 +2308,7 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts,
 
       /* Loop over all progeny. */
       DOSUB_SUBSET(r, sub, parts, ind, count, NULL, -1, 0);
-      for (j = 0; j < 8; j++)
+      for (int j = 0; j < 8; j++)
         if (ci->progeny[j] != sub && ci->progeny[j] != NULL)
           DOSUB_SUBSET(r, sub, parts, ind, count, ci->progeny[j], -1, 0);
 
@@ -2318,7 +2324,7 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts,
   else {
 
     /* Get the cell dimensions. */
-    h = fmin(ci->h[0], fmin(ci->h[1], ci->h[2]));
+    const float h = fmin(ci->h[0], fmin(ci->h[1], ci->h[2]));
 
     /* Recurse? */
     if (ci->split && cj->split &&
@@ -2326,6 +2332,7 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts,
             h / 2) {
 
       /* Get the type of pair if not specified explicitly. */
+      double shift[3] = {0.0, 0.0, 0.0};
       sid = space_getsid(s, &ci, &cj, shift);
 
       /* Different types of flags. */
@@ -2337,7 +2344,7 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts,
             DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[0],
                          -1, 0);
           if (ci->progeny[7] != NULL && cj->progeny[0] == sub)
-            DOSUB_SUBSET(r, ci->progeny[0], parts, ind, count, cj->progeny[7],
+            DOSUB_SUBSET(r, cj->progeny[0], parts, ind, count, ci->progeny[7],
                          -1, 0);
           break;
 
@@ -2834,7 +2841,8 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts,
     else if (ci->ti_end_min <= ti_current || cj->ti_end_min <= ti_current) {
 
       /* Get the relative distance between the pairs, wrapping. */
-      for (k = 0; k < 3; k++) {
+      double shift[3] = {0.0, 0.0, 0.0};
+      for (int k = 0; k < 3; k++) {
         if (cj->loc[k] - ci->loc[k] < -s->dim[k] / 2)
           shift[k] = s->dim[k];
         else if (cj->loc[k] - ci->loc[k] > s->dim[k] / 2)
@@ -2842,7 +2850,8 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts,
       }
 
       /* Get the sorting index. */
-      for (sid = 0, k = 0; k < 3; k++)
+      int sid = 0;
+      for (int k = 0; k < 3; k++)
         sid =
             3 * sid + ((cj->loc[k] - ci->loc[k] + shift[k] < 0)
                            ? 0
@@ -2858,5 +2867,5 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts,
 
   } /* otherwise, pair interaction. */
 
-  if (gettimer) TIMER_TOC(TIMER_DOSUB);
+  if (gettimer) TIMER_TOC(TIMER_DOSUB_PAIR);
 }
diff --git a/src/runner_doiact_grav.h b/src/runner_doiact_grav.h
index 75775c24d1d35eaef2cde66a08c4f13561ef0ae4..e3788dfa1123584c913bca6baa6fc2db6698e6d0 100644
--- a/src/runner_doiact_grav.h
+++ b/src/runner_doiact_grav.h
@@ -19,11 +19,6 @@
 #ifndef SWIFT_RUNNER_DOIACT_GRAV_H
 #define SWIFT_RUNNER_DOIACT_GRAV_H
 
-/* Includes. */
-#include "cell.h"
-#include "clocks.h"
-#include "part.h"
-
 /**
  * @brief Compute the sorted gravity interactions between a cell pair.
  *
@@ -60,8 +55,8 @@ void runner_dopair_grav_new(struct runner *r, struct cell *ci,
   sid = space_getsid(e->s, &ci, &cj, shift);
 
   /* Make sure the cells are sorted. */
-  runner_do_gsort(r, ci, (1 << sid), 0);
-  runner_do_gsort(r, cj, (1 << sid), 0);
+  // runner_do_gsort(r, ci, (1 << sid), 0);
+  // runner_do_gsort(r, cj, (1 << sid), 0);
 
   /* Have the cells been sorted? */
   if (!(ci->gsorted & (1 << sid)) || !(cj->gsorted & (1 << sid)))
@@ -69,7 +64,7 @@ void runner_dopair_grav_new(struct runner *r, struct cell *ci,
 
   /* Get the cutoff shift. */
   for (rshift = 0.0, k = 0; k < 3; k++)
-    rshift += shift[k] * runner_shift[3 * sid + k];
+    rshift += shift[k] * runner_shift[sid][k];
 
   /* Pick-out the sorted lists. */
   sort_i = &ci->gsort[sid * (ci->count + 1)];
diff --git a/src/scheduler.c b/src/scheduler.c
index a18c2d02c9cddcfb63bc57023888a24195d7d342..c89872f05961ba3e8eabea6a7764728764e47a41 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -44,6 +44,9 @@
 #include "error.h"
 #include "intrinsics.h"
 #include "kernel_hydro.h"
+#include "queue.h"
+#include "space.h"
+#include "task.h"
 #include "timers.h"
 
 /**
@@ -65,8 +68,8 @@ void scheduler_addunlock(struct scheduler *s, struct task *ta,
     struct task **unlocks_new;
     int *unlock_ind_new;
     const int size_unlocks_new = s->size_unlocks * 2;
-    if ((unlocks_new = (struct task **)malloc(
-             sizeof(struct task *) *size_unlocks_new)) == NULL ||
+    if ((unlocks_new = (struct task **)malloc(sizeof(struct task *) *
+                                              size_unlocks_new)) == NULL ||
         (unlock_ind_new = (int *)malloc(sizeof(int) * size_unlocks_new)) ==
             NULL)
       error("Failed to re-allocate unlocks.");
@@ -107,13 +110,11 @@ void scheduler_splittasks_mapper(void *map_data, int num_elements,
                                  void *extra_data) {
 
   /* Static constants. */
-  const static int pts[7][8] = {{-1, 12, 10, 9, 4, 3, 1, 0},
-                                {-1, -1, 11, 10, 5, 4, 2, 1},
-                                {-1, -1, -1, 12, 7, 6, 4, 3},
-                                {-1, -1, -1, -1, 8, 7, 5, 4},
-                                {-1, -1, -1, -1, -1, 12, 10, 9},
-                                {-1, -1, -1, -1, -1, -1, 11, 10},
-                                {-1, -1, -1, -1, -1, -1, -1, 12}};
+  const static int pts[7][8] = {
+      {-1, 12, 10, 9, 4, 3, 1, 0},     {-1, -1, 11, 10, 5, 4, 2, 1},
+      {-1, -1, -1, 12, 7, 6, 4, 3},    {-1, -1, -1, -1, 8, 7, 5, 4},
+      {-1, -1, -1, -1, -1, 12, 10, 9}, {-1, -1, -1, -1, -1, -1, 11, 10},
+      {-1, -1, -1, -1, -1, -1, -1, 12}};
   const static float sid_scale[13] = {0.1897, 0.4025, 0.1897, 0.4025, 0.5788,
                                       0.4025, 0.1897, 0.4025, 0.1897, 0.4025,
                                       0.5788, 0.4025, 0.5788};
@@ -161,7 +162,7 @@ void scheduler_splittasks_mapper(void *map_data, int num_elements,
           if (scheduler_dosub && ci->count < space_subsize / ci->count) {
 
             /* convert to a self-subtask. */
-            t->type = task_type_sub;
+            t->type = task_type_sub_self;
 
             /* Otherwise, make tasks explicitly. */
           } else {
@@ -224,7 +225,7 @@ void scheduler_splittasks_mapper(void *map_data, int num_elements,
               sid != 0 && sid != 2 && sid != 6 && sid != 8) {
 
             /* Make this task a sub task. */
-            t->type = task_type_sub;
+            t->type = task_type_sub_pair;
 
             /* Otherwise, split it. */
           } else {
@@ -791,7 +792,8 @@ void scheduler_ranktasks(struct scheduler *s) {
   const int nr_tasks = s->nr_tasks;
 
   /* Run through the tasks and get all the waits right. */
-  /* threadpool_map(s->threadpool, scheduler_simple_rewait_mapper, tasks, nr_tasks,
+  /* threadpool_map(s->threadpool, scheduler_simple_rewait_mapper, tasks,
+     nr_tasks,
                  sizeof(struct task), 1000, NULL); */
   for (int i = 0; i < nr_tasks; i++) {
     struct task *t = &tasks[i];
@@ -837,10 +839,12 @@ void scheduler_ranktasks(struct scheduler *s) {
     j = left_old;
   }
 
+#ifdef SWIFT_DEBUG_CHECKS
   /* Verify that the tasks were ranked correctly. */
-  /* for (int k = 1; k < s->nr_tasks; k++)
+  for (int k = 1; k < s->nr_tasks; k++)
     if (tasks[tid[k - 1]].rank > tasks[tid[k - 1]].rank)
-      error("Task ranking failed."); */
+      error("Task ranking failed.");
+#endif
 }
 
 /**
@@ -860,7 +864,8 @@ void scheduler_reset(struct scheduler *s, int size) {
     if (s->tasks_ind != NULL) free(s->tasks_ind);
 
     /* Allocate the new lists. */
-    if ((s->tasks = (struct task *)malloc(sizeof(struct task) *size)) == NULL ||
+    if ((s->tasks = (struct task *)malloc(sizeof(struct task) * size)) ==
+            NULL ||
         (s->tasks_ind = (int *)malloc(sizeof(int) * size)) == NULL)
       error("Failed to allocate task lists.");
   }
@@ -925,23 +930,23 @@ void scheduler_reweight(struct scheduler *s) {
             t->weight +=
                 2 * wscale * t->ci->count * t->cj->count * sid_scale[t->flags];
           break;
-        case task_type_sub:
-          if (t->cj != NULL) {
-            if (t->ci->nodeID != nodeID || t->cj->nodeID != nodeID) {
-              if (t->flags < 0)
-                t->weight += 3 * wscale * t->ci->count * t->cj->count;
-              else
-                t->weight += 3 * wscale * t->ci->count * t->cj->count *
-                             sid_scale[t->flags];
-            } else {
-              if (t->flags < 0)
-                t->weight += 2 * wscale * t->ci->count * t->cj->count;
-              else
-                t->weight += 2 * wscale * t->ci->count * t->cj->count *
-                             sid_scale[t->flags];
-            }
-          } else
-            t->weight += 1 * wscale * t->ci->count * t->ci->count;
+        case task_type_sub_pair:
+          if (t->ci->nodeID != nodeID || t->cj->nodeID != nodeID) {
+            if (t->flags < 0)
+              t->weight += 3 * wscale * t->ci->count * t->cj->count;
+            else
+              t->weight += 3 * wscale * t->ci->count * t->cj->count *
+                           sid_scale[t->flags];
+          } else {
+            if (t->flags < 0)
+              t->weight += 2 * wscale * t->ci->count * t->cj->count;
+            else
+              t->weight += 2 * wscale * t->ci->count * t->cj->count *
+                           sid_scale[t->flags];
+          }
+          break;
+        case task_type_sub_self:
+          t->weight += 1 * wscale * t->ci->count * t->ci->count;
           break;
         case task_type_ghost:
           if (t->ci == t->ci->super) t->weight += wscale * t->ci->count;
@@ -1097,6 +1102,7 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
        any pre-processing needed. */
     switch (t->type) {
       case task_type_self:
+      case task_type_sub_self:
       case task_type_sort:
       case task_type_ghost:
       case task_type_kick:
@@ -1105,11 +1111,10 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
         qid = t->ci->super->owner;
         break;
       case task_type_pair:
-      case task_type_sub:
+      case task_type_sub_pair:
         qid = t->ci->super->owner;
-        if (t->cj != NULL &&
-            (qid < 0 ||
-             s->queues[qid].count > s->queues[t->cj->super->owner].count))
+        if (qid < 0 ||
+            s->queues[qid].count > s->queues[t->cj->super->owner].count)
           qid = t->cj->super->owner;
         break;
       case task_type_recv:
@@ -1269,7 +1274,7 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
          tries++) {
 
       /* Try to get a task from the suggested queue. */
-      if (s->queues[qid].count > 0) {
+      if (s->queues[qid].count > 0 || s->queues[qid].count_incoming > 0) {
         TIMER_TIC
         res = queue_gettask(&s->queues[qid], prev, 0);
         TIMER_TOC(timer_qget);
@@ -1280,7 +1285,9 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
       if (s->flags & scheduler_flag_steal) {
         int count = 0, qids[nr_queues];
         for (int k = 0; k < nr_queues; k++)
-          if (s->queues[k].count > 0) qids[count++] = k;
+          if (s->queues[k].count > 0 || s->queues[k].count_incoming > 0) {
+            qids[count++] = k;
+          }
         for (int k = 0; k < scheduler_maxsteal && count > 0; k++) {
           const int ind = rand_r(&seed) % count;
           TIMER_TIC
@@ -1353,7 +1360,7 @@ void scheduler_init(struct scheduler *s, struct space *space, int nr_tasks,
 
   /* Init the unlocks. */
   if ((s->unlocks = (struct task **)malloc(
-           sizeof(struct task *) *scheduler_init_nr_unlocks)) == NULL ||
+           sizeof(struct task *) * scheduler_init_nr_unlocks)) == NULL ||
       (s->unlock_ind =
            (int *)malloc(sizeof(int) * scheduler_init_nr_unlocks)) == NULL)
     error("Failed to allocate unlocks.");
diff --git a/src/scheduler.h b/src/scheduler.h
index 7528e13c2f7901ba9979a3ef7567bef1e23d8ac4..37ddb704ac8adc19f870916103b691715c820c4e 100644
--- a/src/scheduler.h
+++ b/src/scheduler.h
@@ -35,7 +35,6 @@
 #include "cell.h"
 #include "lock.h"
 #include "queue.h"
-#include "space.h"
 #include "task.h"
 #include "threadpool.h"
 
diff --git a/src/serial_io.c b/src/serial_io.c
index 5abd3ebc28672d68c4135efe5753dc4713c2d3c6..7e78276dc83430655b4ea4de2fb7425e71e07966 100644
--- a/src/serial_io.c
+++ b/src/serial_io.c
@@ -26,22 +26,22 @@
 /* Some standard headers. */
 #include <hdf5.h>
 #include <math.h>
+#include <mpi.h>
 #include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
-/* MPI headers. */
-#ifdef WITH_MPI
-#include <mpi.h>
-#endif
-
 /* This object's header. */
 #include "serial_io.h"
 
 /* Local includes. */
 #include "common_io.h"
+#include "engine.h"
 #include "error.h"
+#include "kernel_hydro.h"
+#include "part.h"
+#include "units.h"
 
 /*-----------------------------------------------------------------------------
  * Routines reading an IC file
diff --git a/src/serial_io.h b/src/serial_io.h
index b7ed6eb62d823829a473f828696c291e552effa3..6b64624772214494a43316d3a8aa3910c7238dc8 100644
--- a/src/serial_io.h
+++ b/src/serial_io.h
@@ -19,6 +19,9 @@
 #ifndef SWIFT_SERIAL_IO_H
 #define SWIFT_SERIAL_IO_H
 
+/* Config parameters. */
+#include "../config.h"
+
 /* MPI headers. */
 #ifdef WITH_MPI
 #include <mpi.h>
diff --git a/src/single_io.c b/src/single_io.c
index fb3bf4368feed9892b098228d85c99f0bc3e724b..3f65aae0b5d495670f2b4862e466ec849f997d63 100644
--- a/src/single_io.c
+++ b/src/single_io.c
@@ -36,8 +36,11 @@
 
 /* Local includes. */
 #include "common_io.h"
-#include "const.h"
+#include "engine.h"
 #include "error.h"
+#include "kernel_hydro.h"
+#include "part.h"
+#include "units.h"
 
 /*-----------------------------------------------------------------------------
  * Routines reading an IC file
diff --git a/src/single_io.h b/src/single_io.h
index adfc5b43941b2c0d69d9ce0924164aff56864a23..d2c87655e1c91b92e8ccf2aa50d2e0bf98f13482 100644
--- a/src/single_io.h
+++ b/src/single_io.h
@@ -19,6 +19,9 @@
 #ifndef SWIFT_SINGLE_IO_H
 #define SWIFT_SINGLE_IO_H
 
+/* Config parameters. */
+#include "../config.h"
+
 /* Includes. */
 #include "engine.h"
 #include "part.h"
diff --git a/src/space.c b/src/space.c
index 4f6a3362534b1454b5864bfde25a2a497ce72697..074a5740a762df51d02ff6a37557a8be10c23e29 100644
--- a/src/space.c
+++ b/src/space.c
@@ -429,6 +429,7 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
 // clocks_from_ticks(getticks() - tic), clocks_getunit());
 
 #ifdef WITH_MPI
+
   /* Move non-local parts to the end of the list. */
   const int local_nodeID = s->e->nodeID;
   for (size_t k = 0; k < nr_parts;) {
@@ -456,8 +457,9 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
     }
   }
 
+#ifdef SWIFT_DEBUG_CHECKS
   /* Check that all parts are in the correct places. */
-  /*  for (size_t k = 0; k < nr_parts; k++) {
+  for (size_t k = 0; k < nr_parts; k++) {
     if (cells[ind[k]].nodeID != local_nodeID) {
       error("Failed to move all non-local parts to send list");
     }
@@ -466,7 +468,8 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
     if (cells[ind[k]].nodeID == local_nodeID) {
       error("Failed to remove local parts from send list");
     }
-  }*/
+  }
+#endif
 
   /* Move non-local gparts to the end of the list. */
   for (int k = 0; k < nr_gparts;) {
@@ -491,8 +494,8 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
     }
   }
 
+#ifdef SWIFT_DEBUG_CHECKS
   /* Check that all gparts are in the correct place (untested). */
-  /*
   for (size_t k = 0; k < nr_gparts; k++) {
     if (cells[gind[k]].nodeID != local_nodeID) {
       error("Failed to move all non-local gparts to send list");
@@ -502,7 +505,8 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
     if (cells[gind[k]].nodeID == local_nodeID) {
       error("Failed to remove local gparts from send list");
     }
-  }*/
+  }
+#endif
 
   /* Exchange the strays, note that this potentially re-allocates
      the parts arrays. */
@@ -531,12 +535,15 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
     ind[k] =
         cell_getid(cdim, p->x[0] * ih[0], p->x[1] * ih[1], p->x[2] * ih[2]);
     cells[ind[k]].count += 1;
-    /* if ( cells[ ind[k] ].nodeID != nodeID )
-        error( "Received part that does not belong to me (nodeID=%i)." , cells[
-       ind[k] ].nodeID ); */
+#ifdef SWIFT_DEBUG_CHECKS
+    if (cells[ind[k]].nodeID != local_nodeID)
+      error("Received part that does not belong to me (nodeID=%i).",
+            cells[ind[k]].nodeID);
+#endif
   }
   nr_parts = s->nr_parts;
-#endif
+
+#endif /* WITH_MPI */
 
   /* Sort the parts according to their cells. */
   space_parts_sort(s, ind, nr_parts, 0, s->nr_cells - 1, verbose);
@@ -545,15 +552,18 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
   for (size_t k = 0; k < nr_parts; k++)
     if (s->parts[k].gpart != NULL) s->parts[k].gpart->part = &s->parts[k];
 
-  /* Verify sort_struct. */
-  /* for ( k = 1 ; k < nr_parts ; k++ ) {
-      if ( ind[k-1] > ind[k] ) {
-          error( "Sort failed!" );
-          }
-      else if ( ind[k] != cell_getid( cdim , parts[k].x[0]*ih[0] ,
-     parts[k].x[1]*ih[1] , parts[k].x[2]*ih[2] ) )
-          error( "Incorrect indices!" );
-      } */
+#ifdef SWIFT_DEBUG_CHECKS
+  /* Verify space_sort_struct. */
+  for (size_t k = 1; k < nr_parts; k++) {
+    if (ind[k - 1] > ind[k]) {
+      error("Sort failed!");
+    } else if (ind[k] != cell_getid(cdim, s->parts[k].x[0] * ih[0],
+                                    s->parts[k].x[1] * ih[1],
+                                    s->parts[k].x[2] * ih[2])) {
+      error("Incorrect indices!");
+    }
+  }
+#endif
 
   /* We no longer need the indices as of here. */
   free(ind);
@@ -594,8 +604,8 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
   /* We no longer need the indices as of here. */
   free(gind);
 
+#ifdef SWIFT_DEBUG_CHECKS
   /* Verify that the links are correct */
-  /* MATTHIEU: To be commented out once we are happy */
   for (size_t k = 0; k < nr_gparts; ++k) {
 
     if (s->gparts[k].id > 0) {
@@ -615,6 +625,7 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
       if (s->parts[k].gpart->part != &s->parts[k]) error("Linking problem !");
     }
   }
+#endif
 
   /* Hook the cells up to the parts. */
   // tic = getticks();
@@ -705,13 +716,14 @@ void space_parts_sort(struct space *s, int *ind, size_t N, int min, int max,
   threadpool_map(&s->e->threadpool, space_parts_sort_mapper, &sort_struct,
                  s->e->threadpool.num_threads, 0, 1, NULL);
 
-  /* Verify sort_struct. */
-  /* for (int i = 1; i < N; i++)
+#ifdef SWIFT_DEBUG_CHECKS
+  /* Verify space_sort_struct. */
+  for (int i = 1; i < N; i++)
     if (ind[i - 1] > ind[i])
       error("Sorting failed (ind[%i]=%i,ind[%i]=%i), min=%i, max=%i.", i - 1,
-  ind[i - 1], i,
-            ind[i], min, max);
-  message("Sorting succeeded."); */
+            ind[i - 1], i, ind[i], min, max);
+  message("Sorting succeeded.");
+#endif
 
   /* Clean up. */
   free(sort_struct.stack);
@@ -721,7 +733,8 @@ void space_parts_sort(struct space *s, int *ind, size_t N, int min, int max,
             clocks_getunit());
 }
 
-void space_parts_sort_mapper(void *map_data, int num_elements, void *extra_data) {
+void space_parts_sort_mapper(void *map_data, int num_elements,
+                             void *extra_data) {
 
   /* Unpack the mapping data. */
   struct parallel_sort *sort_struct = (struct parallel_sort *)map_data;
@@ -775,19 +788,21 @@ void space_parts_sort_mapper(void *map_data, int num_elements, void *extra_data)
         }
       }
 
-      /* Verify sort_struct. */
-      /* for (int k = i; k <= jj; k++)
+#ifdef SWIFT_DEBUG_CHECKS
+      /* Verify space_sort_struct. */
+      for (int k = i; k <= jj; k++)
         if (ind[k] > pivot) {
-          message("sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i.", k,
-                  ind[k], pivot, i, j);
+          message("sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%li, j=%li.",
+                  k, ind[k], pivot, i, j);
           error("Partition failed (<=pivot).");
         }
       for (int k = jj + 1; k <= j; k++)
         if (ind[k] <= pivot) {
-          message("sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i.", k,
-                  ind[k], pivot, i, j);
+          message("sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%li, j=%li.",
+                  k, ind[k], pivot, i, j);
           error("Partition failed (>pivot).");
-        } */
+        }
+#endif
 
       /* Split-off largest interval. */
       if (jj - i > j - jj + 1) {
@@ -886,13 +901,14 @@ void space_gparts_sort(struct space *s, int *ind, size_t N, int min, int max,
   threadpool_map(&s->e->threadpool, space_gparts_sort_mapper, &sort_struct,
                  s->e->threadpool.num_threads, 0, 1, NULL);
 
-  /* Verify sort_struct. */
-  /* for (int i = 1; i < N; i++)
+#ifdef SWIFT_DEBUG_CHECKS
+  /* Verify space_sort_struct. */
+  for (int i = 1; i < N; i++)
     if (ind[i - 1] > ind[i])
       error("Sorting failed (ind[%i]=%i,ind[%i]=%i), min=%i, max=%i.", i - 1,
-  ind[i - 1], i,
-            ind[i], min, max);
-  message("Sorting succeeded."); */
+            ind[i - 1], i, ind[i], min, max);
+  message("Sorting succeeded.");
+#endif
 
   /* Clean up. */
   free(sort_struct.stack);
@@ -902,7 +918,8 @@ void space_gparts_sort(struct space *s, int *ind, size_t N, int min, int max,
             clocks_getunit());
 }
 
-void space_gparts_sort_mapper(void *map_data, int num_elements, void *extra_data) {
+void space_gparts_sort_mapper(void *map_data, int num_elements,
+                              void *extra_data) {
 
   /* Unpack the mapping data. */
   struct parallel_sort *sort_struct = (struct parallel_sort *)map_data;
@@ -952,19 +969,21 @@ void space_gparts_sort_mapper(void *map_data, int num_elements, void *extra_data
         }
       }
 
-      /* Verify sort_struct. */
-      /* for (int k = i; k <= jj; k++)
+#ifdef SWIFT_DEBUG_CHECKS
+      /* Verify space_sort_struct. */
+      for (int k = i; k <= jj; k++)
         if (ind[k] > pivot) {
-          message("sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i.", k,
-                  ind[k], pivot, i, j);
+          message("sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%li, j=%li.",
+                  k, ind[k], pivot, i, j);
           error("Partition failed (<=pivot).");
         }
       for (int k = jj + 1; k <= j; k++)
         if (ind[k] <= pivot) {
-          message("sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i.", k,
-                  ind[k], pivot, i, j);
+          message("sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%li, j=%li.",
+                  k, ind[k], pivot, i, j);
           error("Partition failed (>pivot).");
-        } */
+        }
+#endif
 
       /* Split-off largest interval. */
       if (jj - i > j - jj + 1) {
diff --git a/src/space.h b/src/space.h
index d6802db02fccc398f15ad5637d24c5ab7140d147..93841acee2c25583c1be383df841abdce304424f 100644
--- a/src/space.h
+++ b/src/space.h
@@ -23,16 +23,18 @@
 #ifndef SWIFT_SPACE_H
 #define SWIFT_SPACE_H
 
-/* Includes. */
+/* Config parameters. */
+#include "../config.h"
+
+/* Some standard headers. */
 #include <stddef.h>
 
-/* Local includes. */
+/* Includes. */
 #include "cell.h"
+#include "lock.h"
 #include "parser.h"
 #include "part.h"
-
-/* Forward-declare the engine to avoid cyclic includes. */
-struct engine;
+#include "space.h"
 
 /* Some constants. */
 #define space_maxdepth 10
@@ -150,8 +152,10 @@ void space_map_parts_xparts(struct space *s,
                                         struct cell *c));
 void space_map_cells_post(struct space *s, int full,
                           void (*fun)(struct cell *c, void *data), void *data);
-void space_parts_sort_mapper(void *map_data, int num_elements, void *extra_data);
-void space_gparts_sort_mapper(void *map_data, int num_elements, void *extra_data);
+void space_parts_sort_mapper(void *map_data, int num_elements,
+                             void *extra_data);
+void space_gparts_sort_mapper(void *map_data, int num_elements,
+                              void *extra_data);
 void space_rebuild(struct space *s, double h_max, int verbose);
 void space_recycle(struct space *s, struct cell *c);
 void space_split(struct space *s, struct cell *cells, int verbose);
@@ -159,4 +163,5 @@ void space_split_mapper(void *map_data, int num_elements, void *extra_data);
 void space_do_parts_sort();
 void space_do_gparts_sort();
 void space_link_cleanup(struct space *s);
+
 #endif /* SWIFT_SPACE_H */
diff --git a/src/task.c b/src/task.c
index b93dfc78adeba446d0828202b92c5477a474611f..c11c34da89a085b04cf4a2160d5fc9c92ef80886 100644
--- a/src/task.c
+++ b/src/task.c
@@ -52,8 +52,8 @@ const char *taskID_names[task_type_count] = {
     "send",      "recv",          "grav_pp",  "grav_mm", "grav_up",
     "grav_down", "grav_external", "comm_root"};
 
-const char *subtaskID_names[task_type_count] = {"none",  "density",
-                                                "force", "grav"};
+const char *subtaskID_names[task_type_count] = {"none", "density", "force",
+                                                "grav"};
 
 /**
  * @brief Computes the overlap between the parts array of two given cells.
@@ -112,13 +112,14 @@ void task_unlock(struct task *t) {
   /* Act based on task type. */
   switch (t->type) {
     case task_type_self:
+    case task_type_sub_self:
     case task_type_sort:
       cell_unlocktree(t->ci);
       break;
     case task_type_pair:
-    case task_type_sub:
+    case task_type_sub_pair:
       cell_unlocktree(t->ci);
-      if (t->cj != NULL) cell_unlocktree(t->cj);
+      cell_unlocktree(t->cj);
       break;
     case task_type_grav_pp:
     case task_type_grav_mm:
@@ -139,55 +140,68 @@ void task_unlock(struct task *t) {
 
 int task_lock(struct task *t) {
 
-  int type = t->type;
+  const int type = t->type;
+  const int subtype = t->subtype;
   struct cell *ci = t->ci, *cj = t->cj;
+#ifdef WITH_MPI
+  int res = 0, err = 0;
+  MPI_Status stat;
+#endif
 
-  /* Communication task? */
-  if (type == task_type_recv || type == task_type_send) {
+  switch (type) {
 
+    /* Communication task? */
+    case task_type_recv:
+    case task_type_send:
 #ifdef WITH_MPI
-    /* Check the status of the MPI request. */
-    int res = 0, err = 0;
-    MPI_Status stat;
-    if ((err = MPI_Test(&t->req, &res, &stat)) != MPI_SUCCESS) {
-      char buff[MPI_MAX_ERROR_STRING];
-      int len;
-      MPI_Error_string(err, buff, &len);
-      error("Failed to test request on send/recv task (tag=%i, %s).", t->flags,
-            buff);
-    }
-    return res;
+      /* Check the status of the MPI request. */
+      if ((err = MPI_Test(&t->req, &res, &stat)) != MPI_SUCCESS) {
+        char buff[MPI_MAX_ERROR_STRING];
+        int len;
+        MPI_Error_string(err, buff, &len);
+        error("Failed to test request on send/recv task (tag=%i, %s).",
+              t->flags, buff);
+      }
+      return res;
 #else
-    error("SWIFT was not compiled with MPI support.");
+      error("SWIFT was not compiled with MPI support.");
 #endif
+      break;
 
-  }
+    case task_type_sort:
+      if (cell_locktree(ci) != 0) return 0;
+      break;
 
-  /* Unary lock? */
-  else if (type == task_type_self || type == task_type_sort ||
-           (type == task_type_sub && cj == NULL)) {
-    if (cell_locktree(ci) != 0) return 0;
-  }
+    case task_type_self:
+    case task_type_sub_self:
+      if (subtype == task_subtype_grav) {
+        if (cell_glocktree(ci) != 0) return 0;
+      } else {
+        if (cell_locktree(ci) != 0) return 0;
+      }
+      break;
 
-  /* Otherwise, binary lock. */
-  else if (type == task_type_pair || (type == task_type_sub && cj != NULL)) {
-    if (ci->hold || cj->hold) return 0;
-    if (cell_locktree(ci) != 0) return 0;
-    if (cell_locktree(cj) != 0) {
-      cell_unlocktree(ci);
-      return 0;
-    }
-  }
+    case task_type_pair:
+    case task_type_sub_pair:
+      if (subtype == task_subtype_grav) {
+        if (ci->ghold || cj->ghold) return 0;
+        if (cell_glocktree(ci) != 0) return 0;
+        if (cell_glocktree(cj) != 0) {
+          cell_gunlocktree(ci);
+          return 0;
+        }
+      } else {
+        if (ci->hold || cj->hold) return 0;
+        if (cell_locktree(ci) != 0) return 0;
+        if (cell_locktree(cj) != 0) {
+          cell_unlocktree(ci);
+          return 0;
+        }
+      }
+      break;
 
-  /* Gravity tasks? */
-  else if (type == task_type_grav_mm || type == task_type_grav_pp ||
-           type == task_type_grav_down) {
-    if (ci->ghold || (cj != NULL && cj->ghold)) return 0;
-    if (cell_glocktree(ci) != 0) return 0;
-    if (cj != NULL && cell_glocktree(cj) != 0) {
-      cell_gunlocktree(ci);
-      return 0;
-    }
+    default:
+      break;
   }
 
   /* If we made it this far, we've got a lock. */
@@ -265,48 +279,6 @@ void task_rmunlock_blind(struct task *ta, struct task *tb) {
   lock_unlock_blind(&ta->lock);
 }
 
-/**
- * @brief Add an unlock_task to the given task.
- *
- * @param ta The unlocking #task.
- * @param tb The #task that will be unlocked.
- */
-
-void task_addunlock(struct task *ta, struct task *tb) {
-
-  error("Use sched_addunlock instead.");
-
-  /* Add the lock atomically. */
-  ta->unlock_tasks[atomic_inc(&ta->nr_unlock_tasks)] = tb;
-
-  /* Check a posteriori if we did not overshoot. */
-  if (ta->nr_unlock_tasks > task_maxunlock)
-    error("Too many unlock_tasks in task.");
-}
-
-void task_addunlock_old(struct task *ta, struct task *tb) {
-
-  int k;
-
-  lock_lock(&ta->lock);
-
-  /* Check if ta already unlocks tb. */
-  for (k = 0; k < ta->nr_unlock_tasks; k++)
-    if (ta->unlock_tasks[k] == tb) {
-      error("Duplicate unlock.");
-      lock_unlock_blind(&ta->lock);
-      return;
-    }
-
-  if (ta->nr_unlock_tasks == task_maxunlock)
-    error("Too many unlock_tasks in task.");
-
-  ta->unlock_tasks[ta->nr_unlock_tasks] = tb;
-  ta->nr_unlock_tasks += 1;
-
-  lock_unlock_blind(&ta->lock);
-}
-
 /**
  * @brief Prints the list of tasks contained in a given mask
  *
diff --git a/src/task.h b/src/task.h
index a949e02fce1e9e0fa6b86f27e1c192a45b9073f3..adef90eaac4d41b4d52d0941ece72f9c02219ba4 100644
--- a/src/task.h
+++ b/src/task.h
@@ -37,7 +37,8 @@ enum task_types {
   task_type_sort,
   task_type_self,
   task_type_pair,
-  task_type_sub,
+  task_type_sub_self,
+  task_type_sub_pair,
   task_type_init,
   task_type_ghost,
   task_type_drift,
@@ -94,7 +95,6 @@ struct task {
 void task_rmunlock(struct task *ta, struct task *tb);
 void task_rmunlock_blind(struct task *ta, struct task *tb);
 void task_cleanunlock(struct task *t, int type);
-void task_addunlock(struct task *ta, struct task *tb);
 void task_unlock(struct task *t);
 float task_overlap(const struct task *ta, const struct task *tb);
 int task_lock(struct task *t);
diff --git a/src/threadpool.c b/src/threadpool.c
index 8605e04bac08bdf61295446bd5074de0182a573b..c57af9a519697184b12877743b0ff9fcf93fe3ec 100644
--- a/src/threadpool.c
+++ b/src/threadpool.c
@@ -24,8 +24,8 @@
 #include <float.h>
 #include <limits.h>
 #include <math.h>
-#include <string.h>
 #include <stdlib.h>
+#include <string.h>
 
 /* This object's header. */
 #include "threadpool.h"
@@ -147,7 +147,7 @@ void threadpool_map(struct threadpool *tp, threadpool_map_function map_function,
   while (tp->num_threads_running < tp->num_threads) {
     pthread_cond_wait(&tp->control_cond, &tp->thread_mutex);
   }
-  
+
   /* Wait for all threads to be done. */
   while (tp->num_threads_waiting < tp->num_threads) {
     pthread_cond_wait(&tp->control_cond, &tp->thread_mutex);
diff --git a/src/threadpool.h b/src/threadpool.h
index 66dc6752d29a654dbaacfd1d0267f1cedf4dd981..e07a0613c0595bc3c280f97cd8f28e5705bf313a 100644
--- a/src/threadpool.h
+++ b/src/threadpool.h
@@ -55,6 +55,7 @@ struct threadpool {
 /* Function prototypes. */
 void threadpool_init(struct threadpool *tp, int num_threads);
 void threadpool_map(struct threadpool *tp, threadpool_map_function map_function,
-                    void *map_data, size_t N, int stride, int chunk, void *extra_data);
+                    void *map_data, size_t N, int stride, int chunk,
+                    void *extra_data);
 
 #endif /* SWIFT_THREADPOOL_H */
diff --git a/src/timers.h b/src/timers.h
index 92b685ebe9b11d49c4703e5837d35cffdca81c4d..c48961b39737f23eb936d7283f76651d33892991 100644
--- a/src/timers.h
+++ b/src/timers.h
@@ -23,6 +23,7 @@
 #define SWIFT_TIMERS_H
 
 /* Includes. */
+#include "atomic.h"
 #include "cycle.h"
 #include "inline.h"
 
@@ -41,9 +42,12 @@ enum {
   timer_dopair_force,
   timer_dopair_grav,
   timer_dograv_external,
-  timer_dosub_density,
-  timer_dosub_force,
-  timer_dosub_grav,
+  timer_dosub_self_density,
+  timer_dosub_self_force,
+  timer_dosub_self_grav,
+  timer_dosub_pair_density,
+  timer_dosub_pair_force,
+  timer_dosub_pair_grav,
   timer_dopair_subset,
   timer_do_ghost,
   timer_dorecv_cell,
@@ -71,7 +75,7 @@ extern ticks timers[timer_count];
 #define TIMER_TOC2(t) timers_toc(t, tic2)
 INLINE static ticks timers_toc(int t, ticks tic) {
   ticks d = (getticks() - tic);
-  __sync_add_and_fetch(&timers[t], d);
+  atomic_add(&timers[t], d);
   return d;
 }
 #else
diff --git a/src/tools.c b/src/tools.c
index 0363100331ad5b298279e9ebadcf87eab5f9e896..1a2b794f688047183827e5c2ed6ba80ba1339080 100644
--- a/src/tools.c
+++ b/src/tools.c
@@ -19,16 +19,25 @@
  *
  ******************************************************************************/
 
+/* Config parameters. */
+#include "../config.h"
+
+/* Some standard headers. */
 #include <math.h>
 #include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
 
+/* This object's header. */
+#include "tools.h"
+
+/* Local includes. */
 #include "cell.h"
 #include "error.h"
+#include "gravity.h"
+#include "hydro.h"
 #include "part.h"
-#include "swift.h"
-#include "tools.h"
+#include "runner.h"
 
 /**
  *  Factorize a given integer, attempts to keep larger pair of factors.
@@ -56,9 +65,7 @@ void factor(int value, int *f1, int *f2) {
  * @param N The number of parts.
  * @param periodic Periodic boundary conditions flag.
  */
-
-void pairs_n2(double *dim, struct part *__restrict__ parts, int N,
-              int periodic) {
+void pairs_n2(double *dim, struct part *restrict parts, int N, int periodic) {
   int i, j, k, count = 0;
   // int mj, mk;
   // double maxratio = 1.0;
@@ -121,8 +128,7 @@ void pairs_n2(double *dim, struct part *__restrict__ parts, int N,
 }
 
 void pairs_single_density(double *dim, long long int pid,
-                          struct part *__restrict__ parts, int N,
-                          int periodic) {
+                          struct part *restrict parts, int N, int periodic) {
   int i, k;
   // int mj, mk;
   // double maxratio = 1.0;
@@ -271,7 +277,7 @@ void self_all_density(struct runner *r, struct cell *ci) {
 }
 
 void pairs_single_grav(double *dim, long long int pid,
-                       struct gpart *__restrict__ parts, int N, int periodic) {
+                       struct gpart *restrict parts, int N, int periodic) {
   int i, k;
   // int mj, mk;
   // double maxratio = 1.0;
@@ -328,7 +334,6 @@ void pairs_single_grav(double *dim, long long int pid,
  *
  * @param N number of intervals in [0,1].
  */
-
 void density_dump(int N) {
   int k;
   float r2[4] = {0.0f, 0.0f, 0.0f, 0.0f}, hi[4], hj[4];
@@ -363,10 +368,8 @@ void density_dump(int N) {
 /**
  * @brief Compute the force on a single particle brute-force.
  */
-
 void engine_single_density(double *dim, long long int pid,
-                           struct part *__restrict__ parts, int N,
-                           int periodic) {
+                           struct part *restrict parts, int N, int periodic) {
   int i, k;
   double r2, dx[3];
   float fdx[3], ih;
@@ -412,7 +415,7 @@ void engine_single_density(double *dim, long long int pid,
 }
 
 void engine_single_force(double *dim, long long int pid,
-                         struct part *__restrict__ parts, int N, int periodic) {
+                         struct part *restrict parts, int N, int periodic) {
   int i, k;
   double r2, dx[3];
   float fdx[3];
diff --git a/src/tools.h b/src/tools.h
index 5f9f41d033ab03983bde3bb37e87f8a39d2deecd..2a4462f274d8e9acd1f2d8e79996ad92c630d404 100644
--- a/src/tools.h
+++ b/src/tools.h
@@ -22,21 +22,25 @@
 #ifndef SWIFT_TOOL_H
 #define SWIFT_TOOL_H
 
+/* Config parameters. */
+#include "../config.h"
+
+/* Includes. */
 #include "cell.h"
+#include "part.h"
 #include "runner.h"
 
 void factor(int value, int *f1, int *f2);
 void density_dump(int N);
 void pairs_single_grav(double *dim, long long int pid,
-                       struct gpart *__restrict__ parts, int N, int periodic);
+                       struct gpart *restrict parts, int N, int periodic);
 void pairs_single_density(double *dim, long long int pid,
-                          struct part *__restrict__ parts, int N, int periodic);
+                          struct part *restrict parts, int N, int periodic);
 
 void pairs_all_density(struct runner *r, struct cell *ci, struct cell *cj);
 void self_all_density(struct runner *r, struct cell *ci);
 
-void pairs_n2(double *dim, struct part *__restrict__ parts, int N,
-              int periodic);
+void pairs_n2(double *dim, struct part *restrict parts, int N, int periodic);
 
 double random_uniform(double a, double b);
 void shuffle_particles(struct part *parts, const int count);
diff --git a/tests/threadpool_test.c b/tests/threadpool_test.c
index 5207881c27b51ed4090c2c8c015f3b4f97e422fb..90ec58e10b16d816b52b785c87e6604ad11f61d8 100644
--- a/tests/threadpool_test.c
+++ b/tests/threadpool_test.c
@@ -23,8 +23,8 @@
 #include <unistd.h>
 
 // Local includes.
-#include "../src/threadpool.h"
 #include "../src/atomic.h"
+#include "../src/threadpool.h"
 
 void map_function_first(void *map_data, int num_elements, void *extra_data) {
   const int *inputs = (int *)map_data;