diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000000000000000000000000000000000000..542c97e89adce40066f0b9a96cc8be9746640d65
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,5 @@
+---
+Language:        Cpp
+BasedOnStyle:  Google
+KeepEmptyLinesAtTheStartOfBlocks: true
+...
diff --git a/.gitignore b/.gitignore
index 5a986acbd59a818b151540fb9303eadb4f926f77..9a56843112c8214fc4dbce8efdf3fc23aa7e5919 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,19 +11,25 @@ config.sub
 ltmain.sh
 libtool
 
-src/version.h
+src/version_string.h
 swift*.tar.gz
+
 doc/doxyfile.stamp
 doc/html/
 doc/latex/
 doc/man/
 doc/Doxyfile
+
 examples/swift
+examples/swift_mpi
 examples/swift_fixdt
 examples/swift_fixdt_mpi
-examples/swift_mindt
-examples/swift_mindt_mpi
-examples/swift_mpi
+examples/*.xmf
+examples/used_parameters.yml
+examples/energy.txt
+examples/*/*.xmf
+examples/*/used_parameters.yml
+examples/*/energy.txt
 
 tests/testPair
 tests/brute_force_standard.dat
@@ -49,6 +55,7 @@ theory/latex/swift.pdf
 theory/kernel/kernels.pdf
 theory/kernel/kernel_derivatives.pdf
 theory/kernel/kernel_definitions.pdf
+theory/paper_pasc/pasc_paper.pdf
 
 m4/libtool.m4
 m4/ltoptions.m4
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000000000000000000000000000000000000..883a63c034401ec1fceb477fe33c8342f74a87c1
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,4 @@
+The SWIFT source code is using a variation of the 'Google' formatting style. 
+The script 'format.sh' in the root directory applies the clang-format-3.8
+tool with our style choices to all the SWIFT C source file. Please apply 
+the formatting script to the files before submitting a merge request.
\ No newline at end of file
diff --git a/INSTALL.swift b/INSTALL.swift
index c18142c1a62c7c8a011ead2ee8d0d869c9e072ec..bd49dfffd641a3ad59ec25284b14685afb9f7b24 100644
--- a/INSTALL.swift
+++ b/INSTALL.swift
@@ -30,7 +30,7 @@ or:
    ./configure CC=icc
 
 to use an Intel compiler. The main "programs" can be found in the "examples/"
-directory.
+directory. See README for run parameters.
 
 SWIFT has been successfully built and tested with the following compilers:
 
@@ -88,6 +88,9 @@ Before running configure the "mpirun" command should be available in the
 shell. If your command isn't called "mpirun" then define the "MPIRUN"
 environment variable, either in the shell or when running configure.
 
+The MPI compiler can be controlled using the MPICC variable, much like
+the CC one. Use this when your MPI compiler has a none-standard name.
+
 
 METIS: a build of the METIS library can be optionally used to optimize the
 load between MPI nodes (requires an MPI library). This should be found in
@@ -105,3 +108,14 @@ among the different cores on each computing node.
 
 DOXYGEN: the doxygen library is required to create the SWIFT API
 documentation.
+
+
+
+                             SWIFT Coding style
+                             ==================
+
+The SWIFT source code is using a variation of the 'Google' style. The
+script 'format.sh' in the root directory applies the clang-format-3.8
+tool with our style choices to all the SWIFT C source file. Please
+apply the formatting script to the files before submitting a merge
+request.
diff --git a/Makefile.am b/Makefile.am
index 9d4f6371b7aa37a1750c5fb8dbed17f9ff48442e..fb4eb5f6d6b63a7d0e034e0a3202ac61066e6e25 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,7 +1,6 @@
-
 # This file is part of SWIFT.
-# Copyright (c) 2012 pedro.gonnet@durham.ac.uk.
-#               2015 matthieu.schaller@durham.ac.uk.
+# Copyright (c) 2012 pedro.gonnet@durham.ac.uk
+#               2015 matthieu.schaller@durham.ac.uk
 # 
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -23,4 +22,4 @@ ACLOCAL_AMFLAGS = -I m4
 SUBDIRS = src examples doc tests
 
 # Non-standard files that should be part of the distribution.
-EXTRA_DIST = INSTALL.swift
+EXTRA_DIST = INSTALL.swift .clang-format format.sh
diff --git a/README b/README
index 59c362415a9199d08fb6dfbb1ed044c66e647254..b0a21c610f7ab4c380e523b51c6f74b81f98a2f2 100644
--- a/README
+++ b/README
@@ -11,9 +11,13 @@
 
 See INSTALL.swift for install instructions.
 
-Usage: swift [OPTION] PARAMFILE
+Usage: swift [OPTION]... PARAMFILE
+       swift_mpi [OPTION]... PARAMFILE
+       swift_fixdt [OPTION]... PARAMFILE
+       swift_fixdt_mpi [OPTION]... PARAMFILE
 
 Valid options are:
+  -a          Pin runners using processor affinity
   -c          Run with cosmological time integration
   -d          Dry run. Read the parameter file, allocate memory but does not read 
               the particles from ICs and exit before the start of time integration.
diff --git a/configure.ac b/configure.ac
index 11ad6550d87f6764570f48449719292bcec3704d..497107121753f212dd8b07f5a8e8eed7acdf82b5 100644
--- a/configure.ac
+++ b/configure.ac
@@ -16,7 +16,7 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 # Init the project.
-AC_INIT([SWIFT],[0.2.0])
+AC_INIT([SWIFT],[0.3.0])
 AC_CONFIG_SRCDIR([src/space.c])
 AC_CONFIG_AUX_DIR([.])
 AM_INIT_AUTOMAKE
@@ -329,7 +329,7 @@ fi
 AM_CONDITIONAL([HAVEPARALLELHDF5],[test "$have_parallel_hdf5" = "yes"])
 
 # Check for setaffinity.
-AC_CHECK_FUNC(pthread_setaffinity_np, AC_DEFINE([HAVE_SETAFFINITY],[true],
+AC_CHECK_FUNC(pthread_setaffinity_np, AC_DEFINE([HAVE_SETAFFINITY],[1],
     [Defined if pthread_setaffinity_np exists.]) )
 AM_CONDITIONAL(HAVESETAFFINITY,
     [test "$ac_cv_func_pthread_setaffinity_np" = "yes"])
diff --git a/doc/Makefile.am b/doc/Makefile.am
index 1736a4ad81a674e68932a45e2f8bea8ef82305f3..dc88489ca4ac4ef2fe856d910420fee6a7e87c8a 100644
--- a/doc/Makefile.am
+++ b/doc/Makefile.am
@@ -1,3 +1,19 @@
+# This file is part of SWIFT.
+# Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk),
+#                    Matthieu Schaller (matthieu.schaller@durham.ac.uk).
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 doxyfile.stamp: 
 if HAVE_DOXYGEN
diff --git a/examples/CosmoVolume/cosmoVolume.yml b/examples/CosmoVolume/cosmoVolume.yml
index 2da45221043187402bb42028f4d03723cbb26688..707c638e5db2c046a2516c0ef9249c08a5c18432 100644
--- a/examples/CosmoVolume/cosmoVolume.yml
+++ b/examples/CosmoVolume/cosmoVolume.yml
@@ -1,6 +1,5 @@
-
 # Define the system of units to use internally. 
-UnitSystem:
+InternalUnitSystem:
   UnitMass_in_cgs:     1   # Grams
   UnitLength_in_cgs:   1   # Centimeters
   UnitVelocity_in_cgs: 1   # Centimeters per second
@@ -9,39 +8,39 @@ UnitSystem:
 
 # Parameters for the task scheduling
 Scheduler:
-  nr_queues:        0        # The number of task queues to use. Use 0  to let the system decide.
-  cell_max_size:    8000000  # Maximal number of interactions per task (this is the default value).
-  cell_sub_size:    5000     # Maximal number of interactions per sub-task  (this is the default value).
-  cell_split_size:  400      # Maximal number of particles per cell (this is the default value).
+  cell_sub_size:    6000     # Value used for the original scaling tests
+  cell_split_size:  300      # Value used for the original scaling tests
 
 # Parameters governing the time integration
 TimeIntegration:
   time_begin: 0.    # The starting time of the simulation (in internal units).
-  time_end:   1.    # The end time of the simulation (in internal units).
-  dt_min:     1e-6  # The minimal time-step size of the simulation (in internal units).
-  dt_max:     1e-2  # The maximal time-step size of the simulation (in internal units).
+  time_end:   1e-4  # The end time of the simulation (in internal units).
+  dt_min:     1e-7  # The minimal time-step size of the simulation (in internal units).
+  dt_max:     1e-4  # The maximal time-step size of the simulation (in internal units).
+
+# Parameters governing the snapshots
+Snapshots:
+  basename:            cosmo # Common part of the name of output files
+  time_first:          0.    # Time of the first output (in internal units)
+  delta_time:          0.05  # Time difference between consecutive outputs (in internal units)
+  UnitMass_in_cgs:     1   # Grams
+  UnitLength_in_cgs:   1   # Centimeters
+  UnitVelocity_in_cgs: 1   # Centimeters per second
+  UnitCurrent_in_cgs:  1   # Amperes
+  UnitTemp_in_cgs:     1   # Kelvin
+
+# Parameters governing the conserved quantities statistics
+Statistics:
+  delta_time:          1e-2 # Time between statistics output
 
 # Parameters for the hydrodynamics scheme
 SPH:
-  resolution_eta:        1.2349   # Target smoothing length in units of the mean inter-particle separation (1.2349 == 48Ngbs with the cubic spline kernel).
-  delta_neighbours:      1.       # The tolerance for the targetted number of neighbours.
+  resolution_eta:        1.2348   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
+  delta_neighbours:      0.1      # The tolerance for the targetted number of neighbours.
+  max_smoothing_length:  0.705    # Maximal smoothing length allowed (in internal units).
   CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
-  max_ghost_iterations:  30       # Maximal number of iterations allowed to converge towards the smoothing length.
-  max_smoothing_length:  0.6      # Maximal smoothing length allowed (in internal units).
 
 # Parameters related to the initial conditions
 InitialConditions:
   file_name:  ./cosmoVolume.hdf5     # The file to read
-  h_scaling:  1.                    # A scaling factor to apply to all smoothing lengths in the ICs.
-  shift_x:    0.                    # A shift to apply to all particles read from the ICs (in internal units).
-  shift_y:    0.
-  shift_z:    0.
 
-# Parameters govering domain decomposition
-DomainDecomposition:
-  initial_type:       m     # The initial strategy ("g", "m", "w", or "v"). See documentation for details.
-  initial_grid_x:    10     # Grid size if the 'g' strategy is chosen.
-  initial_grid_y:    10
-  initial_grid_z:    10
-  repartition_type:   b     # The re-decomposition strategy ("n", "b", "v", "e" or "x"). See documentation for details.
- 
diff --git a/examples/ExternalPointMass/externalPointMass.yml b/examples/ExternalPointMass/externalPointMass.yml
new file mode 100644
index 0000000000000000000000000000000000000000..52330163caa13609fd7674a5cdf2921743fbe227
--- /dev/null
+++ b/examples/ExternalPointMass/externalPointMass.yml
@@ -0,0 +1,51 @@
+# Define the system of units to use internally. 
+InternalUnitSystem:
+  UnitMass_in_cgs:     1.9885e33     # Grams
+  UnitLength_in_cgs:   3.0856776e21  # Centimeters
+  UnitVelocity_in_cgs: 1e5           # Centimeters per second
+  UnitCurrent_in_cgs:  1   # Amperes
+  UnitTemp_in_cgs:     1   # Kelvin
+
+# Parameters governing the time integration
+TimeIntegration:
+  time_begin: 0.    # The starting time of the simulation (in internal units).
+  time_end:   1.    # The end time of the simulation (in internal units).
+  dt_min:     1e-6  # The minimal time-step size of the simulation (in internal units).
+  dt_max:     1e-3  # The maximal time-step size of the simulation (in internal units).
+
+# Parameters governing the snapshots
+Snapshots:
+  basename:            pointMass # Common part of the name of output files
+  time_first:          0.        # Time of the first output (in internal units)
+  delta_time:          0.02      # Time difference between consecutive outputs (in internal units)
+  UnitMass_in_cgs:     1.9885e33     # Grams
+  UnitLength_in_cgs:   3.0856776e21  # Centimeters
+  UnitVelocity_in_cgs: 1e5           # Centimeters per second
+  UnitCurrent_in_cgs:  1   # Amperes
+  UnitTemp_in_cgs:     1   # Kelvin
+
+# Parameters governing the conserved quantities statistics
+Statistics:
+  delta_time:          1e-2 # Time between statistics output
+
+# Parameters for the hydrodynamics scheme
+SPH:
+  resolution_eta:        1.2348   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
+  delta_neighbours:      0.1      # The tolerance for the targetted number of neighbours.
+  max_smoothing_length:  10.      # Maximal smoothing length allowed (in internal units).
+  CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
+
+# Parameters related to the initial conditions
+InitialConditions:
+  file_name:  Sphere.hdf5           # The file to read
+  shift_x:    50.                   # A shift to apply to all particles read from the ICs (in internal units).
+  shift_y:    50.
+  shift_z:    50.
+
+# External potential parameters
+PointMass:
+  position_x:      50.     # location of external point mass in internal units
+  position_y:      50.
+  position_z:      50.	
+  mass:            1e10     # mass of external point mass in internal units
+
diff --git a/examples/ExternalPointMass/makeIC.py b/examples/ExternalPointMass/makeIC.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1b032e47383d6e757306f09063a8931635ea8e9
--- /dev/null
+++ b/examples/ExternalPointMass/makeIC.py
@@ -0,0 +1,142 @@
+###############################################################################
+ # This file is part of SWIFT.
+ # Copyright (c) 2016 John A. Regan (john.a.regan@durham.ac.uk)
+ #                    Tom Theuns (tom.theuns@durham.ac.uk)
+ # 
+ # This program is free software: you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published
+ # by the Free Software Foundation, either version 3 of the License, or
+ # (at your option) any later version.
+ # 
+ # This program is distributed in the hope that it will be useful,
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ # GNU General Public License for more details.
+ # 
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ # 
+ ##############################################################################
+
+import h5py
+import sys
+import numpy
+import math
+import random
+
+# Generates a random distriution of particles, for motion in an external potnetial centred at (0,0,0)
+
+# physical constants in cgs
+NEWTON_GRAVITY_CGS  = 6.672e-8
+SOLAR_MASS_IN_CGS   = 1.9885e33
+PARSEC_IN_CGS       = 3.0856776e18
+
+# choice of units
+const_unit_length_in_cgs   =   (1000*PARSEC_IN_CGS)
+const_unit_mass_in_cgs     =   (SOLAR_MASS_IN_CGS)
+const_unit_velocity_in_cgs =   (1e5)
+
+print "UnitMass_in_cgs:     ", const_unit_mass_in_cgs 
+print "UnitLength_in_cgs:   ", const_unit_length_in_cgs
+print "UnitVelocity_in_cgs: ", const_unit_velocity_in_cgs
+
+# derived units
+const_unit_time_in_cgs = (const_unit_length_in_cgs / const_unit_velocity_in_cgs)
+const_G                = ((NEWTON_GRAVITY_CGS*const_unit_mass_in_cgs*const_unit_time_in_cgs*const_unit_time_in_cgs/(const_unit_length_in_cgs*const_unit_length_in_cgs*const_unit_length_in_cgs)))
+print 'G=', const_G
+
+
+# Parameters
+periodic= 1            # 1 For periodic box
+boxSize = 100.         # 
+Radius  = boxSize / 4. # maximum radius of particles
+G       = const_G 
+Mass    = 1e10         
+
+N       = int(sys.argv[1])  # Number of particles
+L       = N**(1./3.)
+
+# these are not used but necessary for I/O
+rho = 2.              # Density
+P = 1.                # Pressure
+gamma = 5./3.         # Gas adiabatic index
+fileName = "Sphere.hdf5" 
+
+
+#---------------------------------------------------
+numPart        = N
+mass           = 1
+internalEnergy = P / ((gamma - 1.)*rho)
+
+#--------------------------------------------------
+
+#File
+file = h5py.File(fileName, 'w')
+
+# Header
+grp = file.create_group("/Header")
+grp.attrs["BoxSize"] = boxSize
+grp.attrs["NumPart_Total"] =  [0, numPart, 0, 0, 0, 0]
+grp.attrs["NumPart_Total_HighWord"] = [0, 0, 0, 0, 0, 0]
+grp.attrs["NumPart_ThisFile"] = [0, numPart, 0, 0, 0, 0]
+grp.attrs["Time"] = 0.0
+grp.attrs["NumFilesPerSnapshot"] = 1
+grp.attrs["MassTable"] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+grp.attrs["Flag_Entropy_ICs"] = [0, 0, 0, 0, 0, 0]
+
+
+#Runtime parameters
+grp = file.create_group("/RuntimePars")
+grp.attrs["PeriodicBoundariesOn"] = periodic
+
+#Particle group
+#grp0 = file.create_group("/PartType0")
+grp1 = file.create_group("/PartType1")
+#generate particle positions
+radius = Radius * (numpy.random.rand(N))**(1./3.) 
+ctheta = -1. + 2 * numpy.random.rand(N)
+stheta = numpy.sqrt(1.-ctheta**2)
+phi    =  2 * math.pi * numpy.random.rand(N)
+r      = numpy.zeros((numPart, 3))
+# r[:,0] = radius * stheta * numpy.cos(phi)
+# r[:,1] = radius * stheta * numpy.sin(phi)
+# r[:,2] = radius * ctheta
+r[:,0] = radius
+#
+speed  = numpy.sqrt(G * Mass / radius)
+v      = numpy.zeros((numPart, 3))
+omega  = speed / radius
+period = 2.*math.pi/omega
+print 'period = minimum = ',min(period), ' maximum = ',max(period)
+
+v[:,0] = -omega * r[:,1]
+v[:,1] =  omega * r[:,0]
+
+ds = grp1.create_dataset('Velocities', (numPart, 3), 'f')
+ds[()] = v
+v = numpy.zeros(1)
+
+m = numpy.full((numPart, ), mass)
+ds = grp1.create_dataset('Masses', (numPart,), 'f')
+ds[()] = m
+m = numpy.zeros(1)
+
+h = numpy.full((numPart, ), 1.1255 * boxSize / L)
+ds = grp1.create_dataset('SmoothingLength', (numPart,), 'f')
+ds[()] = h
+h = numpy.zeros(1)
+
+u = numpy.full((numPart, ), internalEnergy)
+ds = grp1.create_dataset('InternalEnergy', (numPart,), 'f')
+ds[()] = u
+u = numpy.zeros(1)
+
+
+ids = 1 + numpy.linspace(0, numPart, numPart, endpoint=False)
+ds = grp1.create_dataset('ParticleIDs', (numPart, ), 'L')
+ds[()] = ids
+
+ds = grp1.create_dataset('Coordinates', (numPart, 3), 'd')
+ds[()] = r
+
+file.close()
diff --git a/examples/ExternalPointMass/run.sh b/examples/ExternalPointMass/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..9bbe03738b472f48715ae1875dfd462ef577b3d9
--- /dev/null
+++ b/examples/ExternalPointMass/run.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+# Generate the initial conditions if they are not present.
+if [ ! -e Sphere.hdf5 ]
+then
+    echo "Generating initial conditions for the point mass potential box example..."
+    python makeIC.py 10000
+fi
+
+../swift -g -t 2 externalPointMass.yml
diff --git a/examples/ExternalPointMass/test.pro b/examples/ExternalPointMass/test.pro
new file mode 100644
index 0000000000000000000000000000000000000000..21c10e9d27daa45b085c6a659ba3cf7260f017fb
--- /dev/null
+++ b/examples/ExternalPointMass/test.pro
@@ -0,0 +1,65 @@
+;
+;  test energy / angular momentum conservation of test problem
+;
+@physunits
+
+indir    = '/gpfs/data/tt/Codes/Swift-git/swiftsim/examples/'
+basefile = 'output_'
+nfiles   = 657
+nfollow  = 100 ; number of particles to follow
+eout     = fltarr(nfollow, nfiles)
+ekin     = fltarr(nfollow, nfiles)
+epot     = fltarr(nfollow, nfiles)
+tout     = fltarr(nfiles)
+; set properties of potential
+uL  = 1e3 * phys.pc             ; unit of length
+uM  = phys.msun                 ; unit of mass
+uV  = 1d5                       ; unit of velocity
+
+; derived units
+constG   = 10.^(alog10(phys.g)+alog10(uM)-2d0*alog10(uV)-alog10(uL)) ;
+pcentre  = [50.,50.,50.] * 1d3 * pc / uL
+mextern  = 1d10 * msun / uM
+;
+;
+;
+ifile  = 0
+for ifile=0,nfiles-1 do begin
+;for ifile=0,3 do begin
+   inf    = indir + basefile + strtrim(string(ifile,'(i3.3)'),1) + '.hdf5'
+   time   = h5ra(inf, 'Header','Time')
+   p      = h5rd(inf,'PartType1/Coordinates')
+   v      = h5rd(inf,'PartType1/Velocities')
+   id     = h5rd(inf,'PartType1/ParticleIDs')
+   indx   = sort(id)
+;
+   id     = id[indx]
+   for ic=0,2 do begin
+      tmp = reform(p[ic,*]) & p[ic,*] = tmp[indx]
+      tmp = reform(v[ic,*]) & v[ic,*] = tmp[indx]
+   endfor
+; calculate energy
+   dd  = size(p,/dimen) & npart = dd[1]
+   ener = fltarr(npart)
+   dr   = fltarr(npart) & dv = dr
+   for ic=0,2 do dr[*] = dr[*] + (p[ic,*]-pcentre[ic])^2
+   for ic=0,2 do dv[*] = dv[*] + v[ic,*]^2
+   dr = sqrt(dr)
+;   print,'time = ',time,p[0,0],v[0,0],id[0]
+   ek   = 0.5 * dv
+   ep   = - constG * mextern / dr
+   ener = ek + ep
+   tout(ifile) = time
+   eout(*,ifile) = ener[0:nfollow-1]
+   ekin(*,ifile) = ek[0:nfollow-1]
+   epot(*,ifile) = ep[0:nfollow-1]
+endfor
+
+; calculate relative energy change
+de = 0.0 * eout
+for ifile=1, nfiles -1 do de[*,ifile] = (eout[*,ifile]-eout[*,0])/eout[*,0]
+
+
+end
+
+
diff --git a/examples/Makefile.am b/examples/Makefile.am
index 200d340be168651e6dfc91f4b89a0a343d34d9da..735817c24cc52786e4c562e46e3619fb4a9a2e34 100644
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -1,4 +1,3 @@
-
 # This file is part of SWIFT.
 # Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk),
 #                    Matthieu Schaller (matthieu.schaller@durham.ac.uk).
@@ -62,21 +61,19 @@ swift_fixdt_mpi_CFLAGS = $(MYFLAGS) $(AM_CFLAGS) $(MPI_FLAGS) -DENGINE_POLICY="e
 swift_fixdt_mpi_LDADD =  ../src/.libs/libswiftsim_mpi.a $(HDF5_LDFLAGS) $(HDF5_LIBS) $(MPI_LIBS)
 
 # Scripts to generate ICs
-EXTRA_DIST = UniformBox/makeIC.py \
+EXTRA_DIST = UniformBox/makeIC.py UniformBox/run.sh UniformBox/uniformBox.yml \
 	     UniformDMBox/makeIC.py \
 	     PerturbedBox/makeIC.py \
-	     SedovBlast/makeIC.py SedovBlast/makeIC_fcc.py SedovBlast/solution.py \
-	     SodShock/makeIC.py SodShock/solution.py SodShock/glass_001.hdf5 SodShock/glass_002.hdf5 SodShock/rhox.py \
-	     CosmoVolume/getIC.sh \
+	     SedovBlast/makeIC.py SedovBlast/makeIC_fcc.py SedovBlast/solution.py SedovBlast/run.sh SedovBlast/sedov.yml \
+	     SodShock/makeIC.py SodShock/solution.py SodShock/glass_001.hdf5 SodShock/glass_002.hdf5 SodShock/rhox.py SodShock/run.sh SodShock/sodShock.yml \
+	     CosmoVolume/getIC.sh CosmoVolume/run.sh CosmoVolume/cosmoVolume.yml \
 	     BigCosmoVolume/makeIC.py \
 	     BigPerturbedBox/makeIC_fcc.py \
              GreshoVortex/makeIC.py GreshoVortex/solution.py \
-             MultiTypes/makeIC.py
-
+             MultiTypes/makeIC.py \
+             parameter_example.yml
 
 # Scripts to plot task graphs
 EXTRA_DIST += plot_tasks_MPI.py plot_tasks.py \
 	      process_plot_tasks_MPI process_plot_tasks
 
-# Simple run scripts
-EXTRA_DIST += runs.sh
diff --git a/examples/SedovBlast/sedov.yml b/examples/SedovBlast/sedov.yml
index 55974b03b823befde8365cddab187f5a18c5bbb7..9fbabb4969b6accdb7323d8270b735951ac0693a 100644
--- a/examples/SedovBlast/sedov.yml
+++ b/examples/SedovBlast/sedov.yml
@@ -1,19 +1,11 @@
-
 # Define the system of units to use internally. 
-UnitSystem:
+InternalUnitSystem:
   UnitMass_in_cgs:     1   # Grams
   UnitLength_in_cgs:   1   # Centimeters
   UnitVelocity_in_cgs: 1   # Centimeters per second
   UnitCurrent_in_cgs:  1   # Amperes
   UnitTemp_in_cgs:     1   # Kelvin
 
-# Parameters for the task scheduling
-Scheduler:
-  nr_queues:        0        # The number of task queues to use. Use 0  to let the system decide.
-  cell_max_size:    8000000  # Maximal number of interactions per task (this is the default value).
-  cell_sub_size:    5000     # Maximal number of interactions per sub-task  (this is the default value).
-  cell_split_size:  400      # Maximal number of particles per cell (this is the default value).
-
 # Parameters governing the time integration
 TimeIntegration:
   time_begin: 0.    # The starting time of the simulation (in internal units).
@@ -21,27 +13,29 @@ TimeIntegration:
   dt_min:     1e-7  # The minimal time-step size of the simulation (in internal units).
   dt_max:     1e-2  # The maximal time-step size of the simulation (in internal units).
 
+# Parameters governing the snapshots
+Snapshots:
+  basename:            sedov # Common part of the name of output files
+  time_first:          0.    # Time of the first output (in internal units)
+  delta_time:          0.1   # Time difference between consecutive outputs (in internal units)
+  UnitMass_in_cgs:     1   # Grams
+  UnitLength_in_cgs:   1   # Centimeters
+  UnitVelocity_in_cgs: 1   # Centimeters per second
+  UnitCurrent_in_cgs:  1   # Amperes
+  UnitTemp_in_cgs:     1   # Kelvin
+
+# Parameters governing the conserved quantities statistics
+Statistics:
+  delta_time:          1e-3 # Time between statistics output
+
 # Parameters for the hydrodynamics scheme
 SPH:
-  resolution_eta:        1.2349   # Target smoothing length in units of the mean inter-particle separation (1.2349 == 48Ngbs with the cubic spline kernel).
-  delta_neighbours:      1.       # The tolerance for the targetted number of neighbours.
-  CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
-  max_ghost_iterations:  30       # Maximal number of iterations allowed to converge towards the smoothing length.
+  resolution_eta:        1.2348   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
+  delta_neighbours:      0.1      # The tolerance for the targetted number of neighbours.
   max_smoothing_length:  1.       # Maximal smoothing length allowed (in internal units).
+  CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
 
 # Parameters related to the initial conditions
 InitialConditions:
   file_name:  ./sedov.hdf5          # The file to read
-  h_scaling:  1.                    # A scaling factor to apply to all smoothing lengths in the ICs.
-  shift_x:    0.                    # A shift to apply to all particles read from the ICs (in internal units).
-  shift_y:    0.
-  shift_z:    0.
 
-# Parameters govering domain decomposition
-DomainDecomposition:
-  initial_type:       m     # The initial strategy ("g", "m", "w", or "v"). See documentation for details.
-  initial_grid_x:    10     # Grid size if the 'g' strategy is chosen.
-  initial_grid_y:    10
-  initial_grid_z:    10
-  repartition_type:   b     # The re-decomposition strategy ("n", "b", "v", "e" or "x"). See documentation for details.
- 
diff --git a/examples/SodShock/sodShock.yml b/examples/SodShock/sodShock.yml
index ab43d6682b2a16304d364784efee530ad5289cef..003b7286777230ceb3b84ee01c6a1f335aeb9476 100644
--- a/examples/SodShock/sodShock.yml
+++ b/examples/SodShock/sodShock.yml
@@ -1,19 +1,11 @@
-
 # Define the system of units to use internally. 
-UnitSystem:
+InternalUnitSystem:
   UnitMass_in_cgs:     1   # Grams
   UnitLength_in_cgs:   1   # Centimeters
   UnitVelocity_in_cgs: 1   # Centimeters per second
   UnitCurrent_in_cgs:  1   # Amperes
   UnitTemp_in_cgs:     1   # Kelvin
 
-# Parameters for the task scheduling
-Scheduler:
-  nr_queues:        0        # The number of task queues to use. Use 0  to let the system decide.
-  cell_max_size:    8000000  # Maximal number of interactions per task (this is the default value).
-  cell_sub_size:    5000     # Maximal number of interactions per sub-task.
-  cell_split_size:  400      # Maximal number of particles per cell (this is the default value).
-
 # Parameters governing the time integration
 TimeIntegration:
   time_begin: 0.    # The starting time of the simulation (in internal units).
@@ -21,27 +13,29 @@ TimeIntegration:
   dt_min:     1e-7  # The minimal time-step size of the simulation (in internal units).
   dt_max:     1e-2  # The maximal time-step size of the simulation (in internal units).
 
+# Parameters governing the snapshots
+Snapshots:
+  basename:            sod # Common part of the name of output files
+  time_first:          0.  # Time of the first output (in internal units)
+  delta_time:          0.1 # Time difference between consecutive outputs (in internal units)
+  UnitMass_in_cgs:     1   # Grams
+  UnitLength_in_cgs:   1   # Centimeters
+  UnitVelocity_in_cgs: 1   # Centimeters per second
+  UnitCurrent_in_cgs:  1   # Amperes
+  UnitTemp_in_cgs:     1   # Kelvin
+
+# Parameters governing the conserved quantities statistics
+Statistics:
+  delta_time:          1e-2 # Time between statistics output
+
 # Parameters for the hydrodynamics scheme
 SPH:
-  resolution_eta:        1.2349   # Target smoothing length in units of the mean inter-particle separation (1.2349 == 48Ngbs with the cubic spline kernel).
-  delta_neighbours:      1.       # The tolerance for the targetted number of neighbours.
-  CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
-  max_ghost_iterations:  30       # Maximal number of iterations allowed to converge towards the smoothing length.
+  resolution_eta:        1.2348   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
+  delta_neighbours:      0.1      # The tolerance for the targetted number of neighbours.
   max_smoothing_length:  0.01     # Maximal smoothing length allowed (in internal units).
+  CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
 
 # Parameters related to the initial conditions
 InitialConditions:
   file_name:  ./sodShock.hdf5       # The file to read
-  h_scaling:  1.                    # A scaling factor to apply to all smoothing lengths in the ICs.
-  shift_x:    0.                    # A shift to apply to all particles read from the ICs (in internal units).
-  shift_y:    0.
-  shift_z:    0.
 
-# Parameters govering domain decomposition
-DomainDecomposition:
-  initial_type:       m     # The initial strategy ("g", "m", "w", or "v"). See documentation for details.
-  initial_grid_x:    10     # Grid size if the 'g' strategy is chosen.
-  initial_grid_y:    10
-  initial_grid_z:    10
-  repartition_type:   b     # The re-decomposition strategy ("n", "b", "v", "e" or "x"). See documentation for details.
- 
diff --git a/examples/UniformBox/uniformBox.yml b/examples/UniformBox/uniformBox.yml
index 0474b0f8202effa73210ee2b459806f2376a37f2..50afbee02ddc8a801ca77ed4900b7d2e0e3b50b5 100644
--- a/examples/UniformBox/uniformBox.yml
+++ b/examples/UniformBox/uniformBox.yml
@@ -1,19 +1,11 @@
-
 # Define the system of units to use internally. 
-UnitSystem:
+InternalUnitSystem:
   UnitMass_in_cgs:     1   # Grams
   UnitLength_in_cgs:   1   # Centimeters
   UnitVelocity_in_cgs: 1   # Centimeters per second
   UnitCurrent_in_cgs:  1   # Amperes
   UnitTemp_in_cgs:     1   # Kelvin
 
-# Parameters for the task scheduling
-Scheduler:
-  nr_queues:        0        # The number of task queues to use. Use 0  to let the system decide.
-  cell_max_size:    8000000  # Maximal number of interactions per task (this is the default value).
-  cell_sub_size:    5000     # Maximal number of interactions per sub-task  (this is the default value).
-  cell_split_size:  400      # Maximal number of particles per cell (this is the default value).
-
 # Parameters governing the time integration
 TimeIntegration:
   time_begin: 0.    # The starting time of the simulation (in internal units).
@@ -21,27 +13,28 @@ TimeIntegration:
   dt_min:     1e-6  # The minimal time-step size of the simulation (in internal units).
   dt_max:     1e-2  # The maximal time-step size of the simulation (in internal units).
 
+# Parameters governing the snapshots
+Snapshots:
+  basename:            uniformBox # Common part of the name of output files
+  time_first:          0.         # Time of the first output (in internal units)
+  delta_time:          0.01       # Time difference between consecutive outputs (in internal units)
+  UnitMass_in_cgs:     1   # Grams
+  UnitLength_in_cgs:   1   # Centimeters
+  UnitVelocity_in_cgs: 1   # Centimeters per second
+  UnitCurrent_in_cgs:  1   # Amperes
+  UnitTemp_in_cgs:     1   # Kelvin
+
+# Parameters governing the conserved quantities statistics
+Statistics:
+  delta_time:          1e-2 # Time between statistics output
+
 # Parameters for the hydrodynamics scheme
 SPH:
-  resolution_eta:        1.2349   # Target smoothing length in units of the mean inter-particle separation (1.2349 == 48Ngbs with the cubic spline kernel).
-  delta_neighbours:      1.       # The tolerance for the targetted number of neighbours.
-  CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
-  max_ghost_iterations:  30       # Maximal number of iterations allowed to converge towards the smoothing length.
+  resolution_eta:        1.2348   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
+  delta_neighbours:      0.1      # The tolerance for the targetted number of neighbours.
   max_smoothing_length:  0.1      # Maximal smoothing length allowed (in internal units).
-
+  CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
+  
 # Parameters related to the initial conditions
 InitialConditions:
   file_name:  ./uniformBox.hdf5     # The file to read
-  h_scaling:  1.                    # A scaling factor to apply to all smoothing lengths in the ICs.
-  shift_x:    0.                    # A shift to apply to all particles read from the ICs (in internal units).
-  shift_y:    0.
-  shift_z:    0.
-
-# Parameters govering domain decomposition
-DomainDecomposition:
-  initial_type:       m     # The initial strategy ("g", "m", "w", or "v"). See documentation for details.
-  initial_grid_x:    10     # Grid size if the 'g' strategy is chosen.
-  initial_grid_y:    10
-  initial_grid_z:    10
-  repartition_type:   b     # The re-decomposition strategy ("n", "b", "v", "e" or "x"). See documentation for details.
- 
diff --git a/examples/main.c b/examples/main.c
index 5a20125fe84bac5b793a2e1e2694cdfd76042051..65a948634738ad5113b8275da3591af5039e6997 100644
--- a/examples/main.c
+++ b/examples/main.c
@@ -3,6 +3,9 @@
  * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk),
  *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
  *               2015 Peter W. Draper (p.w.draper@durham.ac.uk)
+ *                    Angus Lepper (angus.lepper@ed.ac.uk)
+ *               2016 John A. Regan (john.a.regan@durham.ac.uk)
+ *                    Tom Theuns (tom.theuns@durham.ac.uk)
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
@@ -24,10 +27,10 @@
 
 /* Some standard headers. */
 #include <fenv.h>
-#include <unistd.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <unistd.h>
 
 /* MPI headers. */
 #ifdef WITH_MPI
@@ -47,8 +50,13 @@
  */
 void print_help_message() {
 
-  printf("\nUsage: swift [OPTION] PARAMFILE\n\n");
+  printf("\nUsage: swift [OPTION]... PARAMFILE\n");
+  printf("       swift_mpi [OPTION]... PARAMFILE\n");
+  printf("       swift_fixdt [OPTION]... PARAMFILE\n");
+  printf("       swift_fixdt_mpi [OPTION]... PARAMFILE\n\n");
+
   printf("Valid options are:\n");
+  printf("  %2s %8s %s\n", "-a", "", "Pin runners using processor affinity");
   printf("  %2s %8s %s\n", "-c", "", "Run with cosmological time integration");
   printf(
       "  %2s %8s %s\n", "-d", "",
@@ -118,24 +126,16 @@ int main(int argc, char *argv[]) {
   fflush(stdout);
 #endif
 
+/* Let's pin the main thread */
 #if defined(HAVE_SETAFFINITY) && defined(HAVE_LIBNUMA) && defined(_GNU_SOURCE)
-  if ((ENGINE_POLICY) & engine_policy_setaffinity) {
-    /* Ensure the NUMA node on which we initialise (first touch) everything
-     * doesn't change before engine_init allocates NUMA-local workers.
-     * Otherwise, we may be scheduled elsewhere between the two times.
-     */
-    cpu_set_t affinity;
-    CPU_ZERO(&affinity);
-    CPU_SET(sched_getcpu(), &affinity);
-    if (sched_setaffinity(0, sizeof(cpu_set_t), &affinity) != 0) {
-      error("failed to set entry thread's affinity");
-    }
-  }
+  if (((ENGINE_POLICY)&engine_policy_setaffinity) == engine_policy_setaffinity)
+    engine_pin();
 #endif
 
   /* Welcome to SWIFT, you made the right choice */
   if (myrank == 0) greetings();
 
+  int with_aff = 0;
   int dry_run = 0;
   int dump_tasks = 0;
   int with_cosmology = 0;
@@ -150,7 +150,10 @@ int main(int argc, char *argv[]) {
 
   /* Parse the parameters */
   int c;
-  while ((c = getopt(argc, argv, "cdef:gGhst:v:y")) != -1) switch (c) {
+  while ((c = getopt(argc, argv, "acdef:gGhst:v:y:")) != -1) switch (c) {
+      case 'a':
+        with_aff = 1;
+        break;
       case 'c':
         with_cosmology = 1;
         break;
@@ -233,8 +236,8 @@ int main(int argc, char *argv[]) {
         "Executing a dry run. No i/o or time integration will be performed.");
 
   /* Report CPU frequency. */
+  cpufreq = clocks_get_cpufreq();
   if (myrank == 0) {
-    cpufreq = clocks_get_cpufreq();
     message("CPU frequency used for tick conversion: %llu Hz", cpufreq);
   }
 
@@ -249,6 +252,8 @@ int main(int argc, char *argv[]) {
     message("sizeof(struct part)  is %4zi bytes.", sizeof(struct part));
     message("sizeof(struct xpart) is %4zi bytes.", sizeof(struct xpart));
     message("sizeof(struct gpart) is %4zi bytes.", sizeof(struct gpart));
+    message("sizeof(struct task)  is %4zi bytes.", sizeof(struct task));
+    message("sizeof(struct cell)  is %4zi bytes.", sizeof(struct cell));
   }
 
   /* How vocal are we ? */
@@ -268,17 +273,6 @@ int main(int argc, char *argv[]) {
   MPI_Bcast(params, sizeof(struct swift_params), MPI_BYTE, 0, MPI_COMM_WORLD);
 #endif
 
-  /* Initialize unit system */
-  struct UnitSystem us;
-  units_init(&us, params);
-  if (myrank == 0) {
-    message("Unit system: U_M = %e g.", us.UnitMass_in_cgs);
-    message("Unit system: U_L = %e cm.", us.UnitLength_in_cgs);
-    message("Unit system: U_t = %e s.", us.UnitTime_in_cgs);
-    message("Unit system: U_I = %e A.", us.UnitCurrent_in_cgs);
-    message("Unit system: U_T = %e K.", us.UnitTemperature_in_cgs);
-  }
-
 /* Prepare the domain decomposition scheme */
 #ifdef WITH_MPI
   struct partition initial_partition;
@@ -296,6 +290,29 @@ int main(int argc, char *argv[]) {
   }
 #endif
 
+  /* Initialize unit system and constants */
+  struct UnitSystem us;
+  struct phys_const prog_const;
+  units_init(&us, params, "InternalUnitSystem");
+  phys_const_init(&us, &prog_const);
+  if (myrank == 0 && verbose > 0) {
+    message("Unit system: U_M = %e g.", us.UnitMass_in_cgs);
+    message("Unit system: U_L = %e cm.", us.UnitLength_in_cgs);
+    message("Unit system: U_t = %e s.", us.UnitTime_in_cgs);
+    message("Unit system: U_I = %e A.", us.UnitCurrent_in_cgs);
+    message("Unit system: U_T = %e K.", us.UnitTemperature_in_cgs);
+    phys_const_print(&prog_const);
+  }
+
+  /* Initialise the hydro properties */
+  struct hydro_props hydro_properties;
+  hydro_props_init(&hydro_properties, params);
+
+  /* Initialise the external potential properties */
+  struct external_potential potential;
+  if (with_external_gravity) potential_init(params, &us, &potential);
+  if (with_external_gravity && myrank == 0) potential_print(&potential);
+
   /* Read particles and space information from (GADGET) ICs */
   char ICfileName[200] = "";
   parser_get_param_string(params, "InitialConditions:file_name", ICfileName);
@@ -333,6 +350,13 @@ int main(int argc, char *argv[]) {
     for (size_t k = 0; k < Ngas; ++k) parts[k].gpart = NULL;
     Ngpart = 0;
   }
+  if (!with_hydro) {
+    free(parts);
+    parts = NULL;
+    for (size_t k = 0; k < Ngpart; ++k)
+      if (gparts[k].id_or_neg_offset < 0) error("Linking problem");
+    Ngas = 0;
+  }
 
   /* Get the total number of particles across all nodes. */
   long long N_total[2] = {0, 0};
@@ -369,6 +393,7 @@ int main(int argc, char *argv[]) {
     message("%zi parts in %i cells.", s.nr_parts, s.tot_cells);
     message("%zi gparts in %i cells.", s.nr_gparts, s.tot_cells);
     message("maximum depth is %d.", s.maxdepth);
+    fflush(stdout);
   }
 
   /* Verify that each particle is in it's proper cell. */
@@ -395,8 +420,9 @@ int main(int argc, char *argv[]) {
   /* Initialize the engine with the space and policies. */
   if (myrank == 0) clocks_gettime(&tic);
   struct engine e;
-  engine_init(&e, &s, params, nr_nodes, myrank, nr_threads, engine_policies,
-              talking);
+  engine_init(&e, &s, params, nr_nodes, myrank, nr_threads, with_aff,
+              engine_policies, talking, &prog_const, &hydro_properties,
+              &potential);
   if (myrank == 0) {
     clocks_gettime(&toc);
     message("engine_init took %.3f %s.", clocks_diff(&tic, &toc),
@@ -404,32 +430,8 @@ int main(int argc, char *argv[]) {
     fflush(stdout);
   }
 
-  /* Now that everything is ready, no need for the parameters any more */
-  free(params);
-  params = NULL;
-
-  int with_outputs = 1;
-  if (with_outputs && !dry_run) {
-    /* Write the state of the system before starting time integration. */
-    if (myrank == 0) clocks_gettime(&tic);
-#if defined(WITH_MPI)
-#if defined(HAVE_PARALLEL_HDF5)
-    write_output_parallel(&e, &us, myrank, nr_nodes, MPI_COMM_WORLD,
-                          MPI_INFO_NULL);
-#else
-    write_output_serial(&e, &us, myrank, nr_nodes, MPI_COMM_WORLD,
-                        MPI_INFO_NULL);
-#endif
-#else
-    write_output_single(&e, &us);
-#endif
-    if (myrank == 0 && verbose) {
-      clocks_gettime(&toc);
-      message("writing particle properties took %.3f %s.",
-              clocks_diff(&tic, &toc), clocks_getunit());
-      fflush(stdout);
-    }
-  }
+  /* Write the state of the system before starting time integration. */
+  if (!dry_run) engine_dump_snapshot(&e);
 
 /* Init the runner history. */
 #ifdef HIST
@@ -439,10 +441,10 @@ int main(int argc, char *argv[]) {
   /* Get some info to the user. */
   if (myrank == 0) {
     message(
-        "Running on %lld gas particles and %lld DM particles until t=%.3e with "
-        "%i threads and %i queues (dt_min=%.3e, dt_max=%.3e)...",
-        N_total[0], N_total[1], e.timeEnd, e.nr_threads, e.sched.nr_queues,
-        e.dt_min, e.dt_max);
+        "Running on %lld gas particles and %lld DM particles from t=%.3e until "
+        "t=%.3e with %d threads and %d queues (dt_min=%.3e, dt_max=%.3e)...",
+        N_total[0], N_total[1], e.timeBegin, e.timeEnd, e.nr_threads,
+        e.sched.nr_queues, e.dt_min, e.dt_max);
     fflush(stdout);
   }
 
@@ -479,36 +481,12 @@ int main(int argc, char *argv[]) {
     if (j % 100 == 2) e.forcerepart = reparttype;
 #endif
 
+    /* Reset timers */
     timers_reset(timers_mask_all);
-#ifdef COUNTER
-    for (k = 0; k < runner_counter_count; k++) runner_counter[k] = 0;
-#endif
 
     /* Take a step. */
     engine_step(&e);
 
-    if (with_outputs && j % 100 == 0) {
-
-      if (myrank == 0) clocks_gettime(&tic);
-#if defined(WITH_MPI)
-#if defined(HAVE_PARALLEL_HDF5)
-      write_output_parallel(&e, &us, myrank, nr_nodes, MPI_COMM_WORLD,
-                            MPI_INFO_NULL);
-#else
-      write_output_serial(&e, &us, myrank, nr_nodes, MPI_COMM_WORLD,
-                          MPI_INFO_NULL);
-#endif
-#else
-      write_output_single(&e, &us);
-#endif
-      if (myrank == 0 && verbose) {
-        clocks_gettime(&toc);
-        message("writing particle properties took %.3f %s.",
-                clocks_diff(&tic, &toc), clocks_getunit());
-        fflush(stdout);
-      }
-    }
-
     /* Dump the task data using the given frequency. */
     if (dump_tasks && (dump_tasks == 1 || j % dump_tasks == 1)) {
 #ifdef WITH_MPI
@@ -534,20 +512,25 @@ int main(int argc, char *argv[]) {
           /* Open file and position at end. */
           file_thread = fopen(dumpfile, "a");
 
-          fprintf(file_thread, " %03i 0 0 0 0 %lli 0 0 0 0\n", myrank,
-                  e.tic_step);
+          fprintf(file_thread, " %03i 0 0 0 0 %lli %lli 0 0 0 0 %lli\n", myrank,
+                  e.tic_step, e.toc_step, cpufreq);
           int count = 0;
           for (int l = 0; l < e.sched.nr_tasks; l++)
             if (!e.sched.tasks[l].skip && !e.sched.tasks[l].implicit) {
-              fprintf(file_thread, " %03i %i %i %i %i %lli %lli %i %i %i\n",
-                      myrank, e.sched.tasks[l].rid, e.sched.tasks[l].type,
-                      e.sched.tasks[l].subtype, (e.sched.tasks[l].cj == NULL),
-                      e.sched.tasks[l].tic, e.sched.tasks[l].toc,
-                      (e.sched.tasks[l].ci != NULL) ? e.sched.tasks[l].ci->count
-                                                    : 0,
-                      (e.sched.tasks[l].cj != NULL) ? e.sched.tasks[l].cj->count
-                                                    : 0,
-                      e.sched.tasks[l].flags);
+              fprintf(
+                  file_thread, " %03i %i %i %i %i %lli %lli %i %i %i %i %i\n",
+                  myrank, e.sched.tasks[l].last_rid, e.sched.tasks[l].type,
+                  e.sched.tasks[l].subtype, (e.sched.tasks[l].cj == NULL),
+                  e.sched.tasks[l].tic, e.sched.tasks[l].toc,
+                  (e.sched.tasks[l].ci != NULL) ? e.sched.tasks[l].ci->count
+                                                : 0,
+                  (e.sched.tasks[l].cj != NULL) ? e.sched.tasks[l].cj->count
+                                                : 0,
+                  (e.sched.tasks[l].ci != NULL) ? e.sched.tasks[l].ci->gcount
+                                                : 0,
+                  (e.sched.tasks[l].cj != NULL) ? e.sched.tasks[l].cj->gcount
+                                                : 0,
+                  e.sched.tasks[l].flags);
               fflush(stdout);
               count++;
             }
@@ -565,15 +548,20 @@ int main(int argc, char *argv[]) {
       snprintf(dumpfile, 30, "thread_info-step%d.dat", j);
       FILE *file_thread;
       file_thread = fopen(dumpfile, "w");
+      /* Add some information to help with the plots */
+      fprintf(file_thread, " %i %i %i %i %lli %lli %i %i %i %lli\n", -2, -1, -1,
+              1, e.tic_step, e.toc_step, 0, 0, 0, cpufreq);
       for (int l = 0; l < e.sched.nr_tasks; l++)
         if (!e.sched.tasks[l].skip && !e.sched.tasks[l].implicit)
           fprintf(
-              file_thread, " %i %i %i %i %lli %lli %i %i\n",
-              e.sched.tasks[l].rid, e.sched.tasks[l].type,
+              file_thread, " %i %i %i %i %lli %lli %i %i %i %i\n",
+              e.sched.tasks[l].last_rid, e.sched.tasks[l].type,
               e.sched.tasks[l].subtype, (e.sched.tasks[l].cj == NULL),
               e.sched.tasks[l].tic, e.sched.tasks[l].toc,
               (e.sched.tasks[l].ci == NULL) ? 0 : e.sched.tasks[l].ci->count,
-              (e.sched.tasks[l].cj == NULL) ? 0 : e.sched.tasks[l].cj->count);
+              (e.sched.tasks[l].cj == NULL) ? 0 : e.sched.tasks[l].cj->count,
+              (e.sched.tasks[l].ci == NULL) ? 0 : e.sched.tasks[l].ci->gcount,
+              (e.sched.tasks[l].cj == NULL) ? 0 : e.sched.tasks[l].cj->gcount);
       fclose(file_thread);
 #endif
     }
@@ -590,28 +578,8 @@ int main(int argc, char *argv[]) {
            (double)runner_hist_bins[k]);
 #endif
 
-  if (with_outputs) {
-
-    if (myrank == 0) clocks_gettime(&tic);
-/* Write final output. */
-#if defined(WITH_MPI)
-#if defined(HAVE_PARALLEL_HDF5)
-    write_output_parallel(&e, &us, myrank, nr_nodes, MPI_COMM_WORLD,
-                          MPI_INFO_NULL);
-#else
-    write_output_serial(&e, &us, myrank, nr_nodes, MPI_COMM_WORLD,
-                        MPI_INFO_NULL);
-#endif
-#else
-    write_output_single(&e, &us);
-#endif
-    if (myrank == 0 && verbose) {
-      clocks_gettime(&toc);
-      message("writing particle properties took %.3f %s.",
-              clocks_diff(&tic, &toc), clocks_getunit());
-      fflush(stdout);
-    }
-  }
+  /* Write final output. */
+  engine_dump_snapshot(&e);
 
 #ifdef WITH_MPI
   if ((res = MPI_Finalize()) != MPI_SUCCESS)
diff --git a/examples/parameter_example.yml b/examples/parameter_example.yml
index e3cd5b209c9d36f9774364a661b77f3d649c398e..fb1e6ff6931b2fdee00792eed7f178872ee6d950 100644
--- a/examples/parameter_example.yml
+++ b/examples/parameter_example.yml
@@ -1,6 +1,5 @@
-
 # Define the system of units to use internally. 
-UnitSystem:
+InternalUnitSystem:
   UnitMass_in_cgs:     1   # Grams
   UnitLength_in_cgs:   1   # Centimeters
   UnitVelocity_in_cgs: 1   # Centimeters per second
@@ -9,10 +8,10 @@ UnitSystem:
 
 # Parameters for the task scheduling
 Scheduler:
-  nr_queues:        0        # The number of task queues to use. Use 0  to let the system decide.
-  cell_max_size:    8000000  # Maximal number of interactions per task (this is the default value).
-  cell_sub_size:    8000000  # Maximal number of interactions per sub-task  (this is the default value).
-  cell_split_size:  400      # Maximal number of particles per cell (this is the default value).
+  nr_queues:        0        # (Optional) The number of task queues to use. Use 0  to let the system decide.
+  cell_max_size:    8000000  # (Optional) Maximal number of interactions per task (this is the default value).
+  cell_sub_size:    8000000  # (Optional) Maximal number of interactions per sub-task  (this is the default value).
+  cell_split_size:  400      # (Optional) Maximal number of particles per cell (this is the default value).
 
 # Parameters governing the time integration
 TimeIntegration:
@@ -21,27 +20,51 @@ TimeIntegration:
   dt_min:     1e-6  # The minimal time-step size of the simulation (in internal units).
   dt_max:     1e-2  # The maximal time-step size of the simulation (in internal units).
 
+# Parameters governing the snapshots
+Snapshots:
+  basename:   output      # Common part of the name of output files
+  time_first: 0.          # Time of the first output (in internal units)
+  delta_time: 0.01        # Time difference between consecutive outputs (in internal units)
+  UnitMass_in_cgs:     1  # Unit system for the outputs (Grams)
+  UnitLength_in_cgs:   1  # Unit system for the outputs (Centimeters)
+  UnitVelocity_in_cgs: 1  # Unit system for the outputs (Centimeters per second)
+  UnitCurrent_in_cgs:  1  # Unit system for the outputs (Amperes)
+  UnitTemp_in_cgs:     1  # Unit system for the outputs (Kelvin)
+
+# Parameters governing the conserved quantities statistics
+Statistics:
+  delta_time:          1e-2 # Time between statistics output
+
 # Parameters for the hydrodynamics scheme
 SPH:
-  resolution_eta:        1.2349   # Target smoothing length in units of the mean inter-particle separation (1.2349 == 48Ngbs with the cubic spline kernel).
-  delta_neighbours:      1.       # The tolerance for the targetted number of neighbours.
+  resolution_eta:        1.2348   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
+  delta_neighbours:      0.1      # The tolerance for the targetted number of neighbours.
+  max_ghost_iterations:  30       # (Optional) Maximal number of iterations allowed to converge towards the smoothing length.
+  max_smoothing_length:  0.1      # Maximal smoothing length allowed (in internal units).
   CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
-  max_ghost_iterations:  30       # Maximal number of iterations allowed to converge towards the smoothing length.
-  max_smoothing_length:  3.       # Maximal smoothing length allowed (in internal units).
+  max_volume_change:     2.       # (Optional) Maximal allowed change of kernel volume over one time-step
 
 # Parameters related to the initial conditions
 InitialConditions:
   file_name:  SedovBlast/sedov.hdf5 # The file to read
-  h_scaling:  1.                    # A scaling factor to apply to all smoothing lengths in the ICs.
-  shift_x:    0.                    # A shift to apply to all particles read from the ICs (in internal units).
+  h_scaling:  1.                    # (Optional) A scaling factor to apply to all smoothing lengths in the ICs.
+  shift_x:    0.                    # (Optional) A shift to apply to all particles read from the ICs (in internal units).
   shift_y:    0.
   shift_z:    0.
 
 # Parameters govering domain decomposition
 DomainDecomposition:
-  initial_type:       m     # The initial strategy ("g", "m", "w", or "v"). See documentation for details.
-  initial_grid_x:    10     # Grid size if the 'g' strategy is chosen.
+  initial_type:       m     # (Optional) The initial strategy ("g", "m", "w", or "v").
+  initial_grid_x:    10     # (Optional) Grid size if the "g" strategy is chosen.
   initial_grid_y:    10
   initial_grid_z:    10
-  repartition_type:   b     # The re-decomposition strategy ("n", "b", "v", "e" or "x"). See documentation for details.
+  repartition_type:   b     # (Optional) The re-decomposition strategy ("n", "b", "v", "e" or "x").
  
+# Parameters related to external potentials
+  
+# Point mass external potential
+PointMass:
+  position_x:      50.     # location of external point mass in internal units
+  position_y:      50.
+  position_z:      50.
+  mass:            1e10     # mass of external point mass in internal units
diff --git a/examples/plot_tasks.py b/examples/plot_tasks.py
index 895c32ef9c3d1490e6d30b7dc79e40171a228ee9..f2d0aa95d1f35f30476e1989349a07be8d9e5b0a 100755
--- a/examples/plot_tasks.py
+++ b/examples/plot_tasks.py
@@ -35,9 +35,6 @@ import pylab as pl
 import numpy as np
 import sys
 
-#  CPU ticks per second.
-CPU_CLOCK = 2.7e9
-
 #  Basic plot configuration.
 PLOT_PARAMS = {"axes.labelsize": 10,
                "axes.titlesize": 10,
@@ -58,25 +55,29 @@ PLOT_PARAMS = {"axes.labelsize": 10,
 pl.rcParams.update(PLOT_PARAMS)
 
 #  Tasks and subtypes. Indexed as in tasks.h.
-TASKTYPES = ["none", "sort", "self", "pair", "sub", "init", "ghost", "drift", "kick",
-             "send", "recv", "grav_pp", "grav_mm", "grav_up", "grav_down",
-             "part_sort", "gpart_sort", "split_cell", "rewait", "count"]
+TASKTYPES = ["none", "sort", "self", "pair", "sub_self", "sub_pair", "init", "ghost",
+             "drift", "kick", "kick_fixdt", "send", "recv", "grav_pp", "grav_mm",
+             "grav_up", "grav_down", "grav_external", "part_sort", "gpart_sort",
+             "split_cell", "rewait", "count"]
 
 TASKCOLOURS = {"none": "black",
                "sort": "lightblue",
                "self": "greenyellow",
                "pair": "navy",
-               "sub": "hotpink",
+               "sub_self": "greenyellow",
+               "sub_pair": "navy",
                "init": "indigo",
                "ghost": "cyan",
                "drift": "maroon",
                "kick": "green",
+               "kick_fixdt": "green",
                "send": "yellow",
                "recv": "magenta",
                "grav_pp": "mediumorchid",
                "grav_mm": "mediumturquoise",
                "grav_up": "mediumvioletred",
                "grav_down": "mediumnightblue",
+               "grav_external": "darkred",
                "part_sort": "steelblue",
                "gpart_sort": "teal" ,
                "split_cell": "seagreen",
@@ -106,7 +107,7 @@ infile = sys.argv[1]
 outpng = sys.argv[2]
 delta_t = 0
 if len( sys.argv ) == 4:
-    delta_t = int(sys.argv[3]) * CPU_CLOCK / 1000
+    delta_t = int(sys.argv[3])
 
 #  Read input.
 data = pl.loadtxt( infile )
@@ -114,20 +115,31 @@ data = pl.loadtxt( infile )
 nthread = int(max(data[:,0])) + 1
 print "Number of threads:", nthread
 
+# Recover the start and end time
+full_step = data[0,:]
+tic_step = int(full_step[4])
+toc_step = int(full_step[5])
+CPU_CLOCK = float(full_step[-1])
+data = data[1:,:]
+
+print "CPU frequency:", CPU_CLOCK / 1.e9
+
 # Avoid start and end times of zero.
 data = data[data[:,4] != 0]
 data = data[data[:,5] != 0]
 
-# Calculate the time range, it not given.
+# Calculate the time range, if not given.
+delta_t = delta_t * CPU_CLOCK / 1000
 if delta_t == 0:
     dt = max(data[:,5]) - min(data[:,4])
     if dt > delta_t:
         delta_t = dt
 
 # Once more doing the real gather and plots this time.
-start_t = min(data[:,4])
+start_t = tic_step 
 data[:,4] -= start_t
 data[:,5] -= start_t
+end_t = (toc_step - start_t) / CPU_CLOCK * 1000
 
 tasks = {}
 tasks[-1] = []
@@ -145,7 +157,7 @@ for line in range(num_lines):
     tasks[thread][-1]["tic"] = tic
     tasks[thread][-1]["toc"] = toc
     tasks[thread][-1]["t"] = (toc + tic)/ 2
-
+    
 combtasks = {}
 combtasks[-1] = []
 for i in range(nthread):
@@ -171,11 +183,11 @@ for thread in range(nthread):
             lasttype = task["type"]
         else:
             combtasks[thread][-1]["toc"] = task["toc"]
-
+            
 typesseen = []
 fig = pl.figure()
 ax = fig.add_subplot(1,1,1)
-ax.set_xlim(0, delta_t * 1.03 * 1000 / CPU_CLOCK)
+ax.set_xlim(-delta_t * 0.03 * 1000 / CPU_CLOCK, delta_t * 1.03 * 1000 / CPU_CLOCK)
 ax.set_ylim(0, nthread)
 tictoc = np.zeros(2)
 for i in range(nthread):
@@ -220,6 +232,10 @@ ax.fill_between([0, 0], nthread+0.5, nthread + nrow + 0.5, facecolor="white")
 ax.set_ylim(0, nthread + nrow + 1)
 ax.legend(loc=1, shadow=True, mode="expand", ncol=5)
 
+# Start and end of time-step
+ax.plot([0, 0], [0, nthread + nrow + 1], 'k--', linewidth=1)
+ax.plot([end_t, end_t], [0, nthread + nrow + 1], 'k--', linewidth=1)
+
 ax.set_xlabel("Wall clock time [ms]")
 ax.set_ylabel("Thread ID" )
 ax.set_yticks(pl.array(range(nthread)), True)
diff --git a/examples/plot_tasks_MPI.py b/examples/plot_tasks_MPI.py
index d59fe6417b524b8cb3cf8f6117fca3b8b3f3c780..9a92faf9417c9a302831eb8cb2f4471eb672d59c 100755
--- a/examples/plot_tasks_MPI.py
+++ b/examples/plot_tasks_MPI.py
@@ -41,9 +41,6 @@ import pylab as pl
 import numpy as np
 import sys
 
-#  CPU ticks per second.
-CPU_CLOCK = 2.7e9
-
 #  Basic plot configuration.
 PLOT_PARAMS = {"axes.labelsize": 10,
                "axes.titlesize": 10,
@@ -64,27 +61,31 @@ PLOT_PARAMS = {"axes.labelsize": 10,
 pl.rcParams.update(PLOT_PARAMS)
 
 #  Tasks and subtypes. Indexed as in tasks.h.
-TASKTYPES = ["none", "sort", "self", "pair", "sub", "init", "ghost", "drift", "kick",
-             "send", "recv", "grav_pp", "grav_mm", "grav_up", "grav_down",
-             "part_sort", "gpart_sort", "split_cell", "rewait", "count"]
+TASKTYPES = ["none", "sort", "self", "pair", "sub_self", "sub_pair", "init", "ghost",
+             "drift", "kick", "kick_fixdt", "send", "recv", "grav_pp", "grav_mm",
+             "grav_up", "grav_down", "grav_external", "part_sort", "gpart_sort",
+             "split_cell", "rewait", "count"]
 
 TASKCOLOURS = {"none": "black",
                "sort": "lightblue",
                "self": "greenyellow",
                "pair": "navy",
-               "sub": "hotpink",
+               "sub_self": "greenyellow",
+               "sub_pair": "navy",
                "init": "indigo",
                "ghost": "cyan",
                "drift": "maroon",
                "kick": "green",
+               "kick_fixdt": "green",
                "send": "yellow",
                "recv": "magenta",
                "grav_pp": "mediumorchid",
                "grav_mm": "mediumturquoise",
                "grav_up": "mediumvioletred",
                "grav_down": "mediumnightblue",
+               "grav_external": "darkred",
                "part_sort": "steelblue",
-               "gpart_sort": "teal",
+               "gpart_sort": "teal" ,
                "split_cell": "seagreen",
                "rewait": "olive",
                "count": "powerblue"}
@@ -113,11 +114,20 @@ infile = sys.argv[1]
 outbase = sys.argv[2]
 delta_t = 0
 if len( sys.argv ) == 4:
-    delta_t = int(sys.argv[3]) * CPU_CLOCK / 1000
-
+    delta_t = int(sys.argv[3])
+    
 #  Read input.
 data = pl.loadtxt( infile )
 
+# Recover the start and end time
+full_step = data[0,:]
+tic_step = int(full_step[5])
+toc_step = int(full_step[6])
+CPU_CLOCK = float(full_step[-1])
+
+print "CPU frequency:", CPU_CLOCK / 1.e9
+
+
 nranks = int(max(data[:,0])) + 1
 print "Number of ranks:", nranks
 nthread = int(max(data[:,1])) + 1
@@ -130,6 +140,7 @@ sdata = sdata[sdata[:,6] != 0]
 # Each rank can have different clock (compute node), but we want to use the
 # same delta times range for comparisons, so we suck it up and take the hit of
 # precalculating this, unless the user knows better.
+delta_t = delta_t * CPU_CLOCK / 1000
 if delta_t == 0:
     for rank in range(nranks):
         data = sdata[sdata[:,0] == rank]
@@ -141,16 +152,22 @@ if delta_t == 0:
 for rank in range(nranks):
     data = sdata[sdata[:,0] == rank]
 
-    start_t = min(data[:,5])
+    full_step = data[0,:]
+    tic_step = int(full_step[5])
+    toc_step = int(full_step[6])
+    data = data[1:,:]
+
+    start_t = tic_step
     data[:,5] -= start_t
     data[:,6] -= start_t
+    end_t = (toc_step - start_t) / CPU_CLOCK * 1000
 
     tasks = {}
     tasks[-1] = []
     for i in range(nthread):
         tasks[i] = []
 
-    num_lines = pl.size(data) / 10
+    num_lines = pl.shape(data)[0]
     for line in range(num_lines):
         thread = int(data[line,1])
         tasks[thread].append({})
@@ -191,7 +208,7 @@ for rank in range(nranks):
     typesseen = []
     fig = pl.figure()
     ax = fig.add_subplot(1,1,1)
-    ax.set_xlim(0, delta_t * 1.03 * 1000 / CPU_CLOCK)
+    ax.set_xlim(-delta_t * 0.03 * 1000 / CPU_CLOCK, delta_t * 1.03 * 1000 / CPU_CLOCK)
     ax.set_ylim(0, nthread)
     tictoc = np.zeros(2)
     for i in range(nthread):
@@ -236,6 +253,10 @@ for rank in range(nranks):
     ax.set_ylim(0, nthread + nrow + 1)
     ax.legend(loc=1, shadow=True, mode="expand", ncol=5)
 
+    # Start and end of time-step
+    ax.plot([0, 0], [0, nthread + nrow + 1], 'k--', linewidth=1)
+    ax.plot([end_t, end_t], [0, nthread + nrow + 1], 'k--', linewidth=1)
+
     ax.set_xlabel("Wall clock time [ms]")
     ax.set_ylabel("Thread ID for MPI Rank " + str(rank) )
     ax.set_yticks(pl.array(range(nthread)), True)
diff --git a/examples/runs_mpi_cv.sh b/examples/runs_mpi_cv.sh
deleted file mode 100755
index a808d9c45afa952669327a793f1f5d7579ced74b..0000000000000000000000000000000000000000
--- a/examples/runs_mpi_cv.sh
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/bin/bash
-
-# Set the number of runs
-FINALTIME=1.
-
-# Cores per node
-CPN=12
-
-# The queue on which to run
-QUEUE=cosma
-PROJECT=durham
-PREFIX=CosmoVolume
-INPUT=$PREFIX/CosmoVolume.hdf5
-
-# Make sure the OMP threads don't go wild
-export OMP_WAIT_POLICY=PASSIVE
-
-# Set the library path so that libmetis is found
-export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/cosma/home/nnrw56/lib
-
-# Single-node runs
-for cpu in $(seq 1 $CPN)
-do
-    if [ ! -e ${PREFIX}_${QUEUE}_1x${cpu}.dump ]
-    then
-        bsub -oo ${PREFIX}_${QUEUE}_1x${cpu}.dump -q ${QUEUE} -P ${PROJECT} -x -n 1 -R "span[ptile=1]" ./swift -c $FINALTIME -t $cpu -f ${INPUT} -m 0.705 -w 6000 -z 300 -d 1e-7 -e 0.01
-    fi
-done
-
-# Multi-node runs
-if [ ! -e ${PREFIX}_${QUEUE}_2x${cpu}.dump ]
-then
-    bsub -oo ${PREFIX}_${QUEUE}_2x${CPN}.dump -q ${QUEUE} -P ${PROJECT} -x -W 02:00 -n 2 -R "span[ptile=1]" mpirun -np 2 ./swift_mpi -c $FINALTIME -t $CPN -g "2 1 1" -f ${INPUT} -m 0.705 -w 6000 -z 300 -d 1e-7 -e 0.01
-fi
-
-if [ ! -e ${PREFIX}_${QUEUE}_4x${cpu}.dump ]
-then
-    bsub -oo ${PREFIX}_${QUEUE}_4x${CPN}.dump -q ${QUEUE} -P ${PROJECT} -x -W 02:00 -n 4 -R "span[ptile=1]" mpirun -np 4 ./swift_mpi -c $FINALTIME -t $CPN -g "2 2 1" -f ${INPUT} -m 0.705 -w 6000 -z 300 -d 1e-7 -e 0.01
-fi
-
-if [ ! -e ${PREFIX}_${QUEUE}_8x${cpu}.dump ]
-then
-    bsub -oo ${PREFIX}_${QUEUE}_8x${CPN}.dump -q ${QUEUE} -P ${PROJECT} -x -W 02:00 -n 8 -R "span[ptile=1]" mpirun -np 8 ./swift_mpi -c $FINALTIME -t $CPN -g "2 2 2" -f ${INPUT} -m 0.705 -w 6000 -z 300 -d 1e-7 -e 0.01
-fi
-
-if [ ! -e ${PREFIX}_${QUEUE}_16x${cpu}.dump ]
-then
-    bsub -oo ${PREFIX}_${QUEUE}_16x${CPN}.dump -q ${QUEUE} -P ${PROJECT} -x -W 02:00 -n 16 -R "span[ptile=1]" mpirun -np 16 ./swift_mpi -c $FINALTIME -t $CPN -g "4 2 2" -f ${INPUT} -m 0.705 -w 6000 -z 300 -d 1e-7 -e 0.01
-fi
-
-if [ ! -e ${PREFIX}_${QUEUE}_32x${cpu}.dump ]
-then
-    bsub -oo ${PREFIX}_${QUEUE}_32x${CPN}.dump -q ${QUEUE} -P ${PROJECT} -x -W 02:00 -n 32 -R "span[ptile=1]" mpirun -np 32 ./swift_mpi -c $FINALTIME -t $CPN -g "4 4 2" -f ${INPUT} -m 0.705 -w 6000 -z 300 -d 1e-7 -e 0.01
-fi
-
-if [ ! -e ${PREFIX}_${QUEUE}_64x${cpu}.dump ]
-then
-    bsub -oo ${PREFIX}_${QUEUE}_64x${CPN}.dump -q ${QUEUE} -P ${PROJECT} -x -W 02:00 -n 64 -R "span[ptile=1]" mpirun -np 64 ./swift_mpi -c $FINALTIME -t $CPN -g "4 4 4" -f ${INPUT} -m 0.705 -w 6000 -z 300 -d 1e-7 -e 0.01
-fi
-
-if [ ! -e ${PREFIX}_${QUEUE}_128x${cpu}.dump ]
-then
-    bsub -oo ${PREFIX}_${QUEUE}_128x${CPN}.dump -q ${QUEUE} -P ${PROJECT} -x -W 02:00 -n 128 -R "span[ptile=1]" mpirun -np 128 ./swift_mpi -c $FINALTIME -t $CPN -g "8 4 4" -f ${INPUT} -m 0.705 -w 6000 -z 300 -d 1e-7 -e 0.01
-fi
-
diff --git a/format.sh b/format.sh
new file mode 100755
index 0000000000000000000000000000000000000000..5bf78c5bb663d5cf22178bff868912a059c5c4fe
--- /dev/null
+++ b/format.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+clang-format-3.8 -style=file -i src/*.[ch] src/*/*.[ch] src/*/*/*.[ch] examples/main.c tests/*.[ch]
diff --git a/src/Makefile.am b/src/Makefile.am
index a96f35b3cf0d8a23aec4f8c0f8d16bec8638cbcd..21b5ef25ca21202caaa9107bfbf09e62aa66011d 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1,4 +1,3 @@
-
 # This file is part of SWIFT.
 # Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk),
 #                    Matthieu Schaller (matthieu.schaller@durham.ac.uk).
@@ -17,10 +16,10 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 # Add the debug flag to the whole thing
-AM_CFLAGS = -DTIMER -DCOUNTER $(HDF5_CPPFLAGS)
+AM_CFLAGS = -DTIMER $(HDF5_CPPFLAGS)
 
 # Assign a "safe" version number
-AM_LDFLAGS = $(LAPACK_LIBS) $(BLAS_LIBS) $(HDF5_LDFLAGS) -version-info 0:0:0 # -fsanitize=address
+AM_LDFLAGS = $(HDF5_LDFLAGS) -version-info 0:0:0
 
 # The git command, if available.
 GIT_CMD = @GIT_CMD@
@@ -35,17 +34,20 @@ endif
 # List required headers
 include_HEADERS = space.h runner.h queue.h task.h lock.h cell.h part.h const.h \
     engine.h swift.h serial_io.h timers.h debug.h scheduler.h proxy.h parallel_io.h \
-    common_io.h single_io.h multipole.h map.h tools.h partition.h clocks.h parser.h
+    common_io.h single_io.h multipole.h map.h tools.h partition.h clocks.h parser.h \
+    physical_constants.h physical_constants_cgs.h potentials.h version.h hydro_properties.h
 
 # Common source files
 AM_SOURCES = space.c runner.c queue.c task.c cell.c engine.c \
     serial_io.c timers.c debug.c scheduler.c proxy.c parallel_io.c \
     units.c common_io.c single_io.c multipole.c version.c map.c \
-    kernel_hydro.c kernel_gravity.c tools.c part.c partition.c clocks.c parser.c
+    kernel_hydro.c kernel_gravity.c tools.c part.c partition.c clocks.c parser.c \
+    physical_constants.c potentials.c hydro_properties.c
 
 # Include files for distribution, not installation.
 nobase_noinst_HEADERS = approx_math.h atomic.h cycle.h error.h inline.h kernel_hydro.h kernel_gravity.h \
-		 vector.h runner_doiact.h runner_doiact_grav.h units.h intrinsics.h minmax.h \
+		 vector.h runner_doiact.h runner_doiact_grav.h units.h intrinsics.h minmax.h kick.h \
+		 timestep.h drift.h \
 		 gravity.h gravity_io.h \
 		 gravity/Default/gravity.h gravity/Default/gravity_iact.h gravity/Default/gravity_io.h \
 		 gravity/Default/gravity_debug.h gravity/Default/gravity_part.h  \
@@ -67,30 +69,31 @@ libswiftsim_la_SOURCES = $(AM_SOURCES)
 # Sources and flags for MPI library
 libswiftsim_mpi_la_SOURCES = $(AM_SOURCES)
 libswiftsim_mpi_la_CFLAGS = $(AM_CFLAGS) -DWITH_MPI $(METIS_INCS)
+libswiftsim_mpi_la_LDFLAGS = $(AM_LDFLAGS) -DWITH_MPI $(METIS_LIBS)
 libswiftsim_mpi_la_SHORTNAME = mpi
 
 
-# Versioning. If any sources change then update the version.h file with
+# Versioning. If any sources change then update the version_string.h file with
 # the current git revision and package version.
-# May have a checkout without a version.h file and no git command (tar/zip
+# May have a checkout without a version_string.h file and no git command (tar/zip
 # download), allow that, but make sure we know it.
-version.h: version.h.in $(AM_SOURCES) $(include_HEADERS) $(noinst_HEADERS)
+version_string.h: version_string.h.in $(AM_SOURCES) $(include_HEADERS) $(noinst_HEADERS)
 	if test "X$(GIT_CMD)" != "X"; then \
 	    GIT_REVISION=`$(GIT_CMD) describe --abbrev=8  --always --tags --dirty`; \
 	    GIT_BRANCH=`$(GIT_CMD) branch | sed -n 's/^\* \(.*\)/\1/p'`; \
 	    sed -e "s,@PACKAGE_VERSION\@,$(PACKAGE_VERSION)," \
 	        -e "s,@GIT_REVISION\@,$${GIT_REVISION}," \
-	        -e "s|@GIT_BRANCH\@|$${GIT_BRANCH}|" version.h.in > version.h; \
+	        -e "s|@GIT_BRANCH\@|$${GIT_BRANCH}|" version_string.h.in > version_string.h; \
 	else \
-	    if test ! -f version.h; then \
+	    if test ! -f version_string.h; then \
 	        sed -e "s,@PACKAGE_VERSION\@,$(PACKAGE_VERSION)," \
 	            -e "s,@GIT_REVISION\@,unknown," \
-		    -e "s,@GIT_BRANCH\@,unknown," version.h.in > version.h; \
+		    -e "s,@GIT_BRANCH\@,unknown," version_string.h.in > version_string.h; \
 	    fi; \
 	fi
 
-#  Make sure version.h is built first.
-BUILT_SOURCES = version.h
+#  Make sure version_string.h is built first.
+BUILT_SOURCES = version_string.h
 
 #  And distribute the built files.
-EXTRA_DIST = version.h version.h.in
+EXTRA_DIST = version_string.h version_string.h.in
diff --git a/src/approx_math.h b/src/approx_math.h
index ef93ea63c383c74caa3eaff65446962872389a35..cbca602b3fafcc5044b0939b2207b8f9d50a7446 100644
--- a/src/approx_math.h
+++ b/src/approx_math.h
@@ -19,6 +19,8 @@
 #ifndef SWIFT_APPROX_MATH_H
 #define SWIFT_APPROX_MATH_H
 
+#include "inline.h"
+
 /**
  * @brief Approximate version of expf(x) using a 4th order Taylor expansion
  *
diff --git a/src/atomic.h b/src/atomic.h
index 818d210e60a7aacdf61d12b60623ce87e62c9ed2..0b87a0f77e17bafc64a2a59b3c70bda782fc14d4 100644
--- a/src/atomic.h
+++ b/src/atomic.h
@@ -26,5 +26,6 @@
 #define atomic_inc(v) atomic_add(v, 1)
 #define atomic_dec(v) atomic_add(v, -1)
 #define atomic_cas(v, o, n) __sync_val_compare_and_swap(v, o, n)
+#define atomic_swap(v, n) __sync_lock_test_and_set(v, n)
 
 #endif /* SWIFT_ATOMIC_H */
diff --git a/src/cell.c b/src/cell.c
index 31a632a5b40a7706eeef6accc385d57e27f0f247..c41d62f05a1e40daf52851e6556bfa8f11f16dd7 100644
--- a/src/cell.c
+++ b/src/cell.c
@@ -1,6 +1,10 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
+ *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ *               2015 Peter W. Draper (p.w.draper@durham.ac.uk)
+ *               2016 John A. Regan (john.a.regan@durham.ac.uk)
+ *                    Tom Theuns (tom.theuns@durham.ac.uk)
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
@@ -47,6 +51,7 @@
 #include "error.h"
 #include "gravity.h"
 #include "hydro.h"
+#include "hydro_properties.h"
 #include "space.h"
 #include "timers.h"
 
@@ -114,7 +119,7 @@ int cell_unpack(struct pcell *pc, struct cell *c, struct space *s) {
       if (k & 1) temp->loc[2] += temp->h[2];
       temp->depth = c->depth + 1;
       temp->split = 0;
-      temp->dx_max = 0.0;
+      temp->dx_max = 0.f;
       temp->nodeID = c->nodeID;
       temp->parent = c;
       c->progeny[k] = temp;
@@ -264,7 +269,7 @@ int cell_locktree(struct cell *c) {
     /* Undo the holds up to finger. */
     for (struct cell *finger2 = c->parent; finger2 != finger;
          finger2 = finger2->parent)
-      __sync_fetch_and_sub(&finger2->hold, 1);
+      atomic_dec(&finger2->hold);
 
     /* Unlock this cell. */
     if (lock_unlock(&c->lock) != 0) error("Failed to unlock cell.");
@@ -304,7 +309,7 @@ int cell_glocktree(struct cell *c) {
     if (lock_trylock(&finger->glock) != 0) break;
 
     /* Increment the hold. */
-    __sync_fetch_and_add(&finger->ghold, 1);
+    atomic_inc(&finger->ghold);
 
     /* Unlock the cell. */
     if (lock_unlock(&finger->glock) != 0) error("Failed to unlock cell.");
@@ -322,7 +327,7 @@ int cell_glocktree(struct cell *c) {
     /* Undo the holds up to finger. */
     for (struct cell *finger2 = c->parent; finger2 != finger;
          finger2 = finger2->parent)
-      __sync_fetch_and_sub(&finger2->ghold, 1);
+      atomic_dec(&finger2->ghold);
 
     /* Unlock this cell. */
     if (lock_unlock(&c->glock) != 0) error("Failed to unlock cell.");
@@ -348,7 +353,7 @@ void cell_unlocktree(struct cell *c) {
 
   /* Climb up the tree and unhold the parents. */
   for (struct cell *finger = c->parent; finger != NULL; finger = finger->parent)
-    __sync_fetch_and_sub(&finger->hold, 1);
+    atomic_dec(&finger->hold);
 
   TIMER_TOC(timer_locktree);
 }
@@ -362,7 +367,7 @@ void cell_gunlocktree(struct cell *c) {
 
   /* Climb up the tree and unhold the parents. */
   for (struct cell *finger = c->parent; finger != NULL; finger = finger->parent)
-    __sync_fetch_and_sub(&finger->ghold, 1);
+    atomic_dec(&finger->ghold);
 
   TIMER_TOC(timer_locktree);
 }
@@ -403,12 +408,14 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset) {
       xparts[j] = xtemp;
     }
   }
-  /* for ( k = 0 ; k <= j ; k++ )
-      if ( parts[k].x[0] > pivot[0] )
-          error( "cell_split: sorting failed." );
-  for ( k = i ; k < count ; k++ )
-      if ( parts[k].x[0] < pivot[0] )
-          error( "cell_split: sorting failed." ); */
+
+#ifdef SWIFT_DEBUG_CHECKS
+  for (int k = 0; k <= j; k++)
+    if (parts[k].x[0] > pivot[0]) error("cell_split: sorting failed.");
+  for (int k = i; k < count; k++)
+    if (parts[k].x[0] < pivot[0]) error("cell_split: sorting failed.");
+#endif
+
   left[1] = i;
   right[1] = count - 1;
   left[0] = 0;
@@ -430,14 +437,17 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset) {
         xparts[j] = xtemp;
       }
     }
-    /* for ( int kk = left[k] ; kk <= j ; kk++ )
-        if ( parts[kk].x[1] > pivot[1] ) {
-            message( "ival=[%i,%i], i=%i, j=%i." , left[k] , right[k] , i , j );
-            error( "sorting failed (left)." );
-            }
-    for ( int kk = i ; kk <= right[k] ; kk++ )
-        if ( parts[kk].x[1] < pivot[1] )
-            error( "sorting failed (right)." ); */
+
+#ifdef SWIFT_DEBUG_CHECKS
+    for (int kk = left[k]; kk <= j; kk++)
+      if (parts[kk].x[1] > pivot[1]) {
+        message("ival=[%i,%i], i=%i, j=%i.", left[k], right[k], i, j);
+        error("sorting failed (left).");
+      }
+    for (int kk = i; kk <= right[k]; kk++)
+      if (parts[kk].x[1] < pivot[1]) error("sorting failed (right).");
+#endif
+
     left[2 * k + 1] = i;
     right[2 * k + 1] = right[k];
     left[2 * k] = left[k];
@@ -460,16 +470,20 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset) {
         xparts[j] = xtemp;
       }
     }
-    /* for ( int kk = left[k] ; kk <= j ; kk++ )
-        if ( parts[kk].x[2] > pivot[2] ) {
-            message( "ival=[%i,%i], i=%i, j=%i." , left[k] , right[k] , i , j );
-            error( "sorting failed (left)." );
-            }
-    for ( int kk = i ; kk <= right[k] ; kk++ )
-        if ( parts[kk].x[2] < pivot[2] ) {
-            message( "ival=[%i,%i], i=%i, j=%i." , left[k] , right[k] , i , j );
-            error( "sorting failed (right)." );
-            } */
+
+#ifdef SWIFT_DEBUG_CHECKS
+    for (int kk = left[k]; kk <= j; kk++)
+      if (parts[kk].x[2] > pivot[2]) {
+        message("ival=[%i,%i], i=%i, j=%i.", left[k], right[k], i, j);
+        error("sorting failed (left).");
+      }
+    for (int kk = i; kk <= right[k]; kk++)
+      if (parts[kk].x[2] < pivot[2]) {
+        message("ival=[%i,%i], i=%i, j=%i.", left[k], right[k], i, j);
+        error("sorting failed (right).");
+      }
+#endif
+
     left[2 * k + 1] = i;
     right[2 * k + 1] = right[k];
     left[2 * k] = left[k];
@@ -486,32 +500,34 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset) {
   /* Re-link the gparts. */
   part_relink_gparts(parts, count, parts_offset);
 
+#ifdef SWIFT_DEBUG_CHECKS
   /* Verify that _all_ the parts have been assigned to a cell. */
-  /* for ( k = 1 ; k < 8 ; k++ )
-      if ( &c->progeny[k-1]->parts[ c->progeny[k-1]->count ] !=
-  c->progeny[k]->parts )
-          error( "Particle sorting failed (internal consistency)." );
-  if ( c->progeny[0]->parts != c->parts )
-      error( "Particle sorting failed (left edge)." );
-  if ( &c->progeny[7]->parts[ c->progeny[7]->count ] != &c->parts[ count ] )
-      error( "Particle sorting failed (right edge)." ); */
+  for (int k = 1; k < 8; k++)
+    if (&c->progeny[k - 1]->parts[c->progeny[k - 1]->count] !=
+        c->progeny[k]->parts)
+      error("Particle sorting failed (internal consistency).");
+  if (c->progeny[0]->parts != c->parts)
+    error("Particle sorting failed (left edge).");
+  if (&c->progeny[7]->parts[c->progeny[7]->count] != &c->parts[count])
+    error("Particle sorting failed (right edge).");
 
   /* Verify a few sub-cells. */
-  /* for (int k = 0 ; k < c->progeny[0]->count ; k++ )
-      if ( c->progeny[0]->parts[k].x[0] > pivot[0] ||
-           c->progeny[0]->parts[k].x[1] > pivot[1] ||
-           c->progeny[0]->parts[k].x[2] > pivot[2] )
-          error( "Sorting failed (progeny=0)." );
-  for (int k = 0 ; k < c->progeny[1]->count ; k++ )
-      if ( c->progeny[1]->parts[k].x[0] > pivot[0] ||
-           c->progeny[1]->parts[k].x[1] > pivot[1] ||
-           c->progeny[1]->parts[k].x[2] <= pivot[2] )
-          error( "Sorting failed (progeny=1)." );
-  for (int k = 0 ; k < c->progeny[2]->count ; k++ )
-      if ( c->progeny[2]->parts[k].x[0] > pivot[0] ||
-           c->progeny[2]->parts[k].x[1] <= pivot[1] ||
-           c->progeny[2]->parts[k].x[2] > pivot[2] )
-          error( "Sorting failed (progeny=2)." ); */
+  for (int k = 0; k < c->progeny[0]->count; k++)
+    if (c->progeny[0]->parts[k].x[0] > pivot[0] ||
+        c->progeny[0]->parts[k].x[1] > pivot[1] ||
+        c->progeny[0]->parts[k].x[2] > pivot[2])
+      error("Sorting failed (progeny=0).");
+  for (int k = 0; k < c->progeny[1]->count; k++)
+    if (c->progeny[1]->parts[k].x[0] > pivot[0] ||
+        c->progeny[1]->parts[k].x[1] > pivot[1] ||
+        c->progeny[1]->parts[k].x[2] <= pivot[2])
+      error("Sorting failed (progeny=1).");
+  for (int k = 0; k < c->progeny[2]->count; k++)
+    if (c->progeny[2]->parts[k].x[0] > pivot[0] ||
+        c->progeny[2]->parts[k].x[1] <= pivot[1] ||
+        c->progeny[2]->parts[k].x[2] > pivot[2])
+      error("Sorting failed (progeny=2).");
+#endif
 
   /* Now do the same song and dance for the gparts. */
 
diff --git a/src/cell.h b/src/cell.h
index 8b65fa1904a4aa407a15bc30954651dc5c4e29e5..d5e80b724f1fdea17ff04ca38d62f6c25f3c49f7 100644
--- a/src/cell.h
+++ b/src/cell.h
@@ -1,6 +1,10 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
+ *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ *               2015 Peter W. Draper (p.w.draper@durham.ac.uk)
+ *               2016 John A. Regan (john.a.regan@durham.ac.uk)
+ *                    Tom Theuns (tom.theuns@durham.ac.uk)
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
@@ -26,8 +30,9 @@
 #include "lock.h"
 #include "multipole.h"
 #include "part.h"
+#include "task.h"
 
-/* Forward declaration of space, needed for cell_unpack. */
+/* Avoid cyclic inclusions */
 struct space;
 
 /* Max tag size set to 2^29 to take into account some MPI implementations
@@ -116,7 +121,7 @@ struct cell {
   struct link *density, *force, *grav;
   int nr_density, nr_force, nr_grav;
 
-  /* The ghost task to link density to interactions. */
+  /* The hierarchical tasks. */
   struct task *ghost, *init, *drift, *kick;
 
   /* Task receiving data. */
@@ -125,6 +130,9 @@ struct cell {
   /* Tasks for gravity tree. */
   struct task *grav_up, *grav_down;
 
+  /* Task for external gravity */
+  struct task *grav_external;
+
   /* Number of tasks that are associated with this cell. */
   int nr_tasks;
 
@@ -132,13 +140,13 @@ struct cell {
   int hold, ghold;
 
   /* Spin lock for various uses. */
-  lock_type lock, glock;
+  swift_lock_type lock, glock;
 
   /* ID of the previous owner, e.g. runner. */
   int owner;
 
   /* Momentum of particles in cell. */
-  float mom[3], ang[3];
+  double mom[3], ang_mom[3];
 
   /* Mass, potential, internal  and kinetic energy of particles in this cell. */
   double mass, e_pot, e_int, e_kin;
diff --git a/src/common_io.c b/src/common_io.c
index 8cdabfc5d6b8439de26a07cd7fb9d2ecfdb1dc0c..971fe6b01c682c489d3444ec1d47d6a902250bb8 100644
--- a/src/common_io.c
+++ b/src/common_io.c
@@ -43,6 +43,8 @@
 #include "const.h"
 #include "error.h"
 #include "kernel_hydro.h"
+#include "part.h"
+#include "units.h"
 #include "version.h"
 
 const char* particle_type_names[NUM_PARTICLE_TYPES] = {
@@ -147,8 +149,8 @@ void readAttribute(hid_t grp, char* name, enum DATA_TYPE type, void* data) {
  *
  * Calls #error() if an error occurs.
  */
-void writeAttribute(hid_t grp, char* name, enum DATA_TYPE type, void* data,
-                    int num) {
+void writeAttribute(hid_t grp, const char* name, enum DATA_TYPE type,
+                    void* data, int num) {
   hid_t h_space = 0, h_attr = 0, h_err = 0;
   hsize_t dim[1] = {num};
 
@@ -186,7 +188,8 @@ void writeAttribute(hid_t grp, char* name, enum DATA_TYPE type, void* data,
  *
  * Calls #error() if an error occurs.
  */
-void writeStringAttribute(hid_t grp, char* name, const char* str, int length) {
+void writeStringAttribute(hid_t grp, const char* name, const char* str,
+                          int length) {
   hid_t h_space = 0, h_attr = 0, h_err = 0, h_type = 0;
 
   h_space = H5Screate(H5S_SCALAR);
@@ -225,7 +228,7 @@ void writeStringAttribute(hid_t grp, char* name, const char* str, int length) {
  * @param name The name of the attribute
  * @param data The value to write
  */
-void writeAttribute_d(hid_t grp, char* name, double data) {
+void writeAttribute_d(hid_t grp, const char* name, double data) {
   writeAttribute(grp, name, DOUBLE, &data, 1);
 }
 
@@ -235,7 +238,7 @@ void writeAttribute_d(hid_t grp, char* name, double data) {
  * @param name The name of the attribute
  * @param data The value to write
  */
-void writeAttribute_f(hid_t grp, char* name, float data) {
+void writeAttribute_f(hid_t grp, const char* name, float data) {
   writeAttribute(grp, name, FLOAT, &data, 1);
 }
 
@@ -246,7 +249,7 @@ void writeAttribute_f(hid_t grp, char* name, float data) {
  * @param data The value to write
  */
 
-void writeAttribute_i(hid_t grp, char* name, int data) {
+void writeAttribute_i(hid_t grp, const char* name, int data) {
   writeAttribute(grp, name, INT, &data, 1);
 }
 
@@ -256,7 +259,7 @@ void writeAttribute_i(hid_t grp, char* name, int data) {
  * @param name The name of the attribute
  * @param data The value to write
  */
-void writeAttribute_l(hid_t grp, char* name, long data) {
+void writeAttribute_l(hid_t grp, const char* name, long data) {
   writeAttribute(grp, name, LONG, &data, 1);
 }
 
@@ -266,7 +269,7 @@ void writeAttribute_l(hid_t grp, char* name, long data) {
  * @param name The name of the attribute
  * @param str The string to write
  */
-void writeAttribute_s(hid_t grp, char* name, const char* str) {
+void writeAttribute_s(hid_t grp, const char* name, const char* str) {
   writeStringAttribute(grp, name, str, strlen(str));
 }
 
@@ -335,11 +338,15 @@ void writeCodeDescription(hid_t h_file) {
  *
  * @todo Use a proper XML library to avoid stupid copies.
  */
-FILE* prepareXMFfile() {
+FILE* prepareXMFfile(const char* baseName) {
   char buffer[1024];
 
-  FILE* xmfFile = fopen("output.xmf", "r");
-  FILE* tempFile = fopen("output_temp.xmf", "w");
+  char fileName[FILENAME_BUFFER_SIZE];
+  char tempFileName[FILENAME_BUFFER_SIZE];
+  snprintf(fileName, FILENAME_BUFFER_SIZE, "%s.xmf", baseName);
+  snprintf(tempFileName, FILENAME_BUFFER_SIZE, "%s_temp.xmf", baseName);
+  FILE* xmfFile = fopen(fileName, "r");
+  FILE* tempFile = fopen(tempFileName, "w");
 
   if (xmfFile == NULL) error("Unable to open current XMF file.");
 
@@ -355,8 +362,8 @@ FILE* prepareXMFfile() {
   fclose(xmfFile);
 
   /* We then copy the XMF file back up to the closing lines */
-  xmfFile = fopen("output.xmf", "w");
-  tempFile = fopen("output_temp.xmf", "r");
+  xmfFile = fopen(fileName, "w");
+  tempFile = fopen(tempFileName, "r");
 
   if (xmfFile == NULL) error("Unable to open current XMF file.");
 
@@ -369,7 +376,7 @@ FILE* prepareXMFfile() {
   }
   fprintf(xmfFile, "\n");
   fclose(tempFile);
-  remove("output_temp.xmf");
+  remove(tempFileName);
 
   return xmfFile;
 }
@@ -380,8 +387,11 @@ FILE* prepareXMFfile() {
  * @todo Exploit the XML nature of the XMF format to write a proper XML writer
  *and simplify all the XMF-related stuff.
  */
-void createXMFfile() {
-  FILE* xmfFile = fopen("output.xmf", "w");
+void createXMFfile(const char* baseName) {
+
+  char fileName[FILENAME_BUFFER_SIZE];
+  snprintf(fileName, FILENAME_BUFFER_SIZE, "%s.xmf", baseName);
+  FILE* xmfFile = fopen(fileName, "w");
 
   fprintf(xmfFile, "<?xml version=\"1.0\" ?> \n");
   fprintf(xmfFile, "<!DOCTYPE Xdmf SYSTEM \"Xdmf.dtd\" []> \n");
@@ -508,7 +518,8 @@ void prepare_dm_gparts(struct gpart* const gparts, size_t Ndm) {
   for (size_t i = 0; i < Ndm; ++i) {
     /* 0 or negative ids are not allowed */
     if (gparts[i].id_or_neg_offset <= 0)
-      error("0 or negative ID for DM particle %zd: ID=%lld", i, gparts[i].id_or_neg_offset);
+      error("0 or negative ID for DM particle %zd: ID=%lld", i,
+            gparts[i].id_or_neg_offset);
   }
 }
 
diff --git a/src/common_io.h b/src/common_io.h
index b7f3a1a317d69937dde8692eead8f00c75649477..fa6811a26671a2ed85c12ada7bb380094f55795d 100644
--- a/src/common_io.h
+++ b/src/common_io.h
@@ -23,13 +23,11 @@
 /* Config parameters. */
 #include "../config.h"
 
-/* Includes. */
-#include "kernel_hydro.h"
+#if defined(HAVE_HDF5)
+
 #include "part.h"
 #include "units.h"
 
-#if defined(HAVE_HDF5)
-
 /**
  * @brief The different types of data used in the GADGET IC files.
  *
@@ -52,10 +50,7 @@ enum DATA_TYPE {
  *start a run or optional.
  *
  */
-enum DATA_IMPORTANCE {
-  COMPULSORY = 1,
-  OPTIONAL = 0
-};
+enum DATA_IMPORTANCE { COMPULSORY = 1, OPTIONAL = 0 };
 
 /**
  * @brief The different particle types present in a GADGET IC file
@@ -88,17 +83,17 @@ void duplicate_hydro_gparts(struct part* const parts,
 
 void readAttribute(hid_t grp, char* name, enum DATA_TYPE type, void* data);
 
-void writeAttribute(hid_t grp, char* name, enum DATA_TYPE type, void* data,
-                    int num);
+void writeAttribute(hid_t grp, const char* name, enum DATA_TYPE type,
+                    void* data, int num);
 
-void writeAttribute_d(hid_t grp, char* name, double data);
-void writeAttribute_f(hid_t grp, char* name, float data);
-void writeAttribute_i(hid_t grp, char* name, int data);
-void writeAttribute_l(hid_t grp, char* name, long data);
-void writeAttribute_s(hid_t grp, char* name, const char* str);
+void writeAttribute_d(hid_t grp, const char* name, double data);
+void writeAttribute_f(hid_t grp, const char* name, float data);
+void writeAttribute_i(hid_t grp, const char* name, int data);
+void writeAttribute_l(hid_t grp, const char* name, long data);
+void writeAttribute_s(hid_t grp, const char* name, const char* str);
 
-void createXMFfile();
-FILE* prepareXMFfile();
+void createXMFfile(const char* baseName);
+FILE* prepareXMFfile(const char* baseName);
 void writeXMFoutputheader(FILE* xmfFile, char* hdfFileName, float time);
 void writeXMFoutputfooter(FILE* xmfFile, int outputCount, float time);
 void writeXMFgroupheader(FILE* xmfFile, char* hdfFileName, size_t N,
@@ -108,9 +103,8 @@ void writeXMFline(FILE* xmfFile, char* fileName, char* partTypeGroupName,
                   char* name, size_t N, int dim, enum DATA_TYPE type);
 
 void writeCodeDescription(hid_t h_file);
-void writeSPHflavour(hid_t h_file);
 void writeUnitSystem(hid_t h_file, struct UnitSystem* us);
 
-#endif
+#endif /* defined HDF5 */
 
 #endif /* SWIFT_COMMON_IO_H */
diff --git a/src/const.h b/src/const.h
index 6a52ec4796a4904629a57ffa8b32a3107bde263e..673aa3bf0093c1c426938a384fd017c1c1d18310 100644
--- a/src/const.h
+++ b/src/const.h
@@ -24,8 +24,7 @@
 #define const_hydro_gamma (5.0f / 3.0f)
 
 /* SPH Viscosity constants. */
-#define const_viscosity_alpha \
-  0.8f /* Used in the legacy gadget-2 SPH mode only */
+#define const_viscosity_alpha 0.8f
 #define const_viscosity_alpha_min \
   0.1f /* Values taken from (Price,2004), not used in legacy gadget mode */
 #define const_viscosity_alpha_max \
@@ -38,36 +37,30 @@
   1.f /* Value taken from (Price,2008), not used in legacy gadget mode */
 
 /* Time integration constants. */
-#define const_cfl 0.1f
-#define const_ln_max_h_change                                           \
-  0.231111721f /* Particle can't change volume by more than a factor of \
-                  2=1.26^3 over one time step */
 #define const_max_u_change 0.1f
 
-/* Neighbour search constants. */
-#define const_eta_kernel \
-  1.2349f /* Corresponds to 48 ngbs with the cubic spline kernel */
-#define const_delta_nwneigh 0.1f
-#define const_smoothing_max_iter 30
-#define CUBIC_SPLINE_KERNEL
-
 /* Gravity stuff. */
-#define const_theta_max                                   \
-  0.57735f /* Opening criteria, which is the ratio of the \
-              cell distance over the cell width. */
+#define const_theta_max 0.57735f
+#define const_G 6.672e-8f     /* Gravitational constant. */
+#define const_epsilon 0.0014f /* Gravity blending distance. */
 
-#define const_G 6.672e-8f             /* Gravitational constant. */
-#define const_epsilon 0.0014f         /* Gravity blending distance. */
-#define const_iepsilon 714.285714286f /* Inverse gravity blending distance. */
-#define const_iepsilon2 (const_iepsilon* const_iepsilon)
-#define const_iepsilon3 (const_iepsilon2* const_iepsilon)
-#define const_iepsilon4 (const_iepsilon2* const_iepsilon2)
-#define const_iepsilon5 (const_iepsilon3* const_iepsilon2)
-#define const_iepsilon6 (const_iepsilon3* const_iepsilon3)
+/* Kernel function to use */
+#define CUBIC_SPLINE_KERNEL
+//#define QUARTIC_SPLINE_KERNEL
+//#define QUINTIC_SPLINE_KERNEL
+//#define WENDLAND_C2_KERNEL
+//#define WENDLAND_C4_KERNEL
+//#define WENDLAND_C6_KERNEL
 
 /* SPH variant to use */
 //#define MINIMAL_SPH
 #define GADGET2_SPH
 //#define DEFAULT_SPH
 
+/* Gravity properties */
+#define EXTERNAL_POTENTIAL_POINTMASS
+
+/* Are we debugging ? */
+//#define SWIFT_DEBUG_CHECKS
+
 #endif /* SWIFT_CONST_H */
diff --git a/src/cycle.h b/src/cycle.h
index 1278c83e8b43324662bdeb0de75eec08faf4fd82..4925808f5a4c4ea7828bc1a6b7a9490d2d2ca255 100644
--- a/src/cycle.h
+++ b/src/cycle.h
@@ -334,7 +334,7 @@ typedef unsigned __int64 ticks;
 extern "C"
 #endif
     ticks
-        __getReg(int whichReg);
+    __getReg(int whichReg);
 #pragma intrinsic(__getReg)
 
 static __inline ticks getticks(void) {
@@ -481,9 +481,9 @@ INLINE_ELAPSED(inline)
 /* MIPS ZBus */
 #if HAVE_MIPS_ZBUS_TIMER
 #if defined(__mips__) && !defined(HAVE_TICK_COUNTER)
+#include <fcntl.h>
 #include <sys/mman.h>
 #include <unistd.h>
-#include <fcntl.h>
 
 typedef uint64_t ticks;
 
diff --git a/src/debug.c b/src/debug.c
index c4d7c5c62d3d1be268395a6c6deab5517cf7aaab..0e0bcdc65da21c5bb6b69ee2066bad538b316f6e 100644
--- a/src/debug.c
+++ b/src/debug.c
@@ -20,12 +20,20 @@
  *
  ******************************************************************************/
 
+/* Config parameters. */
+#include "../config.h"
+
+/* Some standard headers. */
 #include <stdio.h>
 
+/* This object's header. */
+#include "debug.h"
+
+/* Local includes. */
 #include "config.h"
 #include "const.h"
+#include "inline.h"
 #include "part.h"
-#include "debug.h"
 
 /* Import the right hydro definition */
 #if defined(MINIMAL_SPH)
@@ -52,8 +60,8 @@
  * (Should be used for debugging only as it runs in O(N).)
  */
 
-void printParticle(const struct part *parts, struct xpart *xparts,
-                   long long int id, size_t N) {
+void printParticle(const struct part *parts, struct xpart *xparts, long long int id,
+                   size_t N) {
 
   int found = 0;
 
@@ -69,8 +77,18 @@ void printParticle(const struct part *parts, struct xpart *xparts,
   if (!found) printf("## Particles[???] id=%lld not found\n", id);
 }
 
-void printgParticle(const struct gpart *gparts, const struct part *parts,
-                    long long int id, size_t N) {
+/**
+ * @brief Looks for the g-particle with the given id and prints its information
+ * to
+ * the standard output.
+ *
+ * @param gparts The array of g-particles.
+ * @param id The id too look for.
+ * @param N The size of the array of g-particles.
+ *
+ * (Should be used for debugging only as it runs in O(N).)
+ */
+void printgParticle(const struct gpart *gparts, const struct part *parts, long long int id, size_t N) {
 
   int found = 0;
 
@@ -97,13 +115,25 @@ void printgParticle(const struct gpart *gparts, const struct part *parts,
  *
  * @param p The particle to print
  * @param xp The extended data ot the particle to print
- *
  */
 
 void printParticle_single(const struct part *p, const struct xpart *xp) {
 
   printf("## Particle: id=%lld", p->id);
   hydro_debug_particle(p, xp);
+  printf("\n");
+}
+
+/**
+ * @brief Prints the details of a given particle to stdout
+ *
+ * @param gp The g-particle to print
+ */
+void printgParticle_single(struct gpart *gp) {
+
+  printf("## g-Particle: id=%lld ", gp->id_or_neg_offset);
+  gravity_debug_particle(gp);
+  printf("\n");
 }
 
 #ifdef HAVE_METIS
diff --git a/src/debug.h b/src/debug.h
index 585bf4ceb999638a7b8f5edbd73a8961c3325dfb..367241201977d9b79a8c2913dbae5d08f1148529 100644
--- a/src/debug.h
+++ b/src/debug.h
@@ -28,6 +28,7 @@ void printParticle(const struct part *parts, struct xpart *xparts,
 void printgParticle(const struct gpart *gparts, const struct part *parts,
                     long long int id, size_t N);
 void printParticle_single(const struct part *p, const struct xpart *xp);
+void printgParticle_single(struct gpart *gp);
 
 #ifdef HAVE_METIS
 #include "metis.h"
diff --git a/src/drift.h b/src/drift.h
new file mode 100644
index 0000000000000000000000000000000000000000..05b09bb7910e8cddaf9fb24bb248f120f9db9eea
--- /dev/null
+++ b/src/drift.h
@@ -0,0 +1,101 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (c) 2016 Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+#ifndef SWIFT_DRIFT_H
+#define SWIFT_DRIFT_H
+
+/* Config parameters. */
+#include "../config.h"
+
+/* Local headers. */
+#include "const.h"
+#include "debug.h"
+#include "hydro.h"
+
+/**
+ * @brief Perform the 'drift' operation on a #gpart
+ *
+ * @param gp The #gpart to drift.
+ * @param dt The drift time-step
+ * @param timeBase The minimal allowed time-step size.
+ * @param ti_old Integer start of time-step
+ * @param ti_current Integer end of time-step
+ */
+__attribute__((always_inline)) INLINE static void drift_gpart(
+    struct gpart* gp, float dt, double timeBase, int ti_old, int ti_current) {
+  /* Drift... */
+  gp->x[0] += gp->v_full[0] * dt;
+  gp->x[1] += gp->v_full[1] * dt;
+  gp->x[2] += gp->v_full[2] * dt;
+
+  /* Compute offset since last cell construction */
+  gp->x_diff[0] -= gp->v_full[0] * dt;
+  gp->x_diff[1] -= gp->v_full[1] * dt;
+  gp->x_diff[2] -= gp->v_full[2] * dt;
+}
+
+/**
+ * @brief Perform the 'drift' operation on a #part
+ *
+ * @param p The #part to drift.
+ * @param xp The #xpart of the particle.
+ * @param dt The drift time-step
+ * @param timeBase The minimal allowed time-step size.
+ * @param ti_old Integer start of time-step
+ * @param ti_current Integer end of time-step
+ */
+__attribute__((always_inline)) INLINE static void drift_part(
+    struct part* p, struct xpart* xp, float dt, double timeBase, int ti_old,
+    int ti_current) {
+  /* Useful quantity */
+  const float h_inv = 1.0f / p->h;
+
+  /* Drift... */
+  p->x[0] += xp->v_full[0] * dt;
+  p->x[1] += xp->v_full[1] * dt;
+  p->x[2] += xp->v_full[2] * dt;
+
+  /* Predict velocities (for hydro terms) */
+  p->v[0] += p->a_hydro[0] * dt;
+  p->v[1] += p->a_hydro[1] * dt;
+  p->v[2] += p->a_hydro[2] * dt;
+
+  /* Predict smoothing length */
+  const float w1 = p->h_dt * h_inv * dt;
+  if (fabsf(w1) < 0.2f)
+    p->h *= approx_expf(w1); /* 4th order expansion of exp(w) */
+  else
+    p->h *= expf(w1);
+
+  /* Predict density */
+  const float w2 = -3.0f * w1;
+  if (fabsf(w2) < 0.2f)
+    p->rho *= approx_expf(w2); /* 4th order expansion of exp(w) */
+  else
+    p->rho *= expf(w2);
+
+  /* Predict the values of the extra fields */
+  hydro_predict_extra(p, xp, ti_old, ti_current, timeBase);
+
+  /* Compute offset since last cell construction */
+  xp->x_diff[0] -= xp->v_full[0] * dt;
+  xp->x_diff[1] -= xp->v_full[1] * dt;
+  xp->x_diff[2] -= xp->v_full[2] * dt;
+}
+
+#endif /* SWIFT_DRIFT_H */
diff --git a/src/engine.c b/src/engine.c
index cd4f91b7477dca7dbc3eed5bac7357e34d0bc212..5100315199543b0896525f798d17085ced2c19a7 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -1,7 +1,11 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
+ *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
  *               2015 Peter W. Draper (p.w.draper@durham.ac.uk)
+ *                    Angus Lepper (angus.lepper@ed.ac.uk)
+ *               2016 John A. Regan (john.a.regan@durham.ac.uk)
+ *                    Tom Theuns (tom.theuns@durham.ac.uk)
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
@@ -25,11 +29,11 @@
 #include <float.h>
 #include <limits.h>
 #include <sched.h>
+#include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
-#include <stdbool.h>
 
 /* MPI headers. */
 #ifdef WITH_MPI
@@ -52,19 +56,38 @@
 #include "error.h"
 #include "hydro.h"
 #include "minmax.h"
+#include "parallel_io.h"
 #include "part.h"
 #include "partition.h"
+#include "proxy.h"
+#include "runner.h"
+#include "serial_io.h"
+#include "single_io.h"
 #include "timers.h"
-
-const char *engine_policy_names[13] = {
-    "none",                 "rand",   "steal",        "keep",
-    "block",                "fix_dt", "cpu_tight",    "mpi",
-    "numa_affinity",        "hydro",  "self_gravity", "external_gravity",
-    "cosmology_integration"};
+#include "units.h"
+
+const char *engine_policy_names[13] = {"none",
+                                       "rand",
+                                       "steal",
+                                       "keep",
+                                       "block",
+                                       "fix_dt",
+                                       "cpu_tight",
+                                       "mpi",
+                                       "numa_affinity",
+                                       "hydro",
+                                       "self_gravity",
+                                       "external_gravity",
+                                       "cosmology_integration"};
 
 /** The rank of the engine as a global variable (for messages). */
 int engine_rank;
 
+#ifdef HAVE_SETAFFINITY
+/** The initial affinity of the main thread (set by engin_pin()) */
+static cpu_set_t entry_affinity;
+#endif
+
 /**
  * @brief Link a density/force task to a cell.
  *
@@ -88,7 +111,8 @@ struct link *engine_addlink(struct engine *e, struct link *l, struct task *t) {
 }
 
 /**
- * @brief Generate the ghosts all the O(Npart) tasks for a hierarchy of cells.
+ * @brief Generate the hierarchical tasks for a hierarchy of cells - i.e. all
+ * the O(Npart) tasks.
  *
  * Tasks are only created here. The dependencies will be added later on.
  *
@@ -96,14 +120,17 @@ struct link *engine_addlink(struct engine *e, struct link *l, struct task *t) {
  * @param c The #cell.
  * @param super The super #cell.
  */
-
-void engine_make_ghost_tasks(struct engine *e, struct cell *c,
-                             struct cell *super) {
+void engine_make_hierarchical_tasks(struct engine *e, struct cell *c,
+                                    struct cell *super) {
 
   struct scheduler *s = &e->sched;
+  const int is_with_external_gravity =
+      (e->policy & engine_policy_external_gravity) ==
+      engine_policy_external_gravity;
+  const int is_fixdt = (e->policy & engine_policy_fixdt) == engine_policy_fixdt;
 
   /* Am I the super-cell? */
-  if (super == NULL && c->nr_tasks > 0) {
+  if (super == NULL && (c->count > 0 || c->gcount > 0)) {
 
     /* Remember me. */
     super = c;
@@ -111,18 +138,37 @@ void engine_make_ghost_tasks(struct engine *e, struct cell *c,
     /* Local tasks only... */
     if (c->nodeID == e->nodeID) {
 
-      /* Generate the ghost task. */
-      c->ghost = scheduler_addtask(s, task_type_ghost, task_subtype_none, 0, 0,
-                                   c, NULL, 0);
-      /* Add the drift task. */
-      c->drift = scheduler_addtask(s, task_type_drift, task_subtype_none, 0, 0,
-                                   c, NULL, 0);
       /* Add the init task. */
       c->init = scheduler_addtask(s, task_type_init, task_subtype_none, 0, 0, c,
                                   NULL, 0);
-      /* Add the kick task. */
-      c->kick = scheduler_addtask(s, task_type_kick, task_subtype_none, 0, 0, c,
-                                  NULL, 0);
+
+      /* Add the drift task. */
+      c->drift = scheduler_addtask(s, task_type_drift, task_subtype_none, 0, 0,
+                                   c, NULL, 0);
+
+      /* Add the kick task that matches the policy. */
+      if (is_fixdt) {
+        c->kick = scheduler_addtask(s, task_type_kick_fixdt, task_subtype_none,
+                                    0, 0, c, NULL, 0);
+      } else {
+        c->kick = scheduler_addtask(s, task_type_kick, task_subtype_none, 0, 0,
+                                    c, NULL, 0);
+      }
+
+      if (c->count > 0) {
+
+        /* Generate the ghost task. */
+        c->ghost = scheduler_addtask(s, task_type_ghost, task_subtype_none, 0,
+                                     0, c, NULL, 0);
+      }
+
+      if (c->gcount > 0) {
+
+        /* Add the external gravity tasks */
+        if (is_with_external_gravity)
+          c->grav_external = scheduler_addtask(
+              s, task_type_grav_external, task_subtype_none, 0, 0, c, NULL, 0);
+      }
     }
   }
 
@@ -133,7 +179,7 @@ void engine_make_ghost_tasks(struct engine *e, struct cell *c,
   if (c->split)
     for (int k = 0; k < 8; k++)
       if (c->progeny[k] != NULL)
-        engine_make_ghost_tasks(e, c->progeny[k], super);
+        engine_make_hierarchical_tasks(e, c->progeny[k], super);
 }
 
 /**
@@ -179,7 +225,7 @@ void engine_redistribute(struct engine *e) {
   bzero(counts, sizeof(int) * nr_nodes * nr_nodes);
   bzero(g_counts, sizeof(int) * nr_nodes * nr_nodes);
 
-  // Allocate the destination index arrays.
+  /* Allocate the destination index arrays. */
   int *dest, *g_dest;
   if ((dest = (int *)malloc(sizeof(int) * s->nr_parts)) == NULL)
     error("Failed to allocate dest temporary buffer.");
@@ -198,9 +244,12 @@ void engine_redistribute(struct engine *e) {
     }
     const int cid = cell_getid(cdim, parts[k].x[0] * ih[0],
                                parts[k].x[1] * ih[1], parts[k].x[2] * ih[2]);
-    /* if (cid < 0 || cid >= s->nr_cells)
-       error("Bad cell id %i for part %i at [%.3e,%.3e,%.3e].",
-             cid, k, parts[k].x[0], parts[k].x[1], parts[k].x[2]); */
+#ifdef SWIFT_DEBUG_CHECKS
+    if (cid < 0 || cid >= s->nr_cells)
+      error("Bad cell id %i for part %zi at [%.3e,%.3e,%.3e].", cid, k,
+            parts[k].x[0], parts[k].x[1], parts[k].x[2]);
+#endif
+
     dest[k] = cells[cid].nodeID;
 
     /* The counts array is indexed as count[from * nr_nodes + to]. */
@@ -211,27 +260,30 @@ void engine_redistribute(struct engine *e) {
   space_parts_sort(s, dest, s->nr_parts, 0, nr_nodes - 1, e->verbose);
 
   /* We need to re-link the gpart partners of parts. */
-  int current_dest = dest[0];
-  size_t count_this_dest = 0;
-  for (size_t k = 0; k < s->nr_parts; ++k) {
-    if (s->parts[k].gpart != NULL) {
-
-      /* As the addresses will be invalidated by the communications, we will */
-      /* instead store the absolute index from the start of the sub-array */
-      /* of particles to be sent to a given node. */
-      /* Recall that gparts without partners have a negative id. */
-      /* We will restore the pointers on the receiving node later on. */
-      if (dest[k] != current_dest) {
-        current_dest = dest[k];
-        count_this_dest = 0;
-      }
+  if (s->nr_parts > 0) {
+    int current_dest = dest[0];
+    size_t count_this_dest = 0;
+    for (size_t k = 0; k < s->nr_parts; ++k) {
+      if (s->parts[k].gpart != NULL) {
+
+        /* As the addresses will be invalidated by the communications, we will
+         * instead store the absolute index from the start of the sub-array of
+         * particles to be sent to a given node.
+         * Recall that gparts without partners have a negative id.
+         * We will restore the pointers on the receiving node later on. */
+        if (dest[k] != current_dest) {
+          current_dest = dest[k];
+          count_this_dest = 0;
+        }
 
-      /* Debug */
-      /* if(s->parts[k].gpart->id < 0) */
-      /* 	error("Trying to link a partnerless gpart !"); */
+#ifdef SWIFT_DEBUG_CHECKS
+        if (s->parts[k].gpart->id < 0)
+          error("Trying to link a partnerless gpart !");
+#endif
 
-      s->parts[k].gpart->id_or_neg_offset = -count_this_dest;
-      count_this_dest++;
+        s->parts[k].gpart->id_or_neg_offset = -count_this_dest;
+        count_this_dest++;
+      }
     }
   }
 
@@ -247,9 +299,12 @@ void engine_redistribute(struct engine *e) {
     }
     const int cid = cell_getid(cdim, gparts[k].x[0] * ih[0],
                                gparts[k].x[1] * ih[1], gparts[k].x[2] * ih[2]);
-    /* if (cid < 0 || cid >= s->nr_cells)
-       error("Bad cell id %i for part %i at [%.3e,%.3e,%.3e].",
-             cid, k, g_parts[k].x[0], g_parts[k].x[1], g_parts[k].x[2]); */
+#ifdef SWIFT_DEBUG_CHECKS
+    if (cid < 0 || cid >= s->nr_cells)
+      error("Bad cell id %i for part %zi at [%.3e,%.3e,%.3e].", cid, k,
+            gparts[k].x[0], gparts[k].x[1], gparts[k].x[2]);
+#endif
+
     g_dest[k] = cells[cid].nodeID;
 
     /* The counts array is indexed as count[from * nr_nodes + to]. */
@@ -416,7 +471,7 @@ void engine_redistribute(struct engine *e) {
 
         /* Re-link */
         gparts_new[k].id_or_neg_offset = -partner_index;
-        parts_new[partner_index].gpart = &gparts_new[k];
+        parts[partner_index].gpart = &gparts_new[k];
       }
     }
 
@@ -424,29 +479,27 @@ void engine_redistribute(struct engine *e) {
     offset_gparts += count_gparts;
   }
 
+#ifdef SWIFT_DEBUG_CHECKS
   /* Verify that all parts are in the right place. */
-  /* for ( int k = 0 ; k < nr_parts ; k++ ) {
-      int cid = cell_getid( cdim , parts_new[k].x[0]*ih[0],
-    parts_new[k].x[1]*ih[1], parts_new[k].x[2]*ih[2] );
-      if ( cells[ cid ].nodeID != nodeID )
-          error( "Received particle (%i) that does not belong here
-    (nodeID=%i).", k , cells[ cid ].nodeID );
-    } */
+  for (int k = 0; k < nr_parts; k++) {
+    int cid = cell_getid(cdim, parts_new[k].x[0] * ih[0],
+                         parts_new[k].x[1] * ih[1], parts_new[k].x[2] * ih[2]);
+    if (cells[cid].nodeID != nodeID)
+      error("Received particle (%i) that does not belong here (nodeID=%i).", k,
+            cells[cid].nodeID);
+  }
 
   /* Verify that the links are correct */
-  /* MATTHIEU: To be commented out once we are happy */
   for (size_t k = 0; k < nr_gparts; ++k) {
 
     if (gparts_new[k].id_or_neg_offset <= 0) {
-    
+
       struct part *part = &parts_new[-gparts_new[k].id_or_neg_offset];
 
-      if (part->gpart != &gparts_new[k])
-        error("Linking problem !");
+      if (part->gpart != &gparts_new[k]) error("Linking problem !");
 
       if (gparts_new[k].x[0] != part->x[0] ||
-          gparts_new[k].x[1] != part->x[1] ||
-          gparts_new[k].x[2] != part->x[2])
+          gparts_new[k].x[1] != part->x[1] || gparts_new[k].x[2] != part->x[2])
         error("Linked particles are not at the same position !");
     }
   }
@@ -454,9 +507,10 @@ void engine_redistribute(struct engine *e) {
 
     if (parts_new[k].gpart != NULL &&
         parts_new[k].gpart->id_or_neg_offset != -k) {
-        error("Linking problem !");
+      error("Linking problem !");
     }
   }
+#endif
 
   /* Set the new part data, free the old. */
   free(parts);
@@ -821,7 +875,10 @@ void engine_exchange_strays(struct engine *e, size_t offset_parts,
   ticks tic = getticks();
 
   /* Re-set the proxies. */
-  for (int k = 0; k < e->nr_proxies; k++) e->proxies[k].nr_parts_out = 0;
+  for (int k = 0; k < e->nr_proxies; k++) {
+    e->proxies[k].nr_parts_out = 0;
+    e->proxies[k].nr_gparts_out = 0;
+  }
 
   /* Put the parts and gparts into the corresponding proxies. */
   for (size_t k = 0; k < *Npart; k++) {
@@ -841,7 +898,8 @@ void engine_exchange_strays(struct engine *e, size_t offset_parts,
 
     /* Re-link the associated gpart with the buffer offset of the part. */
     if (s->parts[offset_parts + k].gpart != NULL) {
-      s->parts[offset_parts + k].gpart->id_or_neg_offset = -e->proxies[pid].nr_parts_in;
+      s->parts[offset_parts + k].gpart->id_or_neg_offset =
+          -e->proxies[pid].nr_parts_out;
     }
 
     /* Load the part and xpart into the proxy. */
@@ -900,6 +958,7 @@ void engine_exchange_strays(struct engine *e, size_t offset_parts,
             count_parts_in, count_gparts_in);
   }
   if (offset_parts + count_parts_in > s->size_parts) {
+    message("re-allocating parts array.");
     s->size_parts = (offset_parts + count_parts_in) * engine_parts_size_grow;
     struct part *parts_new = NULL;
     struct xpart *xparts_new = NULL;
@@ -914,8 +973,14 @@ void engine_exchange_strays(struct engine *e, size_t offset_parts,
     free(s->xparts);
     s->parts = parts_new;
     s->xparts = xparts_new;
+    for (size_t k = 0; k < offset_parts; k++) {
+      if (s->parts[k].gpart != NULL) {
+        s->parts[k].gpart->id_or_neg_offset = -k;
+      }
+    }
   }
   if (offset_gparts + count_gparts_in > s->size_gparts) {
+    message("re-allocating gparts array.");
     s->size_gparts = (offset_gparts + count_gparts_in) * engine_parts_size_grow;
     struct gpart *gparts_new = NULL;
     if (posix_memalign((void **)&gparts_new, gpart_align,
@@ -924,6 +989,11 @@ void engine_exchange_strays(struct engine *e, size_t offset_parts,
     memcpy(gparts_new, s->gparts, sizeof(struct gpart) * offset_gparts);
     free(s->gparts);
     s->gparts = gparts_new;
+    for (size_t k = 0; k < offset_gparts; k++) {
+      if (s->gparts[k].id_or_neg_offset < 0) {
+        s->parts[-s->gparts[k].id_or_neg_offset].gpart = &s->gparts[k];
+      }
+    }
   }
 
   /* Collect the requests for the particle data from the proxies. */
@@ -978,7 +1048,7 @@ void engine_exchange_strays(struct engine *e, size_t offset_parts,
         reqs_in[pid + 1] == MPI_REQUEST_NULL &&
         reqs_in[pid + 2] == MPI_REQUEST_NULL) {
       /* Copy the particle data to the part/xpart/gpart arrays. */
-      struct proxy *p = &e->proxies[pid >> 1];
+      struct proxy *p = &e->proxies[pid / 3];
       memcpy(&s->parts[offset_parts + count_parts], p->parts_in,
              sizeof(struct part) * p->nr_parts_in);
       memcpy(&s->xparts[offset_parts + count_parts], p->xparts_in,
@@ -995,8 +1065,10 @@ void engine_exchange_strays(struct engine *e, size_t offset_parts,
       for (int k = 0; k < p->nr_gparts_in; k++) {
         struct gpart *gp = &s->gparts[offset_gparts + count_gparts + k];
         if (gp->id_or_neg_offset <= 0) {
-          s->parts[offset_gparts + count_parts - gp->id_or_neg_offset].gpart = gp;
-          gp->id_or_neg_offset = -(offset_gparts + count_parts - gp->id_or_neg_offset);
+          struct part *p =
+              &s->parts[offset_gparts + count_parts - gp->id_or_neg_offset];
+          gp->id_or_neg_offset = s->parts - p;
+          p->gpart = gp;
         }
       }
 
@@ -1140,33 +1212,45 @@ void engine_count_and_link_tasks(struct engine *e) {
         t->cj->density = engine_addlink(e, t->cj->density, t);
         atomic_inc(&t->cj->nr_density);
       }
-    } else if (t->type == task_type_sub) {
+    } else if (t->type == task_type_sub_self) {
       atomic_inc(&t->ci->nr_tasks);
-      if (t->cj != NULL) atomic_inc(&t->cj->nr_tasks);
       if (t->subtype == task_subtype_density) {
         t->ci->density = engine_addlink(e, t->ci->density, t);
         atomic_inc(&t->ci->nr_density);
-        if (t->cj != NULL) {
-          t->cj->density = engine_addlink(e, t->cj->density, t);
-          atomic_inc(&t->cj->nr_density);
-        }
+      }
+    } else if (t->type == task_type_sub_pair) {
+      atomic_inc(&t->ci->nr_tasks);
+      atomic_inc(&t->cj->nr_tasks);
+      if (t->subtype == task_subtype_density) {
+        t->ci->density = engine_addlink(e, t->ci->density, t);
+        atomic_inc(&t->ci->nr_density);
+        t->cj->density = engine_addlink(e, t->cj->density, t);
+        atomic_inc(&t->cj->nr_density);
       }
     }
-
-    /* /\* Link gravity multipole tasks to the up/down tasks. *\/ */
-    /* if (t->type == task_type_grav_mm || */
-    /*     (t->type == task_type_sub && t->subtype == task_subtype_grav)) { */
-    /*   atomic_inc(&t->ci->nr_tasks); */
-    /*   scheduler_addunlock(sched, t->ci->grav_up, t); */
-    /*   scheduler_addunlock(sched, t, t->ci->grav_down); */
-    /*   if (t->cj != NULL && t->ci->grav_up != t->cj->grav_up) { */
-    /*     scheduler_addunlock(sched, t->cj->grav_up, t); */
-    /*     scheduler_addunlock(sched, t, t->cj->grav_down); */
-    /*   } */
-    /* } */
   }
 }
 
+/**
+ * @brief Creates the dependency network for the hydro tasks of a given cell.
+ *
+ * @param sched The #scheduler.
+ * @param density The density task to link.
+ * @param force The force task to link.
+ * @param c The cell.
+ */
+static inline void engine_make_hydro_loops_dependencies(struct scheduler *sched,
+                                                        struct task *density,
+                                                        struct task *force,
+                                                        struct cell *c) {
+
+  /* init --> density loop --> ghost --> force loop --> kick */
+  scheduler_addunlock(sched, c->super->init, density);
+  scheduler_addunlock(sched, density, c->super->ghost);
+  scheduler_addunlock(sched, c->super->ghost, force);
+  scheduler_addunlock(sched, force, c->super->kick);
+}
+
 /**
  * @brief Duplicates the first hydro loop and construct all the
  * dependencies for the hydro part
@@ -1205,11 +1289,7 @@ void engine_make_extra_hydroloop_tasks(struct engine *e) {
       atomic_inc(&t->ci->nr_force);
 
       /* Now, build all the dependencies for the hydro */
-      /* init --> t (density loop) --> ghost --> t2 (force loop) --> kick */
-      scheduler_addunlock(sched, t->ci->super->init, t);
-      scheduler_addunlock(sched, t, t->ci->super->ghost);
-      scheduler_addunlock(sched, t->ci->super->ghost, t2);
-      scheduler_addunlock(sched, t2, t->ci->super->kick);
+      engine_make_hydro_loops_dependencies(sched, t, t2, t->ci);
     }
 
     /* Otherwise, pair interaction? */
@@ -1227,56 +1307,64 @@ void engine_make_extra_hydroloop_tasks(struct engine *e) {
 
       /* Now, build all the dependencies for the hydro for the cells */
       /* that are local and are not descendant of the same super-cells */
-      /* init --> t (density loop) --> ghost --> t2 (force loop) --> kick */
       if (t->ci->nodeID == nodeID) {
-        scheduler_addunlock(sched, t->ci->super->init, t);
-        scheduler_addunlock(sched, t, t->ci->super->ghost);
-        scheduler_addunlock(sched, t->ci->super->ghost, t2);
-        scheduler_addunlock(sched, t2, t->ci->super->kick);
+        engine_make_hydro_loops_dependencies(sched, t, t2, t->ci);
       }
       if (t->cj->nodeID == nodeID && t->ci->super != t->cj->super) {
-        scheduler_addunlock(sched, t->cj->super->init, t);
-        scheduler_addunlock(sched, t, t->cj->super->ghost);
-        scheduler_addunlock(sched, t->cj->super->ghost, t2);
-        scheduler_addunlock(sched, t2, t->cj->super->kick);
+        engine_make_hydro_loops_dependencies(sched, t, t2, t->cj);
       }
     }
 
-    /* Otherwise, sub interaction? */
-    else if (t->type == task_type_sub && t->subtype == task_subtype_density) {
+    /* Otherwise, sub-self interaction? */
+    else if (t->type == task_type_sub_self &&
+             t->subtype == task_subtype_density) {
 
       /* Start by constructing the task for the second hydro loop */
       struct task *t2 =
-          scheduler_addtask(sched, task_type_sub, task_subtype_force, t->flags,
-                            0, t->ci, t->cj, 0);
+          scheduler_addtask(sched, task_type_sub_self, task_subtype_force,
+                            t->flags, 0, t->ci, t->cj, 0);
 
-      /* Add the link between the new loop and both cells */
+      /* Add the link between the new loop and the cell */
       t->ci->force = engine_addlink(e, t->ci->force, t2);
       atomic_inc(&t->ci->nr_force);
-      if (t->cj != NULL) {
-        t->cj->force = engine_addlink(e, t->cj->force, t2);
-        atomic_inc(&t->cj->nr_force);
+
+      /* Now, build all the dependencies for the hydro for the cells */
+      /* that are local and are not descendant of the same super-cells */
+      if (t->ci->nodeID == nodeID) {
+        engine_make_hydro_loops_dependencies(sched, t, t2, t->ci);
       }
+    }
+
+    /* Otherwise, sub-pair interaction? */
+    else if (t->type == task_type_sub_pair &&
+             t->subtype == task_subtype_density) {
+
+      /* Start by constructing the task for the second hydro loop */
+      struct task *t2 =
+          scheduler_addtask(sched, task_type_sub_pair, task_subtype_force,
+                            t->flags, 0, t->ci, t->cj, 0);
+
+      /* Add the link between the new loop and both cells */
+      t->ci->force = engine_addlink(e, t->ci->force, t2);
+      atomic_inc(&t->ci->nr_force);
+      t->cj->force = engine_addlink(e, t->cj->force, t2);
+      atomic_inc(&t->cj->nr_force);
 
       /* Now, build all the dependencies for the hydro for the cells */
       /* that are local and are not descendant of the same super-cells */
-      /* init --> t (density loop) --> ghost --> t2 (force loop) --> kick */
       if (t->ci->nodeID == nodeID) {
-        scheduler_addunlock(sched, t, t->ci->super->ghost);
-        scheduler_addunlock(sched, t->ci->super->ghost, t2);
-        scheduler_addunlock(sched, t2, t->ci->super->kick);
+        engine_make_hydro_loops_dependencies(sched, t, t2, t->ci);
       }
-      if (t->cj != NULL && t->cj->nodeID == nodeID &&
-          t->ci->super != t->cj->super) {
-        scheduler_addunlock(sched, t, t->cj->super->ghost);
-        scheduler_addunlock(sched, t->cj->super->ghost, t2);
-        scheduler_addunlock(sched, t2, t->cj->super->kick);
+      if (t->cj->nodeID == nodeID && t->ci->super != t->cj->super) {
+        engine_make_hydro_loops_dependencies(sched, t, t2, t->cj);
       }
     }
 
-    /* /\* Kick tasks should rely on the grav_down tasks of their cell. *\/ */
-    /* else if (t->type == task_type_kick && t->ci->grav_down != NULL) */
-    /*   scheduler_addunlock(sched, t->ci->grav_down, t); */
+    /* External gravity tasks should depend on init and unlock the kick */
+    else if (t->type == task_type_grav_external) {
+      scheduler_addunlock(sched, t->ci->init, t);
+      scheduler_addunlock(sched, t, t->ci->kick);
+    }
   }
 }
 
@@ -1378,7 +1466,7 @@ void engine_maketasks(struct engine *e) {
   engine_make_hydroloop_tasks(e);
 
   /* Add the gravity mm tasks. */
-  if ((e->policy & engine_policy_self_gravity) == engine_policy_self_gravity)
+  if (e->policy & engine_policy_self_gravity)
     engine_make_gravityinteraction_tasks(e);
 
   /* Split the tasks. */
@@ -1394,7 +1482,7 @@ void engine_maketasks(struct engine *e) {
   e->nr_links = 0;
 
   /* Add the gravity up/down tasks at the top-level cells and push them down. */
-  if ((e->policy & engine_policy_self_gravity) == engine_policy_self_gravity)
+  if (e->policy & engine_policy_self_gravity)
     engine_make_gravityrecursive_tasks(e);
 
   /* Count the number of tasks associated with each cell and
@@ -1402,10 +1490,9 @@ void engine_maketasks(struct engine *e) {
      depend on the sorts of its progeny. */
   engine_count_and_link_tasks(e);
 
-  /* Append a ghost task to each cell, and add kick tasks to the
-     super cells. */
+  /* Append hierarchical tasks to each cells */
   for (int k = 0; k < nr_cells; k++)
-    engine_make_ghost_tasks(e, &cells[k], NULL);
+    engine_make_hierarchical_tasks(e, &cells[k], NULL);
 
   /* Run through the tasks and make force tasks for each density task.
      Each force task depends on the cell ghosts and unlocks the kick task
@@ -1413,7 +1500,7 @@ void engine_maketasks(struct engine *e) {
   engine_make_extra_hydroloop_tasks(e);
 
   /* Add the communication tasks if MPI is being used. */
-  if ((e->policy & engine_policy_mpi) == engine_policy_mpi) {
+  if (e->policy & engine_policy_mpi) {
 
     /* Loop over the proxies. */
     for (int pid = 0; pid < e->nr_proxies; pid++) {
@@ -1466,7 +1553,7 @@ int engine_marktasks(struct engine *e) {
   const ticks tic = getticks();
 
   /* Much less to do here if we're on a fixed time-step. */
-  if ((e->policy & engine_policy_fixdt) == engine_policy_fixdt) {
+  if (e->policy & engine_policy_fixdt) {
 
     /* Run through the tasks and mark as skip or not. */
     for (int k = 0; k < nr_tasks; k++) {
@@ -1475,8 +1562,7 @@ int engine_marktasks(struct engine *e) {
       struct task *t = &tasks[ind[k]];
 
       /* Pair? */
-      if (t->type == task_type_pair ||
-          (t->type == task_type_sub && t->cj != NULL)) {
+      if (t->type == task_type_pair || t->type == task_type_sub_pair) {
 
         /* Local pointers. */
         const struct cell *ci = t->ci;
@@ -1520,15 +1606,14 @@ int engine_marktasks(struct engine *e) {
 
       /* Single-cell task? */
       else if (t->type == task_type_self || t->type == task_type_ghost ||
-               (t->type == task_type_sub && t->cj == NULL)) {
+               t->type == task_type_sub_self) {
 
         /* Set this task's skip. */
         t->skip = (t->ci->ti_end_min > ti_end);
       }
 
       /* Pair? */
-      else if (t->type == task_type_pair ||
-               (t->type == task_type_sub && t->cj != NULL)) {
+      else if (t->type == task_type_pair || t->type == task_type_sub_pair) {
 
         /* Local pointers. */
         const struct cell *ci = t->ci;
@@ -1741,6 +1826,8 @@ void engine_barrier(struct engine *e, int tid) {
 
 /**
  * @brief Mapping function to collect the data from the kick.
+ *
+ * @param c A super-cell.
  */
 
 void engine_collect_kick(struct cell *c) {
@@ -1750,15 +1837,13 @@ void engine_collect_kick(struct cell *c) {
 
   /* Counters for the different quantities. */
   int updated = 0, g_updated = 0;
-  double e_kin = 0.0, e_int = 0.0, e_pot = 0.0;
-  float mom[3] = {0.0f, 0.0f, 0.0f}, ang[3] = {0.0f, 0.0f, 0.0f};
-  int ti_end_min = max_nr_timesteps, ti_end_max = 0;
+  int ti_end_min = max_nr_timesteps;
 
   /* Only do something is the cell is non-empty */
-  if (c->count != 0) {
+  if (c->count != 0 || c->gcount != 0) {
 
     /* If this cell is not split, I'm in trouble. */
-    if (!c->split) error("Cell has no super-cell.");
+    if (!c->split) error("Cell is not split.");
 
     /* Collect the values from the progeny. */
     for (int k = 0; k < 8; k++) {
@@ -1770,36 +1855,201 @@ void engine_collect_kick(struct cell *c) {
 
         /* And update */
         ti_end_min = min(ti_end_min, cp->ti_end_min);
-        ti_end_max = max(ti_end_max, cp->ti_end_max);
         updated += cp->updated;
         g_updated += cp->g_updated;
+      }
+    }
+  }
+
+  /* Store the collected values in the cell. */
+  c->ti_end_min = ti_end_min;
+  c->updated = updated;
+  c->g_updated = g_updated;
+}
+
+/**
+ * @brief Collects the next time-step by making each super-cell recurse
+ * to collect the minimal of ti_end and the number of updated particles.
+ *
+ * @param e The #engine.
+ */
+void engine_collect_timestep(struct engine *e) {
+
+  int updates = 0, g_updates = 0;
+  int ti_end_min = max_nr_timesteps;
+  const struct space *s = e->s;
+
+  /* Collect the cell data. */
+  for (int k = 0; k < s->nr_cells; k++)
+    if (s->cells[k].nodeID == e->nodeID) {
+      struct cell *c = &s->cells[k];
+
+      /* Make the top-cells recurse */
+      engine_collect_kick(c);
+
+      /* And aggregate */
+      ti_end_min = min(ti_end_min, c->ti_end_min);
+      updates += c->updated;
+      g_updates += c->g_updated;
+    }
+
+/* Aggregate the data from the different nodes. */
+#ifdef WITH_MPI
+  {
+    int in_i[1], out_i[1];
+    in_i[0] = 0;
+    out_i[0] = ti_end_min;
+    if (MPI_Allreduce(out_i, in_i, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD) !=
+        MPI_SUCCESS)
+      error("Failed to aggregate t_end_min.");
+    ti_end_min = in_i[0];
+  }
+  {
+    unsigned long long in_ll[2], out_ll[2];
+    out_ll[0] = updates;
+    out_ll[1] = g_updates;
+    if (MPI_Allreduce(out_ll, in_ll, 2, MPI_LONG_LONG_INT, MPI_SUM,
+                      MPI_COMM_WORLD) != MPI_SUCCESS)
+      error("Failed to aggregate energies.");
+    updates = in_ll[0];
+    g_updates = in_ll[1];
+  }
+#endif
+
+  e->ti_end_min = ti_end_min;
+  e->updates = updates;
+  e->g_updates = g_updates;
+}
+
+/**
+ * @brief Mapping function to collect the data from the drift.
+ *
+ * @param c A super-cell.
+ */
+void engine_collect_drift(struct cell *c) {
+
+  /* Skip super-cells (Their values are already set) */
+  if (c->drift != NULL) return;
+
+  /* Counters for the different quantities. */
+  double e_kin = 0.0, e_int = 0.0, e_pot = 0.0, mass = 0.0;
+  double mom[3] = {0.0, 0.0, 0.0}, ang_mom[3] = {0.0, 0.0, 0.0};
+
+  /* Only do something is the cell is non-empty */
+  if (c->count != 0 || c->gcount != 0) {
+
+    /* If this cell is not split, I'm in trouble. */
+    if (!c->split) error("Cell has no super-cell.");
+
+    /* Collect the values from the progeny. */
+    for (int k = 0; k < 8; k++) {
+      struct cell *cp = c->progeny[k];
+      if (cp != NULL) {
+
+        /* Recurse */
+        engine_collect_drift(cp);
+
+        /* And update */
+        mass += cp->mass;
         e_kin += cp->e_kin;
         e_int += cp->e_int;
         e_pot += cp->e_pot;
         mom[0] += cp->mom[0];
         mom[1] += cp->mom[1];
         mom[2] += cp->mom[2];
-        ang[0] += cp->ang[0];
-        ang[1] += cp->ang[1];
-        ang[2] += cp->ang[2];
+        ang_mom[0] += cp->ang_mom[0];
+        ang_mom[1] += cp->ang_mom[1];
+        ang_mom[2] += cp->ang_mom[2];
       }
     }
   }
 
   /* Store the collected values in the cell. */
-  c->ti_end_min = ti_end_min;
-  c->ti_end_max = ti_end_max;
-  c->updated = updated;
-  c->g_updated = g_updated;
+  c->mass = mass;
   c->e_kin = e_kin;
   c->e_int = e_int;
   c->e_pot = e_pot;
   c->mom[0] = mom[0];
   c->mom[1] = mom[1];
   c->mom[2] = mom[2];
-  c->ang[0] = ang[0];
-  c->ang[1] = ang[1];
-  c->ang[2] = ang[2];
+  c->ang_mom[0] = ang_mom[0];
+  c->ang_mom[1] = ang_mom[1];
+  c->ang_mom[2] = ang_mom[2];
+}
+/**
+ * @brief Print the conserved quantities statistics to a log file
+ *
+ * @param e The #engine.
+ */
+void engine_print_stats(struct engine *e) {
+
+  const struct space *s = e->s;
+
+  double e_kin = 0.0, e_int = 0.0, e_pot = 0.0, mass = 0.0;
+  double mom[3] = {0.0, 0.0, 0.0}, ang_mom[3] = {0.0, 0.0, 0.0};
+
+  /* Collect the cell data. */
+  for (int k = 0; k < s->nr_cells; k++)
+    if (s->cells[k].nodeID == e->nodeID) {
+      struct cell *c = &s->cells[k];
+
+      /* Make the top-cells recurse */
+      engine_collect_drift(c);
+
+      /* And aggregate */
+      mass += c->mass;
+      e_kin += c->e_kin;
+      e_int += c->e_int;
+      e_pot += c->e_pot;
+      mom[0] += c->mom[0];
+      mom[1] += c->mom[1];
+      mom[2] += c->mom[2];
+      ang_mom[0] += c->ang_mom[0];
+      ang_mom[1] += c->ang_mom[1];
+      ang_mom[2] += c->ang_mom[2];
+    }
+
+/* Aggregate the data from the different nodes. */
+#ifdef WITH_MPI
+  {
+    double in[10] = {0., 0., 0., 0., 0., 0., 0., 0., 0., 0.};
+    double out[10];
+    out[0] = e_kin;
+    out[1] = e_int;
+    out[2] = e_pot;
+    out[3] = mom[0];
+    out[4] = mom[1];
+    out[5] = mom[2];
+    out[6] = ang_mom[0];
+    out[7] = ang_mom[1];
+    out[8] = ang_mom[2];
+    out[9] = mass;
+    if (MPI_Allreduce(out, in, 10, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD) !=
+        MPI_SUCCESS)
+      error("Failed to aggregate stats.");
+    e_kin = out[0];
+    e_int = out[1];
+    e_pot = out[2];
+    mom[0] = out[3];
+    mom[1] = out[4];
+    mom[2] = out[5];
+    ang_mom[0] = out[6];
+    ang_mom[1] = out[7];
+    ang_mom[2] = out[8];
+    mass = out[9];
+  }
+#endif
+
+  const double e_tot = e_kin + e_int + e_pot;
+
+  /* Print info */
+  if (e->nodeID == 0) {
+    fprintf(e->file_stats,
+            " %14e %14e %14e %14e %14e %14e %14e %14e %14e %14e %14e %14e\n",
+            e->time, mass, e_tot, e_kin, e_int, e_pot, mom[0], mom[1], mom[2],
+            ang_mom[0], ang_mom[1], ang_mom[2]);
+    fflush(e->file_stats);
+  }
 }
 
 /**
@@ -1850,63 +2100,58 @@ void engine_init_particles(struct engine *e) {
 
   struct space *s = e->s;
 
+  struct clocks_time time1, time2;
+  clocks_gettime(&time1);
+
   if (e->nodeID == 0) message("Initialising particles");
 
   /* Make sure all particles are ready to go */
   /* i.e. clean-up any stupid state in the ICs */
-  if ((e->policy & engine_policy_hydro) == engine_policy_hydro) {
-    space_map_cells_pre(s, 1, cell_init_parts, NULL);
+  if (e->policy & engine_policy_hydro) {
+    space_map_cells_pre(s, 0, cell_init_parts, NULL);
   }
-  if (((e->policy & engine_policy_self_gravity) ==
-       engine_policy_self_gravity) ||
-      ((e->policy & engine_policy_external_gravity) ==
-       engine_policy_external_gravity)) {
-    space_map_cells_pre(s, 1, cell_init_gparts, NULL);
+  if ((e->policy & engine_policy_self_gravity) ||
+      (e->policy & engine_policy_external_gravity)) {
+    space_map_cells_pre(s, 0, cell_init_gparts, NULL);
   }
 
   engine_prepare(e);
 
   engine_marktasks(e);
-
-  // printParticle(e->s->parts, 1000, e->s->nr_parts);
-  // printParticle(e->s->parts, 515050, e->s->nr_parts);
-
-  // message("\n0th DENSITY CALC\n");
-
   /* Build the masks corresponding to the policy */
   unsigned int mask = 0;
   unsigned int submask = 0;
 
   /* We always have sort tasks */
   mask |= 1 << task_type_sort;
+  mask |= 1 << task_type_init;
 
   /* Add the tasks corresponding to hydro operations to the masks */
-  if ((e->policy & engine_policy_hydro) == engine_policy_hydro) {
+  if (e->policy & engine_policy_hydro) {
 
-    mask |= 1 << task_type_init;
     mask |= 1 << task_type_self;
     mask |= 1 << task_type_pair;
-    mask |= 1 << task_type_sub;
+    mask |= 1 << task_type_sub_self;
+    mask |= 1 << task_type_sub_pair;
     mask |= 1 << task_type_ghost;
 
     submask |= 1 << task_subtype_density;
   }
 
   /* Add the tasks corresponding to self-gravity to the masks */
-  if ((e->policy & engine_policy_self_gravity) == engine_policy_self_gravity) {
+  if (e->policy & engine_policy_self_gravity) {
 
     /* Nothing here for now */
   }
 
-  /* Add the tasks corresponding to self-gravity to the masks */
-  if ((e->policy & engine_policy_external_gravity) ==
-      engine_policy_external_gravity) {
+  /* Add the tasks corresponding to external gravity to the masks */
+  if (e->policy & engine_policy_external_gravity) {
 
-    /* Nothing here for now */
+    mask |= 1 << task_type_grav_external;
   }
 
   /* Add MPI tasks if need be */
-  if ((e->policy & engine_policy_mpi) == engine_policy_mpi) {
+  if (e->policy & engine_policy_mpi) {
 
     mask |= 1 << task_type_send;
     mask |= 1 << task_type_recv;
@@ -1917,16 +2162,14 @@ void engine_init_particles(struct engine *e) {
   engine_launch(e, e->nr_threads, mask, submask);
   TIMER_TOC(timer_runners);
 
-  // message("\n0th ENTROPY CONVERSION\n")
-
   /* Apply some conversions (e.g. internal energy -> entropy) */
-  space_map_cells_pre(s, 1, cell_convert_hydro, NULL);
+  space_map_cells_pre(s, 0, cell_convert_hydro, NULL);
 
-  // printParticle(e->s->parts, e->s->xparts,1000, e->s->nr_parts);
-  // printParticle(e->s->parts, e->s->xparts,515050, e->s->nr_parts);
+  clocks_gettime(&time2);
 
   /* Ready to go */
   e->step = -1;
+  e->wallclock_time = (float)clocks_diff(&time1, &time2);
 }
 
 /**
@@ -1936,109 +2179,62 @@ void engine_init_particles(struct engine *e) {
  */
 void engine_step(struct engine *e) {
 
-  int updates = 0, g_updates = 0;
-  int ti_end_min = max_nr_timesteps, ti_end_max = 0;
-  double e_pot = 0.0, e_int = 0.0, e_kin = 0.0;
-  float mom[3] = {0.0, 0.0, 0.0};
-  float ang[3] = {0.0, 0.0, 0.0};
-  struct space *s = e->s;
+  double snapshot_drift_time = 0.;
 
   TIMER_TIC2;
 
   struct clocks_time time1, time2;
   clocks_gettime(&time1);
 
-  /* Collect the cell data. */
-  for (int k = 0; k < s->nr_cells; k++)
-    if (s->cells[k].nodeID == e->nodeID) {
-      struct cell *c = &s->cells[k];
+  e->tic_step = getticks();
 
-      /* Recurse */
-      engine_collect_kick(c);
+  /* Recover the (integer) end of the next time-step */
+  engine_collect_timestep(e);
 
-      /* And aggregate */
-      ti_end_min = min(ti_end_min, c->ti_end_min);
-      ti_end_max = max(ti_end_max, c->ti_end_max);
-      e_kin += c->e_kin;
-      e_int += c->e_int;
-      e_pot += c->e_pot;
-      updates += c->updated;
-      g_updates += c->g_updated;
-      mom[0] += c->mom[0];
-      mom[1] += c->mom[1];
-      mom[2] += c->mom[2];
-      ang[0] += c->ang[0];
-      ang[1] += c->ang[1];
-      ang[2] += c->ang[2];
-    }
+  /* Check for output */
+  while (e->ti_end_min >= e->ti_nextSnapshot && e->ti_nextSnapshot > 0) {
 
-/* Aggregate the data from the different nodes. */
-#ifdef WITH_MPI
-  {
-    int in_i[1], out_i[1];
-    in_i[0] = 0;
-    out_i[0] = ti_end_min;
-    if (MPI_Allreduce(out_i, in_i, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD) !=
-        MPI_SUCCESS)
-      error("Failed to aggregate t_end_min.");
-    ti_end_min = in_i[0];
-    out_i[0] = ti_end_max;
-    if (MPI_Allreduce(out_i, in_i, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD) !=
-        MPI_SUCCESS)
-      error("Failed to aggregate t_end_max.");
-    ti_end_max = in_i[0];
-  }
-  {
-    double in_d[5], out_d[5];
-    out_d[0] = updates;
-    out_d[1] = g_updates;
-    out_d[2] = e_kin;
-    out_d[3] = e_int;
-    out_d[4] = e_pot;
-    if (MPI_Allreduce(out_d, in_d, 5, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD) !=
-        MPI_SUCCESS)
-      error("Failed to aggregate energies.");
-    updates = in_d[0];
-    g_updates = in_d[1];
-    e_kin = in_d[2];
-    e_int = in_d[3];
-    e_pot = in_d[4];
-  }
-#endif
+    e->ti_old = e->ti_current;
+    e->ti_current = e->ti_nextSnapshot;
+    e->time = e->ti_current * e->timeBase + e->timeBegin;
+    e->timeOld = e->ti_old * e->timeBase + e->timeBegin;
+    e->timeStep = (e->ti_current - e->ti_old) * e->timeBase;
+    snapshot_drift_time = e->timeStep;
 
-  // message("\nDRIFT\n");
+    /* Drift everybody to the snapshot position */
+    engine_launch(e, e->nr_threads, 1 << task_type_drift, 0);
+
+    /* Dump... */
+    engine_dump_snapshot(e);
+
+    /* ... and find the next output time */
+    engine_compute_next_snapshot_time(e);
+  }
 
   /* Move forward in time */
   e->ti_old = e->ti_current;
-  e->ti_current = ti_end_min;
+  e->ti_current = e->ti_end_min;
   e->step += 1;
   e->time = e->ti_current * e->timeBase + e->timeBegin;
   e->timeOld = e->ti_old * e->timeBase + e->timeBegin;
-  e->timeStep = (e->ti_current - e->ti_old) * e->timeBase;
+  e->timeStep = (e->ti_current - e->ti_old) * e->timeBase + snapshot_drift_time;
 
   /* Drift everybody */
   engine_launch(e, e->nr_threads, 1 << task_type_drift, 0);
 
-  // printParticle(e->s->parts, e->s->xparts, 1000, e->s->nr_parts);
-  // printParticle(e->s->parts, e->s->xparts, 515050, e->s->nr_parts);
-
-  // if(e->step == 2)   exit(0);
-
   if (e->nodeID == 0) {
 
     /* Print some information to the screen */
-    printf("  %6d %14e %14e %10d %10d %21.3f\n", e->step, e->time, e->timeStep,
-           updates, g_updates, e->wallclock_time);
+    printf("  %6d %14e %14e %10zd %10zd %21.3f\n", e->step, e->time,
+           e->timeStep, e->updates, e->g_updates, e->wallclock_time);
     fflush(stdout);
-
-    /* Write some energy statistics */
-    fprintf(e->file_stats, "%d %f %f %f %f %f %f %f %f %f %f %f\n", e->step,
-            e->time, e_kin, e_int, e_pot, e_kin + e_int + e_pot, mom[0], mom[1],
-            mom[2], ang[0], ang[1], ang[2]);
-    fflush(e->file_stats);
   }
 
-  // message("\nACCELERATION AND KICK\n");
+  /* Save some statistics */
+  if (e->time - e->timeLastStatistics >= e->deltaTimeStatistics) {
+    engine_print_stats(e);
+    e->timeLastStatistics += e->deltaTimeStatistics;
+  }
 
   /* Re-distribute the particles amongst the nodes? */
   if (e->forcerepart != REPART_NONE) engine_repartition(e);
@@ -2049,17 +2245,24 @@ void engine_step(struct engine *e) {
   /* Build the masks corresponding to the policy */
   unsigned int mask = 0, submask = 0;
 
-  /* We always have sort tasks and kick tasks */
+  /* We always have sort tasks and init tasks */
   mask |= 1 << task_type_sort;
-  mask |= 1 << task_type_kick;
+  mask |= 1 << task_type_init;
+
+  /* Add the correct kick task */
+  if (e->policy & engine_policy_fixdt) {
+    mask |= 1 << task_type_kick_fixdt;
+  } else {
+    mask |= 1 << task_type_kick;
+  }
 
   /* Add the tasks corresponding to hydro operations to the masks */
-  if ((e->policy & engine_policy_hydro) == engine_policy_hydro) {
+  if (e->policy & engine_policy_hydro) {
 
-    mask |= 1 << task_type_init;
     mask |= 1 << task_type_self;
     mask |= 1 << task_type_pair;
-    mask |= 1 << task_type_sub;
+    mask |= 1 << task_type_sub_self;
+    mask |= 1 << task_type_sub_pair;
     mask |= 1 << task_type_ghost;
 
     submask |= 1 << task_subtype_density;
@@ -2067,20 +2270,18 @@ void engine_step(struct engine *e) {
   }
 
   /* Add the tasks corresponding to self-gravity to the masks */
-  if ((e->policy & engine_policy_self_gravity) == engine_policy_self_gravity) {
+  if (e->policy & engine_policy_self_gravity) {
 
     /* Nothing here for now */
   }
 
-  /* Add the tasks corresponding to self-gravity to the masks */
-  if ((e->policy & engine_policy_external_gravity) ==
-      engine_policy_external_gravity) {
-
-    /* Nothing here for now */
+  /* Add the tasks corresponding to external gravity to the masks */
+  if (e->policy & engine_policy_external_gravity) {
+    mask |= 1 << task_type_grav_external;
   }
 
   /* Add MPI tasks if need be */
-  if ((e->policy & engine_policy_mpi) == engine_policy_mpi) {
+  if (e->policy & engine_policy_mpi) {
 
     mask |= 1 << task_type_send;
     mask |= 1 << task_type_recv;
@@ -2096,8 +2297,7 @@ void engine_step(struct engine *e) {
   clocks_gettime(&time2);
 
   e->wallclock_time = (float)clocks_diff(&time1, &time2);
-  // printParticle(e->s->parts, e->s->xparts,1000, e->s->nr_parts);
-  // printParticle(e->s->parts, e->s->xparts,515050, e->s->nr_parts);
+  e->toc_step = getticks();
 }
 
 /**
@@ -2269,24 +2469,23 @@ void engine_split(struct engine *e, struct partition *initial_partition) {
   part_relink_parts(s->gparts, s->nr_gparts, s->parts);
 
   /* Verify that the links are correct */
-  /* MATTHIEU: To be commented out once we are happy */
   for (size_t k = 0; k < s->nr_gparts; ++k) {
 
     if (s->gparts[k].id_or_neg_offset <= 0) {
-    
+
       struct part *part = &s->parts[-s->gparts[k].id_or_neg_offset];
 
       if (part->gpart != &s->gparts[k]) error("Linking problem !");
 
-      if (s->gparts[k].x[0] != part->x[0] ||
-          s->gparts[k].x[1] != part->x[1] ||
+      if (s->gparts[k].x[0] != part->x[0] || s->gparts[k].x[1] != part->x[1] ||
           s->gparts[k].x[2] != part->x[2])
         error("Linked particles are not at the same position !");
     }
   }
   for (size_t k = 0; k < s->nr_parts; ++k) {
 
-    if (s->parts[k].gpart != NULL && s->parts[k].gpart->id_or_neg_offset != -k) error("Linking problem !");
+    if (s->parts[k].gpart != NULL && s->parts[k].gpart->id_or_neg_offset != -k)
+      error("Linking problem !");
   }
 
 #else
@@ -2294,23 +2493,89 @@ void engine_split(struct engine *e, struct partition *initial_partition) {
 #endif
 }
 
-#if defined(HAVE_LIBNUMA) && defined(_GNU_SOURCE)
-static bool hyperthreads_present(void) {
-#ifdef __linux__
-  FILE *f =
-      fopen("/sys/devices/system/cpu/cpu0/topology/thread_siblings_list", "r");
+/**
+ * @brief Writes a snapshot with the current state of the engine
+ *
+ * @param e The #engine.
+ */
+void engine_dump_snapshot(struct engine *e) {
 
-  int c;
-  while ((c = fgetc(f)) != EOF && c != ',')
+  struct clocks_time time1, time2;
+  clocks_gettime(&time1);
+
+  if (e->verbose) message("writing snapshot at t=%f.", e->time);
+
+/* Dump... */
+#if defined(WITH_MPI)
+#if defined(HAVE_PARALLEL_HDF5)
+  write_output_parallel(e, e->snapshotBaseName, e->snapshotUnits, e->nodeID,
+                        e->nr_nodes, MPI_COMM_WORLD, MPI_INFO_NULL);
+#else
+  write_output_serial(e, e->snapshotBaseName, e->snapshotUnits, e->nodeID,
+                      e->nr_nodes, MPI_COMM_WORLD, MPI_INFO_NULL);
+#endif
+#else
+  write_output_single(e, e->snapshotBaseName, e->snapshotUnits);
+#endif
+
+  clocks_gettime(&time2);
+  if (e->verbose)
+    message("writing particle properties took %.3f %s.",
+            (float)clocks_diff(&time1, &time2), clocks_getunit());
+}
+
+#ifdef HAVE_SETAFFINITY
+/**
+ * @brief Returns the initial affinity the main thread is using.
+ */
+static cpu_set_t *engine_entry_affinity() {
+
+  static int use_entry_affinity = 0;
+
+  if (!use_entry_affinity) {
+    pthread_t engine = pthread_self();
+    pthread_getaffinity_np(engine, sizeof(entry_affinity), &entry_affinity);
+    use_entry_affinity = 1;
+  }
+
+  return &entry_affinity;
+}
+#endif
+
+/**
+ * @brief  Ensure the NUMA node on which we initialise (first touch) everything
+ *  doesn't change before engine_init allocates NUMA-local workers.
+ */
+void engine_pin() {
+
+#ifdef HAVE_SETAFFINITY
+  cpu_set_t *entry_affinity = engine_entry_affinity();
+  int pin;
+  for (pin = 0; pin < CPU_SETSIZE && !CPU_ISSET(pin, entry_affinity); ++pin)
     ;
-  fclose(f);
 
-  return c == ',';
+  cpu_set_t affinity;
+  CPU_ZERO(&affinity);
+  CPU_SET(pin, &affinity);
+  if (sched_setaffinity(0, sizeof(affinity), &affinity) != 0) {
+    error("failed to set engine's affinity");
+  }
 #else
-  return true;  // just guess
+  error("SWIFT was not compiled with support for pinning.");
 #endif
 }
+
+/**
+ * @brief Unpins the main thread.
+ */
+void engine_unpin() {
+#ifdef HAVE_SETAFFINITY
+  pthread_t main_thread = pthread_self();
+  pthread_setaffinity_np(main_thread, sizeof(entry_affinity), &entry_affinity);
+#else
+  error("SWIFT was not compiled with support for pinning.");
 #endif
+}
 
 /**
  * @brief init an engine with the given number of threads, queues, and
@@ -2322,13 +2587,20 @@ static bool hyperthreads_present(void) {
  * @param nr_nodes The number of MPI ranks.
  * @param nodeID The MPI rank of this node.
  * @param nr_threads The number of threads per MPI rank.
+ * @param with_aff use processor affinity, if supported.
  * @param policy The queuing policy to use.
  * @param verbose Is this #engine talkative ?
+ * @param physical_constants The #phys_const used for this run.
+ * @param hydro The #hydro_props used for this run.
+ * @param potential The properties of the external potential.
  */
 
 void engine_init(struct engine *e, struct space *s,
                  const struct swift_params *params, int nr_nodes, int nodeID,
-                 int nr_threads, int policy, int verbose) {
+                 int nr_threads, int with_aff, int policy, int verbose,
+                 const struct phys_const *physical_constants,
+                 const struct hydro_props *hydro,
+                 const struct external_potential *potential) {
 
   /* Clean-up everything */
   bzero(e, sizeof(struct engine));
@@ -2353,90 +2625,153 @@ void engine_init(struct engine *e, struct space *s,
   e->ti_old = 0;
   e->ti_current = 0;
   e->timeStep = 0.;
+  e->timeBase = 0.;
+  e->timeBase_inv = 0.;
+  e->timeFirstSnapshot =
+      parser_get_param_double(params, "Snapshots:time_first");
+  e->deltaTimeSnapshot =
+      parser_get_param_double(params, "Snapshots:delta_time");
+  e->ti_nextSnapshot = 0;
+  parser_get_param_string(params, "Snapshots:basename", e->snapshotBaseName);
+  e->snapshotUnits = malloc(sizeof(struct UnitSystem));
+  units_init(e->snapshotUnits, params, "Snapshots");
   e->dt_min = parser_get_param_double(params, "TimeIntegration:dt_min");
   e->dt_max = parser_get_param_double(params, "TimeIntegration:dt_max");
   e->file_stats = NULL;
+  e->deltaTimeStatistics =
+      parser_get_param_double(params, "Statistics:delta_time");
+  e->timeLastStatistics = e->timeBegin - e->deltaTimeStatistics;
   e->verbose = verbose;
   e->count_step = 0;
   e->wallclock_time = 0.f;
+  e->physical_constants = physical_constants;
+  e->hydro_properties = hydro;
+  e->external_potential = potential;
+  e->parameter_file = params;
   engine_rank = nodeID;
 
   /* Make the space link back to the engine. */
   s->e = e;
 
   /* Get the number of queues */
-  int nr_queues = parser_get_param_int(params, "Scheduler:nr_queues");
+  int nr_queues =
+      parser_get_opt_param_int(params, "Scheduler:nr_queues", nr_threads);
   if (nr_queues <= 0) nr_queues = e->nr_threads;
+  if (nr_queues != nr_threads)
+    message("Number of task queues set to %d", nr_queues);
   s->nr_queues = nr_queues;
 
+/* Deal with affinity. For now, just figure out the number of cores. */
 #if defined(HAVE_SETAFFINITY)
   const int nr_cores = sysconf(_SC_NPROCESSORS_ONLN);
-  int cpuid[nr_cores];
+  cpu_set_t *entry_affinity = engine_entry_affinity();
+  const int nr_affinity_cores = CPU_COUNT(entry_affinity);
+
+  if (nr_cores > CPU_SETSIZE) /* Unlikely, except on e.g. SGI UV. */
+    error("must allocate dynamic cpu_set_t (too many cores per node)");
+
+  char *buf = malloc((nr_cores + 1) * sizeof(char));
+  buf[nr_cores] = '\0';
+  for (int j = 0; j < nr_cores; ++j) {
+    /* Reversed bit order from convention, but same as e.g. Intel MPI's
+     * I_MPI_PIN_DOMAIN explicit mask: left-to-right, LSB-to-MSB. */
+    buf[j] = CPU_ISSET(j, entry_affinity) ? '1' : '0';
+  }
+
+  if (verbose && with_aff) message("Affinity at entry: %s", buf);
+
+  int *cpuid = malloc(nr_affinity_cores * sizeof(int));
   cpu_set_t cpuset;
-  if ((policy & engine_policy_cputight) == engine_policy_cputight) {
-    for (int k = 0; k < nr_cores; k++) cpuid[k] = k;
-  } else {
-    /*  Get next highest power of 2. */
-    int maxint = 1;
-    while (maxint < nr_cores) maxint *= 2;
 
-    cpuid[0] = 0;
-    int k = 1;
-    for (int i = 1; i < maxint; i *= 2)
-      for (int j = maxint / i / 2; j < maxint; j += maxint / i)
-        if (j < nr_cores && j != 0) cpuid[k++] = j;
+  int skip = 0;
+  for (int k = 0; k < nr_affinity_cores; k++) {
+    int c;
+    for (c = skip; c < CPU_SETSIZE && !CPU_ISSET(c, entry_affinity); ++c)
+      ;
+    cpuid[k] = c;
+    skip = c + 1;
+  }
+
+  if (with_aff) {
 
 #if defined(HAVE_LIBNUMA) && defined(_GNU_SOURCE)
-    /* Ascending NUMA distance. Bubblesort(!) for stable equidistant CPUs. */
-    if (numa_available() >= 0) {
-      if (nodeID == 0) message("prefer NUMA-local CPUs");
-
-      const int home = numa_node_of_cpu(sched_getcpu());
-      const int half = nr_cores / 2;
-      const bool swap_hyperthreads = hyperthreads_present();
-      bool done = false;
-      if (swap_hyperthreads && nodeID == 0)
-        message("prefer physical cores to hyperthreads");
-
-      while (!done) {
-        done = true;
-        for (int i = 1; i < nr_cores; i++) {
-          const int node_a = numa_node_of_cpu(cpuid[i - 1]);
-          const int node_b = numa_node_of_cpu(cpuid[i]);
-
-          /* Avoid using local hyperthreads over unused remote physical cores.
-           * Assume two hyperthreads, and that cpuid >= half partitions them.
-           */
-          const int thread_a = swap_hyperthreads && cpuid[i - 1] >= half;
-          const int thread_b = swap_hyperthreads && cpuid[i] >= half;
-
-          bool swap = thread_a > thread_b;
-          if (thread_a == thread_b)
-            swap = numa_distance(home, node_a) > numa_distance(home, node_b);
-
-          if (swap) {
-            const int t = cpuid[i - 1];
-            cpuid[i - 1] = cpuid[i];
-            cpuid[i] = t;
-            done = false;
+    if ((policy & engine_policy_cputight) != engine_policy_cputight) {
+
+      if (numa_available() >= 0) {
+        if (nodeID == 0) message("prefer NUMA-distant CPUs");
+
+        /* Get list of numa nodes of all available cores. */
+        int *nodes = malloc(nr_affinity_cores * sizeof(int));
+        int nnodes = 0;
+        for (int i = 0; i < nr_affinity_cores; i++) {
+          nodes[i] = numa_node_of_cpu(cpuid[i]);
+          if (nodes[i] > nnodes) nnodes = nodes[i];
+        }
+        nnodes += 1;
+
+        /* Count cores per node. */
+        int *core_counts = malloc(nnodes * sizeof(int));
+        for (int i = 0; i < nr_affinity_cores; i++) {
+          core_counts[nodes[i]] = 0;
+        }
+        for (int i = 0; i < nr_affinity_cores; i++) {
+          core_counts[nodes[i]] += 1;
+        }
+
+        /* Index cores within each node. */
+        int *core_indices = malloc(nr_affinity_cores * sizeof(int));
+        for (int i = nr_affinity_cores - 1; i >= 0; i--) {
+          core_indices[i] = core_counts[nodes[i]];
+          core_counts[nodes[i]] -= 1;
+        }
+
+        /* Now sort so that we pick adjacent cpuids from different nodes
+         * by sorting internal node core indices. */
+        int done = 0;
+        while (!done) {
+          done = 1;
+          for (int i = 1; i < nr_affinity_cores; i++) {
+            if (core_indices[i] < core_indices[i - 1]) {
+              int t = cpuid[i - 1];
+              cpuid[i - 1] = cpuid[i];
+              cpuid[i] = t;
+
+              t = core_indices[i - 1];
+              core_indices[i - 1] = core_indices[i];
+              core_indices[i] = t;
+              done = 0;
+            }
           }
         }
+
+        free(nodes);
+        free(core_counts);
+        free(core_indices);
       }
     }
 #endif
+  } else {
+    if (nodeID == 0) message("no processor affinity used");
+
+  } /* with_aff */
+
+  /* Avoid (unexpected) interference between engine and runner threads. We can
+   * do this once we've made at least one call to engine_entry_affinity and
+   * maybe numa_node_of_cpu(sched_getcpu()), even if the engine isn't already
+   * pinned. Also unpin this when asked to not pin at all (!with_aff). */
+  engine_unpin();
+#endif
 
-    if (nodeID == 0) {
+  if (with_aff) {
 #ifdef WITH_MPI
-      printf("[%04i] %s engine_init: cpu map is [ ", nodeID,
-             clocks_get_timesincestart());
+    printf("[%04i] %s engine_init: cpu map is [ ", nodeID,
+           clocks_get_timesincestart());
 #else
-      printf("%s engine_init: cpu map is [ ", clocks_get_timesincestart());
+    printf("%s engine_init: cpu map is [ ", clocks_get_timesincestart());
 #endif
-      for (int i = 0; i < nr_cores; i++) printf("%i ", cpuid[i]);
-      printf("].\n");
-    }
+    for (int i = 0; i < nr_affinity_cores; i++) printf("%i ", cpuid[i]);
+    printf("].\n");
   }
-#endif
 
   /* Are we doing stuff in parallel? */
   if (nr_nodes > 1) {
@@ -2456,20 +2791,18 @@ void engine_init(struct engine *e, struct space *s,
   if (e->nodeID == 0) {
     e->file_stats = fopen("energy.txt", "w");
     fprintf(e->file_stats,
-            "# Step Time E_kin E_int E_pot E_tot "
-            "p_x p_y p_z ang_x ang_y ang_z\n");
+            "# %14s %14s %14s %14s %14s %14s %14s %14s %14s %14s %14s %14s\n",
+            "Time", "Mass", "E_tot", "E_kin", "E_int", "E_pot", "p_x", "p_y",
+            "p_z", "ang_x", "ang_y", "ang_z");
+    fflush(e->file_stats);
   }
 
   /* Print policy */
   engine_print_policy(e);
 
   /* Print information about the hydro scheme */
-  if ((e->policy & engine_policy_hydro) == engine_policy_hydro) {
-    if (e->nodeID == 0) message("Hydrodynamic scheme: %s.", SPH_IMPLEMENTATION);
-    if (e->nodeID == 0)
-      message("Hydrodynamic kernel: %s with %.2f +/- %.2f neighbours.",
-              kernel_name, kernel_nwneigh, const_delta_nwneigh);
-  }
+  if (e->policy & engine_policy_hydro)
+    if (e->nodeID == 0) hydro_props_print(e->hydro_properties);
 
   /* Check we have sensible time bounds */
   if (e->timeBegin >= e->timeEnd)
@@ -2487,10 +2820,11 @@ void engine_init(struct engine *e, struct space *s,
 
   /* Deal with timestep */
   e->timeBase = (e->timeEnd - e->timeBegin) / max_nr_timesteps;
+  e->timeBase_inv = 1.0 / e->timeBase;
   e->ti_current = 0;
 
   /* Fixed time-step case */
-  if ((e->policy & engine_policy_fixdt) == engine_policy_fixdt) {
+  if (e->policy & engine_policy_fixdt) {
     e->dt_min = e->dt_max;
 
     /* Find timestep on the timeline */
@@ -2527,6 +2861,19 @@ void engine_init(struct engine *e, struct space *s,
     error("Maximal time-step size larger than the simulation run time t=%e",
           e->timeEnd - e->timeBegin);
 
+  /* Deal with outputs */
+  if (e->deltaTimeSnapshot < 0.)
+    error("Time between snapshots (%e) must be positive.",
+          e->deltaTimeSnapshot);
+
+  if (e->timeFirstSnapshot < e->timeBegin)
+    error(
+        "Time of first snapshot (%e) must be after the simulation start t=%e.",
+        e->timeFirstSnapshot, e->timeBegin);
+
+  /* Find the time of the first output */
+  engine_compute_next_snapshot_time(e);
+
 /* Construct types for MPI communications */
 #ifdef WITH_MPI
   part_create_mpi_types();
@@ -2570,19 +2917,24 @@ void engine_init(struct engine *e, struct space *s,
     if (pthread_create(&e->runners[k].thread, NULL, &runner_main,
                        &e->runners[k]) != 0)
       error("Failed to create runner thread.");
-    if ((e->policy & engine_policy_setaffinity) == engine_policy_setaffinity) {
+
+    /* Try to pin the runner to a given core */
+    if (with_aff &&
+        (e->policy & engine_policy_setaffinity) == engine_policy_setaffinity) {
 #if defined(HAVE_SETAFFINITY)
 
       /* Set a reasonable queue ID. */
-      e->runners[k].cpuid = cpuid[k % nr_cores];
+      int coreid = k % nr_affinity_cores;
+      e->runners[k].cpuid = cpuid[coreid];
+
       if (nr_queues < e->nr_threads)
-        e->runners[k].qid = cpuid[k % nr_cores] * nr_queues / nr_cores;
+        e->runners[k].qid = cpuid[coreid] * nr_queues / nr_affinity_cores;
       else
         e->runners[k].qid = k;
 
       /* Set the cpu mask to zero | e->id. */
       CPU_ZERO(&cpuset);
-      CPU_SET(cpuid[k % nr_cores], &cpuset);
+      CPU_SET(cpuid[coreid], &cpuset);
 
       /* Apply this mask to the runner's pthread. */
       if (pthread_setaffinity_np(e->runners[k].thread, sizeof(cpu_set_t),
@@ -2596,10 +2948,24 @@ void engine_init(struct engine *e, struct space *s,
       e->runners[k].cpuid = k;
       e->runners[k].qid = k * nr_queues / e->nr_threads;
     }
-    // message( "runner %i on cpuid=%i with qid=%i." , e->runners[k].id ,
-    // e->runners[k].cpuid , e->runners[k].qid );
+    if (verbose) {
+      if (with_aff)
+        message("runner %i on cpuid=%i with qid=%i.", e->runners[k].id,
+                e->runners[k].cpuid, e->runners[k].qid);
+      else
+        message("runner %i using qid=%i no cpuid.", e->runners[k].id,
+                e->runners[k].qid);
+    }
   }
 
+/* Free the affinity stuff */
+#if defined(HAVE_SETAFFINITY)
+  if (with_aff) {
+    free(cpuid);
+    free(buf);
+  }
+#endif
+
   /* Wait for the runner threads to be in place. */
   while (e->barrier_running || e->barrier_launch)
     if (pthread_cond_wait(&e->barrier_cond, &e->barrier_mutex) != 0)
@@ -2631,3 +2997,33 @@ void engine_print_policy(struct engine *e) {
   fflush(stdout);
 #endif
 }
+
+/**
+ * @brief Computes the next time (on the time line) for a dump
+ *
+ * @param e The #engine.
+ */
+void engine_compute_next_snapshot_time(struct engine *e) {
+
+  for (double time = e->timeFirstSnapshot;
+       time < e->timeEnd + e->deltaTimeSnapshot; time += e->deltaTimeSnapshot) {
+
+    /* Output time on the integer timeline */
+    e->ti_nextSnapshot = (time - e->timeBegin) / e->timeBase;
+
+    if (e->ti_nextSnapshot > e->ti_current) break;
+  }
+
+  /* Deal with last snapshot */
+  if (e->ti_nextSnapshot >= max_nr_timesteps) {
+    e->ti_nextSnapshot = -1;
+    if (e->verbose) message("No further output time.");
+  } else {
+
+    /* Be nice, talk... */
+    const float next_snapshot_time =
+        e->ti_nextSnapshot * e->timeBase + e->timeBegin;
+    if (e->verbose)
+      message("Next output time set to t=%f.", next_snapshot_time);
+  }
+}
diff --git a/src/engine.h b/src/engine.h
index c8b9d7a46f8c788237e3b9ba7cb9d7d0a6e1370d..d1bfecc568355a9cf5aa57590ccba1df81b05b8a 100644
--- a/src/engine.h
+++ b/src/engine.h
@@ -1,6 +1,11 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
+ *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ *               2015 Peter W. Draper (p.w.draper@durham.ac.uk)
+ *                    Angus Lepper (angus.lepper@ed.ac.uk)
+ *               2016 John A. Regan (john.a.regan@durham.ac.uk)
+ *                    Tom Theuns (tom.theuns@durham.ac.uk)
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
@@ -32,14 +37,15 @@
 #include <stdio.h>
 
 /* Includes. */
-#include "lock.h"
-#include "proxy.h"
+#include "clocks.h"
+#include "parser.h"
+#include "partition.h"
+#include "potentials.h"
 #include "runner.h"
 #include "scheduler.h"
 #include "space.h"
 #include "task.h"
-#include "parser.h"
-#include "partition.h"
+#include "units.h"
 
 /* Some constants. */
 enum engine_policy {
@@ -123,9 +129,25 @@ struct engine {
 
   /* Time base */
   double timeBase;
+  double timeBase_inv;
+
+  /* Minimal ti_end for the next time-step */
+  int ti_end_min;
+
+  /* Number of particles updated */
+  size_t updates, g_updates;
 
-  /* File for statistics */
+  /* Snapshot information */
+  double timeFirstSnapshot;
+  double deltaTimeSnapshot;
+  int ti_nextSnapshot;
+  char snapshotBaseName[200];
+  struct UnitSystem *snapshotUnits;
+
+  /* Statistics information */
   FILE *file_stats;
+  double timeLastStatistics;
+  double deltaTimeStatistics;
 
   /* The current step number. */
   int step;
@@ -145,8 +167,8 @@ struct engine {
   struct proxy *proxies;
   int nr_proxies, *proxy_ind;
 
-  /* Tic at the start of a step. */
-  ticks tic_step;
+  /* Tic/toc at the start/end of a step. */
+  ticks tic_step, toc_step;
 
   /* Wallclock time of the last time-step */
   float wallclock_time;
@@ -164,13 +186,30 @@ struct engine {
 
   /* Are we talkative ? */
   int verbose;
+
+  /* Physical constants definition */
+  const struct phys_const *physical_constants;
+
+  /* Properties of the hydro scheme */
+  const struct hydro_props *hydro_properties;
+
+  /* Properties of external gravitational potential */
+  const struct external_potential *external_potential;
+
+  /* The (parsed) parameter file */
+  const struct swift_params *parameter_file;
 };
 
 /* Function prototypes. */
 void engine_barrier(struct engine *e, int tid);
+void engine_compute_next_snapshot_time(struct engine *e);
+void engine_dump_snapshot(struct engine *e);
 void engine_init(struct engine *e, struct space *s,
                  const struct swift_params *params, int nr_nodes, int nodeID,
-                 int nr_threads, int policy, int verbose);
+                 int nr_threads, int with_aff, int policy, int verbose,
+                 const struct phys_const *physical_constants,
+                 const struct hydro_props *hydro,
+                 const struct external_potential *potential);
 void engine_launch(struct engine *e, int nr_runners, unsigned int mask,
                    unsigned int submask);
 void engine_prepare(struct engine *e);
@@ -189,5 +228,7 @@ void engine_redistribute(struct engine *e);
 struct link *engine_addlink(struct engine *e, struct link *l, struct task *t);
 void engine_print_policy(struct engine *e);
 int engine_is_done(struct engine *e);
+void engine_pin();
+void engine_unpin();
 
 #endif /* SWIFT_ENGINE_H */
diff --git a/src/gravity/Default/gravity.h b/src/gravity/Default/gravity.h
index 92a9f64c1f84a9e949f4c0e9485f892b5c808cdc..0f62511eced181fbf3b5b781a50314dd08f7c0ef 100644
--- a/src/gravity/Default/gravity.h
+++ b/src/gravity/Default/gravity.h
@@ -1,6 +1,7 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Coypright (c) 2015 Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ *               2016 Tom Theuns (tom.theuns@durham.ac.uk)
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
@@ -18,19 +19,48 @@
  ******************************************************************************/
 
 #include <float.h>
+#include "potentials.h"
 
 /**
- * @brief Computes the gravity time-step of a given particle
+ * @brief Computes the gravity time-step of a given particle due to an external
+ *potential.
  *
- * @param gp Pointer to the g-particle data
+ * This function only branches towards the potential chosen by the user.
  *
+ * @param potential The properties of the external potential.
+ * @param phys_const The physical constants in internal units.
+ * @param gp Pointer to the g-particle data.
  */
+__attribute__((always_inline)) INLINE static float
+gravity_compute_timestep_external(const struct external_potential* potential,
+                                  const struct phys_const* const phys_const,
+                                  const struct gpart* const gp) {
 
-__attribute__((always_inline))
-    INLINE static float gravity_compute_timestep(struct gpart* gp) {
+  float dt = FLT_MAX;
 
-  /* Currently no limit is imposed */
-  return FLT_MAX;
+#ifdef EXTERNAL_POTENTIAL_POINTMASS
+  dt =
+      fminf(dt, external_gravity_pointmass_timestep(potential, phys_const, gp));
+#endif
+
+  return dt;
+}
+
+/**
+ * @brief Computes the gravity time-step of a given particle due to self-gravity
+ *
+ * This function only branches towards the potential chosen by the user.
+ *
+ * @param phys_const The physical constants in internal units.
+ * @param gp Pointer to the g-particle data.
+ */
+__attribute__((always_inline)) INLINE static float
+gravity_compute_timestep_self(const struct phys_const* const phys_const,
+                              const struct gpart* const gp) {
+
+  float dt = FLT_MAX;
+
+  return dt;
 }
 
 /**
@@ -41,8 +71,8 @@ __attribute__((always_inline))
  *
  * @param gp The particle to act upon
  */
-__attribute__((always_inline))
-    INLINE static void gravity_first_init_gpart(struct gpart* gp) {}
+__attribute__((always_inline)) INLINE static void gravity_first_init_gpart(
+    struct gpart* gp) {}
 
 /**
  * @brief Prepares a g-particle for the gravity calculation
@@ -52,8 +82,8 @@ __attribute__((always_inline))
  *
  * @param gp The particle to act upon
  */
-__attribute__((always_inline))
-    INLINE static void gravity_init_part(struct gpart* gp) {
+__attribute__((always_inline)) INLINE static void gravity_init_part(
+    struct gpart* gp) {
 
   /* Zero the acceleration */
   gp->a_grav[0] = 0.f;
@@ -68,8 +98,26 @@ __attribute__((always_inline))
  *
  * @param gp The particle to act upon
  */
-__attribute__((always_inline))
-    INLINE static void gravity_end_force(struct gpart* gp) {}
+__attribute__((always_inline)) INLINE static void gravity_end_force(
+    struct gpart* gp) {}
+
+/**
+ * @brief Computes the gravitational acceleration induced by external potentials
+ *
+ * This function only branches towards the potential chosen by the user.
+ *
+ * @param potential The properties of the external potential.
+ * @param phys_const The physical constants in internal units.
+ * @param gp The particle to act upon.
+ */
+__attribute__((always_inline)) INLINE static void external_gravity(
+    const struct external_potential* potential,
+    const struct phys_const* const phys_const, struct gpart* gp) {
+
+#ifdef EXTERNAL_POTENTIAL_POINTMASS
+  external_gravity_pointmass(potential, phys_const, gp);
+#endif
+}
 
 /**
  * @brief Kick the additional variables
diff --git a/src/gravity/Default/gravity_debug.h b/src/gravity/Default/gravity_debug.h
index 531afffa5c2958eea49fe49171cde81fa8350fcf..7cf375a1fdf7bccc4131dc415ab2d4acbbf2d3bc 100644
--- a/src/gravity/Default/gravity_debug.h
+++ b/src/gravity/Default/gravity_debug.h
@@ -17,12 +17,13 @@
  *
  ******************************************************************************/
 
-__attribute__((always_inline))
-    INLINE static void gravity_debug_particle(const struct gpart* p) {
+__attribute__((always_inline)) INLINE static void gravity_debug_particle(
+    const struct gpart* p) {
   printf(
       "x=[%.3e,%.3e,%.3e], "
       "v_full=[%.3e,%.3e,%.3e] \n a=[%.3e,%.3e,%.3e],\n "
       "mass=%.3e t_begin=%d, t_end=%d\n",
       p->x[0], p->x[1], p->x[2], p->v_full[0], p->v_full[1], p->v_full[2],
-      p->a_grav[0], p->a_grav[1], p->a_grav[2], p->mass, p->ti_begin, p->ti_end);
+      p->a_grav[0], p->a_grav[1], p->a_grav[2], p->mass, p->ti_begin,
+      p->ti_end);
 }
diff --git a/src/gravity/Default/gravity_part.h b/src/gravity/Default/gravity_part.h
index d36ceea650a54e1fdd0ff1fcf162a830dc5ed7cb..b6c9e62559207cb25323c8a108e4bffc87ea0fcf 100644
--- a/src/gravity/Default/gravity_part.h
+++ b/src/gravity/Default/gravity_part.h
@@ -25,6 +25,9 @@ struct gpart {
   /* Particle position. */
   double x[3];
 
+  /* Offset between current position and position at last tree rebuild. */
+  float x_diff[3];
+
   /* Particle velocity. */
   float v_full[3];
 
@@ -40,6 +43,10 @@ struct gpart {
   /* Particle time of end of time-step. */
   int ti_end;
 
+  /* /\* current time of x, and of v_full *\/ */
+  /* float tx; */
+  /* float tv; */
+
   /* Particle ID. If negative, it is the negative offset of the #part with
      which this gpart is linked. */
   long long id_or_neg_offset;
diff --git a/src/hydro.h b/src/hydro.h
index aacbb6ac1d16b38133ee573ee2b7ad95918fc9e5..b2ae9d57c399ecea818e9f3dc7db238e01487a9a 100644
--- a/src/hydro.h
+++ b/src/hydro.h
@@ -19,20 +19,24 @@
 #ifndef SWIFT_HYDRO_H
 #define SWIFT_HYDRO_H
 
-#include "./const.h"
+/* Includes. */
+#include "const.h"
+#include "hydro_properties.h"
+#include "kernel_hydro.h"
+#include "part.h"
 
 /* Import the right functions */
 #if defined(MINIMAL_SPH)
-#include "./hydro/Minimal/hydro_iact.h"
 #include "./hydro/Minimal/hydro.h"
+#include "./hydro/Minimal/hydro_iact.h"
 #define SPH_IMPLEMENTATION "Minimal version of SPH (e.g. Price 2010)"
 #elif defined(GADGET2_SPH)
-#include "./hydro/Gadget2/hydro_iact.h"
 #include "./hydro/Gadget2/hydro.h"
+#include "./hydro/Gadget2/hydro_iact.h"
 #define SPH_IMPLEMENTATION "Gadget-2 version of SPH (Springel 2005)"
 #elif defined(DEFAULT_SPH)
-#include "./hydro/Default/hydro_iact.h"
 #include "./hydro/Default/hydro.h"
+#include "./hydro/Default/hydro_iact.h"
 #define SPH_IMPLEMENTATION "Default version of SPH"
 #else
 #error "Invalid choice of SPH variant"
diff --git a/src/hydro/Default/hydro.h b/src/hydro/Default/hydro.h
index 03953b07ad4e172d96b6e3382814e036a538e2bd..4a6b1900374eb6b422e6fa7422901293b36fd5eb 100644
--- a/src/hydro/Default/hydro.h
+++ b/src/hydro/Default/hydro.h
@@ -27,10 +27,14 @@
  *
  */
 __attribute__((always_inline)) INLINE static float hydro_compute_timestep(
-    struct part* p, struct xpart* xp) {
+    const struct part* p, const struct xpart* xp,
+    const struct hydro_props* hydro_properties) {
+
+  const float CFL_condition = hydro_properties->CFL_condition;
 
   /* CFL condition */
-  const float dt_cfl = 2.f * const_cfl * kernel_gamma * p->h / p->force.v_sig;
+  const float dt_cfl =
+      2.f * kernel_gamma * CFL_condition * p->h / p->force.v_sig;
 
   /* Limit change in u */
   const float dt_u_change =
@@ -49,9 +53,8 @@ __attribute__((always_inline)) INLINE static float hydro_compute_timestep(
  * @param p The particle to act upon
  * @param xp The extended particle data to act upon
  */
-__attribute__((always_inline))
-    INLINE static void hydro_first_init_part(struct part* p, struct xpart* xp) {
-}
+__attribute__((always_inline)) INLINE static void hydro_first_init_part(
+    struct part* p, struct xpart* xp) {}
 
 /**
  * @brief Prepares a particle for the density calculation.
@@ -61,8 +64,8 @@ __attribute__((always_inline))
  *
  * @param p The particle to act upon
  */
-__attribute__((always_inline))
-    INLINE static void hydro_init_part(struct part* p) {
+__attribute__((always_inline)) INLINE static void hydro_init_part(
+    struct part* p) {
   p->density.wcount = 0.f;
   p->density.wcount_dh = 0.f;
   p->rho = 0.f;
@@ -82,8 +85,8 @@ __attribute__((always_inline))
  * @param p The particle to act upon
  * @param time The current time
  */
-__attribute__((always_inline))
-    INLINE static void hydro_end_density(struct part* p, float time) {
+__attribute__((always_inline)) INLINE static void hydro_end_density(
+    struct part* p, float time) {
 
   /* Some smoothing length multiples. */
   const float h = p->h;
@@ -93,7 +96,7 @@ __attribute__((always_inline))
 
   /* Final operation on the density (add self-contribution). */
   p->rho += p->mass * kernel_root;
-  p->rho_dh -= 3.0f * p->mass * kernel_root * kernel_igamma;
+  p->rho_dh -= 3.0f * p->mass * kernel_root;
   p->density.wcount += kernel_root;
 
   /* Finish the calculation by inserting the missing h-factors */
@@ -101,6 +104,11 @@ __attribute__((always_inline))
   p->rho_dh *= ih4;
   p->density.wcount *= (4.0f / 3.0f * M_PI * kernel_gamma3);
   p->density.wcount_dh *= ih * (4.0f / 3.0f * M_PI * kernel_gamma4);
+
+  const float irho = 1.f / p->rho;
+
+  /* Compute the derivative term */
+  p->rho_dh = 1.f / (1.f + 0.33333333f * p->h * p->rho_dh * irho);
 }
 
 /**
@@ -162,8 +170,8 @@ __attribute__((always_inline)) INLINE static void hydro_prepare_force(
  *
  * @param p The particle to act upon
  */
-__attribute__((always_inline))
-    INLINE static void hydro_reset_acceleration(struct part* p) {
+__attribute__((always_inline)) INLINE static void hydro_reset_acceleration(
+    struct part* p) {
 
   /* Reset the acceleration. */
   p->a_hydro[0] = 0.0f;
@@ -209,8 +217,8 @@ __attribute__((always_inline)) INLINE static void hydro_predict_extra(
  *
  * @param p The particle to act upon
  */
-__attribute__((always_inline))
-    INLINE static void hydro_end_force(struct part* p) {}
+__attribute__((always_inline)) INLINE static void hydro_end_force(
+    struct part* p) {}
 
 /**
  * @brief Kick the additional variables
@@ -230,16 +238,17 @@ __attribute__((always_inline)) INLINE static void hydro_kick_extra(
  *
  * @param p The particle to act upon
  */
-__attribute__((always_inline))
-    INLINE static void hydro_convert_quantities(struct part* p) {}
+__attribute__((always_inline)) INLINE static void hydro_convert_quantities(
+    struct part* p) {}
 
 /**
  * @brief Returns the internal energy of a particle
  *
  * @param p The particle of interest
+ * @param dt Time since the last kick
  */
-__attribute__((always_inline))
-    INLINE static float hydro_get_internal_energy(struct part* p) {
+__attribute__((always_inline)) INLINE static float hydro_get_internal_energy(
+    const struct part* p, float dt) {
 
   return p->u;
 }
diff --git a/src/hydro/Default/hydro_debug.h b/src/hydro/Default/hydro_debug.h
index 2e7c3d683aa18eee7a550ac89517e3bd01e42107..79ee392d46ca75a2c097bf045b2d82c9f3dc96c0 100644
--- a/src/hydro/Default/hydro_debug.h
+++ b/src/hydro/Default/hydro_debug.h
@@ -17,8 +17,8 @@
  *
  ******************************************************************************/
 
-__attribute__((always_inline))
-    INLINE static void hydro_debug_particle(struct part* p, struct xpart* xp) {
+__attribute__((always_inline)) INLINE static void hydro_debug_particle(
+    struct part* p, struct xpart* xp) {
   printf(
       "x=[%.3e,%.3e,%.3e], "
       "v=[%.3e,%.3e,%.3e],v_full=[%.3e,%.3e,%.3e] \n a=[%.3e,%.3e,%.3e],\n "
diff --git a/src/hydro/Default/hydro_iact.h b/src/hydro/Default/hydro_iact.h
index 4f85299b9d61b3a66389bac3527a63068ab96db9..0a577931b5e0ca67ab07dfe414a548da66e82cdd 100644
--- a/src/hydro/Default/hydro_iact.h
+++ b/src/hydro/Default/hydro_iact.h
@@ -20,12 +20,6 @@
 #ifndef SWIFT_RUNNER_IACT_H
 #define SWIFT_RUNNER_IACT_H
 
-/* Includes. */
-#include "const.h"
-#include "kernel_hydro.h"
-#include "part.h"
-#include "vector.h"
-
 /**
  * @brief SPH interaction functions following the Gadget-2 version of SPH.
  *
@@ -44,7 +38,6 @@
 /**
  * @brief Density loop
  */
-
 __attribute__((always_inline)) INLINE static void runner_iact_density(
     float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
 
@@ -218,7 +211,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_density(
 /**
  * @brief Density loop (non-symmetric version)
  */
-
 __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density(
     float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
 
@@ -267,12 +259,9 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density(
 /**
  * @brief Density loop (non-symmetric vectorized version)
  */
-
-__attribute__((always_inline))
-    INLINE static void runner_iact_nonsym_vec_density(float *R2, float *Dx,
-                                                      float *Hi, float *Hj,
-                                                      struct part **pi,
-                                                      struct part **pj) {
+__attribute__((always_inline)) INLINE static void
+runner_iact_nonsym_vec_density(float *R2, float *Dx, float *Hi, float *Hj,
+                               struct part **pi, struct part **pj) {
 
 #ifdef VECTORIZE
 
@@ -362,7 +351,6 @@ __attribute__((always_inline))
 /**
  * @brief Force loop
  */
-
 __attribute__((always_inline)) INLINE static void runner_iact_force(
     float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
 
@@ -458,7 +446,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_force(
 /**
  * @brief Force loop (Vectorized version)
  */
-
 __attribute__((always_inline)) INLINE static void runner_iact_vec_force(
     float *R2, float *Dx, float *Hi, float *Hj, struct part **pi,
     struct part **pj) {
@@ -677,7 +664,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_force(
 /**
  * @brief Force loop (non-symmetric version)
  */
-
 __attribute__((always_inline)) INLINE static void runner_iact_nonsym_force(
     float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
 
@@ -768,7 +754,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_force(
 /**
  * @brief Force loop (Vectorized non-symmetric version)
  */
-
 __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force(
     float *R2, float *Dx, float *Hi, float *Hj, struct part **pi,
     struct part **pj) {
diff --git a/src/hydro/Default/hydro_io.h b/src/hydro/Default/hydro_io.h
index 0e9ad46ddc1d4e8c8d3ffdbf3e81262ec49a7092..de71963b05149fa0b2df6222424478a6ad9b1f44 100644
--- a/src/hydro/Default/hydro_io.h
+++ b/src/hydro/Default/hydro_io.h
@@ -104,10 +104,6 @@ void writeSPHflavour(hid_t h_grpsph) {
 
   /* Kernel function description */
   writeAttribute_s(h_grpsph, "Kernel", kernel_name);
-  writeAttribute_f(h_grpsph, "Kernel eta", const_eta_kernel);
-  writeAttribute_f(h_grpsph, "Weighted N_ngb", kernel_nwneigh);
-  writeAttribute_f(h_grpsph, "Delta N_ngb", const_delta_nwneigh);
-  writeAttribute_f(h_grpsph, "Hydro gamma", const_hydro_gamma);
 
   /* Viscosity and thermal conduction */
   writeAttribute_s(h_grpsph, "Thermal Conductivity Model",
@@ -123,11 +119,6 @@ void writeSPHflavour(hid_t h_grpsph) {
   writeAttribute_f(h_grpsph, "Viscosity decay length", const_viscosity_length);
 
   /* Time integration properties */
-  writeAttribute_f(h_grpsph, "CFL parameter", const_cfl);
-  writeAttribute_f(h_grpsph, "Maximal ln(Delta h) change over dt",
-                   const_ln_max_h_change);
-  writeAttribute_f(h_grpsph, "Maximal Delta h change over dt",
-                   exp(const_ln_max_h_change));
   writeAttribute_f(h_grpsph, "Maximal Delta u change over dt",
                    const_max_u_change);
 }
diff --git a/src/hydro/Default/hydro_part.h b/src/hydro/Default/hydro_part.h
index 60453d0c7995f7af2a3166502a24aa590873a043..a4096d5c30d525307d5327559ef6b007c6931486 100644
--- a/src/hydro/Default/hydro_part.h
+++ b/src/hydro/Default/hydro_part.h
@@ -20,8 +20,8 @@
 /* Extra particle data not needed during the SPH loops over neighbours. */
 struct xpart {
 
-  /* Old position, at last tree rebuild. */
-  double x_old[3];
+  /* Offset between current position and position at last tree rebuild. */
+  float x_diff[3];
 
   /* Velocity at the last full step. */
   float v_full[3];
diff --git a/src/hydro/Gadget2/hydro.h b/src/hydro/Gadget2/hydro.h
index 22c5734ed5762400285521b30f9aa60795c45325..0973acb0fb46411c778f2551fbe91621825f0278 100644
--- a/src/hydro/Gadget2/hydro.h
+++ b/src/hydro/Gadget2/hydro.h
@@ -25,20 +25,16 @@
  *
  */
 __attribute__((always_inline)) INLINE static float hydro_compute_timestep(
-    struct part* p, struct xpart* xp) {
+    const struct part* p, const struct xpart* xp,
+    const struct hydro_props* hydro_properties) {
 
-  /* Acceleration */
-  float ac =
-      sqrtf(p->a_hydro[0] * p->a_hydro[0] + p->a_hydro[1] * p->a_hydro[1] +
-            p->a_hydro[2] * p->a_hydro[2]);
-  ac = fmaxf(ac, 1e-30);
-
-  const float dt_accel = sqrtf(2.f);  // MATTHIEU
+  const float CFL_condition = hydro_properties->CFL_condition;
 
   /* CFL condition */
-  const float dt_cfl = 2.f * const_cfl * kernel_gamma * p->h / p->force.v_sig;
+  const float dt_cfl =
+      2.f * kernel_gamma * CFL_condition * p->h / p->force.v_sig;
 
-  return fminf(dt_cfl, dt_accel);
+  return dt_cfl;
 }
 
 /**
@@ -50,9 +46,8 @@ __attribute__((always_inline)) INLINE static float hydro_compute_timestep(
  * @param p The particle to act upon
  * @param xp The extended particle data to act upon
  */
-__attribute__((always_inline))
-    INLINE static void hydro_first_init_part(struct part* p, struct xpart* xp) {
-}
+__attribute__((always_inline)) INLINE static void hydro_first_init_part(
+    struct part* p, struct xpart* xp) {}
 
 /**
  * @brief Prepares a particle for the density calculation.
@@ -62,8 +57,8 @@ __attribute__((always_inline))
  *
  * @param p The particle to act upon
  */
-__attribute__((always_inline))
-    INLINE static void hydro_init_part(struct part* p) {
+__attribute__((always_inline)) INLINE static void hydro_init_part(
+    struct part* p) {
   p->density.wcount = 0.f;
   p->density.wcount_dh = 0.f;
   p->rho = 0.f;
@@ -83,8 +78,8 @@ __attribute__((always_inline))
  * @param p The particle to act upon
  * @param ti_current The current time (on the integer timeline)
  */
-__attribute__((always_inline))
-    INLINE static void hydro_end_density(struct part* p, int ti_current) {
+__attribute__((always_inline)) INLINE static void hydro_end_density(
+    struct part* p, int ti_current) {
 
   /* Some smoothing length multiples. */
   const float h = p->h;
@@ -94,7 +89,7 @@ __attribute__((always_inline))
 
   /* Final operation on the density (add self-contribution). */
   p->rho += p->mass * kernel_root;
-  p->rho_dh -= 3.0f * p->mass * kernel_root * kernel_igamma;
+  p->rho_dh -= 3.0f * p->mass * kernel_root;
   p->density.wcount += kernel_root;
 
   /* Finish the calculation by inserting the missing h-factors */
@@ -152,8 +147,8 @@ __attribute__((always_inline)) INLINE static void hydro_prepare_force(
  *
  * @param p The particle to act upon
  */
-__attribute__((always_inline))
-    INLINE static void hydro_reset_acceleration(struct part* p) {
+__attribute__((always_inline)) INLINE static void hydro_reset_acceleration(
+    struct part* p) {
 
   /* Reset the acceleration. */
   p->a_hydro[0] = 0.0f;
@@ -197,8 +192,8 @@ __attribute__((always_inline)) INLINE static void hydro_predict_extra(
  *
  * @param p The particle to act upon
  */
-__attribute__((always_inline))
-    INLINE static void hydro_end_force(struct part* p) {
+__attribute__((always_inline)) INLINE static void hydro_end_force(
+    struct part* p) {
 
   p->entropy_dt *=
       (const_hydro_gamma - 1.f) * powf(p->rho, -(const_hydro_gamma - 1.f));
@@ -234,8 +229,8 @@ __attribute__((always_inline)) INLINE static void hydro_kick_extra(
  *
  * @param p The particle to act upon
  */
-__attribute__((always_inline))
-    INLINE static void hydro_convert_quantities(struct part* p) {
+__attribute__((always_inline)) INLINE static void hydro_convert_quantities(
+    struct part* p) {
 
   p->entropy = (const_hydro_gamma - 1.f) * p->entropy *
                powf(p->rho, -(const_hydro_gamma - 1.f));
@@ -245,10 +240,13 @@ __attribute__((always_inline))
  * @brief Returns the internal energy of a particle
  *
  * @param p The particle of interest
+ * @param dt Time since the last kick
  */
-__attribute__((always_inline))
-    INLINE static float hydro_get_internal_energy(struct part* p) {
+__attribute__((always_inline)) INLINE static float hydro_get_internal_energy(
+    const struct part* p, float dt) {
+
+  const float entropy = p->entropy + p->entropy_dt * dt;
 
-  return p->entropy * powf(p->rho, const_hydro_gamma - 1.f) *
+  return entropy * powf(p->rho, const_hydro_gamma - 1.f) *
          (1.f / (const_hydro_gamma - 1.f));
 }
diff --git a/src/hydro/Gadget2/hydro_debug.h b/src/hydro/Gadget2/hydro_debug.h
index f96630774fa343076ea26182a1a607f3e7897e77..b67e79182ccaaee7c0421c57a91ec9fa2adae65c 100644
--- a/src/hydro/Gadget2/hydro_debug.h
+++ b/src/hydro/Gadget2/hydro_debug.h
@@ -23,14 +23,15 @@ __attribute__((always_inline)) INLINE static void hydro_debug_particle(
       "x=[%.3e,%.3e,%.3e], "
       "v=[%.3e,%.3e,%.3e],v_full=[%.3e,%.3e,%.3e] \n a=[%.3e,%.3e,%.3e],\n "
       "h=%.3e, "
-      "wcount=%d, wcount_dh=%.3e, m=%.3e, dh_drho=%.3e, rho=%.3e, P=%.3e, S=%.3e, "
+      "wcount=%d, wcount_dh=%.3e, m=%.3e, dh_drho=%.3e, rho=%.3e, P=%.3e, "
+      "S=%.3e, "
       "dS/dt=%.3e, c=%.3e\n"
       "divV=%.3e, curlV=%.3e, rotV=[%.3e,%.3e,%.3e]  \n "
       "v_sig=%e dh/dt=%.3e t_begin=%d, t_end=%d\n",
       p->x[0], p->x[1], p->x[2], p->v[0], p->v[1], p->v[2], xp->v_full[0],
       xp->v_full[1], xp->v_full[2], p->a_hydro[0], p->a_hydro[1], p->a_hydro[2],
-      p->h, (int)p->density.wcount, p->density.wcount_dh, p->mass, p->rho_dh, p->rho,
-      p->force.pressure, p->entropy, p->entropy_dt, p->force.soundspeed,
+      p->h, (int)p->density.wcount, p->density.wcount_dh, p->mass, p->rho_dh,
+      p->rho, p->force.pressure, p->entropy, p->entropy_dt, p->force.soundspeed,
       p->div_v, p->force.curl_v, p->density.rot_v[0], p->density.rot_v[1],
       p->density.rot_v[2], p->force.v_sig, p->h_dt, p->ti_begin, p->ti_end);
 }
diff --git a/src/hydro/Gadget2/hydro_iact.h b/src/hydro/Gadget2/hydro_iact.h
index d988c678affcf4ca722a965a7e52a7c120b4a924..8738b4be09931df4c938f1dff3adeed11468dcfc 100644
--- a/src/hydro/Gadget2/hydro_iact.h
+++ b/src/hydro/Gadget2/hydro_iact.h
@@ -20,12 +20,6 @@
 #ifndef SWIFT_RUNNER_IACT_LEGACY_H
 #define SWIFT_RUNNER_IACT_LEGACY_H
 
-/* Includes. */
-#include "const.h"
-#include "kernel_hydro.h"
-#include "part.h"
-#include "vector.h"
-
 /**
  * @brief SPH interaction functions following the Gadget-2 version of SPH.
  *
@@ -42,7 +36,6 @@
 /**
  * @brief Density loop
  */
-
 __attribute__((always_inline)) INLINE static void runner_iact_density(
     float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
 
@@ -65,7 +58,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_density(
 
   /* Compute contribution to the density */
   pi->rho += mj * wi;
-  pi->rho_dh -= mj * kernel_igamma * (3.f * wi + ui * wi_dx);
+  pi->rho_dh -= mj * (3.f * wi + ui * wi_dx);
 
   /* Compute contribution to the number of neighbours */
   pi->density.wcount += wi;
@@ -78,7 +71,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_density(
 
   /* Compute contribution to the density */
   pj->rho += mi * wj;
-  pj->rho_dh -= mi * kernel_igamma * (3.f * wj + uj * wj_dx);
+  pj->rho_dh -= mi * (3.f * wj + uj * wj_dx);
 
   /* Compute contribution to the number of neighbours */
   pj->density.wcount += wj;
@@ -113,7 +106,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_density(
 /**
  * @brief Density loop (non-symmetric version)
  */
-
 __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density(
     float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
 
@@ -134,7 +126,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density(
 
   /* Compute contribution to the density */
   pi->rho += mj * wi;
-  pi->rho_dh -= mj * kernel_igamma * (3.f * wi + u * wi_dx);
+  pi->rho_dh -= mj * (3.f * wi + u * wi_dx);
 
   /* Compute contribution to the number of neighbours */
   pi->density.wcount += wi;
@@ -162,7 +154,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density(
 /**
  * @brief Force loop
  */
-
 __attribute__((always_inline)) INLINE static void runner_iact_force(
     float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
 
@@ -260,7 +251,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_force(
 /**
  * @brief Force loop (non-symmetric version)
  */
-
 __attribute__((always_inline)) INLINE static void runner_iact_nonsym_force(
     float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
 
diff --git a/src/hydro/Gadget2/hydro_io.h b/src/hydro/Gadget2/hydro_io.h
index c1c59dfa4980a2843e7e13bee4c964c9b254cae6..b977f25386fab0787c925635297a48fa85a8df24 100644
--- a/src/hydro/Gadget2/hydro_io.h
+++ b/src/hydro/Gadget2/hydro_io.h
@@ -83,8 +83,8 @@ __attribute__((always_inline)) INLINE static void hydro_write_particles(
   writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "SmoothingLength",
              FLOAT, N, 1, parts, N_total, mpi_rank, offset, h, us,
              UNIT_CONV_LENGTH);
-  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "InternalEnergy",
-             FLOAT, N, 1, parts, N_total, mpi_rank, offset, entropy, us,
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Entropy", FLOAT, N,
+             1, parts, N_total, mpi_rank, offset, entropy, us,
              UNIT_CONV_ENTROPY_PER_UNIT_MASS);
   writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "ParticleIDs",
              ULONGLONG, N, 1, parts, N_total, mpi_rank, offset, id, us,
@@ -104,10 +104,6 @@ void writeSPHflavour(hid_t h_grpsph) {
 
   /* Kernel function description */
   writeAttribute_s(h_grpsph, "Kernel", kernel_name);
-  writeAttribute_f(h_grpsph, "Kernel eta", const_eta_kernel);
-  writeAttribute_f(h_grpsph, "Weighted N_ngb", kernel_nwneigh);
-  writeAttribute_f(h_grpsph, "Delta N_ngb", const_delta_nwneigh);
-  writeAttribute_f(h_grpsph, "Hydro gamma", const_hydro_gamma);
 
   /* Viscosity and thermal conduction */
   writeAttribute_s(h_grpsph, "Thermal Conductivity Model",
@@ -116,11 +112,4 @@ void writeSPHflavour(hid_t h_grpsph) {
                    "Legacy Gadget-2 as in Springel (2005)");
   writeAttribute_f(h_grpsph, "Viscosity alpha", const_viscosity_alpha);
   writeAttribute_f(h_grpsph, "Viscosity beta", 3.f);
-
-  /* Time integration properties */
-  writeAttribute_f(h_grpsph, "CFL parameter", const_cfl);
-  writeAttribute_f(h_grpsph, "Maximal ln(Delta h) change over dt",
-                   const_ln_max_h_change);
-  writeAttribute_f(h_grpsph, "Maximal Delta h change over dt",
-                   exp(const_ln_max_h_change));
 }
diff --git a/src/hydro/Gadget2/hydro_part.h b/src/hydro/Gadget2/hydro_part.h
index 05754d07dd70bed071e99c86b95eb17eb2194012..863bdbefde4543c1f1f6b0415dc6c229f3d58012 100644
--- a/src/hydro/Gadget2/hydro_part.h
+++ b/src/hydro/Gadget2/hydro_part.h
@@ -20,8 +20,8 @@
 /* Extra particle data not needed during the SPH loops over neighbours. */
 struct xpart {
 
-  /* Old position, at last tree rebuild. */
-  double x_old[3];
+  /* Offset between current position and position at last tree rebuild. */
+  float x_diff[3];
 
   /* Velocity at the last full step. */
   float v_full[3];
diff --git a/src/hydro/Gizmo/hydro.h b/src/hydro/Gizmo/hydro.h
index f3553a009c22a2f6353796e8a278f0db7d66d294..f69dc3f1798f014e895c4a63760805b1739cec94 100644
--- a/src/hydro/Gizmo/hydro.h
+++ b/src/hydro/Gizmo/hydro.h
@@ -39,17 +39,16 @@ __attribute__((always_inline)) INLINE static float hydro_compute_timestep(
  * @param p The particle to act upon
  * @param xp The extended particle data to act upon
  */
-__attribute__((always_inline))
-    INLINE static void hydro_first_init_part(struct part* p, struct xpart* xp) {
-}
+__attribute__((always_inline)) INLINE static void hydro_first_init_part(
+    struct part* p, struct xpart* xp) {}
 
 /**
  * @brief Prepares a particle for the volume calculation.
  *
  * @param p The particle to act upon
  */
-__attribute__((always_inline))
-    INLINE static void hydro_init_part(struct part* p) {
+__attribute__((always_inline)) INLINE static void hydro_init_part(
+    struct part* p) {
 
 #ifdef SPH_GRADIENTS
   /* use the old volumes to estimate new primitive variables to be used for the
@@ -127,8 +126,8 @@ __attribute__((always_inline))
  *
  * @param p The particle to act upon
  */
-__attribute__((always_inline))
-    INLINE static void hydro_end_volume(struct part* p) {
+__attribute__((always_inline)) INLINE static void hydro_end_volume(
+    struct part* p) {
 
   /* Some smoothing length multiples. */
   const float h = p->h;
@@ -387,8 +386,8 @@ __attribute__((always_inline)) INLINE static void hydro_prepare_gradient(
  *
  * @param p The particle to act upon
  */
-__attribute__((always_inline))
-    INLINE static void hydro_end_gradient(struct part* p) {
+__attribute__((always_inline)) INLINE static void hydro_end_gradient(
+    struct part* p) {
 
 #ifndef SPH_GRADIENTS
   float h, ih, ih2, ih3;
@@ -531,8 +530,8 @@ __attribute__((always_inline))
  * @param p The particle to act upon
  * @param xp The extended particle data to act upon
  */
-__attribute__((always_inline))
-    INLINE static void hydro_prepare_fluxes(struct part* p, struct xpart* xp) {
+__attribute__((always_inline)) INLINE static void hydro_prepare_fluxes(
+    struct part* p, struct xpart* xp) {
 
   /* initialize variables used for timestep calculation */
   p->timestepvars.vmax = 0.0f;
@@ -546,8 +545,8 @@ __attribute__((always_inline))
  *
  * @param p The particle to act upon
  */
-__attribute__((always_inline))
-    INLINE static void hydro_reset_acceleration(struct part* p) {
+__attribute__((always_inline)) INLINE static void hydro_reset_acceleration(
+    struct part* p) {
 
   /* figure out what to put here */
 }
@@ -559,8 +558,8 @@ __attribute__((always_inline))
  *
  * @param p The particle to act upon
  */
-__attribute__((always_inline))
-    INLINE static void hydro_end_fluxes(struct part* p) {
+__attribute__((always_inline)) INLINE static void hydro_end_fluxes(
+    struct part* p) {
 
   /* do nothing */
 }
@@ -572,8 +571,8 @@ __attribute__((always_inline))
  *
  * @param p The particle to act upon
  */
-__attribute__((always_inline))
-    INLINE static void hydro_convert_quantities(struct part* p) {
+__attribute__((always_inline)) INLINE static void hydro_convert_quantities(
+    struct part* p) {
 
   float volume;
   GFLOAT m;
@@ -605,17 +604,17 @@ __attribute__((always_inline))
 }
 
 // MATTHIEU
-__attribute__((always_inline))
-    INLINE static void hydro_end_density(struct part* p, float time) {}
+__attribute__((always_inline)) INLINE static void hydro_end_density(
+    struct part* p, float time) {}
 __attribute__((always_inline)) INLINE static void hydro_prepare_force(
     struct part* p, struct xpart* xp, int ti_current, double timeBase) {}
 __attribute__((always_inline)) INLINE static void hydro_predict_extra(
     struct part* p, struct xpart* xp, int t0, int t1, double timeBase) {}
-__attribute__((always_inline))
-    INLINE static void hydro_end_force(struct part* p) {}
+__attribute__((always_inline)) INLINE static void hydro_end_force(
+    struct part* p) {}
 __attribute__((always_inline)) INLINE static void hydro_kick_extra(
     struct part* p, struct xpart* xp, float dt, float half_dt) {}
-__attribute__((always_inline))
-    INLINE static float hydro_get_internal_energy(struct part* p) {
+__attribute__((always_inline)) INLINE static float hydro_get_internal_energy(
+    struct part* p) {
   return 0.f;
 }
diff --git a/src/hydro/Gizmo/hydro_debug.h b/src/hydro/Gizmo/hydro_debug.h
index 2cc957ed883436ce57e9d53d00a073693c9495df..365d85a2f651cf98b0713e8d82f11ae70fa9beaa 100644
--- a/src/hydro/Gizmo/hydro_debug.h
+++ b/src/hydro/Gizmo/hydro_debug.h
@@ -17,8 +17,8 @@
  *
  ******************************************************************************/
 
-__attribute__((always_inline))
-    INLINE static void hydro_debug_particle(struct part* p, struct xpart* xp) {
+__attribute__((always_inline)) INLINE static void hydro_debug_particle(
+    struct part* p, struct xpart* xp) {
   printf(
       "x=[%.16e,%.16e,%.16e], "
       "v=[%.3e,%.3e,%.3e], a=[%.3e,%.3e,%.3e], volume=%.3e\n",
diff --git a/src/hydro/Gizmo/hydro_iact.h b/src/hydro/Gizmo/hydro_iact.h
index 4fe875d3d07315051ef8b3051665a9ea0ef261b8..30a8d6cbebc851b44a5ee2339950aec9e15057c0 100644
--- a/src/hydro/Gizmo/hydro_iact.h
+++ b/src/hydro/Gizmo/hydro_iact.h
@@ -194,11 +194,9 @@ __attribute__((always_inline)) INLINE static void runner_iact_hydro_loop1(
 }
 
 /* this corresponds to task_subtype_hydro_loop1 */
-__attribute__((always_inline))
-    INLINE static void runner_iact_nonsym_hydro_loop1(float r2, float *dx,
-                                                      float hi, float hj,
-                                                      struct part *pi,
-                                                      struct part *pj) {
+__attribute__((always_inline)) INLINE static void
+runner_iact_nonsym_hydro_loop1(float r2, float *dx, float hi, float hj,
+                               struct part *pi, struct part *pj) {
 
   float r;
   float xi;
@@ -487,11 +485,9 @@ __attribute__((always_inline)) INLINE static void runner_iact_hydro_loop2(
 #endif
 }
 
-__attribute__((always_inline))
-    INLINE static void runner_iact_nonsym_hydro_loop2(float r2, float *dx,
-                                                      float hi, float hj,
-                                                      struct part *pi,
-                                                      struct part *pj) {
+__attribute__((always_inline)) INLINE static void
+runner_iact_nonsym_hydro_loop2(float r2, float *dx, float hi, float hj,
+                               struct part *pi, struct part *pj) {
 
 #ifndef SPH_GRADIENTS
 
@@ -1025,11 +1021,9 @@ __attribute__((always_inline)) INLINE static void runner_iact_hydro_loop3(
 }
 
 /* this corresponds to task_subtype_fluxes */
-__attribute__((always_inline))
-    INLINE static void runner_iact_nonsym_hydro_loop3(float r2, float *dx,
-                                                      float hi, float hj,
-                                                      struct part *pi,
-                                                      struct part *pj) {
+__attribute__((always_inline)) INLINE static void
+runner_iact_nonsym_hydro_loop3(float r2, float *dx, float hi, float hj,
+                               struct part *pi, struct part *pj) {
 
   runner_iact_fluxes_common(r2, dx, hi, hj, pi, pj, 0);
 }
diff --git a/src/hydro/Minimal/hydro.h b/src/hydro/Minimal/hydro.h
index 7db3c275ce7e3389610e8297c287cbd5301c6c64..4222daafe82e7dc977cd87f57a5b9a235d505f00 100644
--- a/src/hydro/Minimal/hydro.h
+++ b/src/hydro/Minimal/hydro.h
@@ -27,13 +27,18 @@
  *
  * @param p Pointer to the particle data
  * @param xp Pointer to the extended particle data
+ * @param hydro_properties The SPH parameters
  *
  */
 __attribute__((always_inline)) INLINE static float hydro_compute_timestep(
-    struct part* p, struct xpart* xp) {
+    const struct part* p, const struct xpart* xp,
+    const struct hydro_props* hydro_properties) {
+
+  const float CFL_condition = hydro_properties->CFL_condition;
 
   /* CFL condition */
-  const float dt_cfl = 2.f * const_cfl * kernel_gamma * p->h / p->force.v_sig;
+  const float dt_cfl =
+      2.f * kernel_gamma * CFL_condition * p->h / p->force.v_sig;
 
   return dt_cfl;
 }
@@ -48,8 +53,8 @@ __attribute__((always_inline)) INLINE static float hydro_compute_timestep(
  * @param p The particle to act upon
  * @param xp The extended particle data to act upon
  */
-__attribute__((always_inline))
-    INLINE static void hydro_first_init_part(struct part* p, struct xpart* xp) {
+__attribute__((always_inline)) INLINE static void hydro_first_init_part(
+    struct part* p, struct xpart* xp) {
 
   xp->u_full = p->u;
 }
@@ -63,8 +68,8 @@ __attribute__((always_inline))
  *
  * @param p The particle to act upon
  */
-__attribute__((always_inline))
-    INLINE static void hydro_init_part(struct part* p) {
+__attribute__((always_inline)) INLINE static void hydro_init_part(
+    struct part* p) {
   p->density.wcount = 0.f;
   p->density.wcount_dh = 0.f;
   p->rho = 0.f;
@@ -83,8 +88,8 @@ __attribute__((always_inline))
  * @param p The particle to act upon
  * @param time The current time
  */
-__attribute__((always_inline))
-    INLINE static void hydro_end_density(struct part* p, float time) {
+__attribute__((always_inline)) INLINE static void hydro_end_density(
+    struct part* p, float time) {
 
   /* Some smoothing length multiples. */
   const float h = p->h;
@@ -94,7 +99,7 @@ __attribute__((always_inline))
 
   /* Final operation on the density (add self-contribution). */
   p->rho += p->mass * kernel_root;
-  p->rho_dh -= 3.0f * p->mass * kernel_root * kernel_igamma;
+  p->rho_dh -= 3.0f * p->mass * kernel_root;
   p->density.wcount += kernel_root;
 
   /* Finish the calculation by inserting the missing h-factors */
@@ -138,8 +143,8 @@ __attribute__((always_inline)) INLINE static void hydro_prepare_force(
  *
  * @param p The particle to act upon
  */
-__attribute__((always_inline))
-    INLINE static void hydro_reset_acceleration(struct part* p) {
+__attribute__((always_inline)) INLINE static void hydro_reset_acceleration(
+    struct part* p) {
 
   /* Reset the acceleration. */
   p->a_hydro[0] = 0.0f;
@@ -167,14 +172,7 @@ __attribute__((always_inline))
 __attribute__((always_inline)) INLINE static void hydro_predict_extra(
     struct part* p, struct xpart* xp, int t0, int t1, double timeBase) {
 
-  const float dt = t1 - t0;
-
-  /* Predict internal energy */
-  const float w = p->u_dt / p->u * dt;
-  if (fabsf(w) < 0.2f)
-    p->u *= approx_expf(w); /* 4th order expansion of exp(w) */
-  else
-    p->u *= expf(w);
+  p->u = xp->u_full;
 
   /* Need to recompute the pressure as well */
   p->force.pressure = p->rho * p->u * (const_hydro_gamma - 1.f);
@@ -189,8 +187,8 @@ __attribute__((always_inline)) INLINE static void hydro_predict_extra(
  *
  * @param p The particle to act upon
  */
-__attribute__((always_inline))
-    INLINE static void hydro_end_force(struct part* p) {}
+__attribute__((always_inline)) INLINE static void hydro_end_force(
+    struct part* p) {}
 
 /**
  * @brief Kick the additional variables
@@ -223,8 +221,8 @@ __attribute__((always_inline)) INLINE static void hydro_kick_extra(
  *
  * @param p The particle to act upon
  */
-__attribute__((always_inline))
-    INLINE static void hydro_convert_quantities(struct part* p) {}
+__attribute__((always_inline)) INLINE static void hydro_convert_quantities(
+    struct part* p) {}
 
 /**
  * @brief Returns the internal energy of a particle
@@ -234,9 +232,10 @@ __attribute__((always_inline))
  * energy from the thermodynamic variable.
  *
  * @param p The particle of interest
+ * @param dt Time since the last kick
  */
-__attribute__((always_inline))
-    INLINE static float hydro_get_internal_energy(struct part* p) {
+__attribute__((always_inline)) INLINE static float hydro_get_internal_energy(
+    const struct part* p, float dt) {
 
   return p->u;
 }
diff --git a/src/hydro/Minimal/hydro_debug.h b/src/hydro/Minimal/hydro_debug.h
index 85fdbac0a7893d575e52629600a5407c1bf77fcc..127ba75e99418b6a5dc197a44ccdc77de3cdef15 100644
--- a/src/hydro/Minimal/hydro_debug.h
+++ b/src/hydro/Minimal/hydro_debug.h
@@ -17,8 +17,8 @@
  *
  ******************************************************************************/
 
-__attribute__((always_inline))
-    INLINE static void hydro_debug_particle(struct part* p, struct xpart* xp) {
+__attribute__((always_inline)) INLINE static void hydro_debug_particle(
+    struct part* p, struct xpart* xp) {
   printf(
       "x=[%.3e,%.3e,%.3e], "
       "v=[%.3e,%.3e,%.3e],v_full=[%.3e,%.3e,%.3e] \n a=[%.3e,%.3e,%.3e], "
diff --git a/src/hydro/Minimal/hydro_iact.h b/src/hydro/Minimal/hydro_iact.h
index 3427ec538613842f8fbcf0d8ba5f9ba5a0b8d540..c9da185b8a29eafe2a58420ae5de3a05ff043225 100644
--- a/src/hydro/Minimal/hydro_iact.h
+++ b/src/hydro/Minimal/hydro_iact.h
@@ -19,12 +19,6 @@
 #ifndef SWIFT_RUNNER_IACT_MINIMAL_H
 #define SWIFT_RUNNER_IACT_MINIMAL_H
 
-/* Includes. */
-#include "const.h"
-#include "kernel_hydro.h"
-#include "part.h"
-#include "vector.h"
-
 /**
  * @brief Minimal conservative implementation of SPH
  *
@@ -70,7 +64,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_density(
 /**
  * @brief Density loop (non-symmetric version)
  */
-
 __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density(
     float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
 
@@ -95,7 +88,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density(
 /**
  * @brief Force loop
  */
-
 __attribute__((always_inline)) INLINE static void runner_iact_force(
     float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
 
@@ -131,18 +123,14 @@ __attribute__((always_inline)) INLINE static void runner_iact_force(
   const float P_over_rho_j = pressurej / (rhoj * rhoj) * pj->rho_dh;
 
   /* Compute dv dot r. */
-  float dvdr = (pi->v[0] - pj->v[0]) * dx[0] + (pi->v[1] - pj->v[1]) * dx[1] +
-               (pi->v[2] - pj->v[2]) * dx[2];
-  dvdr *= r_inv;
-
-  /* Compute the relative velocity. (This is 0 if the particles move away from
-   * each other and negative otherwise) */
-  const float omega_ij = fminf(dvdr, 0.f);
+  const float dvdr = (pi->v[0] - pj->v[0]) * dx[0] +
+                     (pi->v[1] - pj->v[1]) * dx[1] +
+                     (pi->v[2] - pj->v[2]) * dx[2];
 
   /* Compute sound speeds */
   const float ci = sqrtf(const_hydro_gamma * pressurei / rhoi);
   const float cj = sqrtf(const_hydro_gamma * pressurej / rhoj);
-  const float v_sig = ci + cj + 3.f * omega_ij;
+  const float v_sig = ci + cj;
 
   /* SPH acceleration term */
   const float sph_term = (P_over_rho_i * wi_dr + P_over_rho_j * wj_dr) * r_inv;
@@ -157,8 +145,8 @@ __attribute__((always_inline)) INLINE static void runner_iact_force(
   pj->a_hydro[2] += mi * sph_term * dx[2];
 
   /* Get the time derivative for u. */
-  pi->u_dt += P_over_rho_i * mj * dvdr * wi_dr;
-  pj->u_dt += P_over_rho_j * mi * dvdr * wj_dr;
+  pi->u_dt += P_over_rho_i * mj * dvdr * r_inv * wi_dr;
+  pj->u_dt += P_over_rho_j * mi * dvdr * r_inv * wj_dr;
 
   /* Get the time derivative for h. */
   pi->h_dt -= mj * dvdr * r_inv / rhoj * wi_dr;
@@ -172,7 +160,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_force(
 /**
  * @brief Force loop (non-symmetric version)
  */
-
 __attribute__((always_inline)) INLINE static void runner_iact_nonsym_force(
     float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
 
@@ -208,18 +195,14 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_force(
   const float P_over_rho_j = pressurej / (rhoj * rhoj) * pj->rho_dh;
 
   /* Compute dv dot r. */
-  float dvdr = (pi->v[0] - pj->v[0]) * dx[0] + (pi->v[1] - pj->v[1]) * dx[1] +
-               (pi->v[2] - pj->v[2]) * dx[2];
-  dvdr *= r_inv;
-
-  /* Compute the relative velocity. (This is 0 if the particles move away from
-   * each other and negative otherwise) */
-  const float omega_ij = fminf(dvdr, 0.f);
+  const float dvdr = (pi->v[0] - pj->v[0]) * dx[0] +
+                     (pi->v[1] - pj->v[1]) * dx[1] +
+                     (pi->v[2] - pj->v[2]) * dx[2];
 
   /* Compute sound speeds */
   const float ci = sqrtf(const_hydro_gamma * pressurei / rhoi);
   const float cj = sqrtf(const_hydro_gamma * pressurej / rhoj);
-  const float v_sig = ci + cj + 3.f * omega_ij;
+  const float v_sig = ci + cj;
 
   /* SPH acceleration term */
   const float sph_term = (P_over_rho_i * wi_dr + P_over_rho_j * wj_dr) * r_inv;
@@ -230,7 +213,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_force(
   pi->a_hydro[2] -= mj * sph_term * dx[2];
 
   /* Get the time derivative for u. */
-  pi->u_dt += P_over_rho_i * mj * dvdr * wi_dr;
+  pi->u_dt += P_over_rho_i * mj * dvdr * r_inv * wi_dr;
 
   /* Get the time derivative for h. */
   pi->h_dt -= mj * dvdr * r_inv / rhoj * wi_dr;
diff --git a/src/hydro/Minimal/hydro_io.h b/src/hydro/Minimal/hydro_io.h
index afe5de83f423e43b4d2480cca1ac3e84d6c549de..1bbfe1065358017e0b27e2131ff717848221fe9c 100644
--- a/src/hydro/Minimal/hydro_io.h
+++ b/src/hydro/Minimal/hydro_io.h
@@ -104,9 +104,6 @@ void writeSPHflavour(hid_t h_grpsph) {
 
   /* Kernel function description */
   writeAttribute_s(h_grpsph, "Kernel", kernel_name);
-  writeAttribute_f(h_grpsph, "Kernel eta", const_eta_kernel);
-  writeAttribute_f(h_grpsph, "Weighted N_ngb", kernel_nwneigh);
-  writeAttribute_f(h_grpsph, "Delta N_ngb", const_delta_nwneigh);
   writeAttribute_f(h_grpsph, "Hydro gamma", const_hydro_gamma);
 
   /* Viscosity and thermal conduction */
@@ -115,11 +112,6 @@ void writeSPHflavour(hid_t h_grpsph) {
   writeAttribute_s(h_grpsph, "Viscosity Model", "No model");
 
   /* Time integration properties */
-  writeAttribute_f(h_grpsph, "CFL parameter", const_cfl);
-  writeAttribute_f(h_grpsph, "Maximal ln(Delta h) change over dt",
-                   const_ln_max_h_change);
-  writeAttribute_f(h_grpsph, "Maximal Delta h change over dt",
-                   exp(const_ln_max_h_change));
   writeAttribute_f(h_grpsph, "Maximal Delta u change over dt",
                    const_max_u_change);
 }
diff --git a/src/hydro/Minimal/hydro_part.h b/src/hydro/Minimal/hydro_part.h
index 173397ef2c72ee99f4d10742f3645afd1e706218..2580ef2a94eabda5a34de9d4e4b48227bc0e5146 100644
--- a/src/hydro/Minimal/hydro_part.h
+++ b/src/hydro/Minimal/hydro_part.h
@@ -26,7 +26,8 @@
  */
 struct xpart {
 
-  double x_old[3]; /*!< Old position, at last tree rebuild. */
+  float x_diff[3]; /*!< Offset between current position and position at last
+                      tree rebuild. */
 
   float v_full[3]; /*!< Velocity at the last full step. */
 
diff --git a/src/hydro_properties.c b/src/hydro_properties.c
new file mode 100644
index 0000000000000000000000000000000000000000..16216f81a5b505fc3a887e86ca4898bc4179e4d5
--- /dev/null
+++ b/src/hydro_properties.c
@@ -0,0 +1,70 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (c) 2016 Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+
+/* This object's header. */
+#include "hydro_properties.h"
+
+/* Standard headers */
+#include <float.h>
+#include <math.h>
+
+/* Local headers. */
+#include "error.h"
+#include "hydro.h"
+#include "kernel_hydro.h"
+
+#define hydro_props_default_max_iterations 30
+#define hydro_props_default_volume_change 2.0f
+
+void hydro_props_init(struct hydro_props *p,
+                      const struct swift_params *params) {
+
+  /* Kernel properties */
+  p->eta_neighbours = parser_get_param_float(params, "SPH:resolution_eta");
+  const float eta3 = p->eta_neighbours * p->eta_neighbours * p->eta_neighbours;
+  p->target_neighbours = 4.0 * M_PI * kernel_gamma3 * eta3 / 3.0;
+  p->delta_neighbours = parser_get_param_float(params, "SPH:delta_neighbours");
+
+  /* Ghost stuff */
+  p->max_smoothing_iterations = parser_get_opt_param_int(
+      params, "SPH:max_ghost_iterations", hydro_props_default_max_iterations);
+
+  /* Time integration properties */
+  p->CFL_condition = parser_get_param_float(params, "SPH:CFL_condition");
+  const float max_volume_change = parser_get_opt_param_float(
+      params, "SPH:max_volume_change", hydro_props_default_volume_change);
+  p->log_max_h_change = logf(powf(max_volume_change, 0.33333333333f));
+}
+
+void hydro_props_print(const struct hydro_props *p) {
+
+  message("Hydrodynamic scheme: %s.", SPH_IMPLEMENTATION);
+  message("Hydrodynamic kernel: %s with %.2f +/- %.2f neighbours (eta=%f).",
+          kernel_name, p->target_neighbours, p->delta_neighbours,
+          p->eta_neighbours);
+  message("Hydrodynamic integration: CFL parameter: %.4f.", p->CFL_condition);
+  message(
+      "Hydrodynamic integration: Max change of volume: %.2f "
+      "(max|dlog(h)/dt|=%f).",
+      powf(expf(p->log_max_h_change), 3.f), p->log_max_h_change);
+
+  if (p->max_smoothing_iterations != hydro_props_default_max_iterations)
+    message("Maximal iterations in ghost task set to %d (default is %d)",
+            p->max_smoothing_iterations, hydro_props_default_max_iterations);
+}
diff --git a/src/hydro_properties.h b/src/hydro_properties.h
new file mode 100644
index 0000000000000000000000000000000000000000..c84252a1dc12f0e5591a7e512fdf4e246f4ab048
--- /dev/null
+++ b/src/hydro_properties.h
@@ -0,0 +1,56 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (c) 2016 Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+
+#ifndef SWIFT_HYDRO_PROPERTIES
+#define SWIFT_HYDRO_PROPERTIES
+
+/* Config parameters. */
+#include "../config.h"
+
+/* Local includes. */
+#include "const.h"
+#include "parser.h"
+
+/**
+ * @brief Contains all the constants and parameters of the hydro scheme
+ */
+struct hydro_props {
+
+  /* Kernel properties */
+  float eta_neighbours;
+  float target_neighbours;
+  float delta_neighbours;
+
+  /* Kernel properties */
+  int max_smoothing_iterations;
+
+  /* Time integration properties */
+  float CFL_condition;
+  float log_max_h_change;
+
+/* Viscosity parameters */
+#ifdef GADGET_SPH
+  float const_viscosity_alpha;
+#endif
+};
+
+void hydro_props_print(const struct hydro_props *p);
+void hydro_props_init(struct hydro_props *p, const struct swift_params *params);
+
+#endif /* SWIFT_HYDRO_PROPERTIES */
diff --git a/src/intrinsics.h b/src/intrinsics.h
index 21b8c8e68bc45d8799db496ff30ac0cfb289acea..27e0bcc729b58493aed8c7eae7dfcdfc8f0855aa 100644
--- a/src/intrinsics.h
+++ b/src/intrinsics.h
@@ -26,8 +26,8 @@
  * This is a wrapper for the GCC intrinsics with an implementation (from
  * Hacker's Delight) if the compiler intrinsics are not available.
  */
-__attribute__((always_inline))
-    INLINE static int intrinsics_clz(unsigned int x) {
+__attribute__((always_inline)) INLINE static int intrinsics_clz(
+    unsigned int x) {
 
 #ifdef __GNUC__
   /* Use GCC intrinsics if possible */
@@ -66,8 +66,8 @@ __attribute__((always_inline))
  * This is a wrapper for the GCC intrinsics with an implementation (from
  * Hacker's Delight) if the compiler intrinsics are not available.
  */
-__attribute__((always_inline))
-    INLINE static int intrinsics_popcount(unsigned int x) {
+__attribute__((always_inline)) INLINE static int intrinsics_popcount(
+    unsigned int x) {
 
 #ifdef __GNUC__
   /* Use GCC intrinsics if possible */
diff --git a/src/kernel_gravity.h b/src/kernel_gravity.h
index 7fd4b061a7e94be01a11b06ad23d9113f579ebb8..fedc046eed3bc64ed09120537cf6863179b171a6 100644
--- a/src/kernel_gravity.h
+++ b/src/kernel_gravity.h
@@ -25,9 +25,12 @@
 #include "inline.h"
 #include "vector.h"
 
-/* Gravity kernel stuff
- * -----------------------------------------------------------------------------------------------
- */
+#define const_iepsilon (1. / const_epsilon)
+#define const_iepsilon2 (const_iepsilon * const_iepsilon)
+#define const_iepsilon3 (const_iepsilon2 * const_iepsilon)
+#define const_iepsilon4 (const_iepsilon2 * const_iepsilon2)
+#define const_iepsilon5 (const_iepsilon3 * const_iepsilon2)
+#define const_iepsilon6 (const_iepsilon3 * const_iepsilon3)
 
 /* The gravity kernel is defined as a degree 6 polynomial in the distance
    r. The resulting value should be post-multiplied with r^-3, resulting
@@ -39,18 +42,28 @@
 #define kernel_grav_degree 6
 #define kernel_grav_ivals 2
 #define kernel_grav_scale (2 * const_iepsilon)
-static float kernel_grav_coeffs
-    [(kernel_grav_degree + 1) * (kernel_grav_ivals + 1)] = {
-        32.0f * const_iepsilon6,         -192.0f / 5.0f * const_iepsilon5,
-        0.0f,                            32.0f / 3.0f * const_iepsilon3,
-        0.0f,                            0.0f,
-        0.0f,                            -32.0f / 3.0f * const_iepsilon6,
-        192.0f / 5.0f * const_iepsilon5, -48.0f * const_iepsilon4,
-        64.0f / 3.0f * const_iepsilon3,  0.0f,
-        0.0f,                            -1.0f / 15.0f,
-        0.0f,                            0.0f,
-        0.0f,                            0.0f,
-        0.0f,                            0.0f,
+static float
+    kernel_grav_coeffs[(kernel_grav_degree + 1) * (kernel_grav_ivals + 1)] = {
+        32.0f * const_iepsilon6,
+        -192.0f / 5.0f * const_iepsilon5,
+        0.0f,
+        32.0f / 3.0f * const_iepsilon3,
+        0.0f,
+        0.0f,
+        0.0f,
+        -32.0f / 3.0f * const_iepsilon6,
+        192.0f / 5.0f * const_iepsilon5,
+        -48.0f * const_iepsilon4,
+        64.0f / 3.0f * const_iepsilon3,
+        0.0f,
+        0.0f,
+        -1.0f / 15.0f,
+        0.0f,
+        0.0f,
+        0.0f,
+        0.0f,
+        0.0f,
+        0.0f,
         1.0f};
 
 /**
@@ -73,8 +86,8 @@ __attribute__((always_inline)) INLINE static void kernel_grav_eval(float x,
  * version).
  */
 
-__attribute__((always_inline))
-    INLINE static void kernel_grav_eval_vec(vector *x, vector *w) {
+__attribute__((always_inline)) INLINE static void kernel_grav_eval_vec(
+    vector *x, vector *w) {
 
   vector ind, c[kernel_grav_degree + 1];
   int j, k;
@@ -176,8 +189,8 @@ __attribute__((always_inline)) INLINE static void blender_eval_vec(vector *x,
  * distance x (Vectorized version). Gives a sensible answer only if x<2.
  */
 
-__attribute__((always_inline))
-    INLINE static void blender_deval_vec(vector *x, vector *w, vector *dw_dx) {
+__attribute__((always_inline)) INLINE static void blender_deval_vec(
+    vector *x, vector *w, vector *dw_dx) {
 
   vector ind, c[blender_degree + 1];
   int j, k;
diff --git a/src/kernel_hydro.h b/src/kernel_hydro.h
index 66f51391fb9504ba30363b1980aaad1fcc9174b7..b1774d8f35b7eddb6c2fdb0c341fa6299de74582 100644
--- a/src/kernel_hydro.h
+++ b/src/kernel_hydro.h
@@ -20,6 +20,8 @@
 #ifndef SWIFT_KERNEL_HYDRO_H
 #define SWIFT_KERNEL_HYDRO_H
 
+#include <math.h>
+
 /* Includes. */
 #include "const.h"
 #include "error.h"
@@ -33,8 +35,8 @@
 #define kernel_name "Cubic spline (M4)"
 #define kernel_degree 3 /* Degree of the polynomial */
 #define kernel_ivals 2  /* Number of branches */
-#define kernel_gamma 1.825742
-#define kernel_constant 16. * M_1_PI
+#define kernel_gamma ((float)(1.825742))
+#define kernel_constant ((float)(16. * M_1_PI))
 static const float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)]
     __attribute__((aligned(16))) = {3.f,  -3.f, 0.f,  0.5f, /* 0 < u < 0.5 */
                                     -1.f, 3.f,  -3.f, 1.f,  /* 0.5 < u < 1 */
@@ -47,8 +49,8 @@ static const float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)]
 #define kernel_name "Quartic spline (M5)"
 #define kernel_degree 4
 #define kernel_ivals 5
-#define kernel_gamma 2.018932
-#define kernel_constant 15625. * M_1_PI / 512.
+#define kernel_gamma ((float)(2.018932))
+#define kernel_constant ((float)(15625. * M_1_PI / 512.))
 static const float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)]
     __attribute__((aligned(16))) = {
         6.f,  0.f,  -2.4f, 0.f,   0.368f, /* 0 < u < 0.2 */
@@ -65,8 +67,8 @@ static const float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)]
 #define kernel_name "Quintic spline (M6)"
 #define kernel_degree 5
 #define kernel_ivals 3
-#define kernel_gamma 2.195775
-#define kernel_constant 2187. * M_1_PI / 40.
+#define kernel_gamma ((float)(2.195775))
+#define kernel_constant ((float)(2187. * M_1_PI / 40.))
 static const float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)]
     __attribute__((aligned(16))) = {
         -10.f,        10.f,      0.f,
@@ -85,8 +87,8 @@ static const float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)]
 #define kernel_name "Wendland C2"
 #define kernel_degree 5
 #define kernel_ivals 1
-#define kernel_gamma 1.936492
-#define kernel_constant 21. * M_1_PI / 2.
+#define kernel_gamma ((float)(1.936492))
+#define kernel_constant ((float)(21. * M_1_PI / 2.))
 static const float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)]
     __attribute__((aligned(16))) = {
         4.f, -15.f, 20.f, -10.f, 0.f, 1.f,  /* 0 < u < 1 */
@@ -99,8 +101,8 @@ static const float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)]
 #define kernel_name "Wendland C4"
 #define kernel_degree 8
 #define kernel_ivals 1
-#define kernel_gamma 2.207940
-#define kernel_constant 495. * M_1_PI / 32.
+#define kernel_gamma ((float)(2.207940))
+#define kernel_constant ((float)(495. * M_1_PI / 32.))
 static const float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)]
     __attribute__((aligned(16))) = {
         11.666667f, -64.f,       140.f, -149.333333f, 70.f,
@@ -115,8 +117,8 @@ static const float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)]
 #define kernel_name "Wendland C6"
 #define kernel_degree 11
 #define kernel_ivals 1
-#define kernel_gamma 2.449490
-#define kernel_constant 1365. * M_1_PI / 64.
+#define kernel_gamma ((float)(2.449490))
+#define kernel_constant ((float)(1365. * M_1_PI / 64.))
 static const float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)]
     __attribute__((aligned(16))) = {
         32.f, -231.f, 704.f, -1155.f, 1056.f, -462.f,
@@ -135,23 +137,22 @@ static const float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)]
 /* Ok, now comes the real deal. */
 
 /* First some powers of gamma = H/h */
-#define kernel_gamma2 kernel_gamma *kernel_gamma
-#define kernel_gamma3 kernel_gamma2 *kernel_gamma
-#define kernel_gamma4 kernel_gamma3 *kernel_gamma
-#define kernel_igamma 1. / kernel_gamma
-#define kernel_igamma2 kernel_igamma *kernel_igamma
-#define kernel_igamma3 kernel_igamma2 *kernel_igamma
-#define kernel_igamma4 kernel_igamma3 *kernel_igamma
-
-/* Some powers of eta */
-#define kernel_eta3 const_eta_kernel *const_eta_kernel *const_eta_kernel
-
-/* The number of neighbours (i.e. N_ngb) */
-#define kernel_nwneigh 4.0 * M_PI *kernel_gamma3 *kernel_eta3 / 3.0
+#define kernel_gamma2 ((float)(kernel_gamma * kernel_gamma))
+#define kernel_gamma3 ((float)(kernel_gamma * kernel_gamma * kernel_gamma))
+#define kernel_gamma4 \
+  ((float)(kernel_gamma * kernel_gamma * kernel_gamma * kernel_gamma))
+#define kernel_igamma ((float)(1. / kernel_gamma))
+#define kernel_igamma2 ((float)(kernel_igamma * kernel_igamma))
+#define kernel_igamma3 ((float)(kernel_igamma * kernel_igamma * kernel_igamma))
+#define kernel_igamma4 \
+  ((float)(kernel_igamma * kernel_igamma * kernel_igamma * kernel_igamma))
+
+/* The number of branches */
+#define kernel_ivals_f ((float)(kernel_ivals))
 
 /* Kernel self contribution (i.e. W(0,h)) */
 #define kernel_root \
-  (kernel_coeffs[kernel_degree]) * kernel_constant *kernel_igamma3
+  ((float)(kernel_coeffs[kernel_degree]) * kernel_constant * kernel_igamma3)
 
 /**
  * @brief Computes the kernel function and its derivative.
@@ -163,14 +164,20 @@ static const float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)]
  * @param dW_dx (return) The norm of the gradient of $|\\nabla W(x,h)|$.
  */
 __attribute__((always_inline)) INLINE static void kernel_deval(
-    float u, float *const W, float *const dW_dx) {
+    float u, float *restrict W, float *restrict dW_dx) {
 
   /* Go to the range [0,1[ from [0,H[ */
-  const float x = u * (float)kernel_igamma;
+  const float x = u * kernel_igamma;
 
+#if kernel_ivals == 1
+  /* Only one branch in this case */
+  const float *const coeffs = &kernel_coeffs[0];
+#else
   /* Pick the correct branch of the kernel */
-  const int ind = (int)fminf(x * (float)kernel_ivals, kernel_ivals);
+  const int temp = (int)(x * kernel_ivals_f);
+  const int ind = temp > kernel_ivals ? kernel_ivals : temp;
   const float *const coeffs = &kernel_coeffs[ind * (kernel_degree + 1)];
+#endif
 
   /* First two terms of the polynomial ... */
   float w = coeffs[0] * x + coeffs[1];
@@ -183,8 +190,8 @@ __attribute__((always_inline)) INLINE static void kernel_deval(
   }
 
   /* Return everything */
-  *W = w * (float)kernel_constant * (float)kernel_igamma3;
-  *dW_dx = dw_dx * (float)kernel_constant * (float)kernel_igamma4;
+  *W = w * kernel_constant * kernel_igamma3;
+  *dW_dx = dw_dx * kernel_constant * kernel_igamma4;
 }
 
 /**
@@ -193,14 +200,20 @@ __attribute__((always_inline)) INLINE static void kernel_deval(
  * @param u The ratio of the distance to the smoothing length $u = x/h$.
  * @param W (return) The value of the kernel function $W(x,h)$.
  */
-__attribute__((always_inline)) INLINE static void kernel_eval(float u,
-                                                              float *const W) {
+__attribute__((always_inline)) INLINE static void kernel_eval(
+    float u, float *restrict W) {
   /* Go to the range [0,1[ from [0,H[ */
-  const float x = u * (float)kernel_igamma;
+  const float x = u * kernel_igamma;
 
+#if kernel_ivals == 1
+  /* Only one branch in this case */
+  const float *const coeffs = &kernel_coeffs[0];
+#else
   /* Pick the correct branch of the kernel */
-  const int ind = (int)fminf(x * (float)kernel_ivals, kernel_ivals);
+  const int temp = (int)(x * kernel_ivals_f);
+  const int ind = temp > kernel_ivals ? kernel_ivals : temp;
   const float *const coeffs = &kernel_coeffs[ind * (kernel_degree + 1)];
+#endif
 
   /* First two terms of the polynomial ... */
   float w = coeffs[0] * x + coeffs[1];
@@ -209,9 +222,64 @@ __attribute__((always_inline)) INLINE static void kernel_eval(float u,
   for (int k = 2; k <= kernel_degree; k++) w = x * w + coeffs[k];
 
   /* Return everything */
-  *W = w * (float)kernel_constant * (float)kernel_igamma3;
+  *W = w * kernel_constant * kernel_igamma3;
 }
 
+#ifdef VECTORIZE
+
+static const vector kernel_igamma_vec = FILL_VEC((float)kernel_igamma);
+
+static const vector kernel_ivals_vec = FILL_VEC((float)kernel_ivals);
+
+static const vector kernel_constant_vec = FILL_VEC((float)kernel_constant);
+
+static const vector kernel_igamma3_vec = FILL_VEC((float)kernel_igamma3);
+
+static const vector kernel_igamma4_vec = FILL_VEC((float)kernel_igamma4);
+
+/**
+ * @brief Computes the kernel function and its derivative (Vectorised version).
+ *
+ * Return 0 if $u > \\gamma = H/h$
+ *
+ * @param u The ratio of the distance to the smoothing length $u = x/h$.
+ * @param w (return) The value of the kernel function $W(x,h)$.
+ * @param dw_dx (return) The norm of the gradient of $|\\nabla W(x,h)|$.
+ */
+__attribute__((always_inline)) INLINE static void kernel_deval_vec(
+    vector *u, vector *w, vector *dw_dx) {
+
+  /* Go to the range [0,1[ from [0,H[ */
+  vector x;
+  x.v = u->v * kernel_igamma_vec.v;
+
+  /* Load x and get the interval id. */
+  vector ind;
+  ind.m = vec_ftoi(vec_fmin(x.v * kernel_ivals_vec.v, kernel_ivals_vec.v));
+
+  /* load the coefficients. */
+  vector c[kernel_degree + 1];
+  for (int k = 0; k < VEC_SIZE; k++)
+    for (int j = 0; j < kernel_degree + 1; j++)
+      c[j].f[k] = kernel_coeffs[ind.i[k] * (kernel_degree + 1) + j];
+
+  /* Init the iteration for Horner's scheme. */
+  w->v = (c[0].v * x.v) + c[1].v;
+  dw_dx->v = c[0].v;
+
+  /* And we're off! */
+  for (int k = 2; k <= kernel_degree; k++) {
+    dw_dx->v = (dw_dx->v * x.v) + w->v;
+    w->v = (x.v * w->v) + c[k].v;
+  }
+
+  /* Return everything */
+  w->v = w->v * kernel_constant_vec.v * kernel_igamma3_vec.v;
+  dw_dx->v = dw_dx->v * kernel_constant_vec.v * kernel_igamma4_vec.v;
+}
+
+#endif
+
 /* Some cross-check functions */
 void hydro_kernel_dump(int N);
 
diff --git a/src/kick.h b/src/kick.h
new file mode 100644
index 0000000000000000000000000000000000000000..df3bd4cc6ce9819d0b65640db51d2ed20a7d59fe
--- /dev/null
+++ b/src/kick.h
@@ -0,0 +1,114 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (c) 2016 Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+#ifndef SWIFT_KICK_H
+#define SWIFT_KICK_H
+
+/* Config parameters. */
+#include "../config.h"
+
+/* Local headers. */
+#include "const.h"
+#include "debug.h"
+
+/**
+ * @brief Perform the 'kick' operation on a #gpart
+ *
+ * @param gp The #gpart to kick.
+ * @param new_dti The (integer) time-step for this kick.
+ * @param timeBase The minimal allowed time-step size.
+ */
+__attribute__((always_inline)) INLINE static void kick_gpart(struct gpart* gp,
+                                                             int new_dti,
+                                                             double timeBase) {
+
+  /* Compute the time step for this kick */
+  const int ti_start = (gp->ti_begin + gp->ti_end) / 2;
+  const int ti_end = gp->ti_end + new_dti / 2;
+  const double dt = (ti_end - ti_start) * timeBase;
+  const double half_dt = (ti_end - gp->ti_end) * timeBase;
+
+  /* Move particle forward in time */
+  gp->ti_begin = gp->ti_end;
+  gp->ti_end = gp->ti_begin + new_dti;
+
+  /* Kick particles in momentum space */
+  gp->v_full[0] += gp->a_grav[0] * dt;
+  gp->v_full[1] += gp->a_grav[1] * dt;
+  gp->v_full[2] += gp->a_grav[2] * dt;
+
+  /* Extra kick work */
+  gravity_kick_extra(gp, dt, half_dt);
+}
+
+/**
+ * @brief Perform the 'kick' operation on a #part
+ *
+ * @param p The #part to kick.
+ * @param xp The #xpart of the particle.
+ * @param new_dti The (integer) time-step for this kick.
+ * @param timeBase The minimal allowed time-step size.
+ */
+__attribute__((always_inline)) INLINE static void kick_part(struct part* p,
+                                                            struct xpart* xp,
+                                                            int new_dti,
+                                                            double timeBase) {
+
+  /* Compute the time step for this kick */
+  const int ti_start = (p->ti_begin + p->ti_end) / 2;
+  const int ti_end = p->ti_end + new_dti / 2;
+  const double dt = (ti_end - ti_start) * timeBase;
+  const double half_dt = (ti_end - p->ti_end) * timeBase;
+
+  /* Move particle forward in time */
+  p->ti_begin = p->ti_end;
+  p->ti_end = p->ti_begin + new_dti;
+  if (p->gpart != NULL) {
+    p->gpart->ti_begin = p->ti_begin;
+    p->gpart->ti_end = p->ti_end;
+  }
+
+  /* Get the acceleration */
+  float a_tot[3] = {p->a_hydro[0], p->a_hydro[1], p->a_hydro[2]};
+  if (p->gpart != NULL) {
+    a_tot[0] += p->gpart->a_grav[0];
+    a_tot[1] += p->gpart->a_grav[1];
+    a_tot[1] += p->gpart->a_grav[2];
+  }
+
+  /* Kick particles in momentum space */
+  xp->v_full[0] += a_tot[0] * dt;
+  xp->v_full[1] += a_tot[1] * dt;
+  xp->v_full[2] += a_tot[2] * dt;
+  if (p->gpart != NULL) {
+    p->gpart->v_full[0] = xp->v_full[0];
+    p->gpart->v_full[1] = xp->v_full[1];
+    p->gpart->v_full[2] = xp->v_full[2];
+  }
+
+  /* Go back by half-step for the hydro velocity */
+  p->v[0] = xp->v_full[0] - half_dt * a_tot[0];
+  p->v[1] = xp->v_full[1] - half_dt * a_tot[1];
+  p->v[2] = xp->v_full[2] - half_dt * a_tot[2];
+
+  /* Extra kick work */
+  hydro_kick_extra(p, xp, dt, half_dt);
+  if (p->gpart != NULL) gravity_kick_extra(p->gpart, dt, half_dt);
+}
+
+#endif /* SWIFT_KICK_H */
diff --git a/src/lock.h b/src/lock.h
index 90e9f90602c120ddd10f4cdefb9b08cedbf45e0f..ca7f01ee029cd1c57ed8fd0f3237ea54cb43e9a7 100644
--- a/src/lock.h
+++ b/src/lock.h
@@ -23,11 +23,11 @@
 #include <pthread.h>
 
 /* Includes. */
-#include "inline.h"
+#include "atomic.h"
 
 #ifdef PTHREAD_SPINLOCK
 #include <pthread.h>
-#define lock_type pthread_spinlock_t
+#define swift_lock_type pthread_spinlock_t
 #define lock_init(l) (pthread_spin_init(l, PTHREAD_PROCESS_PRIVATE) != 0)
 #define lock_destroy(l) (pthread_spin_destroy(l) != 0)
 #define lock_lock(l) (pthread_spin_lock(l) != 0)
@@ -36,7 +36,7 @@
 #define lock_unlock_blind(l) pthread_spin_unlock(l)
 #elif defined(PTHREAD_LOCK)
 #include <pthread.h>
-#define lock_type pthread_mutex_t
+#define swift_lock_type pthread_mutex_t
 #define lock_init(l) (pthread_mutex_init(l, NULL) != 0)
 #define lock_destroy(l) (pthread_mutex_destroy(l) != 0)
 #define lock_lock(l) (pthread_mutex_lock(l) != 0)
@@ -44,18 +44,18 @@
 #define lock_unlock(l) (pthread_mutex_unlock(l) != 0)
 #define lock_unlock_blind(l) pthread_mutex_unlock(l)
 #else
-#define lock_type volatile int
+#define swift_lock_type volatile int
 #define lock_init(l) (*(l) = 0)
 #define lock_destroy(l) 0
 INLINE static int lock_lock(volatile int *l) {
-  while (__sync_val_compare_and_swap(l, 0, 1) != 0)
+  while (atomic_cas(l, 0, 1) != 0)
     ;
   // while( *l );
   return 0;
 }
-#define lock_trylock(l) ((*(l)) ? 1 : __sync_val_compare_and_swap(l, 0, 1))
-#define lock_unlock(l) (__sync_val_compare_and_swap(l, 1, 0) != 1)
-#define lock_unlock_blind(l) __sync_val_compare_and_swap(l, 1, 0)
+#define lock_trylock(l) ((*(l)) ? 1 : atomic_cas(l, 0, 1))
+#define lock_unlock(l) (atomic_cas(l, 1, 0) != 1)
+#define lock_unlock_blind(l) atomic_cas(l, 1, 0)
 #endif
 
 #endif /* SWIFT_LOCK_H */
diff --git a/src/map.c b/src/map.c
index da13fbfb4ac00ed58184f7fe818826c82265a1de..f4f9ac7cfa7141606b578739517b66e951e65eab 100644
--- a/src/map.c
+++ b/src/map.c
@@ -18,15 +18,23 @@
  *
  ******************************************************************************/
 
+/* Config parameters. */
+#include "../config.h"
+
+/* Some standard headers. */
 #include <stdio.h>
 #include <stdlib.h>
-#include "error.h"
+
+/* This object's header. */
 #include "map.h"
 
+/* Local headers. */
+#include "atomic.h"
+#include "error.h"
+
 /**
  * @brief Mapping function to draw a specific cell (gnuplot).
  */
-
 void map_cells_plot(struct cell *c, void *data) {
 
   int depth = *(int *)data;
@@ -80,24 +88,21 @@ void map_cells_plot(struct cell *c, void *data) {
 /**
  * @brief Mapping function for checking if each part is in its box.
  */
+void map_check(struct part *p, struct cell *c, void *data) {
 
-/* void map_check ( struct part *p , struct cell *c , void *data ) {
-
-    if ( p->x[0] < c->loc[0] || p->x[0] > c->loc[0]+c->h[0] ||
-         p->x[0] < c->loc[0] || p->x[0] > c->loc[0]+c->h[0] ||
-         p->x[0] < c->loc[0] || p->x[0] > c->loc[0]+c->h[0] )
-        printf( "map_check: particle %i is outside of its box.\n" , p->id );
-
-    } */
+  if (p->x[0] < c->loc[0] || p->x[0] > c->loc[0] + c->h[0] ||
+      p->x[0] < c->loc[0] || p->x[0] > c->loc[0] + c->h[0] ||
+      p->x[0] < c->loc[0] || p->x[0] > c->loc[0] + c->h[0])
+    printf("map_check: particle %lld is outside of its box.\n", p->id);
+}
 
 /**
  * @brief Mapping function for neighbour count.
  */
-
 void map_cellcheck(struct cell *c, void *data) {
 
   int *count = (int *)data;
-  __sync_fetch_and_add(count, c->count);
+  atomic_add(count, c->count);
 
   /* Loop over all parts and check if they are in the cell. */
   for (int k = 0; k < c->count; k++) {
@@ -133,7 +138,6 @@ void map_cellcheck(struct cell *c, void *data) {
 /**
  * @brief Mapping function for maxdepth cell count.
  */
-
 void map_maxdepth(struct cell *c, void *data) {
 
   int maxdepth = ((int *)data)[0];
@@ -147,7 +151,6 @@ void map_maxdepth(struct cell *c, void *data) {
 /**
  * @brief Mapping function for neighbour count.
  */
-
 void map_count(struct part *p, struct cell *c, void *data) {
 
   double *wcount = (double *)data;
@@ -156,7 +159,6 @@ void map_count(struct part *p, struct cell *c, void *data) {
 
   *wcount += p->density.wcount;
 }
-
 void map_wcount_min(struct part *p, struct cell *c, void *data) {
 
   struct part **p2 = (struct part **)data;
@@ -188,7 +190,6 @@ void map_h_max(struct part *p, struct cell *c, void *data) {
 /**
  * @brief Mapping function for neighbour count.
  */
-
 void map_icount(struct part *p, struct cell *c, void *data) {
 
   // int *count = (int *)data;
@@ -201,7 +202,6 @@ void map_icount(struct part *p, struct cell *c, void *data) {
 /**
  * @brief Mapping function to print the particle position.
  */
-
 void map_dump(struct part *p, struct cell *c, void *data) {
 
   double *shift = (double *)data;
diff --git a/src/map.h b/src/map.h
index 0753c2641af6deb050c1dcef6bcd3ae4621ae6aa..950a5fd96ebdc7177b41912b1565163f33de8701 100644
--- a/src/map.h
+++ b/src/map.h
@@ -22,8 +22,8 @@
 #ifndef SWIFT_MAP_H
 #define SWIFT_MAP_H
 
-#include "part.h"
 #include "cell.h"
+#include "part.h"
 
 void map_cells_plot(struct cell *c, void *data);
 void map_check(struct part *p, struct cell *c, void *data);
diff --git a/src/parallel_io.c b/src/parallel_io.c
index d1c739b59021f38b2259f82dd06c547e0e7c147d..c5cac1cb5efc6e533e599867e39cdd7c7b2c87fa 100644
--- a/src/parallel_io.c
+++ b/src/parallel_io.c
@@ -37,7 +37,11 @@
 
 /* Local includes. */
 #include "common_io.h"
+#include "engine.h"
 #include "error.h"
+#include "kernel_hydro.h"
+#include "part.h"
+#include "units.h"
 
 /**
  * @brief Reads a data array from a given HDF5 group.
@@ -509,8 +513,12 @@ void read_ic_parallel(char* fileName, double dim[3], struct part** parts,
  *its XMF descriptor
  *
  * @param e The engine containing all the system.
- * @param us The UnitSystem used for the conversion of units
- *in the output
+ * @param baseName The common part of the snapshot file name.
+ * @param us The UnitSystem used for the conversion of units in the output.
+ * @param mpi_rank The MPI rank of this node.
+ * @param mpi_size The number of MPI ranks.
+ * @param comm The MPI communicator.
+ * @param info The MPI information object
  *
  * Creates an HDF5 output file and writes the particles
  *contained
@@ -522,10 +530,10 @@ void read_ic_parallel(char* fileName, double dim[3], struct part** parts,
  * Calls #error() if an error occurs.
  *
  */
-void write_output_parallel(struct engine* e, struct UnitSystem* us,
-                           int mpi_rank, int mpi_size, MPI_Comm comm,
-                           MPI_Info info) {
-  hid_t h_file = 0, h_grp = 0, h_grpsph = 0;
+void write_output_parallel(struct engine* e, const char* baseName,
+                           struct UnitSystem* us, int mpi_rank, int mpi_size,
+                           MPI_Comm comm, MPI_Info info) {
+  hid_t h_file = 0, h_grp = 0;
   const size_t Ngas = e->s->nr_parts;
   const size_t Ntot = e->s->nr_gparts;
   int periodic = e->s->periodic;
@@ -536,22 +544,19 @@ void write_output_parallel(struct engine* e, struct UnitSystem* us,
   static int outputCount = 0;
   FILE* xmfFile = 0;
 
-  /* Number of particles of each type */
-  // const size_t Ndm = Ntot - Ngas;
-
-  /* MATTHIEU: Temporary fix to preserve master */
+  /* Number of unassociated gparts */
   const size_t Ndm = Ntot > 0 ? Ntot - Ngas : 0;
-  /* MATTHIEU: End temporary fix */
 
   /* File name */
   char fileName[FILENAME_BUFFER_SIZE];
-  snprintf(fileName, FILENAME_BUFFER_SIZE, "output_%03i.hdf5", outputCount);
+  snprintf(fileName, FILENAME_BUFFER_SIZE, "%s_%03i.hdf5", baseName,
+           outputCount);
 
   /* First time, we need to create the XMF file */
-  if (outputCount == 0 && mpi_rank == 0) createXMFfile();
+  if (outputCount == 0 && mpi_rank == 0) createXMFfile(baseName);
 
   /* Prepare the XMF file for the new entry */
-  if (mpi_rank == 0) xmfFile = prepareXMFfile();
+  if (mpi_rank == 0) xmfFile = prepareXMFfile(baseName);
 
   /* Open HDF5 file */
   hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS);
@@ -632,10 +637,17 @@ void write_output_parallel(struct engine* e, struct UnitSystem* us,
   writeCodeDescription(h_file);
 
   /* Print the SPH parameters */
-  h_grpsph = H5Gcreate(h_file, "/SPH", H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-  if (h_grpsph < 0) error("Error while creating SPH group");
-  writeSPHflavour(h_grpsph);
-  H5Gclose(h_grpsph);
+  h_grp = H5Gcreate(h_file, "/SPH", H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+  if (h_grp < 0) error("Error while creating SPH group");
+  writeSPHflavour(h_grp);
+  H5Gclose(h_grp);
+
+  /* Print the runtime parameters */
+  h_grp =
+      H5Gcreate(h_file, "/Parameters", H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+  if (h_grp < 0) error("Error while creating parameters group");
+  parser_write_params_to_hdf5(e->parameter_file, h_grp);
+  H5Gclose(h_grp);
 
   /* Print the system of Units */
   writeUnitSystem(h_file, us);
diff --git a/src/parallel_io.h b/src/parallel_io.h
index f3691cb29b8d5e7f17382f1f81ba230c3898a929..26757cb679e475d6acd2ce3c408135dfe5e49081 100644
--- a/src/parallel_io.h
+++ b/src/parallel_io.h
@@ -19,6 +19,9 @@
 #ifndef SWIFT_PARALLEL_IO_H
 #define SWIFT_PARALLEL_IO_H
 
+/* Config parameters. */
+#include "../config.h"
+
 /* MPI headers. */
 #ifdef WITH_MPI
 #include <mpi.h>
@@ -36,9 +39,9 @@ void read_ic_parallel(char* fileName, double dim[3], struct part** parts,
                       int* periodic, int mpi_rank, int mpi_size, MPI_Comm comm,
                       MPI_Info info, int dry_run);
 
-void write_output_parallel(struct engine* e, struct UnitSystem* us,
-                           int mpi_rank, int mpi_size, MPI_Comm comm,
-                           MPI_Info info);
+void write_output_parallel(struct engine* e, const char* baseName,
+                           struct UnitSystem* us, int mpi_rank, int mpi_size,
+                           MPI_Comm comm, MPI_Info info);
 
 #endif
 
diff --git a/src/parser.c b/src/parser.c
index 0f767bc434ef596df403fb12d3ae0f77ea546df3..32377f877dc1796ec8bf38ae4de2e9c71b219509 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -22,14 +22,15 @@
 
 /* Some standard headers. */
 /* Needs to be included so that strtok returns char * instead of a int *. */
-#include <string.h>
-#include <stdlib.h>
 #include <ctype.h>
+#include <stdlib.h>
+#include <string.h>
 
 /* This object's header. */
 #include "parser.h"
 
 /* Local headers. */
+#include "common_io.h"
 #include "error.h"
 
 #define PARSER_COMMENT_STRING "#"
@@ -47,7 +48,10 @@ static void parse_line(char *line, struct swift_params *params);
 static void parse_value(char *line, struct swift_params *params);
 static void parse_section_param(char *line, int *isFirstParam,
                                 char *sectionName, struct swift_params *params);
-
+static void find_duplicate_params(const struct swift_params *params,
+                                  const char *param_name);
+static void find_duplicate_section(const struct swift_params *params,
+                                   const char *section_name);
 static int lineNumber = 0;
 
 /**
@@ -65,7 +69,9 @@ void parser_read_file(const char *file_name, struct swift_params *params) {
   char line[PARSER_MAX_LINE_SIZE];
 
   /* Initialise parameter count. */
-  params->count = 0;
+  params->paramCount = 0;
+  params->sectionCount = 0;
+  strcpy(params->fileName, file_name);
 
   /* Check if parameter file exits. */
   if (file == NULL) {
@@ -142,12 +148,46 @@ static int is_empty(const char *str) {
   return retParam;
 }
 
+/**
+ * @brief Look for duplicate parameters.
+ *
+ * @param params Structure that holds the parameters
+ * @param param_name Name of parameter to be searched for
+ */
+
+static void find_duplicate_params(const struct swift_params *params,
+                                  const char *param_name) {
+  for (int i = 0; i < params->paramCount; i++) {
+    if (!strcmp(param_name, params->data[i].name)) {
+      error("Invalid line:%d '%s', parameter is a duplicate.", lineNumber,
+            param_name);
+    }
+  }
+}
+
+/**
+ * @brief Look for duplicate sections.
+ *
+ * @param params Structure that holds the parameters
+ * @param section_name Name of section to be searched for
+ */
+
+static void find_duplicate_section(const struct swift_params *params,
+                                   const char *section_name) {
+  for (int i = 0; i < params->sectionCount; i++) {
+    if (!strcmp(section_name, params->section[i].name)) {
+      error("Invalid line:%d '%s', section is a duplicate.", lineNumber,
+            section_name);
+    }
+  }
+}
 /**
  * @brief Parses a line from a file and stores any parameters in a structure.
  *
  * @param line Line to be parsed.
  * @param params Structure to be populated from file.
  */
+
 static void parse_line(char *line, struct swift_params *params) {
   /* Parse line if it doesn't begin with a comment. */
   if (*line != PARSER_COMMENT_CHAR) {
@@ -170,7 +210,6 @@ static void parse_line(char *line, struct swift_params *params) {
         parse_value(trim_line, params);
       }
       /* Check for invalid lines,not including the start and end of file. */
-      /* Note: strcmp returns 0 if both strings are the same.*/
       else if (strcmp(trim_line, PARSER_START_OF_FILE) &&
                strcmp(trim_line, PARSER_END_OF_FILE)) {
         error("Invalid line:%d '%s'.", lineNumber, trim_line);
@@ -193,6 +232,7 @@ static void parse_value(char *line, struct swift_params *params) {
                                                 name. */
   static int isFirstParam = 1;
   char tmpStr[PARSER_MAX_LINE_SIZE];
+  char tmpSectionName[PARSER_MAX_LINE_SIZE];
 
   char *token;
 
@@ -213,7 +253,7 @@ static void parse_value(char *line, struct swift_params *params) {
   /* Check that it is a parameter inside a section.*/
   if (*line == ' ' || *line == '\t') {
     parse_section_param(line, &isFirstParam, section, params);
-  } else {/*Else it is the start of a new section or standalone parameter. */
+  } else { /*Else it is the start of a new section or standalone parameter. */
     /* Take first token as the parameter name. */
     token = strtok(line, " :\t");
     strcpy(tmpStr, token);
@@ -223,15 +263,49 @@ static void parse_value(char *line, struct swift_params *params) {
 
     /* If second token is NULL then the line must be a section heading. */
     if (token == NULL) {
-      strcat(tmpStr, PARSER_VALUE_STRING);
-      strcpy(section, tmpStr);
+      strcpy(tmpSectionName, tmpStr);
+      strcat(tmpSectionName, PARSER_VALUE_STRING);
+
+      /* Check for duplicate section name. */
+      find_duplicate_section(params, tmpSectionName);
+
+      /* Check for duplicate standalone parameter name used as a section name.
+       */
+      find_duplicate_params(params, tmpStr);
+
+      strcpy(section, tmpSectionName);
+      strcpy(params->section[params->sectionCount].name, tmpSectionName);
+      if (params->sectionCount == PARSER_MAX_NO_OF_SECTIONS - 1) {
+        error(
+            "Maximal number of sections in parameter file reached. Aborting !");
+      } else {
+        params->sectionCount++;
+      }
       inSection = 1;
       isFirstParam = 1;
     } else {
+      /* Create string with standalone parameter name appended with ":" to aid
+       * duplicate search as section names are stored with ":" at the end.*/
+      strcpy(tmpSectionName, tmpStr);
+      strcat(tmpSectionName, PARSER_VALUE_STRING);
+
+      /* Check for duplicate parameter name. */
+      find_duplicate_params(params, tmpStr);
+
+      /* Check for duplicate section name used as standalone parameter name. */
+      find_duplicate_section(params, tmpSectionName);
+
       /* Must be a standalone parameter so no need to prefix name with a
        * section. */
-      strcpy(params->data[params->count].name, tmpStr);
-      strcpy(params->data[params->count++].value, token);
+      strcpy(params->data[params->paramCount].name, tmpStr);
+      strcpy(params->data[params->paramCount].value, token);
+      if (params->paramCount == PARSER_MAX_NO_OF_PARAMS - 1) {
+        error(
+            "Maximal number of parameters in parameter file reached. Aborting "
+            "!");
+      } else {
+        params->paramCount++;
+      }
       inSection = 0;
       isFirstParam = 1;
     }
@@ -278,8 +352,17 @@ static void parse_section_param(char *line, int *isFirstParam,
    * copy it into the parameter structure. */
   strcpy(paramName, sectionName);
   strcat(paramName, tmpStr);
-  strcpy(params->data[params->count].name, paramName);
-  strcpy(params->data[params->count++].value, token);
+
+  /* Check for duplicate parameter name. */
+  find_duplicate_params(params, paramName);
+
+  strcpy(params->data[params->paramCount].name, paramName);
+  strcpy(params->data[params->paramCount].value, token);
+  if (params->paramCount == PARSER_MAX_NO_OF_PARAMS - 1) {
+    error("Maximal number of parameters in parameter file reached. Aborting !");
+  } else {
+    params->paramCount++;
+  }
 }
 
 /**
@@ -294,8 +377,7 @@ int parser_get_param_int(const struct swift_params *params, const char *name) {
   char str[PARSER_MAX_LINE_SIZE];
   int retParam = 0;
 
-  for (int i = 0; i < params->count; i++) {
-    /*strcmp returns 0 if both strings are the same.*/
+  for (int i = 0; i < params->paramCount; i++) {
     if (!strcmp(name, params->data[i].name)) {
       /* Check that exactly one number is parsed. */
       if (sscanf(params->data[i].value, "%d%s", &retParam, str) != 1) {
@@ -309,7 +391,8 @@ int parser_get_param_int(const struct swift_params *params, const char *name) {
     }
   }
 
-  error("Cannot find '%s' in the structure.", name);
+  error("Cannot find '%s' in the structure, in file '%s'.", name,
+        params->fileName);
   return 0;
 }
 
@@ -326,8 +409,7 @@ char parser_get_param_char(const struct swift_params *params,
   char str[PARSER_MAX_LINE_SIZE];
   char retParam = 0;
 
-  for (int i = 0; i < params->count; i++) {
-    /*strcmp returns 0 if both strings are the same.*/
+  for (int i = 0; i < params->paramCount; i++) {
     if (!strcmp(name, params->data[i].name)) {
       /* Check that exactly one number is parsed. */
       if (sscanf(params->data[i].value, "%c%s", &retParam, str) != 1) {
@@ -341,7 +423,8 @@ char parser_get_param_char(const struct swift_params *params,
     }
   }
 
-  error("Cannot find '%s' in the structure.", name);
+  error("Cannot find '%s' in the structure, in file '%s'.", name,
+        params->fileName);
   return 0;
 }
 
@@ -358,8 +441,7 @@ float parser_get_param_float(const struct swift_params *params,
   char str[PARSER_MAX_LINE_SIZE];
   float retParam = 0.f;
 
-  for (int i = 0; i < params->count; i++) {
-    /*strcmp returns 0 if both strings are the same.*/
+  for (int i = 0; i < params->paramCount; i++) {
     if (!strcmp(name, params->data[i].name)) {
       /* Check that exactly one number is parsed. */
       if (sscanf(params->data[i].value, "%f%s", &retParam, str) != 1) {
@@ -373,7 +455,8 @@ float parser_get_param_float(const struct swift_params *params,
     }
   }
 
-  error("Cannot find '%s' in the structure.", name);
+  error("Cannot find '%s' in the structure, in file '%s'.", name,
+        params->fileName);
   return 0.f;
 }
 
@@ -390,8 +473,7 @@ double parser_get_param_double(const struct swift_params *params,
   char str[PARSER_MAX_LINE_SIZE];
   double retParam = 0.;
 
-  for (int i = 0; i < params->count; i++) {
-    /*strcmp returns 0 if both strings are the same.*/
+  for (int i = 0; i < params->paramCount; i++) {
     if (!strcmp(name, params->data[i].name)) {
       /* Check that exactly one number is parsed. */
       if (sscanf(params->data[i].value, "%lf%s", &retParam, str) != 1) {
@@ -404,7 +486,8 @@ double parser_get_param_double(const struct swift_params *params,
     }
   }
 
-  error("Cannot find '%s' in the structure.", name);
+  error("Cannot find '%s' in the structure, in file '%s'.", name,
+        params->fileName);
   return 0.;
 }
 
@@ -417,8 +500,7 @@ double parser_get_param_double(const struct swift_params *params,
  */
 void parser_get_param_string(const struct swift_params *params,
                              const char *name, char *retParam) {
-  for (int i = 0; i < params->count; i++) {
-    /*strcmp returns 0 if both strings are the same.*/
+  for (int i = 0; i < params->paramCount; i++) {
     if (!strcmp(name, params->data[i].name)) {
       strcpy(retParam, params->data[i].value);
       return;
@@ -428,6 +510,150 @@ void parser_get_param_string(const struct swift_params *params,
   error("Cannot find '%s' in the structure.", name);
 }
 
+/**
+ * @brief Retrieve optional integer parameter from structure.
+ *
+ * @param params Structure that holds the parameters
+ * @param name Name of the parameter to be found
+ * @param def Default value of the parameter of not found.
+ * @return Value of the parameter found
+ */
+int parser_get_opt_param_int(const struct swift_params *params,
+                             const char *name, int def) {
+
+  char str[PARSER_MAX_LINE_SIZE];
+  int retParam = 0;
+
+  for (int i = 0; i < params->paramCount; i++) {
+    if (!strcmp(name, params->data[i].name)) {
+      /* Check that exactly one number is parsed. */
+      if (sscanf(params->data[i].value, "%d%s", &retParam, str) != 1) {
+        error(
+            "Tried parsing int '%s' but found '%s' with illegal integer "
+            "characters '%s'.",
+            params->data[i].name, params->data[i].value, str);
+      }
+
+      return retParam;
+    }
+  }
+
+  return def;
+}
+
+/**
+ * @brief Retrieve optional char parameter from structure.
+ *
+ * @param params Structure that holds the parameters
+ * @param name Name of the parameter to be found
+ * @param def Default value of the parameter of not found.
+ * @return Value of the parameter found
+ */
+char parser_get_opt_param_char(const struct swift_params *params,
+                               const char *name, char def) {
+
+  char str[PARSER_MAX_LINE_SIZE];
+  char retParam = 0;
+
+  for (int i = 0; i < params->paramCount; i++) {
+    if (!strcmp(name, params->data[i].name)) {
+      /* Check that exactly one number is parsed. */
+      if (sscanf(params->data[i].value, "%c%s", &retParam, str) != 1) {
+        error(
+            "Tried parsing char '%s' but found '%s' with illegal char "
+            "characters '%s'.",
+            params->data[i].name, params->data[i].value, str);
+      }
+
+      return retParam;
+    }
+  }
+
+  return def;
+}
+
+/**
+ * @brief Retrieve optional float parameter from structure.
+ *
+ * @param params Structure that holds the parameters
+ * @param name Name of the parameter to be found
+ * @param def Default value of the parameter of not found.
+ * @return Value of the parameter found
+ */
+float parser_get_opt_param_float(const struct swift_params *params,
+                                 const char *name, float def) {
+
+  char str[PARSER_MAX_LINE_SIZE];
+  float retParam = 0.f;
+
+  for (int i = 0; i < params->paramCount; i++) {
+    if (!strcmp(name, params->data[i].name)) {
+      /* Check that exactly one number is parsed. */
+      if (sscanf(params->data[i].value, "%f%s", &retParam, str) != 1) {
+        error(
+            "Tried parsing float '%s' but found '%s' with illegal float "
+            "characters '%s'.",
+            params->data[i].name, params->data[i].value, str);
+      }
+
+      return retParam;
+    }
+  }
+
+  return def;
+}
+
+/**
+ * @brief Retrieve optional double parameter from structure.
+ *
+ * @param params Structure that holds the parameters
+ * @param name Name of the parameter to be found
+ * @param def Default value of the parameter of not found.
+ * @return Value of the parameter found
+ */
+double parser_get_opt_param_double(const struct swift_params *params,
+                                   const char *name, double def) {
+
+  char str[PARSER_MAX_LINE_SIZE];
+  double retParam = 0.;
+
+  for (int i = 0; i < params->paramCount; i++) {
+    if (!strcmp(name, params->data[i].name)) {
+      /* Check that exactly one number is parsed. */
+      if (sscanf(params->data[i].value, "%lf%s", &retParam, str) != 1) {
+        error(
+            "Tried parsing double '%s' but found '%s' with illegal double "
+            "characters '%s'.",
+            params->data[i].name, params->data[i].value, str);
+      }
+      return retParam;
+    }
+  }
+
+  return def;
+}
+
+/**
+ * @brief Retrieve string parameter from structure.
+ *
+ * @param params Structure that holds the parameters
+ * @param name Name of the parameter to be found
+ * @param def Default value of the parameter of not found.
+ * @param retParam (return) Value of the parameter found
+ */
+void parser_get_opt_param_string(const struct swift_params *params,
+                                 const char *name, char *retParam,
+                                 const char *def) {
+  for (int i = 0; i < params->paramCount; i++) {
+    if (!strcmp(name, params->data[i].name)) {
+      strcpy(retParam, params->data[i].value);
+      return;
+    }
+  }
+
+  strcpy(retParam, def);
+}
+
 /**
  * @brief Prints the contents of the parameter structure.
  *
@@ -438,7 +664,7 @@ void parser_print_params(const struct swift_params *params) {
   printf("|  SWIFT Parameter File  |\n");
   printf("--------------------------\n");
 
-  for (int i = 0; i < params->count; i++) {
+  for (int i = 0; i < params->paramCount; i++) {
     printf("Parameter name: %s\n", params->data[i].name);
     printf("Parameter value: %s\n", params->data[i].value);
   }
@@ -461,7 +687,7 @@ void parser_write_params_to_file(const struct swift_params *params,
   /* Start of file identifier in YAML. */
   fprintf(file, "%s\n", PARSER_START_OF_FILE);
 
-  for (int i = 0; i < params->count; i++) {
+  for (int i = 0; i < params->paramCount; i++) {
     /* Check that the parameter name contains a section name. */
     if (strchr(params->data[i].name, PARSER_VALUE_CHAR)) {
       /* Copy the parameter name into a temporary string and find the section
@@ -478,7 +704,7 @@ void parser_write_params_to_file(const struct swift_params *params,
       /* Remove white space from parameter name and write it to the file. */
       token = strtok(NULL, " #\n");
 
-      fprintf(file, "\t%s%c %s\n", token, PARSER_VALUE_CHAR,
+      fprintf(file, "  %s%c %s\n", token, PARSER_VALUE_CHAR,
               params->data[i].value);
     } else {
       fprintf(file, "\n%s%c %s\n", params->data[i].name, PARSER_VALUE_CHAR,
@@ -491,3 +717,11 @@ void parser_write_params_to_file(const struct swift_params *params,
 
   fclose(file);
 }
+
+#if defined(HAVE_HDF5)
+void parser_write_params_to_hdf5(const struct swift_params *params, hid_t grp) {
+
+  for (int i = 0; i < params->paramCount; i++)
+    writeAttribute_s(grp, params->data[i].name, params->data[i].value);
+}
+#endif
diff --git a/src/parser.h b/src/parser.h
index 7b2088ae12cdd5136a96baeabd01dd80255c8a3b..b78e21194d256ed7b50b8a09718c9725d52a1e0b 100644
--- a/src/parser.h
+++ b/src/parser.h
@@ -22,9 +22,14 @@
 /* Config parameters. */
 #include "../config.h"
 
+#if defined(HAVE_HDF5)
+#include <hdf5.h>
+#endif
+
 /* Some constants. */
 #define PARSER_MAX_LINE_SIZE 256
-#define PARSER_MAX_NO_OF_PARAMS 512
+#define PARSER_MAX_NO_OF_PARAMS 256
+#define PARSER_MAX_NO_OF_SECTIONS 64
 
 /* A parameter in the input file */
 struct parameter {
@@ -32,10 +37,17 @@ struct parameter {
   char value[PARSER_MAX_LINE_SIZE];
 };
 
+struct section {
+  char name[PARSER_MAX_LINE_SIZE];
+};
+
 /* The array of parameters read from a file */
 struct swift_params {
+  struct section section[PARSER_MAX_NO_OF_SECTIONS];
   struct parameter data[PARSER_MAX_NO_OF_PARAMS];
-  int count;
+  int sectionCount;
+  int paramCount;
+  char fileName[PARSER_MAX_LINE_SIZE];
 };
 
 /* Public API. */
@@ -43,6 +55,7 @@ void parser_read_file(const char *file_name, struct swift_params *params);
 void parser_print_params(const struct swift_params *params);
 void parser_write_params_to_file(const struct swift_params *params,
                                  const char *file_name);
+
 char parser_get_param_char(const struct swift_params *params, const char *name);
 int parser_get_param_int(const struct swift_params *params, const char *name);
 float parser_get_param_float(const struct swift_params *params,
@@ -52,4 +65,20 @@ double parser_get_param_double(const struct swift_params *params,
 void parser_get_param_string(const struct swift_params *params,
                              const char *name, char *retParam);
 
+char parser_get_opt_param_char(const struct swift_params *params,
+                               const char *name, char def);
+int parser_get_opt_param_int(const struct swift_params *params,
+                             const char *name, int def);
+float parser_get_opt_param_float(const struct swift_params *params,
+                                 const char *name, float def);
+double parser_get_opt_param_double(const struct swift_params *params,
+                                   const char *name, double def);
+void parser_get_opt_param_string(const struct swift_params *params,
+                                 const char *name, char *retParam,
+                                 const char *def);
+
+#if defined(HAVE_HDF5)
+void parser_write_params_to_hdf5(const struct swift_params *params, hid_t grp);
+#endif
+
 #endif /* SWIFT_PARSER_H */
diff --git a/src/part.h b/src/part.h
index 1fba171a46cecb7df6ea20ff28ba3bbaefecc7d1..efca7b6b5bef49f20df1e2c45b30f65ecbbf4960 100644
--- a/src/part.h
+++ b/src/part.h
@@ -22,8 +22,7 @@
 /* Config parameters. */
 #include "../config.h"
 
-/* Some standard headers. */
-#include <stdlib.h>
+/* Standard headers. */
 #include <stddef.h>
 
 /* MPI headers. */
@@ -39,7 +38,7 @@
 #define xpart_align 32
 #define gpart_align 32
 
-/* Import the right particle definition */
+/* Import the right hydro particle definition */
 #if defined(MINIMAL_SPH)
 #include "./hydro/Minimal/hydro_part.h"
 #elif defined(GADGET2_SPH)
@@ -50,6 +49,7 @@
 #error "Invalid choice of SPH variant"
 #endif
 
+/* Import the right gravity particle definition */
 #include "./gravity/Default/gravity_part.h"
 
 void part_relink_gparts(struct part *parts, size_t N, ptrdiff_t offset);
diff --git a/src/partition.c b/src/partition.c
index c4145422de72cd03d5e76c3e5b2a9ac217420d47..6df437826de796c05143b2003dbdacb971d9b7be 100644
--- a/src/partition.c
+++ b/src/partition.c
@@ -31,11 +31,11 @@
 #include "../config.h"
 
 /* Standard headers. */
+#include <float.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <strings.h>
-#include <float.h>
 
 /* MPI headers. */
 #ifdef WITH_MPI
@@ -59,13 +59,12 @@
 
 /* Simple descriptions of initial partition types for reports. */
 const char *initial_partition_name[] = {
-    "gridded cells",                 "vectorized point associated cells",
+    "gridded cells", "vectorized point associated cells",
     "METIS particle weighted cells", "METIS unweighted cells"};
 
 /* Simple descriptions of repartition types for reports. */
 const char *repartition_name[] = {
-    "no",
-    "METIS edge and vertex time weighted cells",
+    "no", "METIS edge and vertex time weighted cells",
     "METIS particle count vertex weighted cells",
     "METIS time edge weighted cells",
     "METIS particle count vertex and time edge cells"};
@@ -454,9 +453,9 @@ static void repart_edge_metis(int partweights, int bothweights, int nodeID,
 
     /* Skip un-interesting tasks. */
     if (t->type != task_type_self && t->type != task_type_pair &&
-        t->type != task_type_sub && t->type != task_type_ghost &&
-        t->type != task_type_drift && t->type != task_type_kick &&
-        t->type != task_type_init)
+        t->type != task_type_sub_self && t->type != task_type_sub_self &&
+        t->type != task_type_ghost && t->type != task_type_drift &&
+        t->type != task_type_kick && t->type != task_type_init)
       continue;
 
     /* Get the task weight. */
@@ -497,15 +496,15 @@ static void repart_edge_metis(int partweights, int bothweights, int nodeID,
 
     /* Self interaction? */
     else if ((t->type == task_type_self && ci->nodeID == nodeID) ||
-             (t->type == task_type_sub && cj == NULL && ci->nodeID == nodeID)) {
+             (t->type == task_type_sub_self && cj == NULL &&
+              ci->nodeID == nodeID)) {
       /* Self interactions add only to vertex weight. */
       if (taskvweights) weights_v[cid] += w;
 
     }
 
     /* Pair? */
-    else if (t->type == task_type_pair ||
-             (t->type == task_type_sub && cj != NULL)) {
+    else if (t->type == task_type_pair || (t->type == task_type_sub_pair)) {
       /* In-cell pair? */
       if (ci == cj) {
         /* Add weight to vertex for ci. */
@@ -782,8 +781,9 @@ void partition_initial_partition(struct partition *initial_partition,
     struct cell *c;
 
     /* If we've got the wrong number of nodes, fail. */
-    if (nr_nodes != initial_partition->grid[0] * initial_partition->grid[1] *
-                        initial_partition->grid[2])
+    if (nr_nodes !=
+        initial_partition->grid[0] * initial_partition->grid[1] *
+            initial_partition->grid[2])
       error("Grid size does not match number of nodes.");
 
     /* Run through the cells and set their nodeID. */
@@ -792,8 +792,9 @@ void partition_initial_partition(struct partition *initial_partition,
       c = &s->cells[k];
       for (j = 0; j < 3; j++)
         ind[j] = c->loc[j] / s->dim[j] * initial_partition->grid[j];
-      c->nodeID = ind[0] + initial_partition->grid[0] *
-                               (ind[1] + initial_partition->grid[1] * ind[2]);
+      c->nodeID = ind[0] +
+                  initial_partition->grid[0] *
+                      (ind[1] + initial_partition->grid[1] * ind[2]);
       // message("cell at [%e,%e,%e]: ind = [%i,%i,%i], nodeID = %i", c->loc[0],
       // c->loc[1], c->loc[2], ind[0], ind[1], ind[2], c->nodeID);
     }
@@ -921,11 +922,12 @@ void partition_init(struct partition *partition,
 
 /* Defaults make use of METIS if available */
 #ifdef HAVE_METIS
-  *reparttype = REPART_METIS_BOTH;
-  partition->type = INITPART_METIS_NOWEIGHT;
+  char default_repart = 'b';
+  ;
+  char default_part = 'm';
 #else
-  *reparttype = REPART_NONE;
-  partition->type = INITPART_GRID;
+  char default_repart = 'n';
+  char default_part = 'g';
 #endif
 
   /* Set a default grid so that grid[0]*grid[1]*grid[2] == nr_nodes. */
@@ -935,9 +937,9 @@ void partition_init(struct partition *partition,
   factor(partition->grid[0] * partition->grid[1], &partition->grid[1],
          &partition->grid[0]);
 
-  /* Now let's check what the user wants as an initial domain*/
-  const char part_type =
-      parser_get_param_char(params, "DomainDecomposition:initial_type");
+  /* Now let's check what the user wants as an initial domain. */
+  const char part_type = parser_get_opt_param_char(
+      params, "DomainDecomposition:initial_type", default_part);
 
   switch (part_type) {
     case 'g':
@@ -965,17 +967,17 @@ void partition_init(struct partition *partition,
 
   /* In case of grid, read more parameters */
   if (part_type == 'g') {
-    partition->grid[0] =
-        parser_get_param_int(params, "DomainDecomposition:initial_grid_x");
-    partition->grid[1] =
-        parser_get_param_int(params, "DomainDecomposition:initial_grid_y");
-    partition->grid[2] =
-        parser_get_param_int(params, "DomainDecomposition:initial_grid_z");
+    partition->grid[0] = parser_get_opt_param_int(
+        params, "DomainDecomposition:initial_grid_x", partition->grid[0]);
+    partition->grid[1] = parser_get_opt_param_int(
+        params, "DomainDecomposition:initial_grid_y", partition->grid[1]);
+    partition->grid[2] = parser_get_opt_param_int(
+        params, "DomainDecomposition:initial_grid_z", partition->grid[2]);
   }
 
   /* Now let's check what the user wants as a repartition strategy */
-  const char repart_type =
-      parser_get_param_char(params, "DomainDecomposition:repartition_type");
+  const char repart_type = parser_get_opt_param_char(
+      params, "DomainDecomposition:repartition_type", default_repart);
 
   switch (repart_type) {
     case 'n':
@@ -1044,7 +1046,6 @@ static int check_complete(struct space *s, int verbose, int nregions) {
   return (!failed);
 }
 
-
 /**
  * @brief Partition a space of cells based on another space of cells.
  *
@@ -1062,7 +1063,8 @@ static int check_complete(struct space *s, int verbose, int nregions) {
  *
  * @param oldh the cell dimensions of old space.
  * @param oldcdim number of cells per dimension in old space.
- * @param oldnodeIDs the nodeIDs of cells in the old space, indexed by old cellid.
+ * @param oldnodeIDs the nodeIDs of cells in the old space, indexed by old
+ *cellid.
  * @param s the space to be partitioned.
  *
  * @return 1 if the new space contains nodeIDs from all nodes, 0 otherwise.
diff --git a/src/physical_constants.c b/src/physical_constants.c
new file mode 100644
index 0000000000000000000000000000000000000000..d00d63df1c1e6a4821ce4ba50dfef9c4e0def9d2
--- /dev/null
+++ b/src/physical_constants.c
@@ -0,0 +1,119 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (c) 2016 Tom Theuns (tom.theuns@durham.ac.uk)
+ *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+
+/* Config parameters. */
+#include "../config.h"
+
+/* This object's header. */
+#include "physical_constants.h"
+
+/* Local headers. */
+#include "error.h"
+#include "physical_constants_cgs.h"
+
+/**
+ * @brief Converts physical constants to the internal unit system
+ *
+ * @param us The current internal system of units.
+ * @param internal_const The physical constants to initialize.
+ */
+void phys_const_init(struct UnitSystem* us, struct phys_const* internal_const) {
+
+  /* Units are declared as {U_M, U_L, U_t, U_I, U_T} */
+
+  const float dimension_G[5] = {-1, 3, -2, 0, 0};
+  internal_const->const_newton_G =
+      const_newton_G_cgs / units_general_conversion_factor(us, dimension_G);
+
+  const float dimension_c[5] = {0, 1, -1, 0, 0};
+  internal_const->const_speed_light_c =
+      const_speed_light_c_cgs /
+      units_general_conversion_factor(us, dimension_c);
+
+  const float dimension_h[5] = {1, -2, -1, 0, 0};
+  internal_const->const_planck_h =
+      const_planck_h_cgs / units_general_conversion_factor(us, dimension_h);
+  internal_const->const_planck_hbar =
+      const_planck_hbar_cgs / units_general_conversion_factor(us, dimension_h);
+
+  const float dimension_k[5] = {1, 2, -2, 0, -1};
+  internal_const->const_boltzmann_k =
+      const_boltzmann_k_cgs / units_general_conversion_factor(us, dimension_k);
+
+  const float dimension_thomson[5] = {0, 2, 0, 0, 0};
+  internal_const->const_thomson_cross_section =
+      const_thomson_cross_section_cgs /
+      units_general_conversion_factor(us, dimension_thomson);
+
+  const float dimension_ev[5] = {1, 2, -2, 0, 0};
+  internal_const->const_electron_volt =
+      const_electron_volt_cgs /
+      units_general_conversion_factor(us, dimension_ev);
+
+  const float dimension_charge[5] = {0, 0, -1, 1, 0};
+  internal_const->const_electron_charge =
+      const_electron_charge_cgs /
+      units_general_conversion_factor(us, dimension_charge);
+
+  const float dimension_mass[5] = {1, 0, 0, 0, 0};
+  internal_const->const_electron_mass =
+      const_electron_mass_cgs /
+      units_general_conversion_factor(us, dimension_mass);
+  internal_const->const_proton_mass =
+      const_proton_mass_cgs /
+      units_general_conversion_factor(us, dimension_mass);
+  internal_const->const_solar_mass =
+      const_solar_mass_cgs /
+      units_general_conversion_factor(us, dimension_mass);
+  internal_const->const_earth_mass =
+      const_earth_mass_cgs /
+      units_general_conversion_factor(us, dimension_mass);
+
+  const float dimension_time[5] = {0, 0, 1, 0, 0};
+  internal_const->const_year =
+      const_year_cgs / units_general_conversion_factor(us, dimension_time);
+
+  const float dimension_length[5] = {0, 1, 0, 0, 0};
+  internal_const->const_astronomical_unit =
+      const_astronomical_unit_cgs /
+      units_general_conversion_factor(us, dimension_length);
+  internal_const->const_parsec =
+      const_parsec_cgs / units_general_conversion_factor(us, dimension_length);
+  internal_const->const_light_year =
+      const_light_year_cgs /
+      units_general_conversion_factor(us, dimension_length);
+}
+
+void phys_const_print(struct phys_const* internal_const) {
+
+  message("%25s = %e", "Gravitational constant",
+          internal_const->const_newton_G);
+  message("%25s = %e", "Speed of light", internal_const->const_speed_light_c);
+  message("%25s = %e", "Planck constant", internal_const->const_planck_h);
+  message("%25s = %e", "Boltzmann constant", internal_const->const_boltzmann_k);
+  message("%25s = %e", "Thomson cross-section",
+          internal_const->const_thomson_cross_section);
+  message("%25s = %e", "Electron-Volt", internal_const->const_electron_volt);
+  message("%25s = %e", "Year", internal_const->const_year);
+  message("%25s = %e", "Astronomical Unit",
+          internal_const->const_astronomical_unit);
+  message("%25s = %e", "Parsec", internal_const->const_parsec);
+  message("%25s = %e", "Solar mass", internal_const->const_solar_mass);
+}
diff --git a/src/physical_constants.h b/src/physical_constants.h
new file mode 100644
index 0000000000000000000000000000000000000000..aac0fa0840e08a1140bbab67e944f7f134f4222b
--- /dev/null
+++ b/src/physical_constants.h
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (c) 2016 Tom Theuns (tom.theuns@durham.ac.uk)
+ *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+#ifndef SWIFT_PHYSICAL_CONSTANTS_H
+#define SWIFT_PHYSICAL_CONSTANTS_H
+
+/* Config parameters. */
+#include "../config.h"
+
+/* Local includes. */
+#include "units.h"
+
+/* physical constants in in defined programme units */
+struct phys_const {
+
+  /* Newton's gravitationl constant */
+  double const_newton_G;
+
+  /* Speed of light in vacuum */
+  double const_speed_light_c;
+
+  /* Planck's constant */
+  double const_planck_h;
+
+  /* Planck's reduced constant */
+  double const_planck_hbar;
+
+  /* Boltzmann's constant */
+  double const_boltzmann_k;
+
+  /* Thomson cross-section */
+  double const_thomson_cross_section;
+
+  /* Charge of the electron  */
+  double const_electron_charge;
+
+  /* Electron-Volt */
+  double const_electron_volt;
+
+  /* Mass of the electron */
+  double const_electron_mass;
+
+  /* Mass of the proton */
+  double const_proton_mass;
+
+  /* (Tropical) Year */
+  double const_year;
+
+  /* Astronomical unit */
+  double const_astronomical_unit;
+
+  /* Parsec */
+  double const_parsec;
+
+  /* Light-year */
+  double const_light_year;
+
+  /* Mass of the Sun */
+  double const_solar_mass;
+
+  /* Mass of the Earth */
+  double const_earth_mass;
+};
+
+void phys_const_init(struct UnitSystem* us, struct phys_const* internal_const);
+
+void phys_const_print(struct phys_const* internal_const);
+
+#endif /* SWIFT_PHYSICAL_CONSTANTS_H */
diff --git a/src/physical_constants_cgs.h b/src/physical_constants_cgs.h
new file mode 100644
index 0000000000000000000000000000000000000000..9bd435bde9f0f80c06f0de5fee980f080c2f57d3
--- /dev/null
+++ b/src/physical_constants_cgs.h
@@ -0,0 +1,81 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (c) 2016 Tom Theuns (tom.theuns@durham.ac.uk)
+ *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+#ifndef SWIFT_PHYSICAL_CONSTANTS_CGS_H
+#define SWIFT_PHYSICAL_CONSTANTS_CGS_H
+
+/* The constants declared in this file should _NOT_ be used directly */
+/* Users should use the converted values in the phys_const structure */
+/* where all the constants are defined in the system of units specified */
+/* in the parameter file. */
+
+/* All values are taken from K.A. Olive et al. (Particle Data Group), Chin. */
+/* Phys. C, 38, 090001 (2014) and 2015 update. */
+/* http://pdg.lbl.gov/2015/reviews/rpp2015-rev-phys-constants.pdf */
+/* http://pdg.lbl.gov/2015/reviews/rpp2015-rev-astrophysical-constants.pdf */
+
+/* Newton's gravitation constant */
+const double const_newton_G_cgs = 6.67408e-8; /* g^-1 cm^3 s^-2 */
+
+/* Speed of light in vacuum */
+const double const_speed_light_c_cgs = 2.99792458e10; /* cm s^-1 */
+
+/* Planck's constant */
+const double const_planck_h_cgs = 6.626070040e-27; /* g cm^-2 s^-1 */
+
+/* Planck's reduced constant */
+const double const_planck_hbar_cgs = 1.054571800e-27; /* g cm^-2 s^-1 */
+
+/* Boltzmann's constant */
+const double const_boltzmann_k_cgs = 1.38064852e-16; /* g cm^2 s^-2 K^-1 */
+
+/* Thomson cross-section */
+const double const_thomson_cross_section_cgs = 6.6524587158e-25; /* cm^2 */
+
+/* Elementary charge */
+const double const_electron_charge_cgs = 1.6021766208e-19; /* A s^-1 */
+
+/* Electron-Volt */
+const double const_electron_volt_cgs = 1.6021766208e-12; /* g cm^2 s^-2 */
+
+/* Mass of the electron */
+const double const_electron_mass_cgs = 9.10938356e-28; /* g */
+
+/* Mass of the proton */
+const double const_proton_mass_cgs = 1.672621898e-24; /* g */
+
+/* Tropical year */
+const double const_year_cgs = 3.15569252e7; /* s */
+
+/* Astronomical unit */
+const double const_astronomical_unit_cgs = 1.49597870700e13; /* cm */
+
+/* Parsec */
+const double const_parsec_cgs = 3.08567758149e18; /* cm */
+
+/* Light-year */
+const double const_light_year_cgs = 9.46053e17; /* cm */
+
+/* Mass of the Sun */
+const double const_solar_mass_cgs = 1.9885e33; /* g */
+
+/* Mass of the Earth */
+const double const_earth_mass_cgs = 5.9726e27; /* g */
+
+#endif /* SWIFT_PHYSICAL_CONSTANTS_CGS_H */
diff --git a/src/potentials.c b/src/potentials.c
new file mode 100644
index 0000000000000000000000000000000000000000..98e57d7959963e07cfbbd2fdb60a0df504e6a487
--- /dev/null
+++ b/src/potentials.c
@@ -0,0 +1,65 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (c) 2016 Tom Theuns (tom.theuns@durham.ac.uk)
+ *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+
+/* Config parameters. */
+#include "../config.h"
+
+/* This object's header. */
+#include "potentials.h"
+
+/**
+ * @brief Initialises the external potential properties in the internal system
+ * of units.
+ *
+ * @param parameter_file The parsed parameter file
+ * @param us The current internal system of units
+ * @param potential The external potential properties to initialize
+ */
+void potential_init(const struct swift_params* parameter_file,
+                    struct UnitSystem* us,
+                    struct external_potential* potential) {
+
+#ifdef EXTERNAL_POTENTIAL_POINTMASS
+
+  potential->point_mass.x =
+      parser_get_param_double(parameter_file, "PointMass:position_x");
+  potential->point_mass.y =
+      parser_get_param_double(parameter_file, "PointMass:position_y");
+  potential->point_mass.z =
+      parser_get_param_double(parameter_file, "PointMass:position_z");
+  potential->point_mass.mass =
+      parser_get_param_double(parameter_file, "PointMass:mass");
+
+#endif /* EXTERNAL_POTENTIAL_POINTMASS */
+}
+
+/**
+ * @brief Prints the properties of the external potential to stdout.
+ *
+ * @param  potential The external potential properties.
+ */
+void potential_print(const struct external_potential* potential) {
+
+#ifdef EXTERNAL_POTENTIAL_POINTMASS
+  message("Point mass properties are (x,y,z) = (%e, %e, %e), M = %e",
+          potential->point_mass.x, potential->point_mass.y,
+          potential->point_mass.z, potential->point_mass.mass);
+#endif /* EXTERNAL_POTENTIAL_POINTMASS */
+}
diff --git a/src/potentials.h b/src/potentials.h
new file mode 100644
index 0000000000000000000000000000000000000000..c3db5a3d2231c93b1d83aa94bca83f7f7d7106bc
--- /dev/null
+++ b/src/potentials.h
@@ -0,0 +1,122 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (c) 2016 Tom Theuns (tom.theuns@durham.ac.uk)
+ *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+
+#ifndef SWIFT_POTENTIALS_H
+#define SWIFT_POTENTIALS_H
+
+/* Config parameters. */
+#include "../config.h"
+
+/* Some standard headers. */
+#include <math.h>
+
+/* Local includes. */
+#include "const.h"
+#include "error.h"
+#include "parser.h"
+#include "part.h"
+#include "physical_constants.h"
+#include "units.h"
+
+/* External Potential Properties */
+struct external_potential {
+
+#ifdef EXTERNAL_POTENTIAL_POINTMASS
+  struct {
+    double x, y, z;
+    double mass;
+  } point_mass;
+#endif
+};
+
+/* Include exteral pointmass potential */
+#ifdef EXTERNAL_POTENTIAL_POINTMASS
+
+#define EXTERNAL_GRAVITY_TIMESTEP_PREFACTOR 0.03f
+
+/**
+ * @brief Computes the time-step due to the acceleration from a point mass
+ *
+ * @param potential The properties of the externa potential.
+ * @param phys_const The physical constants in internal units.
+ * @param g Pointer to the g-particle data.
+ */
+__attribute__((always_inline)) INLINE static float
+external_gravity_pointmass_timestep(const struct external_potential* potential,
+                                    const struct phys_const* const phys_const,
+                                    const struct gpart* const g) {
+
+  const float G_newton = phys_const->const_newton_G;
+  const float dx = g->x[0] - potential->point_mass.x;
+  const float dy = g->x[1] - potential->point_mass.y;
+  const float dz = g->x[2] - potential->point_mass.z;
+  const float rinv = 1.f / sqrtf(dx * dx + dy * dy + dz * dz);
+  const float drdv = (g->x[0] - potential->point_mass.x) * (g->v_full[0]) +
+                     (g->x[1] - potential->point_mass.y) * (g->v_full[1]) +
+                     (g->x[2] - potential->point_mass.z) * (g->v_full[2]);
+  const float dota_x = G_newton * potential->point_mass.mass * rinv * rinv *
+                       rinv * (-g->v_full[0] + 3.f * rinv * rinv * drdv * dx);
+  const float dota_y = G_newton * potential->point_mass.mass * rinv * rinv *
+                       rinv * (-g->v_full[1] + 3.f * rinv * rinv * drdv * dy);
+  const float dota_z = G_newton * potential->point_mass.mass * rinv * rinv *
+                       rinv * (-g->v_full[2] + 3.f * rinv * rinv * drdv * dz);
+  const float dota_2 = dota_x * dota_x + dota_y * dota_y + dota_z * dota_z;
+  const float a_2 = g->a_grav[0] * g->a_grav[0] + g->a_grav[1] * g->a_grav[1] +
+                    g->a_grav[2] * g->a_grav[2];
+
+  return EXTERNAL_GRAVITY_TIMESTEP_PREFACTOR * sqrtf(a_2 / dota_2);
+}
+
+/**
+ * @brief Computes the gravitational acceleration of a particle due to a point
+ * mass
+ *
+ * @param potential The proerties of the external potential.
+ * @param phys_const The physical constants in internal units.
+ * @param g Pointer to the g-particle data.
+ */
+__attribute__((always_inline)) INLINE static void external_gravity_pointmass(
+    const struct external_potential* potential,
+    const struct phys_const* const phys_const, struct gpart* g) {
+
+  const float G_newton = phys_const->const_newton_G;
+  const float dx = g->x[0] - potential->point_mass.x;
+  const float dy = g->x[1] - potential->point_mass.y;
+  const float dz = g->x[2] - potential->point_mass.z;
+  const float rinv = 1.f / sqrtf(dx * dx + dy * dy + dz * dz);
+
+  g->a_grav[0] +=
+      -G_newton * potential->point_mass.mass * dx * rinv * rinv * rinv;
+  g->a_grav[1] +=
+      -G_newton * potential->point_mass.mass * dy * rinv * rinv * rinv;
+  g->a_grav[2] +=
+      -G_newton * potential->point_mass.mass * dz * rinv * rinv * rinv;
+}
+#endif /* EXTERNAL_POTENTIAL_POINTMASS */
+
+/* Now, some generic functions, defined in the source file */
+
+void potential_init(const struct swift_params* parameter_file,
+                    struct UnitSystem* us,
+                    struct external_potential* potential);
+
+void potential_print(const struct external_potential* potential);
+
+#endif /* SWIFT_POTENTIALS_H */
diff --git a/src/proxy.c b/src/proxy.c
index 02263a5653bdcdd2d1bf0a86523ed1a599d4bf21..efe3a3eec108d44d5b9bf8b4718dc025464f8762 100644
--- a/src/proxy.c
+++ b/src/proxy.c
@@ -249,8 +249,8 @@ void proxy_parts_exch2(struct proxy *p) {
     } while (p->nr_parts_in > p->size_parts_in);
     free(p->parts_in);
     free(p->xparts_in);
-    if ((p->parts_in = (struct part *)malloc(
-             sizeof(struct part) *p->size_parts_in)) == NULL ||
+    if ((p->parts_in = (struct part *)malloc(sizeof(struct part) *
+                                             p->size_parts_in)) == NULL ||
         (p->xparts_in = (struct xpart *)malloc(sizeof(struct xpart) *
                                                p->size_parts_in)) == NULL)
       error("Failed to re-allocate parts_in buffers.");
@@ -310,7 +310,7 @@ void proxy_parts_load(struct proxy *p, const struct part *parts,
     } while (p->nr_parts_out + N > p->size_parts_out);
     struct part *tp;
     struct xpart *txp;
-    if ((tp = (struct part *)malloc(sizeof(struct part) *p->size_parts_out)) ==
+    if ((tp = (struct part *)malloc(sizeof(struct part) * p->size_parts_out)) ==
             NULL ||
         (txp = (struct xpart *)malloc(sizeof(struct xpart) *
                                       p->size_parts_out)) == NULL)
@@ -395,8 +395,8 @@ void proxy_init(struct proxy *p, int mynodeID, int nodeID) {
   /* Allocate the part send and receive buffers, if needed. */
   if (p->parts_in == NULL) {
     p->size_parts_in = proxy_buffinit;
-    if ((p->parts_in = (struct part *)malloc(
-             sizeof(struct part) *p->size_parts_in)) == NULL ||
+    if ((p->parts_in = (struct part *)malloc(sizeof(struct part) *
+                                             p->size_parts_in)) == NULL ||
         (p->xparts_in = (struct xpart *)malloc(sizeof(struct xpart) *
                                                p->size_parts_in)) == NULL)
       error("Failed to allocate parts_in buffers.");
@@ -404,8 +404,8 @@ void proxy_init(struct proxy *p, int mynodeID, int nodeID) {
   p->nr_parts_in = 0;
   if (p->parts_out == NULL) {
     p->size_parts_out = proxy_buffinit;
-    if ((p->parts_out = (struct part *)malloc(
-             sizeof(struct part) *p->size_parts_out)) == NULL ||
+    if ((p->parts_out = (struct part *)malloc(sizeof(struct part) *
+                                              p->size_parts_out)) == NULL ||
         (p->xparts_out = (struct xpart *)malloc(sizeof(struct xpart) *
                                                 p->size_parts_out)) == NULL)
       error("Failed to allocate parts_out buffers.");
diff --git a/src/queue.c b/src/queue.c
index 6b788d7376ba4bdc95f1b1d918ab52a9514e7b4a..9883d77e66421eda6093331e0c9a8f6ac0155ded 100644
--- a/src/queue.c
+++ b/src/queue.c
@@ -34,18 +34,62 @@
 #include "queue.h"
 
 /* Local headers. */
+#include "atomic.h"
 #include "const.h"
 #include "error.h"
 
-/* Counter macros. */
-#ifdef COUNTER
-#define COUNT(c) (__sync_add_and_fetch(&queue_counter[c], 1))
-#else
-#define COUNT(c)
-#endif
+/**
+ * @brief Enqueue all tasks in the incoming DEQ.
+ *
+ * @param q The #queue, assumed to be locked.
+ */
+void queue_get_incoming(struct queue *q) {
+
+  int *tid = q->tid;
+  struct task *tasks = q->tasks;
+
+  /* Loop over the incoming DEQ. */
+  while (1) {
+
+    /* Is there a next element? */
+    const int ind = q->first_incoming % queue_incoming_size;
+    if (q->tid_incoming[ind] < 0) break;
+
+    /* Get the next offset off the DEQ. */
+    const int offset = atomic_swap(&q->tid_incoming[ind], -1);
+    atomic_inc(&q->first_incoming);
+
+    /* Does the queue need to be grown? */
+    if (q->count == q->size) {
+      int *temp;
+      q->size *= queue_sizegrow;
+      if ((temp = (int *)malloc(sizeof(int) * q->size)) == NULL)
+        error("Failed to allocate new indices.");
+      memcpy(temp, tid, sizeof(int) * q->count);
+      free(tid);
+      q->tid = tid = temp;
+    }
+
+    /* Drop the task at the end of the queue. */
+    tid[q->count] = offset;
+    q->count += 1;
+    atomic_dec(&q->count_incoming);
+
+    /* Shuffle up. */
+    for (int k = q->count - 1; k > 0; k = (k - 1) / 2)
+      if (tasks[tid[k]].weight > tasks[tid[(k - 1) / 2]].weight) {
+        int temp = tid[k];
+        tid[k] = tid[(k - 1) / 2];
+        tid[(k - 1) / 2] = temp;
+      } else
+        break;
 
-/* The counters. */
-int queue_counter[queue_counter_count];
+    /* Check the queue's consistency. */
+    /* for (int k = 1; k < q->count; k++)
+        if ( tasks[ tid[(k-1)/2] ].weight < tasks[ tid[k] ].weight )
+            error( "Queue heap is disordered." ); */
+  }
+}
 
 /**
  * @brief Insert a used tasks into the given queue.
@@ -53,49 +97,29 @@ int queue_counter[queue_counter_count];
  * @param q The #queue.
  * @param t The #task.
  */
-
 void queue_insert(struct queue *q, struct task *t) {
+  /* Get an index in the DEQ. */
+  const int ind = atomic_inc(&q->last_incoming) % queue_incoming_size;
 
-  int k, *tid;
-  struct task *tasks;
+  /* Spin until the new offset can be stored. */
+  while (atomic_cas(&q->tid_incoming[ind], -1, t - q->tasks) != -1) {
 
-  /* Lock the queue. */
-  if (lock_lock(&q->lock) != 0) error("Failed to get queue lock.");
+    /* Try to get the queue lock, non-blocking, ensures that at
+       least somebody is working on this queue. */
+    if (lock_trylock(&q->lock) == 0) {
 
-  tid = q->tid;
-  tasks = q->tasks;
+      /* Clean up the incoming DEQ. */
+      queue_get_incoming(q);
 
-  /* Does the queue need to be grown? */
-  if (q->count == q->size) {
-    int *temp;
-    q->size *= queue_sizegrow;
-    if ((temp = (int *)malloc(sizeof(int) * q->size)) == NULL)
-      error("Failed to allocate new indices.");
-    memcpy(temp, tid, sizeof(int) * q->count);
-    free(tid);
-    q->tid = tid = temp;
+      /* Release the queue lock. */
+      if (lock_unlock(&q->lock) != 0) {
+        error("Unlocking the qlock failed.\n");
+      }
+    }
   }
 
-  /* Drop the task at the end of the queue. */
-  tid[q->count] = (t - tasks);
-  q->count += 1;
-
-  /* Shuffle up. */
-  for (k = q->count - 1; k > 0; k = (k - 1) / 2)
-    if (tasks[tid[k]].weight > tasks[tid[(k - 1) / 2]].weight) {
-      int temp = tid[k];
-      tid[k] = tid[(k - 1) / 2];
-      tid[(k - 1) / 2] = temp;
-    } else
-      break;
-
-  /* Check the queue's consistency. */
-  /* for ( k = 1 ; k < q->count ; k++ )
-      if ( tasks[ tid[(k-1)/2] ].weight < tasks[ tid[k] ].weight )
-          error( "Queue heap is disordered." ); */
-
-  /* Unlock the queue. */
-  if (lock_unlock(&q->lock) != 0) error("Failed to unlock queue.");
+  /* Increase the incoming count. */
+  atomic_inc(&q->count_incoming);
 }
 
 /**
@@ -104,7 +128,6 @@ void queue_insert(struct queue *q, struct task *t) {
  * @param q The #queue.
  * @param tasks List of tasks to which the queue indices refer to.
  */
-
 void queue_init(struct queue *q, struct task *tasks) {
 
   /* Allocate the task list if needed. */
@@ -120,6 +143,17 @@ void queue_init(struct queue *q, struct task *tasks) {
 
   /* Init the queue lock. */
   if (lock_init(&q->lock) != 0) error("Failed to init queue lock.");
+
+  /* Init the incoming DEQ. */
+  if ((q->tid_incoming = (int *)malloc(sizeof(int) * queue_incoming_size)) ==
+      NULL)
+    error("Failed to allocate queue incoming buffer.");
+  for (int k = 0; k < queue_incoming_size; k++) {
+    q->tid_incoming[k] = -1;
+  }
+  q->first_incoming = 0;
+  q->last_incoming = 0;
+  q->count_incoming = 0;
 }
 
 /**
@@ -129,11 +163,10 @@ void queue_init(struct queue *q, struct task *tasks) {
  * @param prev The previous #task extracted from this #queue.
  * @param blocking Block until access to the queue is granted.
  */
-
 struct task *queue_gettask(struct queue *q, const struct task *prev,
                            int blocking) {
 
-  lock_type *qlock = &q->lock;
+  swift_lock_type *qlock = &q->lock;
   struct task *res = NULL;
 
   /* Grab the task lock. */
@@ -143,6 +176,9 @@ struct task *queue_gettask(struct queue *q, const struct task *prev,
     if (lock_trylock(qlock) != 0) return NULL;
   }
 
+  /* Fill any tasks from the incoming DEQ. */
+  queue_get_incoming(q);
+
   /* If there are no tasks, leave immediately. */
   if (q->count == 0) {
     lock_unlock_blind(qlock);
diff --git a/src/queue.h b/src/queue.h
index 9ce52ea5404db727f29d0f1cf898f5f5a4f6d935..5878866c890f53f22c3deaac7fe9b6bba75d499e 100644
--- a/src/queue.h
+++ b/src/queue.h
@@ -29,6 +29,7 @@
 #define queue_sizeinit 100
 #define queue_sizegrow 2
 #define queue_search_window 8
+#define queue_incoming_size 1024
 
 /* Counters. */
 enum {
@@ -41,7 +42,7 @@ extern int queue_counter[queue_counter_count];
 struct queue {
 
   /* The lock to access this queue. */
-  lock_type lock;
+  swift_lock_type lock;
 
   /* Size, count and next element. */
   int size, count;
@@ -52,6 +53,10 @@ struct queue {
   /* The task indices. */
   int *tid;
 
+  /* DEQ for incoming tasks. */
+  int *tid_incoming;
+  volatile unsigned int first_incoming, last_incoming, count_incoming;
+
 } __attribute__((aligned(64)));
 
 /* Function prototypes. */
diff --git a/src/riemann.h b/src/riemann.h
index ad37490d7249bafe776b58a609064cbd0bc23abe..d647b021167317d14f4cd7316d09c247794f3d23 100644
--- a/src/riemann.h
+++ b/src/riemann.h
@@ -21,11 +21,11 @@
 
 /* gives us const_hydro_gamma and tells us which floating point type to use */
 #include "const.h"
+#include "error.h"
+#include "float.h"
 #include "math.h"
 #include "stdio.h"
-#include "float.h"
 #include "stdlib.h"
-#include "error.h"
 
 #define HLLC_SOLVER
 
diff --git a/src/riemann/riemann_exact.h b/src/riemann/riemann_exact.h
index b768cde5f4f5dfd0463cc8a582a1af0a17607bbe..a2f3c30fb1daf5d53bf35abe4ca7e73eafba6018 100644
--- a/src/riemann/riemann_exact.h
+++ b/src/riemann/riemann_exact.h
@@ -78,9 +78,9 @@ __attribute__((always_inline)) INLINE static GFLOAT riemann_fb(GFLOAT p,
  * @param aL The left sound speed
  * @param aR The right sound speed
  */
-__attribute__((always_inline))
-    INLINE static GFLOAT riemann_f(GFLOAT p, GFLOAT* WL, GFLOAT* WR, GFLOAT vL,
-                                   GFLOAT vR, GFLOAT aL, GFLOAT aR) {
+__attribute__((always_inline)) INLINE static GFLOAT riemann_f(
+    GFLOAT p, GFLOAT* WL, GFLOAT* WR, GFLOAT vL, GFLOAT vR, GFLOAT aL,
+    GFLOAT aR) {
 
   return riemann_fb(p, WL, aL) + riemann_fb(p, WR, aR) + (vR - vL);
 }
diff --git a/src/riemann/riemann_hllc.h b/src/riemann/riemann_hllc.h
index 3fcc8b534ce65d4d364c400dc43e1992f39264b9..6c583f6410f53ed64d630082926d816129768fab 100644
--- a/src/riemann/riemann_hllc.h
+++ b/src/riemann/riemann_hllc.h
@@ -63,13 +63,15 @@ __attribute__((always_inline)) INLINE static void riemann_solve_for_flux(
      all these speeds are along the interface normal, since uL and uR are */
   qL = 1.;
   if (pstar > WL[4]) {
-    qL = sqrtf(1. + 0.5 * (const_hydro_gamma + 1.) / const_hydro_gamma *
-                        (pstar / WL[4] - 1.));
+    qL = sqrtf(1. +
+               0.5 * (const_hydro_gamma + 1.) / const_hydro_gamma *
+                   (pstar / WL[4] - 1.));
   }
   qR = 1.;
   if (pstar > WR[4]) {
-    qR = sqrtf(1. + 0.5 * (const_hydro_gamma + 1.) / const_hydro_gamma *
-                        (pstar / WR[4] - 1.));
+    qR = sqrtf(1. +
+               0.5 * (const_hydro_gamma + 1.) / const_hydro_gamma *
+                   (pstar / WR[4] - 1.));
   }
   SL = uL - aL * qL;
   SR = uR + aR * qR;
diff --git a/src/riemann/riemann_trrs.h b/src/riemann/riemann_trrs.h
index 56f7feadae6ea89cec742e298a269e787d58e7b3..efdbfb59877c09a59d535a4785ad74620c0f3651 100644
--- a/src/riemann/riemann_trrs.h
+++ b/src/riemann/riemann_trrs.h
@@ -40,9 +40,9 @@
  * By assuming 2 rarefaction waves, we can analytically solve for the pressure
  * and velocity in the intermediate region, eliminating the iterative procedure.
  *
- * According to Toro: "The two-rarefaction approximation is generally quite
+ * According to Toro: 'The two-rarefaction approximation is generally quite
  * robust; (...) The TRRS is in fact exact when both non-linear waves are
- * actually rarefaction waves."
+ * actually rarefaction waves.'
  *
  * @param WL The left state vector
  * @param WR The right state vector
diff --git a/src/runner.c b/src/runner.c
index 2e84d8db16098d4979e22644b18d6eeaf2c29091..c28b62edf17f07ac9a68bc3c1485d6fd8b8ba445 100644
--- a/src/runner.c
+++ b/src/runner.c
@@ -1,6 +1,10 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
+ *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ *               2015 Peter W. Draper (p.w.draper@durham.ac.uk)
+ *               2016 John A. Regan (john.a.regan@durham.ac.uk)
+ *                    Tom Theuns (tom.theuns@durham.ac.uk)
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
@@ -36,33 +40,39 @@
 /* Local headers. */
 #include "approx_math.h"
 #include "atomic.h"
+#include "cell.h"
 #include "const.h"
 #include "debug.h"
+#include "drift.h"
 #include "engine.h"
 #include "error.h"
 #include "gravity.h"
 #include "hydro.h"
+#include "hydro_properties.h"
+#include "kick.h"
 #include "minmax.h"
 #include "scheduler.h"
 #include "space.h"
 #include "task.h"
 #include "timers.h"
+#include "timestep.h"
 
 /* Orientation of the cell pairs */
-const float runner_shift[13 * 3] = {
-    5.773502691896258e-01, 5.773502691896258e-01,  5.773502691896258e-01,
-    7.071067811865475e-01, 7.071067811865475e-01,  0.0,
-    5.773502691896258e-01, 5.773502691896258e-01,  -5.773502691896258e-01,
-    7.071067811865475e-01, 0.0,                    7.071067811865475e-01,
-    1.0,                   0.0,                    0.0,
-    7.071067811865475e-01, 0.0,                    -7.071067811865475e-01,
-    5.773502691896258e-01, -5.773502691896258e-01, 5.773502691896258e-01,
-    7.071067811865475e-01, -7.071067811865475e-01, 0.0,
-    5.773502691896258e-01, -5.773502691896258e-01, -5.773502691896258e-01,
-    0.0,                   7.071067811865475e-01,  7.071067811865475e-01,
-    0.0,                   1.0,                    0.0,
-    0.0,                   7.071067811865475e-01,  -7.071067811865475e-01,
-    0.0,                   0.0,                    1.0, };
+const double runner_shift[13][3] = {
+    {5.773502691896258e-01, 5.773502691896258e-01, 5.773502691896258e-01},
+    {7.071067811865475e-01, 7.071067811865475e-01, 0.0},
+    {5.773502691896258e-01, 5.773502691896258e-01, -5.773502691896258e-01},
+    {7.071067811865475e-01, 0.0, 7.071067811865475e-01},
+    {1.0, 0.0, 0.0},
+    {7.071067811865475e-01, 0.0, -7.071067811865475e-01},
+    {5.773502691896258e-01, -5.773502691896258e-01, 5.773502691896258e-01},
+    {7.071067811865475e-01, -7.071067811865475e-01, 0.0},
+    {5.773502691896258e-01, -5.773502691896258e-01, -5.773502691896258e-01},
+    {0.0, 7.071067811865475e-01, 7.071067811865475e-01},
+    {0.0, 1.0, 0.0},
+    {0.0, 7.071067811865475e-01, -7.071067811865475e-01},
+    {0.0, 0.0, 1.0},
+};
 
 /* Does the axis need flipping ? */
 const char runner_flip[27] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
@@ -77,8 +87,48 @@ const char runner_flip[27] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
 #define FUNCTION force
 #include "runner_doiact.h"
 
-/* Import the gravity loop functions. */
-#include "runner_doiact_grav.h"
+/**
+ * @brief Calculate gravity acceleration from external potential
+ *
+ * @param r runner task
+ * @param c cell
+ * @param timer 1 if the time is to be recorded.
+ */
+void runner_do_grav_external(struct runner *r, struct cell *c, int timer) {
+
+  struct gpart *restrict gparts = c->gparts;
+  const int gcount = c->gcount;
+  const int ti_current = r->e->ti_current;
+  const struct external_potential *potential = r->e->external_potential;
+  const struct phys_const *constants = r->e->physical_constants;
+
+  TIMER_TIC;
+
+  /* Recurse? */
+  if (c->split) {
+    for (int k = 0; k < 8; k++)
+      if (c->progeny[k] != NULL) runner_do_grav_external(r, c->progeny[k], 0);
+    return;
+  }
+
+#ifdef TASK_VERBOSE
+  OUT;
+#endif
+
+  /* Loop over the parts in this cell. */
+  for (int i = 0; i < gcount; i++) {
+
+    /* Get a direct pointer on the part. */
+    struct gpart *const g = &gparts[i];
+
+    /* Is this part within the time step? */
+    if (g->ti_end <= ti_current) {
+
+      external_gravity(potential, constants, g);
+    }
+  }
+  if (timer) TIMER_TOC(timer_dograv_external);
+}
 
 /**
  * @brief Sort the entries in ascending order using QuickSort.
@@ -86,8 +136,7 @@ const char runner_flip[27] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
  * @param sort The entries
  * @param N The number of entries.
  */
-
-void runner_dosort_ascending(struct entry *sort, int N) {
+void runner_do_sort_ascending(struct entry *sort, int N) {
 
   struct {
     short int lo, hi;
@@ -168,8 +217,7 @@ void runner_dosort_ascending(struct entry *sort, int N) {
  * @param clock Flag indicating whether to record the timing or not, needed
  *      for recursive calls.
  */
-
-void runner_dosort(struct runner *r, struct cell *c, int flags, int clock) {
+void runner_do_sort(struct runner *r, struct cell *c, int flags, int clock) {
 
   struct entry *finger;
   struct entry *fingers[8];
@@ -177,8 +225,8 @@ void runner_dosort(struct runner *r, struct cell *c, int flags, int clock) {
   struct entry *sort;
   int j, k, count = c->count;
   int i, ind, off[8], inds[8], temp_i, missing;
-  // float shift[3];
-  float buff[8], px[3];
+  float buff[8];
+  double px[3];
 
   TIMER_TIC
 
@@ -203,7 +251,7 @@ void runner_dosort(struct runner *r, struct cell *c, int flags, int clock) {
     for (k = 0; k < 8; k++) {
       if (c->progeny[k] == NULL) continue;
       missing = flags & ~c->progeny[k]->sorted;
-      if (missing) runner_dosort(r, c->progeny[k], missing, 0);
+      if (missing) runner_do_sort(r, c->progeny[k], missing, 0);
     }
 
     /* Loop over the 13 different sort arrays. */
@@ -282,9 +330,9 @@ void runner_dosort(struct runner *r, struct cell *c, int flags, int clock) {
       for (j = 0; j < 13; j++)
         if (flags & (1 << j)) {
           sort[j * (count + 1) + k].i = k;
-          sort[j * (count + 1) + k].d = px[0] * runner_shift[3 * j + 0] +
-                                        px[1] * runner_shift[3 * j + 1] +
-                                        px[2] * runner_shift[3 * j + 2];
+          sort[j * (count + 1) + k].d = px[0] * runner_shift[j][0] +
+                                        px[1] * runner_shift[j][1] +
+                                        px[2] * runner_shift[j][2];
         }
     }
 
@@ -293,168 +341,23 @@ void runner_dosort(struct runner *r, struct cell *c, int flags, int clock) {
       if (flags & (1 << j)) {
         sort[j * (count + 1) + count].d = FLT_MAX;
         sort[j * (count + 1) + count].i = 0;
-        runner_dosort_ascending(&sort[j * (count + 1)], count);
+        runner_do_sort_ascending(&sort[j * (count + 1)], count);
         c->sorted |= (1 << j);
       }
   }
 
+#ifdef SWIFT_DEBUG_CHECKS
   /* Verify the sorting. */
-  /* for ( j = 0 ; j < 13 ; j++ ) {
-      if ( !( flags & (1 << j) ) )
-          continue;
-      finger = &sort[ j*(count + 1) ];
-      for ( k = 1 ; k < count ; k++ ) {
-          if ( finger[k].d < finger[k-1].d )
-              error( "Sorting failed, ascending array." );
-          if ( finger[k].i >= count )
-              error( "Sorting failed, indices borked." );
-          }
-      } */
-
-  if (clock) TIMER_TOC(timer_dosort);
-}
-
-void runner_dogsort(struct runner *r, struct cell *c, int flags, int clock) {
-
-  struct entry *finger;
-  struct entry *fingers[8];
-  struct gpart *gparts = c->gparts;
-  struct entry *gsort;
-  int j, k, count = c->gcount;
-  int i, ind, off[8], inds[8], temp_i, missing;
-  // float shift[3];
-  float buff[8], px[3];
-
-  TIMER_TIC
-
-  /* Clean-up the flags, i.e. filter out what's already been sorted. */
-  flags &= ~c->gsorted;
-  if (flags == 0) return;
-
-  /* start by allocating the entry arrays. */
-  if (c->gsort == NULL || c->gsortsize < count) {
-    if (c->gsort != NULL) free(c->gsort);
-    c->gsortsize = count * 1.1;
-    if ((c->gsort = (struct entry *)malloc(sizeof(struct entry) *
-                                           (c->gsortsize + 1) * 13)) == NULL)
-      error("Failed to allocate sort memory.");
-  }
-  gsort = c->gsort;
-
-  /* Does this cell have any progeny? */
-  if (c->split) {
-
-    /* Fill in the gaps within the progeny. */
-    for (k = 0; k < 8; k++) {
-      if (c->progeny[k] == NULL) continue;
-      missing = flags & ~c->progeny[k]->gsorted;
-      if (missing) runner_dogsort(r, c->progeny[k], missing, 0);
+  for (j = 0; j < 13; j++) {
+    if (!(flags & (1 << j))) continue;
+    finger = &sort[j * (count + 1)];
+    for (k = 1; k < count; k++) {
+      if (finger[k].d < finger[k - 1].d)
+        error("Sorting failed, ascending array.");
+      if (finger[k].i >= count) error("Sorting failed, indices borked.");
     }
-
-    /* Loop over the 13 different sort arrays. */
-    for (j = 0; j < 13; j++) {
-
-      /* Has this sort array been flagged? */
-      if (!(flags & (1 << j))) continue;
-
-      /* Init the particle index offsets. */
-      for (off[0] = 0, k = 1; k < 8; k++)
-        if (c->progeny[k - 1] != NULL)
-          off[k] = off[k - 1] + c->progeny[k - 1]->gcount;
-        else
-          off[k] = off[k - 1];
-
-      /* Init the entries and indices. */
-      for (k = 0; k < 8; k++) {
-        inds[k] = k;
-        if (c->progeny[k] != NULL && c->progeny[k]->gcount > 0) {
-          fingers[k] = &c->progeny[k]->gsort[j * (c->progeny[k]->gcount + 1)];
-          buff[k] = fingers[k]->d;
-          off[k] = off[k];
-        } else
-          buff[k] = FLT_MAX;
-      }
-
-      /* Sort the buffer. */
-      for (i = 0; i < 7; i++)
-        for (k = i + 1; k < 8; k++)
-          if (buff[inds[k]] < buff[inds[i]]) {
-            temp_i = inds[i];
-            inds[i] = inds[k];
-            inds[k] = temp_i;
-          }
-
-      /* For each entry in the new sort list. */
-      finger = &gsort[j * (count + 1)];
-      for (ind = 0; ind < count; ind++) {
-
-        /* Copy the minimum into the new sort array. */
-        finger[ind].d = buff[inds[0]];
-        finger[ind].i = fingers[inds[0]]->i + off[inds[0]];
-
-        /* Update the buffer. */
-        fingers[inds[0]] += 1;
-        buff[inds[0]] = fingers[inds[0]]->d;
-
-        /* Find the smallest entry. */
-        for (k = 1; k < 8 && buff[inds[k]] < buff[inds[k - 1]]; k++) {
-          temp_i = inds[k - 1];
-          inds[k - 1] = inds[k];
-          inds[k] = temp_i;
-        }
-
-      } /* Merge. */
-
-      /* Add a sentinel. */
-      gsort[j * (count + 1) + count].d = FLT_MAX;
-      gsort[j * (count + 1) + count].i = 0;
-
-      /* Mark as sorted. */
-      c->gsorted |= (1 << j);
-
-    } /* loop over sort arrays. */
-
-  } /* progeny? */
-
-  /* Otherwise, just sort. */
-  else {
-
-    /* Fill the sort array. */
-    for (k = 0; k < count; k++) {
-      px[0] = gparts[k].x[0];
-      px[1] = gparts[k].x[1];
-      px[2] = gparts[k].x[2];
-      for (j = 0; j < 13; j++)
-        if (flags & (1 << j)) {
-          gsort[j * (count + 1) + k].i = k;
-          gsort[j * (count + 1) + k].d = px[0] * runner_shift[3 * j + 0] +
-                                         px[1] * runner_shift[3 * j + 1] +
-                                         px[2] * runner_shift[3 * j + 2];
-        }
-    }
-
-    /* Add the sentinel and sort. */
-    for (j = 0; j < 13; j++)
-      if (flags & (1 << j)) {
-        gsort[j * (count + 1) + count].d = FLT_MAX;
-        gsort[j * (count + 1) + count].i = 0;
-        runner_dosort_ascending(&gsort[j * (count + 1)], count);
-        c->gsorted |= (1 << j);
-      }
   }
-
-  /* Verify the sorting. */
-  /* for ( j = 0 ; j < 13 ; j++ ) {
-      if ( !( flags & (1 << j) ) )
-          continue;
-      finger = &c->gsort[ j*(count + 1) ];
-      for ( k = 1 ; k < count ; k++ ) {
-          if ( finger[k].d < finger[k-1].d )
-              error( "Sorting failed, ascending array." );
-          if ( finger[k].i < 0 || finger[k].i >= count )
-              error( "Sorting failed, indices borked." );
-          }
-      } */
+#endif
 
   if (clock) TIMER_TOC(timer_dosort);
 }
@@ -466,8 +369,7 @@ void runner_dogsort(struct runner *r, struct cell *c, int flags, int clock) {
  * @param c The cell.
  * @param timer 1 if the time is to be recorded.
  */
-
-void runner_doinit(struct runner *r, struct cell *c, int timer) {
+void runner_do_init(struct runner *r, struct cell *c, int timer) {
 
   struct part *const parts = c->parts;
   struct gpart *const gparts = c->gparts;
@@ -480,7 +382,7 @@ void runner_doinit(struct runner *r, struct cell *c, int timer) {
   /* Recurse? */
   if (c->split) {
     for (int k = 0; k < 8; k++)
-      if (c->progeny[k] != NULL) runner_doinit(r, c->progeny[k], 0);
+      if (c->progeny[k] != NULL) runner_do_init(r, c->progeny[k], 0);
     return;
   } else {
 
@@ -520,8 +422,7 @@ void runner_doinit(struct runner *r, struct cell *c, int timer) {
  * @param r The runner thread.
  * @param c The cell.
  */
-
-void runner_doghost(struct runner *r, struct cell *c) {
+void runner_do_ghost(struct runner *r, struct cell *c) {
 
   struct part *p, *parts = c->parts;
   struct xpart *xp, *xparts = c->xparts;
@@ -531,13 +432,20 @@ void runner_doghost(struct runner *r, struct cell *c) {
   float h_corr;
   const int ti_current = r->e->ti_current;
   const double timeBase = r->e->timeBase;
+  const float target_wcount = r->e->hydro_properties->target_neighbours;
+  const float max_wcount =
+      target_wcount + r->e->hydro_properties->delta_neighbours;
+  const float min_wcount =
+      target_wcount - r->e->hydro_properties->delta_neighbours;
+  const int max_smoothing_iter =
+      r->e->hydro_properties->max_smoothing_iterations;
 
   TIMER_TIC;
 
   /* Recurse? */
   if (c->split) {
     for (int k = 0; k < 8; k++)
-      if (c->progeny[k] != NULL) runner_doghost(r, c->progeny[k]);
+      if (c->progeny[k] != NULL) runner_do_ghost(r, c->progeny[k]);
     return;
   }
 
@@ -547,7 +455,7 @@ void runner_doghost(struct runner *r, struct cell *c) {
   for (int k = 0; k < count; k++) pid[k] = k;
 
   /* While there are particles that need to be updated... */
-  for (int num_reruns = 0; count > 0 && num_reruns < const_smoothing_max_iter;
+  for (int num_reruns = 0; count > 0 && num_reruns < max_smoothing_iter;
        num_reruns++) {
 
     /* Reset the redo-count. */
@@ -571,7 +479,7 @@ void runner_doghost(struct runner *r, struct cell *c) {
 
         /* Otherwise, compute the smoothing length update (Newton step). */
         else {
-          h_corr = (kernel_nwneigh - p->density.wcount) / p->density.wcount_dh;
+          h_corr = (target_wcount - p->density.wcount) / p->density.wcount_dh;
 
           /* Truncate to the range [ -p->h/2 , p->h ]. */
           h_corr = fminf(h_corr, p->h);
@@ -579,8 +487,7 @@ void runner_doghost(struct runner *r, struct cell *c) {
         }
 
         /* Did we get the right number density? */
-        if (p->density.wcount > kernel_nwneigh + const_delta_nwneigh ||
-            p->density.wcount < kernel_nwneigh - const_delta_nwneigh) {
+        if (p->density.wcount > max_wcount || p->density.wcount < min_wcount) {
 
           /* Ok, correct then */
           p->h += h_corr;
@@ -641,8 +548,13 @@ void runner_doghost(struct runner *r, struct cell *c) {
 
           }
 
-          /* Otherwise, sub interaction? */
-          else if (l->t->type == task_type_sub) {
+          /* Otherwise, sub-self interaction? */
+          else if (l->t->type == task_type_sub_self)
+            runner_dosub_subset_density(r, finger, parts, pid, count, NULL, -1,
+                                        1);
+
+          /* Otherwise, sub-pair interaction? */
+          else if (l->t->type == task_type_sub_pair) {
 
             /* Left or right? */
             if (l->t->ci == finger)
@@ -660,7 +572,7 @@ void runner_doghost(struct runner *r, struct cell *c) {
   if (count)
     message("Smoothing length failed to converge on %i particles.", count);
 
-  TIMER_TOC(timer_doghost);
+  TIMER_TOC(timer_do_ghost);
 }
 
 /**
@@ -670,7 +582,7 @@ void runner_doghost(struct runner *r, struct cell *c) {
  * @param c The cell.
  * @param timer Are we timing this ?
  */
-void runner_dodrift(struct runner *r, struct cell *c, int timer) {
+void runner_do_drift(struct runner *r, struct cell *c, int timer) {
 
   const double timeBase = r->e->timeBase;
   const double dt = (r->e->ti_current - r->e->ti_old) * timeBase;
@@ -681,8 +593,16 @@ void runner_dodrift(struct runner *r, struct cell *c, int timer) {
   struct gpart *const gparts = c->gparts;
   float dx_max = 0.f, dx2_max = 0.f, h_max = 0.f;
 
+  double e_kin = 0.0, e_int = 0.0, e_pot = 0.0, mass = 0.0;
+  double mom[3] = {0.0, 0.0, 0.0};
+  double ang_mom[3] = {0.0, 0.0, 0.0};
+
   TIMER_TIC
 
+#ifdef TASK_VERBOSE
+  OUT;
+#endif
+
   /* No children? */
   if (!c->split) {
 
@@ -694,9 +614,13 @@ void runner_dodrift(struct runner *r, struct cell *c, int timer) {
       struct gpart *const gp = &gparts[k];
 
       /* Drift... */
-      gp->x[0] += gp->v_full[0] * dt;
-      gp->x[1] += gp->v_full[1] * dt;
-      gp->x[2] += gp->v_full[2] * dt;
+      drift_gpart(gp, dt, timeBase, ti_old, ti_current);
+
+      /* Compute (square of) motion since last cell construction */
+      const float dx2 = gp->x_diff[0] * gp->x_diff[0] +
+                        gp->x_diff[1] * gp->x_diff[1] +
+                        gp->x_diff[2] * gp->x_diff[2];
+      dx2_max = fmaxf(dx2_max, dx2);
     }
 
     /* Loop over all the particles in the cell (more work for these !) */
@@ -707,44 +631,45 @@ void runner_dodrift(struct runner *r, struct cell *c, int timer) {
       struct part *const p = &parts[k];
       struct xpart *const xp = &xparts[k];
 
-      /* Useful quantity */
-      const float h_inv = 1.0f / p->h;
-
       /* Drift... */
-      p->x[0] += xp->v_full[0] * dt;
-      p->x[1] += xp->v_full[1] * dt;
-      p->x[2] += xp->v_full[2] * dt;
-
-      /* Predict velocities (for hydro terms) */
-      p->v[0] += p->a_hydro[0] * dt;
-      p->v[1] += p->a_hydro[1] * dt;
-      p->v[2] += p->a_hydro[2] * dt;
-
-      /* Predict smoothing length */
-      const float w1 = p->h_dt * h_inv * dt;
-      if (fabsf(w1) < 0.2f)
-        p->h *= approx_expf(w1); /* 4th order expansion of exp(w) */
-      else
-        p->h *= expf(w1);
-
-      /* Predict density */
-      const float w2 = -3.0f * p->h_dt * h_inv * dt;
-      if (fabsf(w2) < 0.2f)
-        p->rho *= approx_expf(w2); /* 4th order expansion of exp(w) */
-      else
-        p->rho *= expf(w2);
-
-      /* Predict the values of the extra fields */
-      hydro_predict_extra(p, xp, ti_old, ti_current, timeBase);
+      drift_part(p, xp, dt, timeBase, ti_old, ti_current);
 
       /* Compute (square of) motion since last cell construction */
-      const float dx2 = (p->x[0] - xp->x_old[0]) * (p->x[0] - xp->x_old[0]) +
-                        (p->x[1] - xp->x_old[1]) * (p->x[1] - xp->x_old[1]) +
-                        (p->x[2] - xp->x_old[2]) * (p->x[2] - xp->x_old[2]);
+      const float dx2 = xp->x_diff[0] * xp->x_diff[0] +
+                        xp->x_diff[1] * xp->x_diff[1] +
+                        xp->x_diff[2] * xp->x_diff[2];
       dx2_max = fmaxf(dx2_max, dx2);
 
       /* Maximal smoothing length */
       h_max = fmaxf(p->h, h_max);
+
+      /* Now collect quantities for statistics */
+
+      const float half_dt =
+          (ti_current - (p->ti_begin + p->ti_end) / 2) * timeBase;
+      const double x[3] = {p->x[0], p->x[1], p->x[2]};
+      const float v[3] = {xp->v_full[0] + p->a_hydro[0] * half_dt,
+                          xp->v_full[1] + p->a_hydro[1] * half_dt,
+                          xp->v_full[2] + p->a_hydro[2] * half_dt};
+      const float m = p->mass;
+
+      /* Collect mass */
+      mass += m;
+
+      /* Collect momentum */
+      mom[0] += m * v[0];
+      mom[1] += m * v[1];
+      mom[2] += m * v[2];
+
+      /* Collect angular momentum */
+      ang_mom[0] += m * (x[1] * v[2] - x[2] * v[1]);
+      ang_mom[1] += m * (x[2] * v[0] - x[0] * v[2]);
+      ang_mom[2] += m * (x[0] * v[1] - x[1] * v[0]);
+
+      /* Collect energies. */
+      e_kin += 0.5 * m * (v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
+      e_pot += 0.;
+      e_int += m * hydro_get_internal_energy(p, half_dt);
     }
 
     /* Now, get the maximal particle motion from its square */
@@ -757,266 +682,248 @@ void runner_dodrift(struct runner *r, struct cell *c, int timer) {
     /* Loop over the progeny. */
     for (int k = 0; k < 8; k++)
       if (c->progeny[k] != NULL) {
+
+        /* Recurse */
         struct cell *cp = c->progeny[k];
-        runner_dodrift(r, cp, 0);
+        runner_do_drift(r, cp, 0);
 
+        /* Collect */
         dx_max = fmaxf(dx_max, cp->dx_max);
         h_max = fmaxf(h_max, cp->h_max);
+        mass += cp->mass;
+        e_kin += cp->e_kin;
+        e_int += cp->e_int;
+        e_pot += cp->e_pot;
+        mom[0] += cp->mom[0];
+        mom[1] += cp->mom[1];
+        mom[2] += cp->mom[2];
+        ang_mom[0] += cp->ang_mom[0];
+        ang_mom[1] += cp->ang_mom[1];
+        ang_mom[2] += cp->ang_mom[2];
       }
   }
 
   /* Store the values */
   c->h_max = h_max;
   c->dx_max = dx_max;
+  c->mass = mass;
+  c->e_kin = e_kin;
+  c->e_int = e_int;
+  c->e_pot = e_pot;
+  c->mom[0] = mom[0];
+  c->mom[1] = mom[1];
+  c->mom[2] = mom[2];
+  c->ang_mom[0] = ang_mom[0];
+  c->ang_mom[1] = ang_mom[1];
+  c->ang_mom[2] = ang_mom[2];
 
   if (timer) TIMER_TOC(timer_drift);
 }
 
 /**
- * @brief Combined second and first kick for fixed dt.
+ * @brief Kick particles in momentum space and collect statistics (fixed
+ * time-step case)
  *
  * @param r The runner thread.
  * @param c The cell.
- * @param timer The timer
+ * @param timer Are we timing this ?
  */
+void runner_do_kick_fixdt(struct runner *r, struct cell *c, int timer) {
 
-void runner_dokick(struct runner *r, struct cell *c, int timer) {
-
-  const float global_dt_min = r->e->dt_min;
-  const float global_dt_max = r->e->dt_max;
-  const int ti_current = r->e->ti_current;
+  const double global_dt = r->e->dt_max;
   const double timeBase = r->e->timeBase;
-  const double timeBase_inv = 1.0 / r->e->timeBase;
   const int count = c->count;
   const int gcount = c->gcount;
   struct part *const parts = c->parts;
   struct xpart *const xparts = c->xparts;
   struct gpart *const gparts = c->gparts;
-  const int is_fixdt =
-      (r->e->policy & engine_policy_fixdt) == engine_policy_fixdt;
 
   int updated = 0, g_updated = 0;
   int ti_end_min = max_nr_timesteps, ti_end_max = 0;
-  double e_kin = 0.0, e_int = 0.0, e_pot = 0.0, mass = 0.0;
-  float mom[3] = {0.0f, 0.0f, 0.0f};
-  float ang[3] = {0.0f, 0.0f, 0.0f};
 
   TIMER_TIC
 
+#ifdef TASK_VERBOSE
+  OUT;
+#endif
+
+  /* The new time-step */
+  const int new_dti = global_dt / timeBase;
+
   /* No children? */
   if (!c->split) {
 
-    /* Loop over the g-particles and kick the active ones. */
+    /* Loop over the g-particles and kick everyone. */
     for (int k = 0; k < gcount; k++) {
 
       /* Get a handle on the part. */
       struct gpart *const gp = &gparts[k];
 
-      /* If the g-particle has no counterpart and needs to be kicked */
-      if (gp->id_or_neg_offset > 0 && (is_fixdt || gp->ti_end <= ti_current)) {
+      /* If the g-particle has no counterpart */
+      if (gp->id_or_neg_offset > 0) {
 
         /* First, finish the force calculation */
         gravity_end_force(gp);
 
-        /* Now we are ready to compute the next time-step size */
-        int new_dti;
-
-        if (is_fixdt) {
-
-          /* Now we have a time step, proceed with the kick */
-          new_dti = global_dt_max * timeBase_inv;
-
-        } else {
-
-          /* Compute the next timestep (gravity condition) */
-          float new_dt = gravity_compute_timestep(gp);
-
-          /* Limit timestep within the allowed range */
-          new_dt = fminf(new_dt, global_dt_max);
-          new_dt = fmaxf(new_dt, global_dt_min);
-
-          /* Convert to integer time */
-          new_dti = new_dt * timeBase_inv;
-
-          /* Recover the current timestep */
-          const int current_dti = gp->ti_end - gp->ti_begin;
-
-          /* Limit timestep increase */
-          if (current_dti > 0) new_dti = min(new_dti, 2 * current_dti);
-
-          /* Put this timestep on the time line */
-          int dti_timeline = max_nr_timesteps;
-          while (new_dti < dti_timeline) dti_timeline /= 2;
-
-          /* Now we have a time step, proceed with the kick */
-          new_dti = dti_timeline;
-        }
-
-        /* Compute the time step for this kick */
-        const int ti_start = (gp->ti_begin + gp->ti_end) / 2;
-        const int ti_end = gp->ti_end + new_dti / 2;
-        const double dt = (ti_end - ti_start) * timeBase;
-        const double half_dt = (ti_end - gp->ti_end) * timeBase;
-
-        /* Move particle forward in time */
-        gp->ti_begin = gp->ti_end;
-        gp->ti_end = gp->ti_begin + new_dti;
-
-        /* Kick particles in momentum space */
-        gp->v_full[0] += gp->a_grav[0] * dt;
-        gp->v_full[1] += gp->a_grav[1] * dt;
-        gp->v_full[2] += gp->a_grav[2] * dt;
-
-        /* Extra kick work */
-        gravity_kick_extra(gp, dt, half_dt);
+        /* Kick the g-particle forward */
+        kick_gpart(gp, new_dti, timeBase);
 
         /* Number of updated g-particles */
         g_updated++;
-      }
 
-      /* Minimal time for next end of time-step */
-      ti_end_min = min(gp->ti_end, ti_end_min);
-      ti_end_max = max(gp->ti_end, ti_end_max);
+        /* Minimal time for next end of time-step */
+        ti_end_min = min(gp->ti_end, ti_end_min);
+        ti_end_max = max(gp->ti_end, ti_end_max);
+      }
     }
 
     /* Now do the hydro ones... */
 
-    /* Loop over the particles and kick the active ones. */
+    /* Loop over the particles and kick everyone. */
     for (int k = 0; k < count; k++) {
 
       /* Get a handle on the part. */
       struct part *const p = &parts[k];
       struct xpart *const xp = &xparts[k];
 
-      /* If particle needs to be kicked */
-      if (is_fixdt || p->ti_end <= ti_current) {
+      /* First, finish the force loop */
+      p->h_dt *= p->h * 0.333333333f;
 
-        /* First, finish the force loop */
-        p->h_dt *= p->h * 0.333333333f;
+      /* And do the same of the extra variable */
+      hydro_end_force(p);
+      if (p->gpart != NULL) gravity_end_force(p->gpart);
 
-        /* And do the same of the extra variable */
-        hydro_end_force(p);
-        if (p->gpart != NULL) gravity_end_force(p->gpart);
+      /* Kick the particle forward */
+      kick_part(p, xp, new_dti, timeBase);
 
-        /* Now we are ready to compute the next time-step size */
-        int new_dti;
+      /* Number of updated particles */
+      updated++;
+      if (p->gpart != NULL) g_updated++;
 
-        if (is_fixdt) {
+      /* Minimal time for next end of time-step */
+      ti_end_min = min(p->ti_end, ti_end_min);
+      ti_end_max = max(p->ti_end, ti_end_max);
+    }
+  }
 
-          /* Now we have a time step, proceed with the kick */
-          new_dti = global_dt_max * timeBase_inv;
+  /* Otherwise, aggregate data from children. */
+  else {
 
-        } else {
+    /* Loop over the progeny. */
+    for (int k = 0; k < 8; k++)
+      if (c->progeny[k] != NULL) {
+        struct cell *const cp = c->progeny[k];
 
-          /* Compute the next timestep (hydro condition) */
-          const float new_dt_hydro = hydro_compute_timestep(p, xp);
+        /* Recurse */
+        runner_do_kick_fixdt(r, cp, 0);
 
-          /* Compute the next timestep (gravity condition) */
-          float new_dt_grav = FLT_MAX;
-          if (p->gpart != NULL)
-            new_dt_grav = gravity_compute_timestep(p->gpart);
+        /* And aggregate */
+        updated += cp->updated;
+        g_updated += cp->g_updated;
+        ti_end_min = min(cp->ti_end_min, ti_end_min);
+        ti_end_max = max(cp->ti_end_max, ti_end_max);
+      }
+  }
 
-          float new_dt = fminf(new_dt_hydro, new_dt_grav);
+  /* Store the values. */
+  c->updated = updated;
+  c->g_updated = g_updated;
+  c->ti_end_min = ti_end_min;
+  c->ti_end_max = ti_end_max;
 
-          /* Limit change in h */
-          const float dt_h_change =
-              (p->h_dt != 0.0f) ? fabsf(const_ln_max_h_change * p->h / p->h_dt)
-                                : FLT_MAX;
+  if (timer) TIMER_TOC(timer_kick);
+}
 
-          new_dt = fminf(new_dt, dt_h_change);
+/**
+ * @brief Kick particles in momentum space and collect statistics (floating
+ * time-step case)
+ *
+ * @param r The runner thread.
+ * @param c The cell.
+ * @param timer Are we timing this ?
+ */
+void runner_do_kick(struct runner *r, struct cell *c, int timer) {
 
-          /* Limit timestep within the allowed range */
-          new_dt = fminf(new_dt, global_dt_max);
-          new_dt = fmaxf(new_dt, global_dt_min);
+  const struct engine *e = r->e;
+  const double timeBase = e->timeBase;
+  const int ti_current = r->e->ti_current;
+  const int count = c->count;
+  const int gcount = c->gcount;
+  struct part *const parts = c->parts;
+  struct xpart *const xparts = c->xparts;
+  struct gpart *const gparts = c->gparts;
 
-          /* Convert to integer time */
-          new_dti = new_dt * timeBase_inv;
+  int updated = 0, g_updated = 0;
+  int ti_end_min = max_nr_timesteps, ti_end_max = 0;
 
-          /* Recover the current timestep */
-          const int current_dti = p->ti_end - p->ti_begin;
+  TIMER_TIC
 
-          /* Limit timestep increase */
-          if (current_dti > 0) new_dti = min(new_dti, 2 * current_dti);
+#ifdef TASK_VERBOSE
+  OUT;
+#endif
 
-          /* Put this timestep on the time line */
-          int dti_timeline = max_nr_timesteps;
-          while (new_dti < dti_timeline) dti_timeline /= 2;
+  /* No children? */
+  if (!c->split) {
 
-          /* Now we have a time step, proceed with the kick */
-          new_dti = dti_timeline;
-        }
+    /* Loop over the g-particles and kick the active ones. */
+    for (int k = 0; k < gcount; k++) {
 
-        /* Compute the time step for this kick */
-        const int ti_start = (p->ti_begin + p->ti_end) / 2;
-        const int ti_end = p->ti_end + new_dti / 2;
-        const double dt = (ti_end - ti_start) * timeBase;
-        const double half_dt = (ti_end - p->ti_end) * timeBase;
-
-        /* Move particle forward in time */
-        p->ti_begin = p->ti_end;
-        p->ti_end = p->ti_begin + new_dti;
-        if (p->gpart != NULL) {
-          p->gpart->ti_begin = p->ti_begin;
-          p->gpart->ti_end = p->ti_end;
-        }
+      /* Get a handle on the part. */
+      struct gpart *const gp = &gparts[k];
 
-        /* Get the acceleration */
-        float a_tot[3] = {p->a_hydro[0], p->a_hydro[1], p->a_hydro[2]};
-        if (p->gpart != NULL) {
-          a_tot[0] += p->gpart->a_grav[0];
-          a_tot[1] += p->gpart->a_grav[1];
-          a_tot[1] += p->gpart->a_grav[2];
-        }
+      /* If the g-particle has no counterpart and needs to be kicked */
+      if (gp->id_or_neg_offset > 0) {
 
-        /* Kick particles in momentum space */
-        xp->v_full[0] += a_tot[0] * dt;
-        xp->v_full[1] += a_tot[1] * dt;
-        xp->v_full[2] += a_tot[2] * dt;
+        if (gp->ti_end <= ti_current) {
 
-        if (p->gpart != NULL) {
-          p->gpart->v_full[0] = xp->v_full[0];
-          p->gpart->v_full[1] = xp->v_full[1];
-          p->gpart->v_full[2] = xp->v_full[2];
-        }
+          /* First, finish the force calculation */
+          gravity_end_force(gp);
 
-        /* Go back by half-step for the hydro velocity */
-        p->v[0] = xp->v_full[0] - half_dt * a_tot[0];
-        p->v[1] = xp->v_full[1] - half_dt * a_tot[1];
-        p->v[2] = xp->v_full[2] - half_dt * a_tot[2];
+          /* Compute the next timestep */
+          const int new_dti = get_gpart_timestep(gp, e);
 
-        /* Extra kick work */
-        hydro_kick_extra(p, xp, dt, half_dt);
-        if (p->gpart != NULL) gravity_kick_extra(p->gpart, dt, half_dt);
+          /* Now we have a time step, proceed with the kick */
+          kick_gpart(gp, new_dti, timeBase);
 
-        /* Number of updated particles */
-        updated++;
-        if (p->gpart != NULL) g_updated++;
+          /* Number of updated g-particles */
+          g_updated++;
+        }
+
+        /* Minimal time for next end of time-step */
+        ti_end_min = min(gp->ti_end, ti_end_min);
+        ti_end_max = max(gp->ti_end, ti_end_max);
       }
+    }
 
-      /* Now collect quantities for statistics */
+    /* Now do the hydro ones... */
 
-      const double x[3] = {p->x[0], p->x[1], p->x[2]};
-      const float v_full[3] = {xp->v_full[0], xp->v_full[1], xp->v_full[2]};
-      const float m = p->mass;
+    /* Loop over the particles and kick the active ones. */
+    for (int k = 0; k < count; k++) {
 
-      /* Collect mass */
-      mass += m;
+      /* Get a handle on the part. */
+      struct part *const p = &parts[k];
+      struct xpart *const xp = &xparts[k];
 
-      /* Collect momentum */
-      mom[0] += m * v_full[0];
-      mom[1] += m * v_full[1];
-      mom[2] += m * v_full[2];
+      /* If particle needs to be kicked */
+      if (p->ti_end <= ti_current) {
 
-      /* Collect angular momentum */
-      ang[0] += m * (x[1] * v_full[2] - x[2] * v_full[1]);
-      ang[1] += m * (x[2] * v_full[0] - x[0] * v_full[2]);
-      ang[2] += m * (x[0] * v_full[1] - x[1] * v_full[0]);
+        /* First, finish the force loop */
+        p->h_dt *= p->h * 0.333333333f;
 
-      /* Collect total energy. */
-      e_kin += 0.5 * m * (v_full[0] * v_full[0] + v_full[1] * v_full[1] +
-                          v_full[2] * v_full[2]);
-      e_pot += 0.f; /* No gravitational potential thus far */
-      e_int += hydro_get_internal_energy(p);
+        /* And do the same of the extra variable */
+        hydro_end_force(p);
+        if (p->gpart != NULL) gravity_end_force(p->gpart);
+
+        /* Compute the next timestep (hydro condition) */
+        const int new_dti = get_part_timestep(p, xp, e);
+
+        /* Now we have a time step, proceed with the kick */
+        kick_part(p, xp, new_dti, timeBase);
+
+        /* Number of updated particles */
+        updated++;
+        if (p->gpart != NULL) g_updated++;
+      }
 
       /* Minimal time for next end of time-step */
       ti_end_min = min(p->ti_end, ti_end_min);
@@ -1034,21 +941,11 @@ void runner_dokick(struct runner *r, struct cell *c, int timer) {
         struct cell *const cp = c->progeny[k];
 
         /* Recurse */
-        runner_dokick(r, cp, 0);
+        runner_do_kick(r, cp, 0);
 
         /* And aggregate */
         updated += cp->updated;
         g_updated += cp->g_updated;
-        e_kin += cp->e_kin;
-        e_int += cp->e_int;
-        e_pot += cp->e_pot;
-        mass += cp->mass;
-        mom[0] += cp->mom[0];
-        mom[1] += cp->mom[1];
-        mom[2] += cp->mom[2];
-        ang[0] += cp->ang[0];
-        ang[1] += cp->ang[1];
-        ang[2] += cp->ang[2];
         ti_end_min = min(cp->ti_end_min, ti_end_min);
         ti_end_max = max(cp->ti_end_max, ti_end_max);
       }
@@ -1057,22 +954,56 @@ void runner_dokick(struct runner *r, struct cell *c, int timer) {
   /* Store the values. */
   c->updated = updated;
   c->g_updated = g_updated;
-  c->e_kin = e_kin;
-  c->e_int = e_int;
-  c->e_pot = e_pot;
-  c->mass = mass;
-  c->mom[0] = mom[0];
-  c->mom[1] = mom[1];
-  c->mom[2] = mom[2];
-  c->ang[0] = ang[0];
-  c->ang[1] = ang[1];
-  c->ang[2] = ang[2];
   c->ti_end_min = ti_end_min;
   c->ti_end_max = ti_end_max;
 
   if (timer) TIMER_TOC(timer_kick);
 }
 
+/**
+ * @brief Construct the cell properties from the received particles
+ *
+ * @param r The runner thread.
+ * @param c The cell.
+ * @param timer Are we timing this ?
+ */
+void runner_do_recv_cell(struct runner *r, struct cell *c, int timer) {
+
+  const struct part *const parts = c->parts;
+  const struct gpart *const gparts = c->gparts;
+  const size_t nr_parts = c->count;
+  const size_t nr_gparts = c->gcount;
+  // const int ti_current = r->e->ti_current;
+
+  TIMER_TIC;
+
+  int ti_end_min = max_nr_timesteps;
+  int ti_end_max = 0;
+  float h_max = 0.f;
+
+  /* Collect everything... */
+  for (size_t k = 0; k < nr_parts; k++) {
+    const int ti_end = parts[k].ti_end;
+    // if(ti_end < ti_current) error("Received invalid particle !");
+    ti_end_min = min(ti_end_min, ti_end);
+    ti_end_max = max(ti_end_max, ti_end);
+    h_max = fmaxf(h_max, parts[k].h);
+  }
+  for (size_t k = 0; k < nr_gparts; k++) {
+    const int ti_end = gparts[k].ti_end;
+    // if(ti_end < ti_current) error("Received invalid particle !");
+    ti_end_min = min(ti_end_min, ti_end);
+    ti_end_max = max(ti_end_max, ti_end);
+  }
+
+  /* ... and store. */
+  c->ti_end_min = ti_end_min;
+  c->ti_end_max = ti_end_max;
+  c->h_max = h_max;
+
+  if (timer) TIMER_TOC(timer_dorecv_cell);
+}
+
 /**
  * @brief The #runner main thread routine.
  *
@@ -1114,6 +1045,7 @@ void *runner_main(void *data) {
       struct cell *ci = t->ci;
       struct cell *cj = t->cj;
       t->rid = r->cpuid;
+      t->last_rid = r->cpuid;
 
       /* Different types of tasks... */
       switch (t->type) {
@@ -1134,54 +1066,46 @@ void *runner_main(void *data) {
             error("Unknown task subtype.");
           break;
         case task_type_sort:
-          runner_dosort(r, ci, t->flags, 1);
+          runner_do_sort(r, ci, t->flags, 1);
           break;
-        case task_type_sub:
+        case task_type_sub_self:
           if (t->subtype == task_subtype_density)
-            runner_dosub1_density(r, ci, cj, t->flags, 1);
+            runner_dosub_self1_density(r, ci, 1);
           else if (t->subtype == task_subtype_force)
-            runner_dosub2_force(r, ci, cj, t->flags, 1);
-          else if (t->subtype == task_subtype_grav)
-            runner_dosub_grav(r, ci, cj, 1);
+            runner_dosub_self2_force(r, ci, 1);
+          else
+            error("Unknown task subtype.");
+          break;
+        case task_type_sub_pair:
+          if (t->subtype == task_subtype_density)
+            runner_dosub_pair1_density(r, ci, cj, t->flags, 1);
+          else if (t->subtype == task_subtype_force)
+            runner_dosub_pair2_force(r, ci, cj, t->flags, 1);
           else
             error("Unknown task subtype.");
           break;
         case task_type_init:
-          runner_doinit(r, ci, 1);
+          runner_do_init(r, ci, 1);
           break;
         case task_type_ghost:
-          runner_doghost(r, ci);
+          runner_do_ghost(r, ci);
           break;
         case task_type_drift:
-          runner_dodrift(r, ci, 1);
+          runner_do_drift(r, ci, 1);
           break;
         case task_type_kick:
-          runner_dokick(r, ci, 1);
-          break;
-        case task_type_send:
+          runner_do_kick(r, ci, 1);
           break;
-        case task_type_recv: {
-          struct part *parts = ci->parts;
-          size_t nr_parts = ci->count;
-          ci->ti_end_min = ci->ti_end_max = max_nr_timesteps;
-          for (size_t k = 0; k < nr_parts; k++)
-            parts[k].ti_end = max_nr_timesteps;
+        case task_type_kick_fixdt:
+          runner_do_kick_fixdt(r, ci, 1);
           break;
-        }
-        case task_type_grav_pp:
-          if (t->cj == NULL)
-            runner_doself_grav(r, t->ci);
-          else
-            runner_dopair_grav(r, t->ci, t->cj);
-          break;
-        case task_type_grav_mm:
-          runner_dograv_mm(r, t->ci, t->cj);
+        case task_type_send:
           break;
-        case task_type_grav_up:
-          runner_dograv_up(r, t->ci);
+        case task_type_recv:
+          runner_do_recv_cell(r, ci, 1);
           break;
-        case task_type_grav_down:
-          runner_dograv_down(r, t->ci);
+        case task_type_grav_external:
+          runner_do_grav_external(r, t->ci, 1);
           break;
         case task_type_part_sort:
           space_do_parts_sort();
diff --git a/src/runner.h b/src/runner.h
index ff2f93db6eae9d9e85cd9c3de6f398ed0f64c681..6838b959955c4e54e208b8d2d16339e7fdb1740f 100644
--- a/src/runner.h
+++ b/src/runner.h
@@ -1,6 +1,10 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
+ *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ *               2015 Peter W. Draper (p.w.draper@durham.ac.uk)
+ *               2016 John A. Regan (john.a.regan@durham.ac.uk)
+ *                    Tom Theuns (tom.theuns@durham.ac.uk)
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
@@ -19,13 +23,12 @@
 #ifndef SWIFT_RUNNER_H
 #define SWIFT_RUNNER_H
 
-/* Includes. */
-#include "cell.h"
-#include "inline.h"
-
-extern const float runner_shift[13 * 3];
+extern const double runner_shift[13][3];
 extern const char runner_flip[27];
 
+struct cell;
+struct engine;
+
 /* A struct representing a runner's thread and its data. */
 struct runner {
 
@@ -43,16 +46,13 @@ struct runner {
 };
 
 /* Function prototypes. */
-void runner_doghost(struct runner *r, struct cell *c);
-void runner_dopair_density(struct runner *r, struct cell *ci, struct cell *cj);
-void runner_doself_density(struct runner *r, struct cell *c);
-void runner_dosub_density(struct runner *r, struct cell *ci, struct cell *cj,
-                          int flags);
-void runner_dosort(struct runner *r, struct cell *c, int flag, int clock);
-void runner_dogsort(struct runner *r, struct cell *c, int flag, int clock);
-void runner_dokick(struct runner *r, struct cell *c, int timer);
-void runner_dodrift(struct runner *r, struct cell *c, int timer);
-void runner_doinit(struct runner *r, struct cell *c, int timer);
+void runner_do_ghost(struct runner *r, struct cell *c);
+void runner_do_sort(struct runner *r, struct cell *c, int flag, int clock);
+void runner_do_gsort(struct runner *r, struct cell *c, int flag, int clock);
+void runner_do_kick(struct runner *r, struct cell *c, int timer);
+void runner_do_kick_fixdt(struct runner *r, struct cell *c, int timer);
+void runner_do_drift(struct runner *r, struct cell *c, int timer);
+void runner_do_init(struct runner *r, struct cell *c, int timer);
 void *runner_main(void *data);
 
 #endif /* SWIFT_RUNNER_H */
diff --git a/src/runner_doiact.h b/src/runner_doiact.h
index de339db6133fcc829bdc6ee0ce9e537b68982422..4da83b940d55460c4bf8d2f650534aedf94dbd5d 100644
--- a/src/runner_doiact.h
+++ b/src/runner_doiact.h
@@ -1,7 +1,7 @@
-
 /*******************************************************************************
  * This file is part of SWIFT.
  * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
+ *               2016 Matthieu Schaller (matthieu.schaller@durham.ac.uk)
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
@@ -18,10 +18,6 @@
  *
  ******************************************************************************/
 
-/* Includes. */
-#include "cell.h"
-#include "part.h"
-
 /* Before including this file, define FUNCTION, which is the
    name of the interaction function. This creates the interaction functions
    runner_dopair_FUNCTION, runner_dopair_FUNCTION_naive, runner_doself_FUNCTION,
@@ -57,11 +53,17 @@
 #define _DOSELF_SUBSET(f) PASTE(runner_doself_subset, f)
 #define DOSELF_SUBSET _DOSELF_SUBSET(FUNCTION)
 
-#define _DOSUB1(f) PASTE(runner_dosub1, f)
-#define DOSUB1 _DOSUB1(FUNCTION)
+#define _DOSUB_SELF1(f) PASTE(runner_dosub_self1, f)
+#define DOSUB_SELF1 _DOSUB_SELF1(FUNCTION)
+
+#define _DOSUB_PAIR1(f) PASTE(runner_dosub_pair1, f)
+#define DOSUB_PAIR1 _DOSUB_PAIR1(FUNCTION)
+
+#define _DOSUB_SELF2(f) PASTE(runner_dosub_self2, f)
+#define DOSUB_SELF2 _DOSUB_SELF2(FUNCTION)
 
-#define _DOSUB2(f) PASTE(runner_dosub2, f)
-#define DOSUB2 _DOSUB2(FUNCTION)
+#define _DOSUB_PAIR2(f) PASTE(runner_dosub_pair2, f)
+#define DOSUB_PAIR2 _DOSUB_PAIR2(FUNCTION)
 
 #define _DOSUB_SUBSET(f) PASTE(runner_dosub_subset, f)
 #define DOSUB_SUBSET _DOSUB_SUBSET(FUNCTION)
@@ -72,14 +74,23 @@
 #define _IACT(f) PASTE(runner_iact, f)
 #define IACT _IACT(FUNCTION)
 
+#define _IACT_NONSYM_VEC(f) PASTE(runner_iact_nonsym_vec, f)
+#define IACT_NONSYM_VEC _IACT_NONSYM_VEC(FUNCTION)
+
+#define _IACT_VEC(f) PASTE(runner_iact_vec, f)
+#define IACT_VEC _IACT_VEC(FUNCTION)
+
 #define _TIMER_DOSELF(f) PASTE(timer_doself, f)
 #define TIMER_DOSELF _TIMER_DOSELF(FUNCTION)
 
 #define _TIMER_DOPAIR(f) PASTE(timer_dopair, f)
 #define TIMER_DOPAIR _TIMER_DOPAIR(FUNCTION)
 
-#define _TIMER_DOSUB(f) PASTE(timer_dosub, f)
-#define TIMER_DOSUB _TIMER_DOSUB(FUNCTION)
+#define _TIMER_DOSUB_SELF(f) PASTE(timer_dosub_self, f)
+#define TIMER_DOSUB_SELF _TIMER_DOSUB_SELF(FUNCTION)
+
+#define _TIMER_DOSUB_PAIR(f) PASTE(timer_dosub_pair, f)
+#define TIMER_DOSUB_PAIR _TIMER_DOSUB_PAIR(FUNCTION)
 
 #define _TIMER_DOSELF_SUBSET(f) PASTE(timer_doself_subset, f)
 #define TIMER_DOSELF_SUBSET _TIMER_DOSELF_SUBSET(FUNCTION)
@@ -87,12 +98,6 @@
 #define _TIMER_DOPAIR_SUBSET(f) PASTE(timer_dopair_subset, f)
 #define TIMER_DOPAIR_SUBSET _TIMER_DOPAIR_SUBSET(FUNCTION)
 
-#define _IACT_NONSYM_VEC(f) PASTE(runner_iact_nonsym_vec, f)
-#define IACT_NONSYM_VEC _IACT_NONSYM_VEC(FUNCTION)
-
-#define _IACT_VEC(f) PASTE(runner_iact_vec, f)
-#define IACT_VEC _IACT_VEC(FUNCTION)
-
 /**
  * @brief Compute the interactions between a cell pair.
  *
@@ -100,18 +105,14 @@
  * @param ci The first #cell.
  * @param cj The second #cell.
  */
-
 void DOPAIR_NAIVE(struct runner *r, struct cell *restrict ci,
                   struct cell *restrict cj) {
 
-  struct engine *e = r->e;
-  int pid, pjd, k, count_i = ci->count, count_j = cj->count;
-  double shift[3] = {0.0, 0.0, 0.0};
-  struct part *restrict parts_i = ci->parts, *restrict parts_j = cj->parts;
-  struct part *restrict pi, *restrict pj;
-  double pix[3];
-  float dx[3], hi, hig2, r2;
+  const struct engine *e = r->e;
   const int ti_current = e->ti_current;
+
+  error("Don't use in actual runs ! Slow code !");
+
 #ifdef VECTORIZE
   int icount = 0;
   float r2q[VEC_SIZE] __attribute__((aligned(16)));
@@ -120,44 +121,47 @@ void DOPAIR_NAIVE(struct runner *r, struct cell *restrict ci,
   float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
   struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
 #endif
+
   TIMER_TIC
 
   /* Anything to do here? */
   if (ci->ti_end_min > ti_current && cj->ti_end_min > ti_current) return;
 
+  const int count_i = ci->count;
+  const int count_j = cj->count;
+  struct part *restrict parts_i = ci->parts;
+  struct part *restrict parts_j = cj->parts;
+
   /* Get the relative distance between the pairs, wrapping. */
-  for (k = 0; k < 3; k++) {
+  double shift[3] = {0.0, 0.0, 0.0};
+  for (int k = 0; k < 3; k++) {
     if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2)
       shift[k] = e->s->dim[k];
     else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2)
       shift[k] = -e->s->dim[k];
   }
 
-  /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with
-  %i/%i parts and shift = [ %g %g %g ].\n" ,
-      ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] ,
-  cj->loc[2] ,
-      ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout);
-  tic = getticks(); */
-
   /* Loop over the parts in ci. */
-  for (pid = 0; pid < count_i; pid++) {
+  for (int pid = 0; pid < count_i; pid++) {
 
     /* Get a hold of the ith part in ci. */
-    pi = &parts_i[pid];
-    for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
-    hi = pi->h;
-    hig2 = hi * hi * kernel_gamma2;
+    struct part *restrict pi = &parts_i[pid];
+    const float hi = pi->h;
+
+    double pix[3];
+    for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
+    const float hig2 = hi * hi * kernel_gamma2;
 
     /* Loop over the parts in cj. */
-    for (pjd = 0; pjd < count_j; pjd++) {
+    for (int pjd = 0; pjd < count_j; pjd++) {
 
       /* Get a pointer to the jth particle. */
-      pj = &parts_j[pjd];
+      struct part *restrict pj = &parts_j[pjd];
 
       /* Compute the pairwise distance. */
-      r2 = 0.0f;
-      for (k = 0; k < 3; k++) {
+      float r2 = 0.0f;
+      float dx[3];
+      for (int k = 0; k < 3; k++) {
         dx[k] = pix[k] - pj->x[k];
         r2 += dx[k] * dx[k];
       }
@@ -198,7 +202,7 @@ void DOPAIR_NAIVE(struct runner *r, struct cell *restrict ci,
 #ifdef VECTORIZE
   /* Pick up any leftovers. */
   if (icount > 0)
-    for (k = 0; k < icount; k++)
+    for (int k = 0; k < icount; k++)
       IACT(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
 #endif
 
@@ -207,12 +211,10 @@ void DOPAIR_NAIVE(struct runner *r, struct cell *restrict ci,
 
 void DOSELF_NAIVE(struct runner *r, struct cell *restrict c) {
 
-  int pid, pjd, k, count = c->count;
-  struct part *restrict parts = c->parts;
-  struct part *restrict pi, *restrict pj;
-  double pix[3] = {0.0, 0.0, 0.0};
-  float dx[3], hi, hig2, r2;
   const int ti_current = r->e->ti_current;
+
+  error("Don't use in actual runs ! Slow code !");
+
 #ifdef VECTORIZE
   int icount = 0;
   float r2q[VEC_SIZE] __attribute__((aligned(16)));
@@ -221,38 +223,34 @@ void DOSELF_NAIVE(struct runner *r, struct cell *restrict c) {
   float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
   struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
 #endif
+
   TIMER_TIC
 
   /* Anything to do here? */
   if (c->ti_end_min > ti_current) return;
 
-  /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with
-  %i/%i parts and shift = [ %g %g %g ].\n" ,
-      ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] ,
-  cj->loc[2] ,
-      ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout);
-  tic = getticks(); */
+  const int count = c->count;
+  struct part *restrict parts = c->parts;
 
   /* Loop over the parts in ci. */
-  for (pid = 0; pid < count; pid++) {
+  for (int pid = 0; pid < count; pid++) {
 
     /* Get a hold of the ith part in ci. */
-    pi = &parts[pid];
-    pix[0] = pi->x[0];
-    pix[1] = pi->x[1];
-    pix[2] = pi->x[2];
-    hi = pi->h;
-    hig2 = hi * hi * kernel_gamma2;
+    struct part *restrict pi = &parts[pid];
+    const double pix[3] = {pi->x[0], pi->x[1], pi->x[2]};
+    const float hi = pi->h;
+    const float hig2 = hi * hi * kernel_gamma2;
 
     /* Loop over the parts in cj. */
-    for (pjd = pid + 1; pjd < count; pjd++) {
+    for (int pjd = pid + 1; pjd < count; pjd++) {
 
       /* Get a pointer to the jth particle. */
-      pj = &parts[pjd];
+      struct part *restrict pj = &parts[pjd];
 
       /* Compute the pairwise distance. */
-      r2 = 0.0f;
-      for (k = 0; k < 3; k++) {
+      float r2 = 0.0f;
+      float dx[3];
+      for (int k = 0; k < 3; k++) {
         dx[k] = pix[k] - pj->x[k];
         r2 += dx[k] * dx[k];
       }
@@ -293,7 +291,7 @@ void DOSELF_NAIVE(struct runner *r, struct cell *restrict c) {
 #ifdef VECTORIZE
   /* Pick up any leftovers. */
   if (icount > 0)
-    for (k = 0; k < icount; k++)
+    for (int k = 0; k < icount; k++)
       IACT(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
 #endif
 
@@ -311,18 +309,121 @@ void DOSELF_NAIVE(struct runner *r, struct cell *restrict c) {
  * @param count The number of particles in @c ind.
  * @param cj The second #cell.
  */
+void DOPAIR_SUBSET_NAIVE(struct runner *r, struct cell *restrict ci,
+                         struct part *restrict parts_i, int *restrict ind,
+                         int count, struct cell *restrict cj) {
+
+  struct engine *e = r->e;
+
+  error("Don't use in actual runs ! Slow code !");
+
+#ifdef VECTORIZE
+  int icount = 0;
+  float r2q[VEC_SIZE] __attribute__((aligned(16)));
+  float hiq[VEC_SIZE] __attribute__((aligned(16)));
+  float hjq[VEC_SIZE] __attribute__((aligned(16)));
+  float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
+  struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
+#endif
+
+  TIMER_TIC
+
+  const int count_j = cj->count;
+  struct part *restrict parts_j = cj->parts;
+
+  /* Get the relative distance between the pairs, wrapping. */
+  double shift[3] = {0.0, 0.0, 0.0};
+  for (int k = 0; k < 3; k++) {
+    if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2)
+      shift[k] = e->s->dim[k];
+    else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2)
+      shift[k] = -e->s->dim[k];
+  }
+
+  /* Loop over the parts_i. */
+  for (int pid = 0; pid < count; pid++) {
+
+    /* Get a hold of the ith part in ci. */
+    struct part *restrict pi = &parts_i[ind[pid]];
+    double pix[3];
+    for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
+    const float hi = pi->h;
+    const float hig2 = hi * hi * kernel_gamma2;
+
+    /* Loop over the parts in cj. */
+    for (int pjd = 0; pjd < count_j; pjd++) {
+
+      /* Get a pointer to the jth particle. */
+      struct part *restrict pj = &parts_j[pjd];
+
+      /* Compute the pairwise distance. */
+      float r2 = 0.0f;
+      float dx[3];
+      for (int k = 0; k < 3; k++) {
+        dx[k] = pix[k] - pj->x[k];
+        r2 += dx[k] * dx[k];
+      }
+
+      /* Hit or miss? */
+      if (r2 < hig2) {
+
+#ifndef VECTORIZE
+
+        IACT_NONSYM(r2, dx, hi, pj->h, pi, pj);
+
+#else
+
+        /* Add this interaction to the queue. */
+        r2q[icount] = r2;
+        dxq[3 * icount + 0] = dx[0];
+        dxq[3 * icount + 1] = dx[1];
+        dxq[3 * icount + 2] = dx[2];
+        hiq[icount] = hi;
+        hjq[icount] = pj->h;
+        piq[icount] = pi;
+        pjq[icount] = pj;
+        icount += 1;
+
+        /* Flush? */
+        if (icount == VEC_SIZE) {
+          IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq);
+          icount = 0;
+        }
+
+#endif
+      }
+
+    } /* loop over the parts in cj. */
+
+  } /* loop over the parts in ci. */
+
+#ifdef VECTORIZE
+  /* Pick up any leftovers. */
+  if (icount > 0)
+    for (int k = 0; k < icount; k++)
+      IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
+#endif
+
+  TIMER_TOC(timer_dopair_subset);
+}
 
+/**
+ * @brief Compute the interactions between a cell pair, but only for the
+ *      given indices in ci.
+ *
+ * @param r The #runner.
+ * @param ci The first #cell.
+ * @param parts_i The #part to interact with @c cj.
+ * @param ind The list of indices of particles in @c ci to interact with.
+ * @param count The number of particles in @c ind.
+ * @param cj The second #cell.
+ */
 void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci,
                    struct part *restrict parts_i, int *restrict ind, int count,
                    struct cell *restrict cj) {
 
   struct engine *e = r->e;
-  int pid, pjd, sid, k, count_j = cj->count, flipped;
-  double shift[3] = {0.0, 0.0, 0.0};
-  struct part *restrict pi, *restrict pj, *restrict parts_j = cj->parts;
-  double pix[3];
-  float dx[3], hi, hig2, r2, di, dxj;
-  struct entry *sort_j;
+
 #ifdef VECTORIZE
   int icount = 0;
   float r2q[VEC_SIZE] __attribute__((aligned(16)));
@@ -331,10 +432,15 @@ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci,
   float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
   struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
 #endif
+
   TIMER_TIC
 
+  const int count_j = cj->count;
+  struct part *restrict parts_j = cj->parts;
+
   /* Get the relative distance between the pairs, wrapping. */
-  for (k = 0; k < 3; k++) {
+  double shift[3] = {0.0, 0.0, 0.0};
+  for (int k = 0; k < 3; k++) {
     if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2)
       shift[k] = e->s->dim[k];
     else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2)
@@ -342,53 +448,50 @@ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci,
   }
 
   /* Get the sorting index. */
-  for (sid = 0, k = 0; k < 3; k++)
+  int sid = 0;
+  for (int k = 0; k < 3; k++)
     sid = 3 * sid + ((cj->loc[k] - ci->loc[k] + shift[k] < 0)
                          ? 0
                          : (cj->loc[k] - ci->loc[k] + shift[k] > 0) ? 2 : 1);
 
   /* Switch the cells around? */
-  flipped = runner_flip[sid];
+  const int flipped = runner_flip[sid];
   sid = sortlistID[sid];
 
   /* Have the cells been sorted? */
   if (!(cj->sorted & (1 << sid))) error("Trying to interact unsorted cells.");
 
-  /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with
-  %i/%i parts and shift = [ %g %g %g ].\n" ,
-      ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] ,
-  cj->loc[2] ,
-      ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout);
-  tic = getticks(); */
-
   /* Pick-out the sorted lists. */
-  sort_j = &cj->sort[sid * (cj->count + 1)];
-  dxj = cj->dx_max;
+  const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)];
+  const float dxj = cj->dx_max;
 
   /* Parts are on the left? */
   if (!flipped) {
 
     /* Loop over the parts_i. */
-    for (pid = 0; pid < count; pid++) {
+    for (int pid = 0; pid < count; pid++) {
 
       /* Get a hold of the ith part in ci. */
-      pi = &parts_i[ind[pid]];
-      for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
-      hi = pi->h;
-      hig2 = hi * hi * kernel_gamma2;
-      di = hi * kernel_gamma + dxj + pix[0] * runner_shift[3 * sid + 0] +
-           pix[1] * runner_shift[3 * sid + 1] +
-           pix[2] * runner_shift[3 * sid + 2];
+      struct part *restrict pi = &parts_i[ind[pid]];
+      double pix[3];
+      for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
+
+      const float hi = pi->h;
+      const float hig2 = hi * hi * kernel_gamma2;
+      const float di = hi * kernel_gamma + dxj + pix[0] * runner_shift[sid][0] +
+                       pix[1] * runner_shift[sid][1] +
+                       pix[2] * runner_shift[sid][2];
 
       /* Loop over the parts in cj. */
-      for (pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) {
+      for (int pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) {
 
         /* Get a pointer to the jth particle. */
-        pj = &parts_j[sort_j[pjd].i];
+        struct part *restrict pj = &parts_j[sort_j[pjd].i];
 
         /* Compute the pairwise distance. */
-        r2 = 0.0f;
-        for (k = 0; k < 3; k++) {
+        float r2 = 0.0f;
+        float dx[3];
+        for (int k = 0; k < 3; k++) {
           dx[k] = pix[k] - pj->x[k];
           r2 += dx[k] * dx[k];
         }
@@ -432,26 +535,28 @@ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci,
   else {
 
     /* Loop over the parts_i. */
-    for (pid = 0; pid < count; pid++) {
+    for (int pid = 0; pid < count; pid++) {
 
       /* Get a hold of the ith part in ci. */
-      pi = &parts_i[ind[pid]];
-      for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
-      hi = pi->h;
-      hig2 = hi * hi * kernel_gamma2;
-      di = -hi * kernel_gamma - dxj + pix[0] * runner_shift[3 * sid + 0] +
-           pix[1] * runner_shift[3 * sid + 1] +
-           pix[2] * runner_shift[3 * sid + 2];
+      struct part *restrict pi = &parts_i[ind[pid]];
+      double pix[3];
+      for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
+      const float hi = pi->h;
+      const float hig2 = hi * hi * kernel_gamma2;
+      const float di =
+          -hi * kernel_gamma - dxj + pix[0] * runner_shift[sid][0] +
+          pix[1] * runner_shift[sid][1] + pix[2] * runner_shift[sid][2];
 
       /* Loop over the parts in cj. */
-      for (pjd = count_j - 1; pjd >= 0 && di < sort_j[pjd].d; pjd--) {
+      for (int pjd = count_j - 1; pjd >= 0 && di < sort_j[pjd].d; pjd--) {
 
         /* Get a pointer to the jth particle. */
-        pj = &parts_j[sort_j[pjd].i];
+        struct part *restrict pj = &parts_j[sort_j[pjd].i];
 
         /* Compute the pairwise distance. */
-        r2 = 0.0f;
-        for (k = 0; k < 3; k++) {
+        float r2 = 0.0f;
+        float dx[3];
+        for (int k = 0; k < 3; k++) {
           dx[k] = pix[k] - pj->x[k];
           r2 += dx[k] * dx[k];
         }
@@ -493,119 +598,7 @@ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci,
 #ifdef VECTORIZE
   /* Pick up any leftovers. */
   if (icount > 0)
-    for (k = 0; k < icount; k++)
-      IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
-#endif
-
-  TIMER_TOC(timer_dopair_subset);
-}
-
-/**
- * @brief Compute the interactions between a cell pair, but only for the
- *      given indices in ci.
- *
- * @param r The #runner.
- * @param ci The first #cell.
- * @param parts_i The #part to interact with @c cj.
- * @param ind The list of indices of particles in @c ci to interact with.
- * @param count The number of particles in @c ind.
- * @param cj The second #cell.
- */
-
-void DOPAIR_SUBSET_NAIVE(struct runner *r, struct cell *restrict ci,
-                         struct part *restrict parts_i, int *restrict ind,
-                         int count, struct cell *restrict cj) {
-
-  struct engine *e = r->e;
-  int pid, pjd, k, count_j = cj->count;
-  double shift[3] = {0.0, 0.0, 0.0};
-  struct part *restrict pi, *restrict pj, *restrict parts_j = cj->parts;
-  double pix[3];
-  float dx[3], hi, hig2, r2;
-#ifdef VECTORIZE
-  int icount = 0;
-  float r2q[VEC_SIZE] __attribute__((aligned(16)));
-  float hiq[VEC_SIZE] __attribute__((aligned(16)));
-  float hjq[VEC_SIZE] __attribute__((aligned(16)));
-  float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
-  struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
-#endif
-  TIMER_TIC
-
-  /* Get the relative distance between the pairs, wrapping. */
-  for (k = 0; k < 3; k++) {
-    if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2)
-      shift[k] = e->s->dim[k];
-    else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2)
-      shift[k] = -e->s->dim[k];
-  }
-
-  /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with
-  %i/%i parts and shift = [ %g %g %g ].\n" ,
-      ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] ,
-  cj->loc[2] ,
-      ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout);
-  tic = getticks(); */
-
-  /* Loop over the parts_i. */
-  for (pid = 0; pid < count; pid++) {
-
-    /* Get a hold of the ith part in ci. */
-    pi = &parts_i[ind[pid]];
-    for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
-    hi = pi->h;
-    hig2 = hi * hi * kernel_gamma2;
-
-    /* Loop over the parts in cj. */
-    for (pjd = 0; pjd < count_j; pjd++) {
-
-      /* Get a pointer to the jth particle. */
-      pj = &parts_j[pjd];
-
-      /* Compute the pairwise distance. */
-      r2 = 0.0f;
-      for (k = 0; k < 3; k++) {
-        dx[k] = pix[k] - pj->x[k];
-        r2 += dx[k] * dx[k];
-      }
-
-      /* Hit or miss? */
-      if (r2 < hig2) {
-
-#ifndef VECTORIZE
-
-        IACT_NONSYM(r2, dx, hi, pj->h, pi, pj);
-
-#else
-
-        /* Add this interaction to the queue. */
-        r2q[icount] = r2;
-        dxq[3 * icount + 0] = dx[0];
-        dxq[3 * icount + 1] = dx[1];
-        dxq[3 * icount + 2] = dx[2];
-        hiq[icount] = hi;
-        hjq[icount] = pj->h;
-        piq[icount] = pi;
-        pjq[icount] = pj;
-        icount += 1;
-
-        /* Flush? */
-        if (icount == VEC_SIZE) {
-          IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq);
-          icount = 0;
-        }
-
-#endif
-      }
-
-    } /* loop over the parts in cj. */
-
-  } /* loop over the parts in ci. */
-
-#ifdef VECTORIZE
-  /* Pick up any leftovers. */
-  if (icount > 0)
-    for (k = 0; k < icount; k++)
+    for (int k = 0; k < icount; k++)
       IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
 #endif
 
@@ -622,15 +615,9 @@ void DOPAIR_SUBSET_NAIVE(struct runner *r, struct cell *restrict ci,
  * @param ind The list of indices of particles in @c ci to interact with.
  * @param count The number of particles in @c ind.
  */
-
 void DOSELF_SUBSET(struct runner *r, struct cell *restrict ci,
                    struct part *restrict parts, int *restrict ind, int count) {
 
-  int pid, pjd, k, count_i = ci->count;
-  struct part *restrict parts_j = ci->parts;
-  struct part *restrict pi, *restrict pj;
-  double pix[3] = {0.0, 0.0, 0.0};
-  float dx[3], hi, hig2, r2;
 #ifdef VECTORIZE
   int icount = 0;
   float r2q[VEC_SIZE] __attribute__((aligned(16)));
@@ -639,35 +626,31 @@ void DOSELF_SUBSET(struct runner *r, struct cell *restrict ci,
   float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
   struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
 #endif
+
   TIMER_TIC
 
-  /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with
-  %i/%i parts and shift = [ %g %g %g ].\n" ,
-      ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] ,
-  cj->loc[2] ,
-      ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout);
-  tic = getticks(); */
+  const int count_i = ci->count;
+  struct part *restrict parts_j = ci->parts;
 
   /* Loop over the parts in ci. */
-  for (pid = 0; pid < count; pid++) {
+  for (int pid = 0; pid < count; pid++) {
 
     /* Get a hold of the ith part in ci. */
-    pi = &parts[ind[pid]];
-    pix[0] = pi->x[0];
-    pix[1] = pi->x[1];
-    pix[2] = pi->x[2];
-    hi = pi->h;
-    hig2 = hi * hi * kernel_gamma2;
+    struct part *restrict pi = &parts[ind[pid]];
+    const double pix[3] = {pi->x[0], pi->x[1], pi->x[2]};
+    const float hi = pi->h;
+    const float hig2 = hi * hi * kernel_gamma2;
 
     /* Loop over the parts in cj. */
-    for (pjd = 0; pjd < count_i; pjd++) {
+    for (int pjd = 0; pjd < count_i; pjd++) {
 
       /* Get a pointer to the jth particle. */
-      pj = &parts_j[pjd];
+      struct part *restrict pj = &parts_j[pjd];
 
       /* Compute the pairwise distance. */
-      r2 = 0.0f;
-      for (k = 0; k < 3; k++) {
+      float r2 = 0.0f;
+      float dx[3];
+      for (int k = 0; k < 3; k++) {
         dx[k] = pix[k] - pj->x[k];
         r2 += dx[k] * dx[k];
       }
@@ -708,7 +691,7 @@ void DOSELF_SUBSET(struct runner *r, struct cell *restrict ci,
 #ifdef VECTORIZE
   /* Pick up any leftovers. */
   if (icount > 0)
-    for (k = 0; k < icount; k++)
+    for (int k = 0; k < icount; k++)
       IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
 #endif
 
@@ -716,26 +699,17 @@ void DOSELF_SUBSET(struct runner *r, struct cell *restrict ci,
 }
 
 /**
- * @brief Compute the interactions between a cell pair.
+ * @brief Compute the interactions between a cell pair (non-symmetric).
  *
  * @param r The #runner.
  * @param ci The first #cell.
  * @param cj The second #cell.
  */
-
 void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) {
 
   struct engine *restrict e = r->e;
-  int pid, pjd, k, sid;
-  double rshift, shift[3] = {0.0, 0.0, 0.0};
-  struct entry *restrict sort_i, *restrict sort_j;
-  struct part *restrict pi, *restrict pj, *restrict parts_i, *restrict parts_j;
-  double pix[3], pjx[3], di, dj;
-  float dx[3], hi, hig2, hj, hjg2, r2, dx_max;
-  double hi_max, hj_max;
-  double di_max, dj_min;
-  int count_i, count_j;
   const int ti_current = e->ti_current;
+
 #ifdef VECTORIZE
   int icount = 0;
   float r2q[VEC_SIZE] __attribute__((aligned(16)));
@@ -744,60 +718,64 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) {
   float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
   struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
 #endif
+
   TIMER_TIC
 
   /* Anything to do here? */
   if (ci->ti_end_min > ti_current && cj->ti_end_min > ti_current) return;
 
   /* Get the sort ID. */
-  sid = space_getsid(e->s, &ci, &cj, shift);
+  double shift[3] = {0.0, 0.0, 0.0};
+  const int sid = space_getsid(e->s, &ci, &cj, shift);
 
   /* Have the cells been sorted? */
   if (!(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid)))
     error("Trying to interact unsorted cells.");
 
   /* Get the cutoff shift. */
-  for (rshift = 0.0, k = 0; k < 3; k++)
-    rshift += shift[k] * runner_shift[3 * sid + k];
+  double rshift = 0.0;
+  for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k];
 
   /* Pick-out the sorted lists. */
-  sort_i = &ci->sort[sid * (ci->count + 1)];
-  sort_j = &cj->sort[sid * (cj->count + 1)];
+  const struct entry *restrict sort_i = &ci->sort[sid * (ci->count + 1)];
+  const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)];
 
   /* Get some other useful values. */
-  hi_max = ci->h_max * kernel_gamma - rshift;
-  hj_max = cj->h_max * kernel_gamma;
-  count_i = ci->count;
-  count_j = cj->count;
-  parts_i = ci->parts;
-  parts_j = cj->parts;
-  di_max = sort_i[count_i - 1].d - rshift;
-  dj_min = sort_j[0].d;
-  dx_max = (ci->dx_max + cj->dx_max);
+  const double hi_max = ci->h_max * kernel_gamma - rshift;
+  const double hj_max = cj->h_max * kernel_gamma;
+  const int count_i = ci->count;
+  const int count_j = cj->count;
+  struct part *restrict parts_i = ci->parts;
+  struct part *restrict parts_j = cj->parts;
+  const double di_max = sort_i[count_i - 1].d - rshift;
+  const double dj_min = sort_j[0].d;
+  const float dx_max = (ci->dx_max + cj->dx_max);
 
   /* Loop over the parts in ci. */
-  for (pid = count_i - 1; pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min;
-       pid--) {
+  for (int pid = count_i - 1;
+       pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min; pid--) {
 
     /* Get a hold of the ith part in ci. */
-    pi = &parts_i[sort_i[pid].i];
+    struct part *restrict pi = &parts_i[sort_i[pid].i];
     if (pi->ti_end > ti_current) continue;
-    hi = pi->h;
-    di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift;
+    const float hi = pi->h;
+    const double di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift;
     if (di < dj_min) continue;
 
-    hig2 = hi * hi * kernel_gamma2;
-    for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
+    double pix[3];
+    for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
+    const float hig2 = hi * hi * kernel_gamma2;
 
     /* Loop over the parts in cj. */
-    for (pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) {
+    for (int pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) {
 
       /* Get a pointer to the jth particle. */
-      pj = &parts_j[sort_j[pjd].i];
+      struct part *restrict pj = &parts_j[sort_j[pjd].i];
 
       /* Compute the pairwise distance. */
-      r2 = 0.0f;
-      for (k = 0; k < 3; k++) {
+      float r2 = 0.0f;
+      float dx[3];
+      for (int k = 0; k < 3; k++) {
         dx[k] = pix[k] - pj->x[k];
         r2 += dx[k] * dx[k];
       }
@@ -835,33 +813,31 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) {
 
   } /* loop over the parts in ci. */
 
-  /* printf( "runner_dopair: first half took %.3f %s...\n" ,
-  clocks_from_ticks(getticks() - tic), clocks_getunit());
-  tic = getticks(); */
-
   /* Loop over the parts in cj. */
-  for (pjd = 0; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max;
+  for (int pjd = 0; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max;
        pjd++) {
 
     /* Get a hold of the jth part in cj. */
-    pj = &parts_j[sort_j[pjd].i];
+    struct part *restrict pj = &parts_j[sort_j[pjd].i];
     if (pj->ti_end > ti_current) continue;
-    hj = pj->h;
-    dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift;
+    const float hj = pj->h;
+    const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift;
     if (dj > di_max) continue;
 
-    for (k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k];
-    hjg2 = hj * hj * kernel_gamma2;
+    double pjx[3];
+    for (int k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k];
+    const float hjg2 = hj * hj * kernel_gamma2;
 
     /* Loop over the parts in ci. */
-    for (pid = count_i - 1; pid >= 0 && sort_i[pid].d > dj; pid--) {
+    for (int pid = count_i - 1; pid >= 0 && sort_i[pid].d > dj; pid--) {
 
       /* Get a pointer to the jth particle. */
-      pi = &parts_i[sort_i[pid].i];
+      struct part *restrict pi = &parts_i[sort_i[pid].i];
 
       /* Compute the pairwise distance. */
-      r2 = 0.0f;
-      for (k = 0; k < 3; k++) {
+      float r2 = 0.0f;
+      float dx[3];
+      for (int k = 0; k < 3; k++) {
         dx[k] = pjx[k] - pi->x[k];
         r2 += dx[k] * dx[k];
       }
@@ -902,28 +878,25 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) {
 #ifdef VECTORIZE
   /* Pick up any leftovers. */
   if (icount > 0)
-    for (k = 0; k < icount; k++)
+    for (int k = 0; k < icount; k++)
       IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
 #endif
 
   TIMER_TOC(TIMER_DOPAIR);
 }
 
+/**
+ * @brief Compute the interactions between a cell pair (symmetric)
+ *
+ * @param r The #runner.
+ * @param ci The first #cell.
+ * @param cj The second #cell.
+ */
 void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
 
   struct engine *restrict e = r->e;
-  int pid, pjd, k, sid;
-  double rshift, shift[3] = {0.0, 0.0, 0.0};
-  struct entry *sort_i, *sort_j;
-  struct entry *sortdt_i = NULL, *sortdt_j = NULL;
-  int countdt_i = 0, countdt_j = 0;
-  struct part *restrict pi, *restrict pj, *restrict parts_i, *restrict parts_j;
-  double pix[3], pjx[3], di, dj;
-  float dx[3], hi, hig2, hj, hjg2, r2, dx_max;
-  double hi_max, hj_max;
-  double di_max, dj_min;
-  int count_i, count_j;
   const int ti_current = e->ti_current;
+
 #ifdef VECTORIZE
   int icount1 = 0;
   float r2q1[VEC_SIZE] __attribute__((aligned(16)));
@@ -938,38 +911,42 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
   float dxq2[3 * VEC_SIZE] __attribute__((aligned(16)));
   struct part *piq2[VEC_SIZE], *pjq2[VEC_SIZE];
 #endif
+
   TIMER_TIC
 
   /* Anything to do here? */
   if (ci->ti_end_min > ti_current && cj->ti_end_min > ti_current) return;
 
   /* Get the shift ID. */
-  sid = space_getsid(e->s, &ci, &cj, shift);
+  double shift[3] = {0.0, 0.0, 0.0};
+  const int sid = space_getsid(e->s, &ci, &cj, shift);
 
   /* Have the cells been sorted? */
   if (!(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid)))
     error("Trying to interact unsorted cells.");
 
   /* Get the cutoff shift. */
-  for (rshift = 0.0, k = 0; k < 3; k++)
-    rshift += shift[k] * runner_shift[3 * sid + k];
+  double rshift = 0.0;
+  for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k];
 
   /* Pick-out the sorted lists. */
-  sort_i = &ci->sort[sid * (ci->count + 1)];
-  sort_j = &cj->sort[sid * (cj->count + 1)];
+  struct entry *restrict sort_i = &ci->sort[sid * (ci->count + 1)];
+  struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)];
 
   /* Get some other useful values. */
-  hi_max = ci->h_max * kernel_gamma - rshift;
-  hj_max = cj->h_max * kernel_gamma;
-  count_i = ci->count;
-  count_j = cj->count;
-  parts_i = ci->parts;
-  parts_j = cj->parts;
-  di_max = sort_i[count_i - 1].d - rshift;
-  dj_min = sort_j[0].d;
-  dx_max = (ci->dx_max + cj->dx_max);
+  const double hi_max = ci->h_max * kernel_gamma - rshift;
+  const double hj_max = cj->h_max * kernel_gamma;
+  const int count_i = ci->count;
+  const int count_j = cj->count;
+  struct part *restrict parts_i = ci->parts;
+  struct part *restrict parts_j = cj->parts;
+  const double di_max = sort_i[count_i - 1].d - rshift;
+  const double dj_min = sort_j[0].d;
+  const double dx_max = (ci->dx_max + cj->dx_max);
 
   /* Collect the number of parts left and right below dt. */
+  int countdt_i = 0, countdt_j = 0;
+  struct entry *restrict sortdt_i = NULL, *restrict sortdt_j = NULL;
   if (ci->ti_end_max <= ti_current) {
     sortdt_i = sort_i;
     countdt_i = count_i;
@@ -977,7 +954,7 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
     if ((sortdt_i = (struct entry *)alloca(sizeof(struct entry) * count_i)) ==
         NULL)
       error("Failed to allocate dt sortlists.");
-    for (k = 0; k < count_i; k++)
+    for (int k = 0; k < count_i; k++)
       if (parts_i[sort_i[k].i].ti_end <= ti_current) {
         sortdt_i[countdt_i] = sort_i[k];
         countdt_i += 1;
@@ -990,7 +967,7 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
     if ((sortdt_j = (struct entry *)alloca(sizeof(struct entry) * count_j)) ==
         NULL)
       error("Failed to allocate dt sortlists.");
-    for (k = 0; k < count_j; k++)
+    for (int k = 0; k < count_j; k++)
       if (parts_j[sort_j[k].i].ti_end <= ti_current) {
         sortdt_j[countdt_j] = sort_j[k];
         countdt_j += 1;
@@ -998,31 +975,33 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
   }
 
   /* Loop over the parts in ci. */
-  for (pid = count_i - 1; pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min;
-       pid--) {
+  for (int pid = count_i - 1;
+       pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min; pid--) {
 
     /* Get a hold of the ith part in ci. */
-    pi = &parts_i[sort_i[pid].i];
-    hi = pi->h;
-    di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift;
+    struct part *restrict pi = &parts_i[sort_i[pid].i];
+    const float hi = pi->h;
+    const double di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift;
     if (di < dj_min) continue;
 
-    hig2 = hi * hi * kernel_gamma2;
-    for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
+    double pix[3];
+    for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
+    const float hig2 = hi * hi * kernel_gamma2;
 
     /* Look at valid dt parts only? */
     if (pi->ti_end > ti_current) {
 
       /* Loop over the parts in cj within dt. */
-      for (pjd = 0; pjd < countdt_j && sortdt_j[pjd].d < di; pjd++) {
+      for (int pjd = 0; pjd < countdt_j && sortdt_j[pjd].d < di; pjd++) {
 
         /* Get a pointer to the jth particle. */
-        pj = &parts_j[sortdt_j[pjd].i];
-        hj = pj->h;
+        struct part *restrict pj = &parts_j[sortdt_j[pjd].i];
+        const float hj = pj->h;
 
         /* Compute the pairwise distance. */
-        r2 = 0.0f;
-        for (k = 0; k < 3; k++) {
+        float r2 = 0.0f;
+        float dx[3];
+        for (int k = 0; k < 3; k++) {
           dx[k] = pj->x[k] - pix[k];
           r2 += dx[k] * dx[k];
         }
@@ -1064,15 +1043,16 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
     else {
 
       /* Loop over the parts in cj. */
-      for (pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) {
+      for (int pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) {
 
         /* Get a pointer to the jth particle. */
-        pj = &parts_j[sort_j[pjd].i];
-        hj = pj->h;
+        struct part *restrict pj = &parts_j[sort_j[pjd].i];
+        const float hj = pj->h;
 
         /* Compute the pairwise distance. */
-        r2 = 0.0f;
-        for (k = 0; k < 3; k++) {
+        float r2 = 0.0f;
+        float dx[3];
+        for (int k = 0; k < 3; k++) {
           dx[k] = pix[k] - pj->x[k];
           r2 += dx[k] * dx[k];
         }
@@ -1138,36 +1118,34 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
 
   } /* loop over the parts in ci. */
 
-  /* printf( "runner_dopair: first half took %.3f %s...\n" ,
-  clocks_from_ticks(getticks() - tic), clocks_getunit());
-  tic = getticks(); */
-
   /* Loop over the parts in cj. */
-  for (pjd = 0; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max;
+  for (int pjd = 0; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max;
        pjd++) {
 
     /* Get a hold of the jth part in cj. */
-    pj = &parts_j[sort_j[pjd].i];
-    hj = pj->h;
-    dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift;
+    struct part *restrict pj = &parts_j[sort_j[pjd].i];
+    const float hj = pj->h;
+    const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift;
     if (dj > di_max) continue;
 
-    for (k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k];
-    hjg2 = hj * hj * kernel_gamma2;
+    double pjx[3];
+    for (int k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k];
+    const float hjg2 = hj * hj * kernel_gamma2;
 
     /* Is this particle outside the dt? */
     if (pj->ti_end > ti_current) {
 
       /* Loop over the parts in ci. */
-      for (pid = countdt_i - 1; pid >= 0 && sortdt_i[pid].d > dj; pid--) {
+      for (int pid = countdt_i - 1; pid >= 0 && sortdt_i[pid].d > dj; pid--) {
 
         /* Get a pointer to the jth particle. */
-        pi = &parts_i[sortdt_i[pid].i];
-        hi = pi->h;
+        struct part *restrict pi = &parts_i[sortdt_i[pid].i];
+        const float hi = pi->h;
 
         /* Compute the pairwise distance. */
-        r2 = 0.0f;
-        for (k = 0; k < 3; k++) {
+        float r2 = 0.0f;
+        float dx[3];
+        for (int k = 0; k < 3; k++) {
           dx[k] = pi->x[k] - pjx[k];
           r2 += dx[k] * dx[k];
         }
@@ -1208,15 +1186,16 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
     else {
 
       /* Loop over the parts in ci. */
-      for (pid = count_i - 1; pid >= 0 && sort_i[pid].d > dj; pid--) {
+      for (int pid = count_i - 1; pid >= 0 && sort_i[pid].d > dj; pid--) {
 
         /* Get a pointer to the jth particle. */
-        pi = &parts_i[sort_i[pid].i];
-        hi = pi->h;
+        struct part *restrict pi = &parts_i[sort_i[pid].i];
+        const float hi = pi->h;
 
         /* Compute the pairwise distance. */
-        r2 = 0.0f;
-        for (k = 0; k < 3; k++) {
+        float r2 = 0.0f;
+        float dx[3];
+        for (int k = 0; k < 3; k++) {
           dx[k] = pjx[k] - pi->x[k];
           r2 += dx[k] * dx[k];
         }
@@ -1285,10 +1264,10 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
 #ifdef VECTORIZE
   /* Pick up any leftovers. */
   if (icount1 > 0)
-    for (k = 0; k < icount1; k++)
+    for (int k = 0; k < icount1; k++)
       IACT_NONSYM(r2q1[k], &dxq1[3 * k], hiq1[k], hjq1[k], piq1[k], pjq1[k]);
   if (icount2 > 0)
-    for (k = 0; k < icount2; k++)
+    for (int k = 0; k < icount2; k++)
       IACT(r2q2[k], &dxq2[3 * k], hiq2[k], hjq2[k], piq2[k], pjq2[k]);
 #endif
 
@@ -1296,20 +1275,15 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
 }
 
 /**
- * @brief Compute the cell self-interaction.
+ * @brief Compute the cell self-interaction (non-symmetric).
  *
  * @param r The #runner.
  * @param c The #cell.
  */
-
 void DOSELF1(struct runner *r, struct cell *restrict c) {
 
-  int k, pid, pjd, count = c->count;
-  double pix[3];
-  float dx[3], hi, hj, hig2, r2;
-  struct part *restrict parts = c->parts, *restrict pi, *restrict pj;
   const int ti_current = r->e->ti_current;
-  int firstdt = 0, countdt = 0, *indt = NULL, doj;
+
 #ifdef VECTORIZE
   int icount1 = 0;
   float r2q1[VEC_SIZE] __attribute__((aligned(16)));
@@ -1326,43 +1300,49 @@ void DOSELF1(struct runner *r, struct cell *restrict c) {
 #endif
   TIMER_TIC
 
-  /* Set up indt if needed. */
-  if (c->ti_end_min > ti_current)
-    return;
-  else if (c->ti_end_max > ti_current) {
-    if ((indt = (int *)alloca(sizeof(int) * count)) == NULL)
-      error("Failed to allocate indt.");
-    for (k = 0; k < count; k++)
-      if (parts[k].ti_end <= ti_current) {
-        indt[countdt] = k;
-        countdt += 1;
-      }
-  }
+  if (c->ti_end_min > ti_current) return;
+  if (c->ti_end_max < ti_current) error("Cell in an impossible time-zone");
+
+  struct part *restrict parts = c->parts;
+  const int count = c->count;
+
+  /* Set up indt. */
+  int *indt = NULL;
+  int countdt = 0, firstdt = 0;
+  if ((indt = (int *)alloca(sizeof(int) * count)) == NULL)
+    error("Failed to allocate indt.");
+  for (int k = 0; k < count; k++)
+    if (parts[k].ti_end <= ti_current) {
+      indt[countdt] = k;
+      countdt += 1;
+    }
 
   /* Loop over the particles in the cell. */
-  for (pid = 0; pid < count; pid++) {
+  for (int pid = 0; pid < count; pid++) {
 
     /* Get a pointer to the ith particle. */
-    pi = &parts[pid];
+    struct part *restrict pi = &parts[pid];
 
     /* Get the particle position and radius. */
-    for (k = 0; k < 3; k++) pix[k] = pi->x[k];
-    hi = pi->h;
-    hig2 = hi * hi * kernel_gamma2;
+    double pix[3];
+    for (int k = 0; k < 3; k++) pix[k] = pi->x[k];
+    const float hi = pi->h;
+    const float hig2 = hi * hi * kernel_gamma2;
 
     /* Is the ith particle inactive? */
     if (pi->ti_end > ti_current) {
 
       /* Loop over the other particles .*/
-      for (pjd = firstdt; pjd < countdt; pjd++) {
+      for (int pjd = firstdt; pjd < countdt; pjd++) {
 
         /* Get a pointer to the jth particle. */
-        pj = &parts[indt[pjd]];
-        hj = pj->h;
+        struct part *restrict pj = &parts[indt[pjd]];
+        const float hj = pj->h;
 
         /* Compute the pairwise distance. */
-        r2 = 0.0f;
-        for (k = 0; k < 3; k++) {
+        float r2 = 0.0f;
+        float dx[3];
+        for (int k = 0; k < 3; k++) {
           dx[k] = pj->x[k] - pix[k];
           r2 += dx[k] * dx[k];
         }
@@ -1407,19 +1387,21 @@ void DOSELF1(struct runner *r, struct cell *restrict c) {
       firstdt += 1;
 
       /* Loop over the other particles .*/
-      for (pjd = pid + 1; pjd < count; pjd++) {
+      for (int pjd = pid + 1; pjd < count; pjd++) {
 
         /* Get a pointer to the jth particle. */
-        pj = &parts[pjd];
-        hj = pj->h;
+        struct part *restrict pj = &parts[pjd];
+        const float hj = pj->h;
 
         /* Compute the pairwise distance. */
-        r2 = 0.0f;
-        for (k = 0; k < 3; k++) {
+        float r2 = 0.0f;
+        float dx[3];
+        for (int k = 0; k < 3; k++) {
           dx[k] = pix[k] - pj->x[k];
           r2 += dx[k] * dx[k];
         }
-        doj = (pj->ti_end <= ti_current) && (r2 < hj * hj * kernel_gamma2);
+        const int doj =
+            (pj->ti_end <= ti_current) && (r2 < hj * hj * kernel_gamma2);
 
         /* Hit or miss? */
         if (r2 < hig2 || doj) {
@@ -1510,24 +1492,26 @@ void DOSELF1(struct runner *r, struct cell *restrict c) {
 #ifdef VECTORIZE
   /* Pick up any leftovers. */
   if (icount1 > 0)
-    for (k = 0; k < icount1; k++)
+    for (int k = 0; k < icount1; k++)
       IACT_NONSYM(r2q1[k], &dxq1[3 * k], hiq1[k], hjq1[k], piq1[k], pjq1[k]);
   if (icount2 > 0)
-    for (k = 0; k < icount2; k++)
+    for (int k = 0; k < icount2; k++)
       IACT(r2q2[k], &dxq2[3 * k], hiq2[k], hjq2[k], piq2[k], pjq2[k]);
 #endif
 
   TIMER_TOC(TIMER_DOSELF);
 }
 
+/**
+ * @brief Compute the cell self-interaction (symmetric).
+ *
+ * @param r The #runner.
+ * @param c The #cell.
+ */
 void DOSELF2(struct runner *r, struct cell *restrict c) {
 
-  int k, pid, pjd, count = c->count;
-  double pix[3];
-  float dx[3], hi, hj, hig2, r2;
-  struct part *restrict parts = c->parts, *restrict pi, *restrict pj;
   const int ti_current = r->e->ti_current;
-  int firstdt = 0, countdt = 0, *indt = NULL;
+
 #ifdef VECTORIZE
   int icount1 = 0;
   float r2q1[VEC_SIZE] __attribute__((aligned(16)));
@@ -1544,43 +1528,49 @@ void DOSELF2(struct runner *r, struct cell *restrict c) {
 #endif
   TIMER_TIC
 
-  /* Set up indt if needed. */
-  if (c->ti_end_min > ti_current)
-    return;
-  else if (c->ti_end_max > ti_current) {
-    if ((indt = (int *)alloca(sizeof(int) * count)) == NULL)
-      error("Failed to allocate indt.");
-    for (k = 0; k < count; k++)
-      if (parts[k].ti_end <= ti_current) {
-        indt[countdt] = k;
-        countdt += 1;
-      }
-  }
+  if (c->ti_end_min > ti_current) return;
+  if (c->ti_end_max < ti_current) error("Cell in an impossible time-zone");
+
+  struct part *restrict parts = c->parts;
+  const int count = c->count;
+
+  /* Set up indt. */
+  int *indt = NULL;
+  int countdt = 0, firstdt = 0;
+  if ((indt = (int *)alloca(sizeof(int) * count)) == NULL)
+    error("Failed to allocate indt.");
+  for (int k = 0; k < count; k++)
+    if (parts[k].ti_end <= ti_current) {
+      indt[countdt] = k;
+      countdt += 1;
+    }
 
   /* Loop over the particles in the cell. */
-  for (pid = 0; pid < count; pid++) {
+  for (int pid = 0; pid < count; pid++) {
 
     /* Get a pointer to the ith particle. */
-    pi = &parts[pid];
+    struct part *restrict pi = &parts[pid];
 
     /* Get the particle position and radius. */
-    for (k = 0; k < 3; k++) pix[k] = pi->x[k];
-    hi = pi->h;
-    hig2 = hi * hi * kernel_gamma2;
+    double pix[3];
+    for (int k = 0; k < 3; k++) pix[k] = pi->x[k];
+    const float hi = pi->h;
+    const float hig2 = hi * hi * kernel_gamma2;
 
     /* Is the ith particle not active? */
     if (pi->ti_end > ti_current) {
 
       /* Loop over the other particles .*/
-      for (pjd = firstdt; pjd < countdt; pjd++) {
+      for (int pjd = firstdt; pjd < countdt; pjd++) {
 
         /* Get a pointer to the jth particle. */
-        pj = &parts[indt[pjd]];
-        hj = pj->h;
+        struct part *restrict pj = &parts[indt[pjd]];
+        const float hj = pj->h;
 
         /* Compute the pairwise distance. */
-        r2 = 0.0f;
-        for (k = 0; k < 3; k++) {
+        float r2 = 0.0f;
+        float dx[3];
+        for (int k = 0; k < 3; k++) {
           dx[k] = pj->x[k] - pix[k];
           r2 += dx[k] * dx[k];
         }
@@ -1625,15 +1615,16 @@ void DOSELF2(struct runner *r, struct cell *restrict c) {
       firstdt += 1;
 
       /* Loop over the other particles .*/
-      for (pjd = pid + 1; pjd < count; pjd++) {
+      for (int pjd = pid + 1; pjd < count; pjd++) {
 
         /* Get a pointer to the jth particle. */
-        pj = &parts[pjd];
-        hj = pj->h;
+        struct part *restrict pj = &parts[pjd];
+        const float hj = pj->h;
 
         /* Compute the pairwise distance. */
-        r2 = 0.0f;
-        for (k = 0; k < 3; k++) {
+        float r2 = 0.0f;
+        float dx[3];
+        for (int k = 0; k < 3; k++) {
           dx[k] = pix[k] - pj->x[k];
           r2 += dx[k] * dx[k];
         }
@@ -1702,10 +1693,10 @@ void DOSELF2(struct runner *r, struct cell *restrict c) {
 #ifdef VECTORIZE
   /* Pick up any leftovers. */
   if (icount1 > 0)
-    for (k = 0; k < icount1; k++)
+    for (int k = 0; k < icount1; k++)
       IACT_NONSYM(r2q1[k], &dxq1[3 * k], hiq1[k], hjq1[k], piq1[k], pjq1[k]);
   if (icount2 > 0)
-    for (k = 0; k < icount2; k++)
+    for (int k = 0; k < icount2; k++)
       IACT(r2q2[k], &dxq2[3 * k], hiq2[k], hjq2[k], piq2[k], pjq2[k]);
 #endif
 
@@ -1713,7 +1704,7 @@ void DOSELF2(struct runner *r, struct cell *restrict c) {
 }
 
 /**
- * @brief Compute grouped sub-cell interactions
+ * @brief Compute grouped sub-cell interactions for pairs
  *
  * @param r The #runner.
  * @param ci The first #cell.
@@ -1724,559 +1715,574 @@ void DOSELF2(struct runner *r, struct cell *restrict c) {
  * @todo Hard-code the sid on the recursive calls to avoid the
  * redundant computations to find the sid on-the-fly.
  */
+void DOSUB_PAIR1(struct runner *r, struct cell *ci, struct cell *cj, int sid,
+                 int gettimer) {
 
-void DOSUB1(struct runner *r, struct cell *ci, struct cell *cj, int sid,
-            int gettimer) {
-
-  int j = 0, k;
-  double shift[3];
-  float h;
   struct space *s = r->e->s;
   const int ti_current = r->e->ti_current;
 
   TIMER_TIC
 
-  /* Is this a single cell? */
-  if (cj == NULL) {
-
-    /* Should we even bother? */
-    if (ci->ti_end_min > ti_current) return;
+  /* Should we even bother? */
+  if (ci->ti_end_min > ti_current && cj->ti_end_min > ti_current) return;
 
-    /* Recurse? */
-    if (ci->split) {
+  /* Get the cell dimensions. */
+  const float h = fmin(ci->h[0], fmin(ci->h[1], ci->h[2]));
 
-      /* Loop over all progeny. */
-      for (k = 0; k < 8; k++)
-        if (ci->progeny[k] != NULL) {
-          DOSUB1(r, ci->progeny[k], NULL, -1, 0);
-          for (j = k + 1; j < 8; j++)
-            if (ci->progeny[j] != NULL)
-              DOSUB1(r, ci->progeny[k], ci->progeny[j], -1, 0);
-        }
+  /* Get the type of pair if not specified explicitly. */
+  // if ( sid < 0 )
+  double shift[3];
+  sid = space_getsid(s, &ci, &cj, shift);
 
-    }
+  /* Recurse? */
+  if (ci->split && cj->split &&
+      fmaxf(ci->h_max, cj->h_max) * kernel_gamma + ci->dx_max + cj->dx_max <
+          h / 2) {
 
-    /* Otherwise, compute self-interaction. */
-    else
-      DOSELF1(r, ci);
+    /* Different types of flags. */
+    switch (sid) {
 
-  } /* self-interaction. */
+      /* Regular sub-cell interactions of a single cell. */
+      case 0: /* (  1 ,  1 ,  1 ) */
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        break;
 
-  /* Otherwise, it's a pair interaction. */
-  else {
+      case 1: /* (  1 ,  1 ,  0 ) */
+        if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[0], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[1], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[1], -1, 0);
+        break;
 
-    /* Should we even bother? */
-    if (ci->ti_end_min > ti_current && cj->ti_end_min > ti_current) return;
+      case 2: /* (  1 ,  1 , -1 ) */
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[1], -1, 0);
+        break;
 
-    /* Get the cell dimensions. */
-    h = fmin(ci->h[0], fmin(ci->h[1], ci->h[2]));
+      case 3: /* (  1 ,  0 ,  1 ) */
+        if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[0], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[2], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[2], -1, 0);
+        break;
 
-    /* Get the type of pair if not specified explicitly. */
-    // if ( sid < 0 )
-    sid = space_getsid(s, &ci, &cj, shift);
+      case 4: /* (  1 ,  0 ,  0 ) */
+        if (ci->progeny[4] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[4], cj->progeny[0], -1, 0);
+        if (ci->progeny[4] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[4], cj->progeny[1], -1, 0);
+        if (ci->progeny[4] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[4], cj->progeny[2], -1, 0);
+        if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[4], cj->progeny[3], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[0], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[1], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[2], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[3], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[0], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[1], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[2], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[3], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[1], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[2], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[3], -1, 0);
+        break;
 
-    /* Recurse? */
-    if (ci->split && cj->split &&
-        fmaxf(ci->h_max, cj->h_max) * kernel_gamma + ci->dx_max + cj->dx_max <
-            h / 2) {
+      case 5: /* (  1 ,  0 , -1 ) */
+        if (ci->progeny[4] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[4], cj->progeny[1], -1, 0);
+        if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[4], cj->progeny[3], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[1], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[3], -1, 0);
+        break;
 
-      /* Different types of flags. */
-      switch (sid) {
+      case 6: /* (  1 , -1 ,  1 ) */
+        if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[2], -1, 0);
+        break;
 
-        /* Regular sub-cell interactions of a single cell. */
-        case 0: /* (  1 ,  1 ,  1 ) */
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          break;
+      case 7: /* (  1 , -1 ,  0 ) */
+        if (ci->progeny[4] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[4], cj->progeny[2], -1, 0);
+        if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[4], cj->progeny[3], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[2], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[3], -1, 0);
+        break;
 
-        case 1: /* (  1 ,  1 ,  0 ) */
-          if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[0], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[1], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[1], -1, 0);
-          break;
+      case 8: /* (  1 , -1 , -1 ) */
+        if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[4], cj->progeny[3], -1, 0);
+        break;
 
-        case 2: /* (  1 ,  1 , -1 ) */
-          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[1], -1, 0);
-          break;
+      case 9: /* (  0 ,  1 ,  1 ) */
+        if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[3], cj->progeny[0], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[3], cj->progeny[4], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[4], -1, 0);
+        break;
 
-        case 3: /* (  1 ,  0 ,  1 ) */
-          if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[5], cj->progeny[0], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
-            DOSUB1(r, ci->progeny[5], cj->progeny[2], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[2], -1, 0);
-          break;
+      case 10: /* (  0 ,  1 ,  0 ) */
+        if (ci->progeny[2] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[2], cj->progeny[0], -1, 0);
+        if (ci->progeny[2] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[2], cj->progeny[1], -1, 0);
+        if (ci->progeny[2] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[2], cj->progeny[4], -1, 0);
+        if (ci->progeny[2] != NULL && cj->progeny[5] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[2], cj->progeny[5], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[3], cj->progeny[0], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[3], cj->progeny[1], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[3], cj->progeny[4], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[5] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[3], cj->progeny[5], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[0], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[1], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[4], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[5] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[5], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[1], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[4], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[5] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[5], -1, 0);
+        break;
 
-        case 4: /* (  1 ,  0 ,  0 ) */
-          if (ci->progeny[4] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[4], cj->progeny[0], -1, 0);
-          if (ci->progeny[4] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[4], cj->progeny[1], -1, 0);
-          if (ci->progeny[4] != NULL && cj->progeny[2] != NULL)
-            DOSUB1(r, ci->progeny[4], cj->progeny[2], -1, 0);
-          if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
-            DOSUB1(r, ci->progeny[4], cj->progeny[3], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[5], cj->progeny[0], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[5], cj->progeny[1], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
-            DOSUB1(r, ci->progeny[5], cj->progeny[2], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[3] != NULL)
-            DOSUB1(r, ci->progeny[5], cj->progeny[3], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[0], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[1], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[2] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[2], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[3] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[3], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[1], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[2], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[3] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[3], -1, 0);
-          break;
+      case 11: /* (  0 ,  1 , -1 ) */
+        if (ci->progeny[2] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[2], cj->progeny[1], -1, 0);
+        if (ci->progeny[2] != NULL && cj->progeny[5] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[2], cj->progeny[5], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[1], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[5] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[5], -1, 0);
+        break;
 
-        case 5: /* (  1 ,  0 , -1 ) */
-          if (ci->progeny[4] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[4], cj->progeny[1], -1, 0);
-          if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
-            DOSUB1(r, ci->progeny[4], cj->progeny[3], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[1], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[3] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[3], -1, 0);
-          break;
+      case 12: /* (  0 ,  0 ,  1 ) */
+        if (ci->progeny[1] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[1], cj->progeny[0], -1, 0);
+        if (ci->progeny[1] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[1], cj->progeny[2], -1, 0);
+        if (ci->progeny[1] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[1], cj->progeny[4], -1, 0);
+        if (ci->progeny[1] != NULL && cj->progeny[6] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[1], cj->progeny[6], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[3], cj->progeny[0], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[3], cj->progeny[2], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[3], cj->progeny[4], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[6] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[3], cj->progeny[6], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[0], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[2], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[4], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[6] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[6], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[2], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[4], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[6] != NULL)
+          DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[6], -1, 0);
+        break;
+    }
 
-        case 6: /* (  1 , -1 ,  1 ) */
-          if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
-            DOSUB1(r, ci->progeny[5], cj->progeny[2], -1, 0);
-          break;
+  }
 
-        case 7: /* (  1 , -1 ,  0 ) */
-          if (ci->progeny[4] != NULL && cj->progeny[2] != NULL)
-            DOSUB1(r, ci->progeny[4], cj->progeny[2], -1, 0);
-          if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
-            DOSUB1(r, ci->progeny[4], cj->progeny[3], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
-            DOSUB1(r, ci->progeny[5], cj->progeny[2], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[3] != NULL)
-            DOSUB1(r, ci->progeny[5], cj->progeny[3], -1, 0);
-          break;
+  /* Otherwise, compute the pair directly. */
+  else if (ci->ti_end_min <= ti_current || cj->ti_end_min <= ti_current) {
 
-        case 8: /* (  1 , -1 , -1 ) */
-          if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
-            DOSUB1(r, ci->progeny[4], cj->progeny[3], -1, 0);
-          break;
+    /* Do any of the cells need to be sorted first? */
+    if (!(ci->sorted & (1 << sid))) runner_do_sort(r, ci, (1 << sid), 1);
+    if (!(cj->sorted & (1 << sid))) runner_do_sort(r, cj, (1 << sid), 1);
 
-        case 9: /* (  0 ,  1 ,  1 ) */
-          if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[3], cj->progeny[0], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
-            DOSUB1(r, ci->progeny[3], cj->progeny[4], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[4], -1, 0);
-          break;
+    /* Compute the interactions. */
+    DOPAIR1(r, ci, cj);
+  }
 
-        case 10: /* (  0 ,  1 ,  0 ) */
-          if (ci->progeny[2] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[2], cj->progeny[0], -1, 0);
-          if (ci->progeny[2] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[2], cj->progeny[1], -1, 0);
-          if (ci->progeny[2] != NULL && cj->progeny[4] != NULL)
-            DOSUB1(r, ci->progeny[2], cj->progeny[4], -1, 0);
-          if (ci->progeny[2] != NULL && cj->progeny[5] != NULL)
-            DOSUB1(r, ci->progeny[2], cj->progeny[5], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[3], cj->progeny[0], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[3], cj->progeny[1], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
-            DOSUB1(r, ci->progeny[3], cj->progeny[4], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[5] != NULL)
-            DOSUB1(r, ci->progeny[3], cj->progeny[5], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[0], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[1], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[4] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[4], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[5] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[5], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[1], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[4], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[5] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[5], -1, 0);
-          break;
+  if (gettimer) TIMER_TOC(TIMER_DOSUB_PAIR);
+}
 
-        case 11: /* (  0 ,  1 , -1 ) */
-          if (ci->progeny[2] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[2], cj->progeny[1], -1, 0);
-          if (ci->progeny[2] != NULL && cj->progeny[5] != NULL)
-            DOSUB1(r, ci->progeny[2], cj->progeny[5], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[1], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[5] != NULL)
-            DOSUB1(r, ci->progeny[6], cj->progeny[5], -1, 0);
-          break;
+/**
+ * @brief Compute grouped sub-cell interactions for self tasks
+ *
+ * @param r The #runner.
+ * @param ci The first #cell.
+ * @param gettimer Do we have a timer ?
+ */
+void DOSUB_SELF1(struct runner *r, struct cell *ci, int gettimer) {
 
-        case 12: /* (  0 ,  0 ,  1 ) */
-          if (ci->progeny[1] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[1], cj->progeny[0], -1, 0);
-          if (ci->progeny[1] != NULL && cj->progeny[2] != NULL)
-            DOSUB1(r, ci->progeny[1], cj->progeny[2], -1, 0);
-          if (ci->progeny[1] != NULL && cj->progeny[4] != NULL)
-            DOSUB1(r, ci->progeny[1], cj->progeny[4], -1, 0);
-          if (ci->progeny[1] != NULL && cj->progeny[6] != NULL)
-            DOSUB1(r, ci->progeny[1], cj->progeny[6], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[3], cj->progeny[0], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[2] != NULL)
-            DOSUB1(r, ci->progeny[3], cj->progeny[2], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
-            DOSUB1(r, ci->progeny[3], cj->progeny[4], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[6] != NULL)
-            DOSUB1(r, ci->progeny[3], cj->progeny[6], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[5], cj->progeny[0], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
-            DOSUB1(r, ci->progeny[5], cj->progeny[2], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[4] != NULL)
-            DOSUB1(r, ci->progeny[5], cj->progeny[4], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[6] != NULL)
-            DOSUB1(r, ci->progeny[5], cj->progeny[6], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[2], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[4], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[6] != NULL)
-            DOSUB1(r, ci->progeny[7], cj->progeny[6], -1, 0);
-          break;
-      }
+  const int ti_current = r->e->ti_current;
 
-    }
+  TIMER_TIC
 
-    /* Otherwise, compute the pair directly. */
-    else if (ci->ti_end_min <= ti_current || cj->ti_end_min <= ti_current) {
+  /* Should we even bother? */
+  if (ci->ti_end_min > ti_current) return;
 
-      /* Do any of the cells need to be sorted first? */
-      if (!(ci->sorted & (1 << sid))) runner_dosort(r, ci, (1 << sid), 1);
-      if (!(cj->sorted & (1 << sid))) runner_dosort(r, cj, (1 << sid), 1);
+  /* Recurse? */
+  if (ci->split) {
 
-      /* Compute the interactions. */
-      DOPAIR1(r, ci, cj);
-    }
+    /* Loop over all progeny. */
+    for (int k = 0; k < 8; k++)
+      if (ci->progeny[k] != NULL) {
+        DOSUB_SELF1(r, ci->progeny[k], 0);
+        for (int j = k + 1; j < 8; j++)
+          if (ci->progeny[j] != NULL)
+            DOSUB_PAIR1(r, ci->progeny[k], ci->progeny[j], -1, 0);
+      }
+  }
 
-  } /* otherwise, pair interaction. */
+  /* Otherwise, compute self-interaction. */
+  else
+    DOSELF1(r, ci);
 
-  if (gettimer) TIMER_TOC(TIMER_DOSUB);
+  if (gettimer) TIMER_TOC(TIMER_DOSUB_SELF);
 }
 
-void DOSUB2(struct runner *r, struct cell *ci, struct cell *cj, int sid,
-            int gettimer) {
+/**
+ * @brief Compute grouped sub-cell interactions for pairs (symmetric case)
+ *
+ * @param r The #runner.
+ * @param ci The first #cell.
+ * @param cj The second #cell.
+ * @param sid The direction linking the cells
+ * @param gettimer Do we have a timer ?
+ *
+ * @todo Hard-code the sid on the recursive calls to avoid the
+ * redundant computations to find the sid on-the-fly.
+ */
+void DOSUB_PAIR2(struct runner *r, struct cell *ci, struct cell *cj, int sid,
+                 int gettimer) {
 
-  int j, k;
-  double shift[3];
-  float h;
   struct space *s = r->e->s;
   const int ti_current = r->e->ti_current;
 
   TIMER_TIC
 
-  /* Is this a single cell? */
-  if (cj == NULL) {
+  /* Should we even bother? */
+  if (ci->ti_end_min > ti_current && cj->ti_end_min > ti_current) return;
 
-    /* Should we even bother? */
-    if (ci->ti_end_min > ti_current) return;
+  /* Get the cell dimensions. */
+  const float h = fmin(ci->h[0], fmin(ci->h[1], ci->h[2]));
 
-    /* Recurse? */
-    if (ci->split) {
+  /* Get the type of pair if not specified explicitly. */
+  // if ( sid < 0 )
+  double shift[3];
+  sid = space_getsid(s, &ci, &cj, shift);
 
-      /* Loop over all progeny. */
-      for (k = 0; k < 8; k++)
-        if (ci->progeny[k] != NULL) {
-          DOSUB2(r, ci->progeny[k], NULL, -1, 0);
-          for (j = k + 1; j < 8; j++)
-            if (ci->progeny[j] != NULL)
-              DOSUB2(r, ci->progeny[k], ci->progeny[j], -1, 0);
-        }
+  /* Recurse? */
+  if (ci->split && cj->split &&
+      fmaxf(ci->h_max, cj->h_max) * kernel_gamma + ci->dx_max + cj->dx_max <
+          h / 2) {
 
-    }
+    /* Different types of flags. */
+    switch (sid) {
 
-    /* Otherwise, compute self-interaction. */
-    else
-      DOSELF2(r, ci);
+      /* Regular sub-cell interactions of a single cell. */
+      case 0: /* (  1 ,  1 ,  1 ) */
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        break;
 
-  } /* self-interaction. */
+      case 1: /* (  1 ,  1 ,  0 ) */
+        if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[0], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[1], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[1], -1, 0);
+        break;
 
-  /* Otherwise, it's a pair interaction. */
-  else {
+      case 2: /* (  1 ,  1 , -1 ) */
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[1], -1, 0);
+        break;
 
-    /* Should we even bother? */
-    if (ci->ti_end_min > ti_current && cj->ti_end_min > ti_current) return;
+      case 3: /* (  1 ,  0 ,  1 ) */
+        if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[0], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[2], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[2], -1, 0);
+        break;
 
-    /* Get the cell dimensions. */
-    h = fmin(ci->h[0], fmin(ci->h[1], ci->h[2]));
+      case 4: /* (  1 ,  0 ,  0 ) */
+        if (ci->progeny[4] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[4], cj->progeny[0], -1, 0);
+        if (ci->progeny[4] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[4], cj->progeny[1], -1, 0);
+        if (ci->progeny[4] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[4], cj->progeny[2], -1, 0);
+        if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[4], cj->progeny[3], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[0], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[1], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[2], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[3], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[0], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[1], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[2], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[3], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[1], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[2], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[3], -1, 0);
+        break;
 
-    /* Get the type of pair if not specified explicitly. */
-    // if ( sid < 0 )
-    sid = space_getsid(s, &ci, &cj, shift);
+      case 5: /* (  1 ,  0 , -1 ) */
+        if (ci->progeny[4] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[4], cj->progeny[1], -1, 0);
+        if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[4], cj->progeny[3], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[1], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[3], -1, 0);
+        break;
 
-    /* Recurse? */
-    if (ci->split && cj->split &&
-        fmaxf(ci->h_max, cj->h_max) * kernel_gamma + ci->dx_max + cj->dx_max <
-            h / 2) {
+      case 6: /* (  1 , -1 ,  1 ) */
+        if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[2], -1, 0);
+        break;
 
-      /* Different types of flags. */
-      switch (sid) {
+      case 7: /* (  1 , -1 ,  0 ) */
+        if (ci->progeny[4] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[4], cj->progeny[2], -1, 0);
+        if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[4], cj->progeny[3], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[2], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[3], -1, 0);
+        break;
 
-        /* Regular sub-cell interactions of a single cell. */
-        case 0: /* (  1 ,  1 ,  1 ) */
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          break;
+      case 8: /* (  1 , -1 , -1 ) */
+        if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[4], cj->progeny[3], -1, 0);
+        break;
 
-        case 1: /* (  1 ,  1 ,  0 ) */
-          if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[0], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[1], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[1], -1, 0);
-          break;
+      case 9: /* (  0 ,  1 ,  1 ) */
+        if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[3], cj->progeny[0], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[3], cj->progeny[4], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[4], -1, 0);
+        break;
 
-        case 2: /* (  1 ,  1 , -1 ) */
-          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[1], -1, 0);
-          break;
+      case 10: /* (  0 ,  1 ,  0 ) */
+        if (ci->progeny[2] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[2], cj->progeny[0], -1, 0);
+        if (ci->progeny[2] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[2], cj->progeny[1], -1, 0);
+        if (ci->progeny[2] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[2], cj->progeny[4], -1, 0);
+        if (ci->progeny[2] != NULL && cj->progeny[5] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[2], cj->progeny[5], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[3], cj->progeny[0], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[3], cj->progeny[1], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[3], cj->progeny[4], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[5] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[3], cj->progeny[5], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[0], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[1], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[4], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[5] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[5], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[1], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[4], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[5] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[5], -1, 0);
+        break;
 
-        case 3: /* (  1 ,  0 ,  1 ) */
-          if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[5], cj->progeny[0], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
-            DOSUB2(r, ci->progeny[5], cj->progeny[2], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[2], -1, 0);
-          break;
+      case 11: /* (  0 ,  1 , -1 ) */
+        if (ci->progeny[2] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[2], cj->progeny[1], -1, 0);
+        if (ci->progeny[2] != NULL && cj->progeny[5] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[2], cj->progeny[5], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[1], -1, 0);
+        if (ci->progeny[6] != NULL && cj->progeny[5] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[5], -1, 0);
+        break;
 
-        case 4: /* (  1 ,  0 ,  0 ) */
-          if (ci->progeny[4] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[4], cj->progeny[0], -1, 0);
-          if (ci->progeny[4] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[4], cj->progeny[1], -1, 0);
-          if (ci->progeny[4] != NULL && cj->progeny[2] != NULL)
-            DOSUB2(r, ci->progeny[4], cj->progeny[2], -1, 0);
-          if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
-            DOSUB2(r, ci->progeny[4], cj->progeny[3], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[5], cj->progeny[0], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[5], cj->progeny[1], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
-            DOSUB2(r, ci->progeny[5], cj->progeny[2], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[3] != NULL)
-            DOSUB2(r, ci->progeny[5], cj->progeny[3], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[0], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[1], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[2] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[2], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[3] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[3], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[1], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[2], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[3] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[3], -1, 0);
-          break;
+      case 12: /* (  0 ,  0 ,  1 ) */
+        if (ci->progeny[1] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[1], cj->progeny[0], -1, 0);
+        if (ci->progeny[1] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[1], cj->progeny[2], -1, 0);
+        if (ci->progeny[1] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[1], cj->progeny[4], -1, 0);
+        if (ci->progeny[1] != NULL && cj->progeny[6] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[1], cj->progeny[6], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[3], cj->progeny[0], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[3], cj->progeny[2], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[3], cj->progeny[4], -1, 0);
+        if (ci->progeny[3] != NULL && cj->progeny[6] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[3], cj->progeny[6], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[0], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[2], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[4], -1, 0);
+        if (ci->progeny[5] != NULL && cj->progeny[6] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[6], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[0], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[2], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[4], -1, 0);
+        if (ci->progeny[7] != NULL && cj->progeny[6] != NULL)
+          DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[6], -1, 0);
+        break;
+    }
 
-        case 5: /* (  1 ,  0 , -1 ) */
-          if (ci->progeny[4] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[4], cj->progeny[1], -1, 0);
-          if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
-            DOSUB2(r, ci->progeny[4], cj->progeny[3], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[1], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[3] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[3], -1, 0);
-          break;
+  }
 
-        case 6: /* (  1 , -1 ,  1 ) */
-          if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
-            DOSUB2(r, ci->progeny[5], cj->progeny[2], -1, 0);
-          break;
+  /* Otherwise, compute the pair directly. */
+  else if (ci->ti_end_min <= ti_current || cj->ti_end_min <= ti_current) {
 
-        case 7: /* (  1 , -1 ,  0 ) */
-          if (ci->progeny[4] != NULL && cj->progeny[2] != NULL)
-            DOSUB2(r, ci->progeny[4], cj->progeny[2], -1, 0);
-          if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
-            DOSUB2(r, ci->progeny[4], cj->progeny[3], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
-            DOSUB2(r, ci->progeny[5], cj->progeny[2], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[3] != NULL)
-            DOSUB2(r, ci->progeny[5], cj->progeny[3], -1, 0);
-          break;
+    /* Do any of the cells need to be sorted first? */
+    if (!(ci->sorted & (1 << sid))) runner_do_sort(r, ci, (1 << sid), 1);
+    if (!(cj->sorted & (1 << sid))) runner_do_sort(r, cj, (1 << sid), 1);
 
-        case 8: /* (  1 , -1 , -1 ) */
-          if (ci->progeny[4] != NULL && cj->progeny[3] != NULL)
-            DOSUB2(r, ci->progeny[4], cj->progeny[3], -1, 0);
-          break;
+    /* Compute the interactions. */
+    DOPAIR2(r, ci, cj);
+  }
 
-        case 9: /* (  0 ,  1 ,  1 ) */
-          if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[3], cj->progeny[0], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
-            DOSUB2(r, ci->progeny[3], cj->progeny[4], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[4], -1, 0);
-          break;
+  if (gettimer) TIMER_TOC(TIMER_DOSUB_PAIR);
+}
 
-        case 10: /* (  0 ,  1 ,  0 ) */
-          if (ci->progeny[2] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[2], cj->progeny[0], -1, 0);
-          if (ci->progeny[2] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[2], cj->progeny[1], -1, 0);
-          if (ci->progeny[2] != NULL && cj->progeny[4] != NULL)
-            DOSUB2(r, ci->progeny[2], cj->progeny[4], -1, 0);
-          if (ci->progeny[2] != NULL && cj->progeny[5] != NULL)
-            DOSUB2(r, ci->progeny[2], cj->progeny[5], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[3], cj->progeny[0], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[3], cj->progeny[1], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
-            DOSUB2(r, ci->progeny[3], cj->progeny[4], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[5] != NULL)
-            DOSUB2(r, ci->progeny[3], cj->progeny[5], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[0], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[1], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[4] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[4], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[5] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[5], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[1], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[4], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[5] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[5], -1, 0);
-          break;
+/**
+ * @brief Compute grouped sub-cell interactions for self tasks (symmetric case)
+ *
+ * @param r The #runner.
+ * @param ci The first #cell.
+ * @param gettimer Do we have a timer ?
+ */
+void DOSUB_SELF2(struct runner *r, struct cell *ci, int gettimer) {
 
-        case 11: /* (  0 ,  1 , -1 ) */
-          if (ci->progeny[2] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[2], cj->progeny[1], -1, 0);
-          if (ci->progeny[2] != NULL && cj->progeny[5] != NULL)
-            DOSUB2(r, ci->progeny[2], cj->progeny[5], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[1] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[1], -1, 0);
-          if (ci->progeny[6] != NULL && cj->progeny[5] != NULL)
-            DOSUB2(r, ci->progeny[6], cj->progeny[5], -1, 0);
-          break;
+  const int ti_current = r->e->ti_current;
 
-        case 12: /* (  0 ,  0 ,  1 ) */
-          if (ci->progeny[1] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[1], cj->progeny[0], -1, 0);
-          if (ci->progeny[1] != NULL && cj->progeny[2] != NULL)
-            DOSUB2(r, ci->progeny[1], cj->progeny[2], -1, 0);
-          if (ci->progeny[1] != NULL && cj->progeny[4] != NULL)
-            DOSUB2(r, ci->progeny[1], cj->progeny[4], -1, 0);
-          if (ci->progeny[1] != NULL && cj->progeny[6] != NULL)
-            DOSUB2(r, ci->progeny[1], cj->progeny[6], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[3], cj->progeny[0], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[2] != NULL)
-            DOSUB2(r, ci->progeny[3], cj->progeny[2], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[4] != NULL)
-            DOSUB2(r, ci->progeny[3], cj->progeny[4], -1, 0);
-          if (ci->progeny[3] != NULL && cj->progeny[6] != NULL)
-            DOSUB2(r, ci->progeny[3], cj->progeny[6], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[5], cj->progeny[0], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[2] != NULL)
-            DOSUB2(r, ci->progeny[5], cj->progeny[2], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[4] != NULL)
-            DOSUB2(r, ci->progeny[5], cj->progeny[4], -1, 0);
-          if (ci->progeny[5] != NULL && cj->progeny[6] != NULL)
-            DOSUB2(r, ci->progeny[5], cj->progeny[6], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[0] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[2] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[2], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[4] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[4], -1, 0);
-          if (ci->progeny[7] != NULL && cj->progeny[6] != NULL)
-            DOSUB2(r, ci->progeny[7], cj->progeny[6], -1, 0);
-          break;
-      }
+  TIMER_TIC
 
-    }
+  /* Should we even bother? */
+  if (ci->ti_end_min > ti_current) return;
 
-    /* Otherwise, compute the pair directly. */
-    else if (ci->ti_end_min <= ti_current || cj->ti_end_min <= ti_current) {
+  /* Recurse? */
+  if (ci->split) {
 
-      /* Do any of the cells need to be sorted first? */
-      if (!(ci->sorted & (1 << sid))) runner_dosort(r, ci, (1 << sid), 1);
-      if (!(cj->sorted & (1 << sid))) runner_dosort(r, cj, (1 << sid), 1);
+    /* Loop over all progeny. */
+    for (int k = 0; k < 8; k++)
+      if (ci->progeny[k] != NULL) {
+        DOSUB_SELF2(r, ci->progeny[k], 0);
+        for (int j = k + 1; j < 8; j++)
+          if (ci->progeny[j] != NULL)
+            DOSUB_PAIR2(r, ci->progeny[k], ci->progeny[j], -1, 0);
+      }
 
-      /* Compute the interactions. */
-      DOPAIR2(r, ci, cj);
-    }
+  }
 
-  } /* otherwise, pair interaction. */
+  /* Otherwise, compute self-interaction. */
+  else
+    DOSELF2(r, ci);
 
-  if (gettimer) TIMER_TOC(TIMER_DOSUB);
+  if (gettimer) TIMER_TOC(TIMER_DOSUB_SELF);
 }
 
 void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts,
                   int *ind, int count, struct cell *cj, int sid, int gettimer) {
 
-  int j, k;
-  double shift[3];
-  float h;
   struct space *s = r->e->s;
-  struct cell *sub = NULL;
   const int ti_current = r->e->ti_current;
 
   TIMER_TIC
 
   /* Find out in which sub-cell of ci the parts are. */
-  for (k = 0; k < 8; k++)
+  struct cell *sub = NULL;
+  for (int k = 0; k < 8; k++)
     if (ci->progeny[k] != NULL) {
       // if ( parts[ ind[ 0 ] ].x[0] >= ci->progeny[k]->loc[0] &&
       //      parts[ ind[ 0 ] ].x[0] <= ci->progeny[k]->loc[0] +
@@ -2302,7 +2308,7 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts,
 
       /* Loop over all progeny. */
       DOSUB_SUBSET(r, sub, parts, ind, count, NULL, -1, 0);
-      for (j = 0; j < 8; j++)
+      for (int j = 0; j < 8; j++)
         if (ci->progeny[j] != sub && ci->progeny[j] != NULL)
           DOSUB_SUBSET(r, sub, parts, ind, count, ci->progeny[j], -1, 0);
 
@@ -2318,7 +2324,7 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts,
   else {
 
     /* Get the cell dimensions. */
-    h = fmin(ci->h[0], fmin(ci->h[1], ci->h[2]));
+    const float h = fmin(ci->h[0], fmin(ci->h[1], ci->h[2]));
 
     /* Recurse? */
     if (ci->split && cj->split &&
@@ -2326,6 +2332,7 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts,
             h / 2) {
 
       /* Get the type of pair if not specified explicitly. */
+      double shift[3] = {0.0, 0.0, 0.0};
       sid = space_getsid(s, &ci, &cj, shift);
 
       /* Different types of flags. */
@@ -2337,7 +2344,7 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts,
             DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[0],
                          -1, 0);
           if (ci->progeny[7] != NULL && cj->progeny[0] == sub)
-            DOSUB_SUBSET(r, ci->progeny[0], parts, ind, count, cj->progeny[7],
+            DOSUB_SUBSET(r, cj->progeny[0], parts, ind, count, ci->progeny[7],
                          -1, 0);
           break;
 
@@ -2834,7 +2841,8 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts,
     else if (ci->ti_end_min <= ti_current || cj->ti_end_min <= ti_current) {
 
       /* Get the relative distance between the pairs, wrapping. */
-      for (k = 0; k < 3; k++) {
+      double shift[3] = {0.0, 0.0, 0.0};
+      for (int k = 0; k < 3; k++) {
         if (cj->loc[k] - ci->loc[k] < -s->dim[k] / 2)
           shift[k] = s->dim[k];
         else if (cj->loc[k] - ci->loc[k] > s->dim[k] / 2)
@@ -2842,7 +2850,8 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts,
       }
 
       /* Get the sorting index. */
-      for (sid = 0, k = 0; k < 3; k++)
+      int sid = 0;
+      for (int k = 0; k < 3; k++)
         sid =
             3 * sid + ((cj->loc[k] - ci->loc[k] + shift[k] < 0)
                            ? 0
@@ -2850,7 +2859,7 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts,
       sid = sortlistID[sid];
 
       /* Do any of the cells need to be sorted first? */
-      if (!(cj->sorted & (1 << sid))) runner_dosort(r, cj, (1 << sid), 1);
+      if (!(cj->sorted & (1 << sid))) runner_do_sort(r, cj, (1 << sid), 1);
 
       /* Compute the interactions. */
       DOPAIR_SUBSET(r, ci, parts, ind, count, cj);
@@ -2858,5 +2867,5 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts,
 
   } /* otherwise, pair interaction. */
 
-  if (gettimer) TIMER_TOC(TIMER_DOSUB);
+  if (gettimer) TIMER_TOC(TIMER_DOSUB_PAIR);
 }
diff --git a/src/runner_doiact_grav.h b/src/runner_doiact_grav.h
index 02626295a49f314fef840bc044a476f5c9cf332d..e3788dfa1123584c913bca6baa6fc2db6698e6d0 100644
--- a/src/runner_doiact_grav.h
+++ b/src/runner_doiact_grav.h
@@ -19,11 +19,6 @@
 #ifndef SWIFT_RUNNER_DOIACT_GRAV_H
 #define SWIFT_RUNNER_DOIACT_GRAV_H
 
-/* Includes. */
-#include "cell.h"
-#include "clocks.h"
-#include "part.h"
-
 /**
  * @brief Compute the sorted gravity interactions between a cell pair.
  *
@@ -60,8 +55,8 @@ void runner_dopair_grav_new(struct runner *r, struct cell *ci,
   sid = space_getsid(e->s, &ci, &cj, shift);
 
   /* Make sure the cells are sorted. */
-  runner_dogsort(r, ci, (1 << sid), 0);
-  runner_dogsort(r, cj, (1 << sid), 0);
+  // runner_do_gsort(r, ci, (1 << sid), 0);
+  // runner_do_gsort(r, cj, (1 << sid), 0);
 
   /* Have the cells been sorted? */
   if (!(ci->gsorted & (1 << sid)) || !(cj->gsorted & (1 << sid)))
@@ -69,7 +64,7 @@ void runner_dopair_grav_new(struct runner *r, struct cell *ci,
 
   /* Get the cutoff shift. */
   for (rshift = 0.0, k = 0; k < 3; k++)
-    rshift += shift[k] * runner_shift[3 * sid + k];
+    rshift += shift[k] * runner_shift[sid][k];
 
   /* Pick-out the sorted lists. */
   sort_i = &ci->gsort[sid * (ci->count + 1)];
diff --git a/src/scheduler.c b/src/scheduler.c
index d1d343240b37f5afd5f41fecacf106b0e85f726f..39dab82a41197faa2f1d203b730774d7a2b9ea02 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -44,6 +44,9 @@
 #include "error.h"
 #include "intrinsics.h"
 #include "kernel_hydro.h"
+#include "queue.h"
+#include "space.h"
+#include "task.h"
 #include "timers.h"
 
 /**
@@ -66,8 +69,8 @@ void scheduler_addunlock(struct scheduler *s, struct task *ta,
     struct task **unlocks_new;
     int *unlock_ind_new;
     s->size_unlocks *= 2;
-    if ((unlocks_new = (struct task **)malloc(
-             sizeof(struct task *) *s->size_unlocks)) == NULL ||
+    if ((unlocks_new = (struct task **)malloc(sizeof(struct task *) *
+                                              s->size_unlocks)) == NULL ||
         (unlock_ind_new = (int *)malloc(sizeof(int) * s->size_unlocks)) == NULL)
       error("Failed to re-allocate unlocks.");
     memcpy(unlocks_new, s->unlocks, sizeof(struct task *) * s->nr_unlocks);
@@ -95,13 +98,11 @@ void scheduler_addunlock(struct scheduler *s, struct task *ta,
 
 void scheduler_splittasks(struct scheduler *s) {
 
-  const int pts[7][8] = {{-1, 12, 10, 9, 4, 3, 1, 0},
-                         {-1, -1, 11, 10, 5, 4, 2, 1},
-                         {-1, -1, -1, 12, 7, 6, 4, 3},
-                         {-1, -1, -1, -1, 8, 7, 5, 4},
-                         {-1, -1, -1, -1, -1, 12, 10, 9},
-                         {-1, -1, -1, -1, -1, -1, 11, 10},
-                         {-1, -1, -1, -1, -1, -1, -1, 12}};
+  const int pts[7][8] = {
+      {-1, 12, 10, 9, 4, 3, 1, 0},     {-1, -1, 11, 10, 5, 4, 2, 1},
+      {-1, -1, -1, 12, 7, 6, 4, 3},    {-1, -1, -1, -1, 8, 7, 5, 4},
+      {-1, -1, -1, -1, -1, 12, 10, 9}, {-1, -1, -1, -1, -1, -1, 11, 10},
+      {-1, -1, -1, -1, -1, -1, -1, 12}};
   const float sid_scale[13] = {0.1897, 0.4025, 0.1897, 0.4025, 0.5788,
                                0.4025, 0.1897, 0.4025, 0.1897, 0.4025,
                                0.5788, 0.4025, 0.5788};
@@ -175,7 +176,7 @@ void scheduler_splittasks(struct scheduler *s) {
         if (scheduler_dosub && ci->count < space_subsize / ci->count) {
 
           /* convert to a self-subtask. */
-          t->type = task_type_sub;
+          t->type = task_type_sub_self;
 
         }
 
@@ -237,7 +238,7 @@ void scheduler_splittasks(struct scheduler *s) {
             sid != 0 && sid != 2 && sid != 6 && sid != 8) {
 
           /* Make this task a sub task. */
-          t->type = task_type_sub;
+          t->type = task_type_sub_pair;
 
         }
 
@@ -517,132 +518,6 @@ void scheduler_splittasks(struct scheduler *s) {
 
     } /* pair interaction? */
 
-    /* Gravity interaction? */
-    else if (t->type == task_type_grav_mm) {
-
-      /* Get a handle on the cells involved. */
-      struct cell *ci = t->ci;
-      struct cell *cj = t->cj;
-
-      /* Self-interaction? */
-      if (cj == NULL) {
-
-        /* Ignore this task if the cell has no gparts. */
-        if (ci->gcount == 0) t->type = task_type_none;
-
-        /* If the cell is split, recurse. */
-        else if (ci->split) {
-
-          /* Make a single sub-task? */
-          if (scheduler_dosub && ci->gcount < space_subsize / ci->gcount) {
-
-            t->type = task_type_sub;
-            t->subtype = task_subtype_grav;
-
-          }
-
-          /* Otherwise, just split the task. */
-          else {
-
-            /* Split this task into tasks on its progeny. */
-            t->type = task_type_none;
-            for (int j = 0; j < 8; j++)
-              if (ci->progeny[j] != NULL && ci->progeny[j]->gcount > 0) {
-                if (t->type == task_type_none) {
-                  t->type = task_type_grav_mm;
-                  t->ci = ci->progeny[j];
-                  t->cj = NULL;
-                } else
-                  t = scheduler_addtask(s, task_type_grav_mm, task_subtype_none,
-                                        0, 0, ci->progeny[j], NULL, 0);
-                for (int k = j + 1; k < 8; k++)
-                  if (ci->progeny[k] != NULL && ci->progeny[k]->gcount > 0) {
-                    if (t->type == task_type_none) {
-                      t->type = task_type_grav_mm;
-                      t->ci = ci->progeny[j];
-                      t->cj = ci->progeny[k];
-                    } else
-                      t = scheduler_addtask(s, task_type_grav_mm,
-                                            task_subtype_none, 0, 0,
-                                            ci->progeny[j], ci->progeny[k], 0);
-                  }
-              }
-            redo = (t->type != task_type_none);
-          }
-
-        }
-
-        /* Otherwise, just make a pp task out of it. */
-        else
-          t->type = task_type_grav_pp;
-
-      }
-
-      /* Nope, pair. */
-      else {
-
-        /* Make a sub-task? */
-        if (scheduler_dosub && ci->gcount < space_subsize / cj->gcount) {
-
-          t->type = task_type_sub;
-          t->subtype = task_subtype_grav;
-
-        }
-
-        /* Otherwise, split the task. */
-        else {
-
-          /* Get the opening angle theta. */
-          float dx[3], theta;
-          for (int k = 0; k < 3; k++) {
-            dx[k] = fabs(ci->loc[k] - cj->loc[k]);
-            if (s->space->periodic && dx[k] > 0.5 * s->space->dim[k])
-              dx[k] = -dx[k] + s->space->dim[k];
-            if (dx[k] > 0.0f) dx[k] -= ci->h[k];
-          }
-          theta =
-              (dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2]) /
-              (ci->h[0] * ci->h[0] + ci->h[1] * ci->h[1] + ci->h[2] * ci->h[2]);
-
-          /* Ignore this task if the cell has no gparts. */
-          if (ci->gcount == 0 || cj->gcount == 0) t->type = task_type_none;
-
-          /* Split the interaction? */
-          else if (theta < const_theta_max * const_theta_max) {
-
-            /* Are both ci and cj split? */
-            if (ci->split && cj->split) {
-
-              /* Split this task into tasks on its progeny. */
-              t->type = task_type_none;
-              for (int j = 0; j < 8; j++)
-                if (ci->progeny[j] != NULL && ci->progeny[j]->gcount > 0) {
-                  for (int k = 0; k < 8; k++)
-                    if (cj->progeny[k] != NULL && cj->progeny[k]->gcount > 0) {
-                      if (t->type == task_type_none) {
-                        t->type = task_type_grav_mm;
-                        t->ci = ci->progeny[j];
-                        t->cj = cj->progeny[k];
-                      } else
-                        t = scheduler_addtask(
-                            s, task_type_grav_mm, task_subtype_none, 0, 0,
-                            ci->progeny[j], cj->progeny[k], 0);
-                    }
-                }
-              redo = (t->type != task_type_none);
-
-            }
-
-            /* Otherwise, make a pp task out of it. */
-            else
-              t->type = task_type_grav_pp;
-          }
-        }
-
-      } /* gravity pair interaction? */
-
-    } /* gravity interaction? */
-
   } /* loop over all tasks. */
 }
 
@@ -687,6 +562,8 @@ struct task *scheduler_addtask(struct scheduler *s, int type, int subtype,
   t->tic = 0;
   t->toc = 0;
   t->nr_unlock_tasks = 0;
+  t->rid = -1;
+  t->last_rid = -1;
 
   /* Init the lock. */
   lock_init(&t->lock);
@@ -808,10 +685,12 @@ void scheduler_ranktasks(struct scheduler *s) {
     left = j;
   }
 
+#ifdef SWIFT_DEBUG_CHECKS
   /* Verify that the tasks were ranked correctly. */
-  /* for ( k = 1 ; k < s->nr_tasks ; k++ )
-      if ( tasks[ tid[k-1] ].rank > tasks[ tid[k-1] ].rank )
-          error( "Task ranking failed." ); */
+  for (int k = 1; k < s->nr_tasks; k++)
+    if (tasks[tid[k - 1]].rank > tasks[tid[k - 1]].rank)
+      error("Task ranking failed.");
+#endif
 }
 
 /**
@@ -831,7 +710,8 @@ void scheduler_reset(struct scheduler *s, int size) {
     if (s->tasks_ind != NULL) free(s->tasks_ind);
 
     /* Allocate the new lists. */
-    if ((s->tasks = (struct task *)malloc(sizeof(struct task) *size)) == NULL ||
+    if ((s->tasks = (struct task *)malloc(sizeof(struct task) * size)) ==
+            NULL ||
         (s->tasks_ind = (int *)malloc(sizeof(int) * size)) == NULL)
       error("Failed to allocate task lists.");
   }
@@ -898,23 +778,23 @@ void scheduler_reweight(struct scheduler *s) {
             t->weight +=
                 2 * wscale * t->ci->count * t->cj->count * sid_scale[t->flags];
           break;
-        case task_type_sub:
-          if (t->cj != NULL) {
-            if (t->ci->nodeID != nodeID || t->cj->nodeID != nodeID) {
-              if (t->flags < 0)
-                t->weight += 3 * wscale * t->ci->count * t->cj->count;
-              else
-                t->weight += 3 * wscale * t->ci->count * t->cj->count *
-                             sid_scale[t->flags];
-            } else {
-              if (t->flags < 0)
-                t->weight += 2 * wscale * t->ci->count * t->cj->count;
-              else
-                t->weight += 2 * wscale * t->ci->count * t->cj->count *
-                             sid_scale[t->flags];
-            }
-          } else
-            t->weight += 1 * wscale * t->ci->count * t->ci->count;
+        case task_type_sub_pair:
+          if (t->ci->nodeID != nodeID || t->cj->nodeID != nodeID) {
+            if (t->flags < 0)
+              t->weight += 3 * wscale * t->ci->count * t->cj->count;
+            else
+              t->weight += 3 * wscale * t->ci->count * t->cj->count *
+                           sid_scale[t->flags];
+          } else {
+            if (t->flags < 0)
+              t->weight += 2 * wscale * t->ci->count * t->cj->count;
+            else
+              t->weight += 2 * wscale * t->ci->count * t->cj->count *
+                           sid_scale[t->flags];
+          }
+          break;
+        case task_type_sub_self:
+          t->weight += 1 * wscale * t->ci->count * t->ci->count;
           break;
         case task_type_ghost:
           if (t->ci == t->ci->super) t->weight += wscale * t->ci->count;
@@ -1081,6 +961,7 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
        any pre-processing needed. */
     switch (t->type) {
       case task_type_self:
+      case task_type_sub_self:
       case task_type_sort:
       case task_type_ghost:
       case task_type_kick:
@@ -1089,11 +970,10 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
         qid = t->ci->super->owner;
         break;
       case task_type_pair:
-      case task_type_sub:
+      case task_type_sub_pair:
         qid = t->ci->super->owner;
-        if (t->cj != NULL &&
-            (qid < 0 ||
-             s->queues[qid].count > s->queues[t->cj->super->owner].count))
+        if (qid < 0 ||
+            s->queues[qid].count > s->queues[t->cj->super->owner].count)
           qid = t->cj->super->owner;
         break;
       case task_type_recv:
@@ -1253,7 +1133,7 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
          tries++) {
 
       /* Try to get a task from the suggested queue. */
-      if (s->queues[qid].count > 0) {
+      if (s->queues[qid].count > 0 || s->queues[qid].count_incoming > 0) {
         TIMER_TIC
         res = queue_gettask(&s->queues[qid], prev, 0);
         TIMER_TOC(timer_qget);
@@ -1264,7 +1144,9 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
       if (s->flags & scheduler_flag_steal) {
         int count = 0, qids[nr_queues];
         for (int k = 0; k < nr_queues; k++)
-          if (s->queues[k].count > 0) qids[count++] = k;
+          if (s->queues[k].count > 0 || s->queues[k].count_incoming > 0) {
+            qids[count++] = k;
+          }
         for (int k = 0; k < scheduler_maxsteal && count > 0; k++) {
           const int ind = rand_r(&seed) % count;
           TIMER_TIC
@@ -1336,7 +1218,7 @@ void scheduler_init(struct scheduler *s, struct space *space, int nr_tasks,
 
   /* Init the unlocks. */
   if ((s->unlocks = (struct task **)malloc(
-           sizeof(struct task *) *scheduler_init_nr_unlocks)) == NULL ||
+           sizeof(struct task *) * scheduler_init_nr_unlocks)) == NULL ||
       (s->unlock_ind =
            (int *)malloc(sizeof(int) * scheduler_init_nr_unlocks)) == NULL)
     error("Failed to allocate unlocks.");
diff --git a/src/scheduler.h b/src/scheduler.h
index 64c694aea295c13810a20b626055fc6c15eb0af8..62af23152e7e2d2bc68e0cc4d7122f4dd0483aa1 100644
--- a/src/scheduler.h
+++ b/src/scheduler.h
@@ -35,7 +35,6 @@
 #include "cell.h"
 #include "lock.h"
 #include "queue.h"
-#include "space.h"
 #include "task.h"
 
 /* Some constants. */
@@ -89,7 +88,7 @@ struct scheduler {
   int nr_unlocks, size_unlocks;
 
   /* Lock for this scheduler. */
-  lock_type lock;
+  swift_lock_type lock;
 
   /* Waiting queue. */
   pthread_mutex_t sleep_mutex;
diff --git a/src/serial_io.c b/src/serial_io.c
index 10eab97f1bf118a842e274b521056d0d81b32db1..7e78276dc83430655b4ea4de2fb7425e71e07966 100644
--- a/src/serial_io.c
+++ b/src/serial_io.c
@@ -26,22 +26,22 @@
 /* Some standard headers. */
 #include <hdf5.h>
 #include <math.h>
+#include <mpi.h>
 #include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
-/* MPI headers. */
-#ifdef WITH_MPI
-#include <mpi.h>
-#endif
-
 /* This object's header. */
 #include "serial_io.h"
 
 /* Local includes. */
 #include "common_io.h"
+#include "engine.h"
 #include "error.h"
+#include "kernel_hydro.h"
+#include "part.h"
+#include "units.h"
 
 /*-----------------------------------------------------------------------------
  * Routines reading an IC file
@@ -590,6 +590,7 @@ void read_ic_serial(char* fileName, double dim[3], struct part** parts,
  * @brief Writes an HDF5 output file (GADGET-3 type) with its XMF descriptor
  *
  * @param e The engine containing all the system.
+ * @param baseName The common part of the snapshot file name.
  * @param us The UnitSystem used for the conversion of units in the output.
  * @param mpi_rank The MPI rank of this node.
  * @param mpi_size The number of MPI ranks.
@@ -604,9 +605,10 @@ void read_ic_serial(char* fileName, double dim[3], struct part** parts,
  * Calls #error() if an error occurs.
  *
  */
-void write_output_serial(struct engine* e, struct UnitSystem* us, int mpi_rank,
-                         int mpi_size, MPI_Comm comm, MPI_Info info) {
-  hid_t h_file = 0, h_grp = 0, h_grpsph = 0;
+void write_output_serial(struct engine* e, const char* baseName,
+                         struct UnitSystem* us, int mpi_rank, int mpi_size,
+                         MPI_Comm comm, MPI_Info info) {
+  hid_t h_file = 0, h_grp = 0;
   const size_t Ngas = e->s->nr_parts;
   const size_t Ntot = e->s->nr_gparts;
   int periodic = e->s->periodic;
@@ -617,16 +619,13 @@ void write_output_serial(struct engine* e, struct UnitSystem* us, int mpi_rank,
   static int outputCount = 0;
   FILE* xmfFile = 0;
 
-  /* Number of particles of each type */
-  // const size_t Ndm = Ntot - Ngas;
-
-  /* MATTHIEU: Temporary fix to preserve master */
+  /* Number of unassociated gparts */
   const size_t Ndm = Ntot > 0 ? Ntot - Ngas : 0;
-  /* MATTHIEU: End temporary fix */
 
   /* File name */
   char fileName[FILENAME_BUFFER_SIZE];
-  snprintf(fileName, FILENAME_BUFFER_SIZE, "output_%03i.hdf5", outputCount);
+  snprintf(fileName, FILENAME_BUFFER_SIZE, "%s_%03i.hdf5", baseName,
+           outputCount);
 
   /* Compute offset in the file and total number of particles */
   size_t N[NUM_PARTICLE_TYPES] = {Ngas, Ndm, 0};
@@ -646,10 +645,10 @@ void write_output_serial(struct engine* e, struct UnitSystem* us, int mpi_rank,
   if (mpi_rank == 0) {
 
     /* First time, we need to create the XMF file */
-    if (outputCount == 0) createXMFfile();
+    if (outputCount == 0) createXMFfile(baseName);
 
     /* Prepare the XMF file for the new entry */
-    xmfFile = prepareXMFfile();
+    xmfFile = prepareXMFfile(baseName);
 
     /* Write the part corresponding to this specific output */
     writeXMFoutputheader(xmfFile, fileName, e->time);
@@ -711,10 +710,17 @@ void write_output_serial(struct engine* e, struct UnitSystem* us, int mpi_rank,
     writeCodeDescription(h_file);
 
     /* Print the SPH parameters */
-    h_grpsph = H5Gcreate(h_file, "/SPH", H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-    if (h_grpsph < 0) error("Error while creating SPH group");
-    writeSPHflavour(h_grpsph);
-    H5Gclose(h_grpsph);
+    h_grp = H5Gcreate(h_file, "/SPH", H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+    if (h_grp < 0) error("Error while creating SPH group");
+    writeSPHflavour(h_grp);
+    H5Gclose(h_grp);
+
+    /* Print the runtime parameters */
+    h_grp =
+        H5Gcreate(h_file, "/Parameters", H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+    if (h_grp < 0) error("Error while creating parameters group");
+    parser_write_params_to_hdf5(e->parameter_file, h_grp);
+    H5Gclose(h_grp);
 
     /* Print the system of Units */
     writeUnitSystem(h_file, us);
diff --git a/src/serial_io.h b/src/serial_io.h
index 74ab8326dbeeb955e354687059cdd595657285f0..6b64624772214494a43316d3a8aa3910c7238dc8 100644
--- a/src/serial_io.h
+++ b/src/serial_io.h
@@ -19,6 +19,9 @@
 #ifndef SWIFT_SERIAL_IO_H
 #define SWIFT_SERIAL_IO_H
 
+/* Config parameters. */
+#include "../config.h"
+
 /* MPI headers. */
 #ifdef WITH_MPI
 #include <mpi.h>
@@ -36,8 +39,9 @@ void read_ic_serial(char* fileName, double dim[3], struct part** parts,
                     int* periodic, int mpi_rank, int mpi_size, MPI_Comm comm,
                     MPI_Info info, int dry_run);
 
-void write_output_serial(struct engine* e, struct UnitSystem* us, int mpi_rank,
-                         int mpi_size, MPI_Comm comm, MPI_Info info);
+void write_output_serial(struct engine* e, const char* baseName,
+                         struct UnitSystem* us, int mpi_rank, int mpi_size,
+                         MPI_Comm comm, MPI_Info info);
 
 #endif
 
diff --git a/src/single_io.c b/src/single_io.c
index 1dc71087e102ff884dba7b7d4b6dcd6339335cac..3f65aae0b5d495670f2b4862e466ec849f997d63 100644
--- a/src/single_io.c
+++ b/src/single_io.c
@@ -35,9 +35,12 @@
 #include "single_io.h"
 
 /* Local includes. */
-#include "const.h"
 #include "common_io.h"
+#include "engine.h"
 #include "error.h"
+#include "kernel_hydro.h"
+#include "part.h"
+#include "units.h"
 
 /*-----------------------------------------------------------------------------
  * Routines reading an IC file
@@ -456,7 +459,8 @@ void read_ic_single(char* fileName, double dim[3], struct part** parts,
  * @brief Writes an HDF5 output file (GADGET-3 type) with its XMF descriptor
  *
  * @param e The engine containing all the system.
- * @param us The UnitSystem used for the conversion of units in the output
+ * @param baseName The common part of the snapshot file name.
+ * @param us The UnitSystem used for the conversion of units in the output.
  *
  * Creates an HDF5 output file and writes the particles contained
  * in the engine. If such a file already exists, it is erased and replaced
@@ -466,7 +470,8 @@ void read_ic_single(char* fileName, double dim[3], struct part** parts,
  * Calls #error() if an error occurs.
  *
  */
-void write_output_single(struct engine* e, struct UnitSystem* us) {
+void write_output_single(struct engine* e, const char* baseName,
+                         struct UnitSystem* us) {
 
   hid_t h_file = 0, h_grp = 0;
   const size_t Ngas = e->s->nr_parts;
@@ -478,25 +483,22 @@ void write_output_single(struct engine* e, struct UnitSystem* us) {
   struct gpart* dmparts = NULL;
   static int outputCount = 0;
 
-  /* Number of particles of each type */
-  // const size_t Ndm = Ntot - Ngas;
-
-  /* MATTHIEU: Temporary fix to preserve master */
+  /* Number of unassociated gparts */
   const size_t Ndm = Ntot > 0 ? Ntot - Ngas : 0;
-  /* MATTHIEU: End temporary fix */
 
   long long N_total[NUM_PARTICLE_TYPES] = {Ngas, Ndm, 0};
 
   /* File name */
   char fileName[FILENAME_BUFFER_SIZE];
-  snprintf(fileName, FILENAME_BUFFER_SIZE, "output_%03i.hdf5", outputCount);
+  snprintf(fileName, FILENAME_BUFFER_SIZE, "%s_%03i.hdf5", baseName,
+           outputCount);
 
   /* First time, we need to create the XMF file */
-  if (outputCount == 0) createXMFfile();
+  if (outputCount == 0) createXMFfile(baseName);
 
   /* Prepare the XMF file for the new entry */
   FILE* xmfFile = 0;
-  xmfFile = prepareXMFfile();
+  xmfFile = prepareXMFfile(baseName);
 
   /* Write the part corresponding to this specific output */
   writeXMFoutputheader(xmfFile, fileName, e->time);
@@ -563,6 +565,13 @@ void write_output_single(struct engine* e, struct UnitSystem* us) {
   writeSPHflavour(h_grp);
   H5Gclose(h_grp);
 
+  /* Print the runtime parameters */
+  h_grp =
+      H5Gcreate(h_file, "/Parameters", H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+  if (h_grp < 0) error("Error while creating parameters group");
+  parser_write_params_to_hdf5(e->parameter_file, h_grp);
+  H5Gclose(h_grp);
+
   /* Print the system of Units */
   writeUnitSystem(h_file, us);
 
diff --git a/src/single_io.h b/src/single_io.h
index 587ebe07b6fa2b984b964baf282e7ceb1003ad29..d2c87655e1c91b92e8ccf2aa50d2e0bf98f13482 100644
--- a/src/single_io.h
+++ b/src/single_io.h
@@ -19,6 +19,9 @@
 #ifndef SWIFT_SINGLE_IO_H
 #define SWIFT_SINGLE_IO_H
 
+/* Config parameters. */
+#include "../config.h"
+
 /* Includes. */
 #include "engine.h"
 #include "part.h"
@@ -30,7 +33,8 @@ void read_ic_single(char* fileName, double dim[3], struct part** parts,
                     struct gpart** gparts, size_t* Ngas, size_t* Ndm,
                     int* periodic, int dry_run);
 
-void write_output_single(struct engine* e, struct UnitSystem* us);
+void write_output_single(struct engine* e, const char* baseName,
+                         struct UnitSystem* us);
 
 #endif
 
diff --git a/src/space.c b/src/space.c
index 7609ddd426b3deac354998d071c802367aaa5fe5..0b42a3643f5740881da85e0533c02f7d8a0ee958 100644
--- a/src/space.c
+++ b/src/space.c
@@ -1,22 +1,25 @@
 /*******************************************************************************
-* This file is part of SWIFT.
-* Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
-*               2016 Peter W. Draper (p.w.draper@durham.ac.uk)
-*
-* This program is free software: you can redistribute it and/or modify
-* it under the terms of the GNU Lesser General Public License as published
-* by the Free Software Foundation, either version 3 of the License, or
-* (at your option) any later version.
-*
-* This program is distributed in the hope that it will be useful,
-* but WITHOUT ANY WARRANTY; without even the implied warranty of
-* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-* GNU General Public License for more details.
-*
-* You should have received a copy of the GNU Lesser General Public License
-* along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*
-******************************************************************************/
+ * This file is part of SWIFT.
+ * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
+ *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ *               2015 Peter W. Draper (p.w.draper@durham.ac.uk)
+ *               2016 John A. Regan (john.a.regan@durham.ac.uk)
+ *                    Tom Theuns (tom.theuns@durham.ac.uk)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
 
 /* Config parameters. */
 #include "../config.h"
@@ -25,8 +28,8 @@
 #include <float.h>
 #include <limits.h>
 #include <math.h>
-#include <string.h>
 #include <stdlib.h>
+#include <string.h>
 
 /* MPI headers. */
 #ifdef WITH_MPI
@@ -164,15 +167,17 @@ void space_regrid(struct space *s, double cell_max, int verbose) {
   /* Run through the parts and get the current h_max. */
   // tic = getticks();
   float h_max = s->cell_min / kernel_gamma / space_stretch;
-  if (s->cells != NULL) {
-    for (int k = 0; k < s->nr_cells; k++) {
-      if (s->cells[k].h_max > h_max) h_max = s->cells[k].h_max;
-    }
-  } else {
-    for (size_t k = 0; k < nr_parts; k++) {
-      if (s->parts[k].h > h_max) h_max = s->parts[k].h;
+  if (nr_parts > 0) {
+    if (s->cells != NULL) {
+      for (int k = 0; k < s->nr_cells; k++) {
+        if (s->cells[k].h_max > h_max) h_max = s->cells[k].h_max;
+      }
+    } else {
+      for (size_t k = 0; k < nr_parts; k++) {
+        if (s->parts[k].h > h_max) h_max = s->parts[k].h;
+      }
+      s->h_max = h_max;
     }
-    s->h_max = h_max;
   }
 
 /* If we are running in parallel, make sure everybody agrees on
@@ -200,9 +205,9 @@ void space_regrid(struct space *s, double cell_max, int verbose) {
         "Must have at least 3 cells in each spatial dimension when periodicity "
         "is switched on.");
 
-  /* In MPI-Land, changing the top-level cell size requires that the
-   * global partition is recomputed and the particles redistributed.
-   * Be prepared to do that. */
+/* In MPI-Land, changing the top-level cell size requires that the
+ * global partition is recomputed and the particles redistributed.
+ * Be prepared to do that. */
 #ifdef WITH_MPI
   double oldh[3];
   double oldcdim[3];
@@ -296,10 +301,11 @@ void space_regrid(struct space *s, double cell_max, int verbose) {
        * cells around the nodes. We repartition using the old space node
        * positions as a grid to resample. */
       if (s->e->nodeID == 0)
-        message("basic cell dimensions have increased - recalculating the "
-                "global partition.");
+        message(
+            "basic cell dimensions have increased - recalculating the "
+            "global partition.");
 
-      if (!partition_space_to_space(oldh, oldcdim, oldnodeIDs, s) ) {
+      if (!partition_space_to_space(oldh, oldcdim, oldnodeIDs, s)) {
 
         /* Failed, try another technique that requires no settings. */
         message("Failed to get a new partition, trying less optimal method");
@@ -371,7 +377,7 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
   const ticks tic = getticks();
 
   /* Be verbose about this. */
-  // message( "re)building space..." ); fflush(stdout);
+  // message("re)building space..."); fflush(stdout);
 
   /* Re-grid if necessary, or just re-set the cell data. */
   space_regrid(s, cell_max, verbose);
@@ -425,9 +431,10 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
 // clocks_from_ticks(getticks() - tic), clocks_getunit());
 
 #ifdef WITH_MPI
+
   /* Move non-local parts to the end of the list. */
   const int local_nodeID = s->e->nodeID;
-  for (size_t k = 0; k < nr_parts; k++)
+  for (size_t k = 0; k < nr_parts;) {
     if (cells[ind[k]].nodeID != local_nodeID) {
       cells[ind[k]].count -= 1;
       nr_parts -= 1;
@@ -446,10 +453,28 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
       const int t = ind[k];
       ind[k] = ind[nr_parts];
       ind[nr_parts] = t;
+    } else {
+      /* Increment when not exchanging otherwise we need to retest "k".*/
+      k++;
+    }
+  }
+
+#ifdef SWIFT_DEBUG_CHECKS
+  /* Check that all parts are in the correct places. */
+  for (size_t k = 0; k < nr_parts; k++) {
+    if (cells[ind[k]].nodeID != local_nodeID) {
+      error("Failed to move all non-local parts to send list");
+    }
+  }
+  for (size_t k = nr_parts; k < s->nr_parts; k++) {
+    if (cells[ind[k]].nodeID == local_nodeID) {
+      error("Failed to remove local parts from send list");
     }
+  }
+#endif
 
   /* Move non-local gparts to the end of the list. */
-  for (int k = 0; k < nr_gparts; k++)
+  for (int k = 0; k < nr_gparts;) {
     if (cells[gind[k]].nodeID != local_nodeID) {
       cells[gind[k]].gcount -= 1;
       nr_gparts -= 1;
@@ -466,19 +491,33 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
       const int t = gind[k];
       gind[k] = gind[nr_gparts];
       gind[nr_gparts] = t;
+    } else {
+      /* Increment when not exchanging otherwise we need to retest "k".*/
+      k++;
+    }
+  }
+
+#ifdef SWIFT_DEBUG_CHECKS
+  /* Check that all gparts are in the correct place (untested). */
+  for (size_t k = 0; k < nr_gparts; k++) {
+    if (cells[gind[k]].nodeID != local_nodeID) {
+      error("Failed to move all non-local gparts to send list");
+    }
+  }
+  for (size_t k = nr_gparts; k < s->nr_gparts; k++) {
+    if (cells[gind[k]].nodeID == local_nodeID) {
+      error("Failed to remove local gparts from send list");
     }
+  }
+#endif
 
   /* Exchange the strays, note that this potentially re-allocates
      the parts arrays. */
-  /* TODO: This function also exchanges gparts, but this is shorted-out
-     until they are fully implemented. */
   size_t nr_parts_exchanged = s->nr_parts - nr_parts;
   size_t nr_gparts_exchanged = s->nr_gparts - nr_gparts;
   engine_exchange_strays(s->e, nr_parts, &ind[nr_parts], &nr_parts_exchanged,
                          nr_gparts, &gind[nr_gparts], &nr_gparts_exchanged);
 
-  /* Add post-processing, i.e. re-linking/creating of gparts here. */
-
   /* Set the new particle counts. */
   s->nr_parts = nr_parts + nr_parts_exchanged;
   s->nr_gparts = nr_gparts + nr_gparts_exchanged;
@@ -488,7 +527,7 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
     int *ind_new;
     if ((ind_new = (int *)malloc(sizeof(int) * s->nr_parts)) == NULL)
       error("Failed to allocate temporary particle indices.");
-    memcpy(ind_new, ind, sizeof(size_t) * nr_parts);
+    memcpy(ind_new, ind, sizeof(int) * nr_parts);
     free(ind);
     ind = ind_new;
   }
@@ -499,12 +538,15 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
     ind[k] =
         cell_getid(cdim, p->x[0] * ih[0], p->x[1] * ih[1], p->x[2] * ih[2]);
     cells[ind[k]].count += 1;
-    /* if ( cells[ ind[k] ].nodeID != nodeID )
-        error( "Received part that does not belong to me (nodeID=%i)." , cells[
-       ind[k] ].nodeID ); */
+#ifdef SWIFT_DEBUG_CHECKS
+    if (cells[ind[k]].nodeID != local_nodeID)
+      error("Received part that does not belong to me (nodeID=%i).",
+            cells[ind[k]].nodeID);
+#endif
   }
   nr_parts = s->nr_parts;
-#endif
+
+#endif /* WITH_MPI */
 
   /* Sort the parts according to their cells. */
   space_parts_sort(s, ind, nr_parts, 0, s->nr_cells - 1, verbose);
@@ -512,15 +554,18 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
   /* Re-link the gparts. */
   part_relink_gparts(s->parts, nr_parts, 0);
 
+#ifdef SWIFT_DEBUG_CHECKS
   /* Verify space_sort_struct. */
-  /* for ( k = 1 ; k < nr_parts ; k++ ) {
-      if ( ind[k-1] > ind[k] ) {
-          error( "Sort failed!" );
-          }
-      else if ( ind[k] != cell_getid( cdim , parts[k].x[0]*ih[0] ,
-     parts[k].x[1]*ih[1] , parts[k].x[2]*ih[2] ) )
-          error( "Incorrect indices!" );
-      } */
+  for (size_t k = 1; k < nr_parts; k++) {
+    if (ind[k - 1] > ind[k]) {
+      error("Sort failed!");
+    } else if (ind[k] != cell_getid(cdim, s->parts[k].x[0] * ih[0],
+                                    s->parts[k].x[1] * ih[1],
+                                    s->parts[k].x[2] * ih[2])) {
+      error("Incorrect indices!");
+    }
+  }
+#endif
 
   /* We no longer need the indices as of here. */
   free(ind);
@@ -560,8 +605,8 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
   /* We no longer need the indices as of here. */
   free(gind);
 
+#ifdef SWIFT_DEBUG_CHECKS
   /* Verify that the links are correct */
-  /* MATTHIEU: To be commented out once we are happy */
   for (size_t k = 0; k < nr_gparts; ++k) {
 
     if (s->gparts[k].id_or_neg_offset < 0) {
@@ -582,6 +627,7 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
       error("Linking problem !");
     }
   }
+#endif
 
   /* Hook the cells up to the parts. */
   // tic = getticks();
@@ -671,13 +717,14 @@ void space_parts_sort(struct space *s, int *ind, size_t N, int min, int max,
   /* Launch the sorting tasks. */
   engine_launch(s->e, s->e->nr_threads, (1 << task_type_part_sort), 0);
 
+#ifdef SWIFT_DEBUG_CHECKS
   /* Verify space_sort_struct. */
-  /* for (int i = 1; i < N; i++)
+  for (int i = 1; i < N; i++)
     if (ind[i - 1] > ind[i])
       error("Sorting failed (ind[%i]=%i,ind[%i]=%i), min=%i, max=%i.", i - 1,
-  ind[i - 1], i,
-            ind[i], min, max);
-  message("Sorting succeeded."); */
+            ind[i - 1], i, ind[i], min, max);
+  message("Sorting succeeded.");
+#endif
 
   /* Clean up. */
   free(space_sort_struct.stack);
@@ -739,19 +786,21 @@ void space_do_parts_sort() {
         }
       }
 
+#ifdef SWIFT_DEBUG_CHECKS
       /* Verify space_sort_struct. */
-      /* for (int k = i; k <= jj; k++)
+      for (int k = i; k <= jj; k++)
         if (ind[k] > pivot) {
-          message("sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i.", k,
-                  ind[k], pivot, i, j);
+          message("sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%li, j=%li.",
+                  k, ind[k], pivot, i, j);
           error("Partition failed (<=pivot).");
         }
       for (int k = jj + 1; k <= j; k++)
         if (ind[k] <= pivot) {
-          message("sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i.", k,
-                  ind[k], pivot, i, j);
+          message("sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%li, j=%li.",
+                  k, ind[k], pivot, i, j);
           error("Partition failed (>pivot).");
-        } */
+        }
+#endif
 
       /* Split-off largest interval. */
       if (jj - i > j - jj + 1) {
@@ -851,13 +900,14 @@ void space_gparts_sort(struct space *s, int *ind, size_t N, int min, int max,
   /* Launch the sorting tasks. */
   engine_launch(s->e, s->e->nr_threads, (1 << task_type_gpart_sort), 0);
 
+#ifdef SWIFT_DEBUG_CHECKS
   /* Verify space_sort_struct. */
-  /* for (int i = 1; i < N; i++)
+  for (int i = 1; i < N; i++)
     if (ind[i - 1] > ind[i])
       error("Sorting failed (ind[%i]=%i,ind[%i]=%i), min=%i, max=%i.", i - 1,
-  ind[i - 1], i,
-            ind[i], min, max);
-  message("Sorting succeeded."); */
+            ind[i - 1], i, ind[i], min, max);
+  message("Sorting succeeded.");
+#endif
 
   /* Clean up. */
   free(space_sort_struct.stack);
@@ -915,19 +965,21 @@ void space_do_gparts_sort() {
         }
       }
 
+#ifdef SWIFT_DEBUG_CHECKS
       /* Verify space_sort_struct. */
-      /* for (int k = i; k <= jj; k++)
+      for (int k = i; k <= jj; k++)
         if (ind[k] > pivot) {
-          message("sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i.", k,
-                  ind[k], pivot, i, j);
+          message("sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%li, j=%li.",
+                  k, ind[k], pivot, i, j);
           error("Partition failed (<=pivot).");
         }
       for (int k = jj + 1; k <= j; k++)
         if (ind[k] <= pivot) {
-          message("sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i.", k,
-                  ind[k], pivot, i, j);
+          message("sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%li, j=%li.",
+                  k, ind[k], pivot, i, j);
           error("Partition failed (>pivot).");
-        } */
+        }
+#endif
 
       /* Split-off largest interval. */
       if (jj - i > j - jj + 1) {
@@ -1209,7 +1261,7 @@ void space_do_split(struct space *s, struct cell *c) {
       temp->depth = c->depth + 1;
       temp->split = 0;
       temp->h_max = 0.0;
-      temp->dx_max = 0.0;
+      temp->dx_max = 0.f;
       temp->nodeID = c->nodeID;
       temp->parent = c;
       c->progeny[k] = temp;
@@ -1254,16 +1306,19 @@ void space_do_split(struct space *s, struct cell *c) {
       struct xpart *xp = &xparts[k];
       const float h = p->h;
       const int ti_end = p->ti_end;
-      xp->x_old[0] = p->x[0];
-      xp->x_old[1] = p->x[1];
-      xp->x_old[2] = p->x[2];
+      xp->x_diff[0] = 0.f;
+      xp->x_diff[1] = 0.f;
+      xp->x_diff[2] = 0.f;
       if (h > h_max) h_max = h;
       if (ti_end < ti_end_min) ti_end_min = ti_end;
       if (ti_end > ti_end_max) ti_end_max = ti_end;
     }
     for (int k = 0; k < gcount; k++) {
-      struct gpart *p = &gparts[k];
-      const int ti_end = p->ti_end;
+      struct gpart *gp = &gparts[k];
+      const int ti_end = gp->ti_end;
+      gp->x_diff[0] = 0.f;
+      gp->x_diff[1] = 0.f;
+      gp->x_diff[2] = 0.f;
       if (ti_end < ti_end_min) ti_end_min = ti_end;
       if (ti_end > ti_end_max) ti_end_max = ti_end;
     }
@@ -1276,9 +1331,11 @@ void space_do_split(struct space *s, struct cell *c) {
   if (s->nr_parts > 0)
     c->owner =
         ((c->parts - s->parts) % s->nr_parts) * s->nr_queues / s->nr_parts;
-  else
+  else if (s->nr_gparts > 0)
     c->owner =
         ((c->gparts - s->gparts) % s->nr_gparts) * s->nr_queues / s->nr_gparts;
+  else
+    c->owner = 0; /* Ok, there is really nothing on this rank... */
 }
 
 /**
@@ -1397,12 +1454,15 @@ void space_init(struct space *s, const struct swift_params *params,
   s->size_parts_foreign = 0;
 
   /* Get the constants for the scheduler */
-  space_maxsize = parser_get_param_int(params, "Scheduler:cell_max_size");
-  space_subsize = parser_get_param_int(params, "Scheduler:cell_sub_size");
-  space_splitsize = parser_get_param_int(params, "Scheduler:cell_split_size");
-  if(verbose)
+  space_maxsize = parser_get_opt_param_int(params, "Scheduler:cell_max_size",
+                                           space_maxsize_default);
+  space_subsize = parser_get_opt_param_int(params, "Scheduler:cell_sub_size",
+                                           space_subsize_default);
+  space_splitsize = parser_get_opt_param_int(
+      params, "Scheduler:cell_split_size", space_splitsize_default);
+  if (verbose)
     message("max_size set to %d, sub_size set to %d, split_size set to %d",
-	    space_maxsize, space_subsize, space_splitsize);
+            space_maxsize, space_subsize, space_splitsize);
 
   /* Check that we have enough cells */
   if (s->cell_min * 3 > dim[0] || s->cell_min * 3 > dim[1] ||
@@ -1414,7 +1474,7 @@ void space_init(struct space *s, const struct swift_params *params,
 
   /* Apply h scaling */
   const double scaling =
-      parser_get_param_double(params, "InitialConditions:h_scaling");
+      parser_get_opt_param_double(params, "InitialConditions:h_scaling", 1.0);
   if (scaling != 1.0 && !dry_run) {
     message("Re-scaling smoothing lengths by a factor %e", scaling);
     for (size_t k = 0; k < Npart; k++) parts[k].h *= scaling;
@@ -1422,10 +1482,13 @@ void space_init(struct space *s, const struct swift_params *params,
 
   /* Apply shift */
   double shift[3] = {0.0, 0.0, 0.0};
-  shift[0] = parser_get_param_double(params, "InitialConditions:shift_x");
-  shift[1] = parser_get_param_double(params, "InitialConditions:shift_y");
-  shift[2] = parser_get_param_double(params, "InitialConditions:shift_z");
-  if ((shift[0] != 0 || shift[1] != 0 || shift[2] != 0) && !dry_run) {
+  shift[0] =
+      parser_get_opt_param_double(params, "InitialConditions:shift_x", 0.0);
+  shift[1] =
+      parser_get_opt_param_double(params, "InitialConditions:shift_y", 0.0);
+  shift[2] =
+      parser_get_opt_param_double(params, "InitialConditions:shift_z", 0.0);
+  if ((shift[0] != 0. || shift[1] != 0. || shift[2] != 0.) && !dry_run) {
     message("Shifting particles by [%e %e %e]", shift[0], shift[1], shift[2]);
     for (size_t k = 0; k < Npart; k++) {
       parts[k].x[0] += shift[0];
@@ -1471,10 +1534,12 @@ void space_init(struct space *s, const struct swift_params *params,
   }
 
   /* Allocate the extra parts array. */
-  if (posix_memalign((void *)&s->xparts, xpart_align,
-                     Npart * sizeof(struct xpart)) != 0)
-    error("Failed to allocate xparts.");
-  bzero(s->xparts, Npart * sizeof(struct xpart));
+  if (Npart > 0) {
+    if (posix_memalign((void *)&s->xparts, xpart_align,
+                       Npart * sizeof(struct xpart)) != 0)
+      error("Failed to allocate xparts.");
+    bzero(s->xparts, Npart * sizeof(struct xpart));
+  }
 
   /* Init the space lock. */
   if (lock_init(&s->lock) != 0) error("Failed to create space spin-lock.");
diff --git a/src/space.h b/src/space.h
index 88e2f6f52774651217c4ff24e25f549d8ae1e347..5cfb2cb8368ed60586aad62e0f753aab17bf55d1 100644
--- a/src/space.h
+++ b/src/space.h
@@ -1,6 +1,10 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
+ *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ *               2015 Peter W. Draper (p.w.draper@durham.ac.uk)
+ *               2016 John A. Regan (john.a.regan@durham.ac.uk)
+ *                    Tom Theuns (tom.theuns@durham.ac.uk)
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
@@ -19,16 +23,18 @@
 #ifndef SWIFT_SPACE_H
 #define SWIFT_SPACE_H
 
-/* Includes. */
+/* Config parameters. */
+#include "../config.h"
+
+/* Some standard headers. */
 #include <stddef.h>
 
-/* Local includes. */
+/* Includes. */
 #include "cell.h"
+#include "lock.h"
 #include "parser.h"
 #include "part.h"
-
-/* Forward-declare the engine to avoid cyclic includes. */
-struct engine;
+#include "space.h"
 
 /* Some constants. */
 #define space_maxdepth 10
@@ -93,7 +99,7 @@ struct space {
   int periodic;
 
   /* General-purpose lock for this space. */
-  lock_type lock;
+  swift_lock_type lock;
 
   /* Number of queues in the system. */
   int nr_queues;
@@ -154,4 +160,5 @@ void space_do_split(struct space *s, struct cell *c);
 void space_do_parts_sort();
 void space_do_gparts_sort();
 void space_link_cleanup(struct space *s);
+
 #endif /* SWIFT_SPACE_H */
diff --git a/src/swift.h b/src/swift.h
index e568a28c888295affc9ec45b6d059d34f5b4bf04..7e3116c1de8bc8e6cc2f89d0d5cbe9771ffbf33a 100644
--- a/src/swift.h
+++ b/src/swift.h
@@ -33,6 +33,7 @@
 #include "error.h"
 #include "gravity.h"
 #include "hydro.h"
+#include "hydro_properties.h"
 #include "lock.h"
 #include "map.h"
 #include "multipole.h"
@@ -40,6 +41,8 @@
 #include "parser.h"
 #include "part.h"
 #include "partition.h"
+#include "physical_constants.h"
+#include "potentials.h"
 #include "queue.h"
 #include "runner.h"
 #include "scheduler.h"
diff --git a/src/task.c b/src/task.c
index 5f1475a46e4626e1f51db673d73fd84f86e6edb6..77c5a944c8ca23e870624770c1e0cde4bf6195be 100644
--- a/src/task.c
+++ b/src/task.c
@@ -1,6 +1,10 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
+ *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ *               2015 Peter W. Draper (p.w.draper@durham.ac.uk)
+ *               2016 John A. Regan (john.a.regan@durham.ac.uk)
+ *                    Tom Theuns (tom.theuns@durham.ac.uk)
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
@@ -43,13 +47,14 @@
 
 /* Task type names. */
 const char *taskID_names[task_type_count] = {
-    "none",      "sort",       "self",       "pair",    "sub",
-    "init",      "ghost",      "drift",      "kick",    "send",
-    "recv",      "grav_pp",    "grav_mm",    "grav_up", "grav_down",
-    "part_sort", "gpart_sort", "split_cell", "rewait"};
+    "none",       "sort",      "self",          "pair",      "sub_self",
+    "sub_pair",   "init",      "ghost",         "drift",     "kick",
+    "kick_fixdt", "send",      "recv",          "grav_pp",   "grav_mm",
+    "grav_up",    "grav_down", "grav_external", "part_sort", "gpart_sort",
+    "split_cell", "rewait"};
 
-const char *subtaskID_names[task_type_count] = {"none",  "density",
-                                                "force", "grav"};
+const char *subtaskID_names[task_type_count] = {"none", "density", "force",
+                                                "grav"};
 
 /**
  * @brief Computes the overlap between the parts array of two given cells.
@@ -112,13 +117,14 @@ void task_unlock(struct task *t) {
   /* Act based on task type. */
   switch (t->type) {
     case task_type_self:
+    case task_type_sub_self:
     case task_type_sort:
       cell_unlocktree(t->ci);
       break;
     case task_type_pair:
-    case task_type_sub:
+    case task_type_sub_pair:
       cell_unlocktree(t->ci);
-      if (t->cj != NULL) cell_unlocktree(t->cj);
+      cell_unlocktree(t->cj);
       break;
     case task_type_grav_pp:
     case task_type_grav_mm:
@@ -139,55 +145,68 @@ void task_unlock(struct task *t) {
 
 int task_lock(struct task *t) {
 
-  int type = t->type;
+  const int type = t->type;
+  const int subtype = t->subtype;
   struct cell *ci = t->ci, *cj = t->cj;
+#ifdef WITH_MPI
+  int res = 0, err = 0;
+  MPI_Status stat;
+#endif
 
-  /* Communication task? */
-  if (type == task_type_recv || type == task_type_send) {
+  switch (type) {
 
+    /* Communication task? */
+    case task_type_recv:
+    case task_type_send:
 #ifdef WITH_MPI
-    /* Check the status of the MPI request. */
-    int res = 0, err = 0;
-    MPI_Status stat;
-    if ((err = MPI_Test(&t->req, &res, &stat)) != MPI_SUCCESS) {
-      char buff[MPI_MAX_ERROR_STRING];
-      int len;
-      MPI_Error_string(err, buff, &len);
-      error("Failed to test request on send/recv task (tag=%i, %s).", t->flags,
-            buff);
-    }
-    return res;
+      /* Check the status of the MPI request. */
+      if ((err = MPI_Test(&t->req, &res, &stat)) != MPI_SUCCESS) {
+        char buff[MPI_MAX_ERROR_STRING];
+        int len;
+        MPI_Error_string(err, buff, &len);
+        error("Failed to test request on send/recv task (tag=%i, %s).",
+              t->flags, buff);
+      }
+      return res;
 #else
-    error("SWIFT was not compiled with MPI support.");
+      error("SWIFT was not compiled with MPI support.");
 #endif
+      break;
 
-  }
+    case task_type_sort:
+      if (cell_locktree(ci) != 0) return 0;
+      break;
 
-  /* Unary lock? */
-  else if (type == task_type_self || type == task_type_sort ||
-           (type == task_type_sub && cj == NULL)) {
-    if (cell_locktree(ci) != 0) return 0;
-  }
+    case task_type_self:
+    case task_type_sub_self:
+      if (subtype == task_subtype_grav) {
+        if (cell_glocktree(ci) != 0) return 0;
+      } else {
+        if (cell_locktree(ci) != 0) return 0;
+      }
+      break;
 
-  /* Otherwise, binary lock. */
-  else if (type == task_type_pair || (type == task_type_sub && cj != NULL)) {
-    if (ci->hold || cj->hold) return 0;
-    if (cell_locktree(ci) != 0) return 0;
-    if (cell_locktree(cj) != 0) {
-      cell_unlocktree(ci);
-      return 0;
-    }
-  }
+    case task_type_pair:
+    case task_type_sub_pair:
+      if (subtype == task_subtype_grav) {
+        if (ci->ghold || cj->ghold) return 0;
+        if (cell_glocktree(ci) != 0) return 0;
+        if (cell_glocktree(cj) != 0) {
+          cell_gunlocktree(ci);
+          return 0;
+        }
+      } else {
+        if (ci->hold || cj->hold) return 0;
+        if (cell_locktree(ci) != 0) return 0;
+        if (cell_locktree(cj) != 0) {
+          cell_unlocktree(ci);
+          return 0;
+        }
+      }
+      break;
 
-  /* Gravity tasks? */
-  else if (type == task_type_grav_mm || type == task_type_grav_pp ||
-           type == task_type_grav_down) {
-    if (ci->ghold || (cj != NULL && cj->ghold)) return 0;
-    if (cell_glocktree(ci) != 0) return 0;
-    if (cj != NULL && cell_glocktree(cj) != 0) {
-      cell_gunlocktree(ci);
-      return 0;
-    }
+    default:
+      break;
   }
 
   /* If we made it this far, we've got a lock. */
@@ -265,48 +284,6 @@ void task_rmunlock_blind(struct task *ta, struct task *tb) {
   lock_unlock_blind(&ta->lock);
 }
 
-/**
- * @brief Add an unlock_task to the given task.
- *
- * @param ta The unlocking #task.
- * @param tb The #task that will be unlocked.
- */
-
-void task_addunlock(struct task *ta, struct task *tb) {
-
-  error("Use sched_addunlock instead.");
-
-  /* Add the lock atomically. */
-  ta->unlock_tasks[atomic_inc(&ta->nr_unlock_tasks)] = tb;
-
-  /* Check a posteriori if we did not overshoot. */
-  if (ta->nr_unlock_tasks > task_maxunlock)
-    error("Too many unlock_tasks in task.");
-}
-
-void task_addunlock_old(struct task *ta, struct task *tb) {
-
-  int k;
-
-  lock_lock(&ta->lock);
-
-  /* Check if ta already unlocks tb. */
-  for (k = 0; k < ta->nr_unlock_tasks; k++)
-    if (ta->unlock_tasks[k] == tb) {
-      error("Duplicate unlock.");
-      lock_unlock_blind(&ta->lock);
-      return;
-    }
-
-  if (ta->nr_unlock_tasks == task_maxunlock)
-    error("Too many unlock_tasks in task.");
-
-  ta->unlock_tasks[ta->nr_unlock_tasks] = tb;
-  ta->nr_unlock_tasks += 1;
-
-  lock_unlock_blind(&ta->lock);
-}
-
 /**
  * @brief Prints the list of tasks contained in a given mask
  *
diff --git a/src/task.h b/src/task.h
index 9c0ba6087d772d7362a98bc40a838c6fa3713166..51a44b3127694a60da772ca4e728073b15ac1147 100644
--- a/src/task.h
+++ b/src/task.h
@@ -2,6 +2,9 @@
  * This file is part of SWIFT.
  * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
  *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ *               2015 Peter W. Draper (p.w.draper@durham.ac.uk)
+ *               2016 John A. Regan (john.a.regan@durham.ac.uk)
+ *                    Tom Theuns (tom.theuns@durham.ac.uk)
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
@@ -34,17 +37,20 @@ enum task_types {
   task_type_sort,
   task_type_self,
   task_type_pair,
-  task_type_sub,
+  task_type_sub_self,
+  task_type_sub_pair,
   task_type_init,
   task_type_ghost,
   task_type_drift,
   task_type_kick,
+  task_type_kick_fixdt,
   task_type_send,
   task_type_recv,
   task_type_grav_pp,
   task_type_grav_mm,
   task_type_grav_up,
   task_type_grav_down,
+  task_type_grav_external,
   task_type_part_sort,
   task_type_gpart_sort,
   task_type_split_cell,
@@ -73,7 +79,7 @@ struct task {
   char skip, tight, implicit;
   int flags, wait, rank, weight;
 
-  lock_type lock;
+  swift_lock_type lock;
 
   struct cell *ci, *cj;
 
@@ -81,7 +87,7 @@ struct task {
   MPI_Request req;
 #endif
 
-  int rid;
+  int rid, last_rid;
   ticks tic, toc;
 
   int nr_unlock_tasks;
@@ -92,7 +98,6 @@ struct task {
 void task_rmunlock(struct task *ta, struct task *tb);
 void task_rmunlock_blind(struct task *ta, struct task *tb);
 void task_cleanunlock(struct task *t, int type);
-void task_addunlock(struct task *ta, struct task *tb);
 void task_unlock(struct task *t);
 float task_overlap(const struct task *ta, const struct task *tb);
 int task_lock(struct task *t);
diff --git a/src/timers.c b/src/timers.c
index 2501d347c8cea608650ece4c2883dab85ceee058..b621d27c90902f06c3760cbef6a88237a2b3b95b 100644
--- a/src/timers.c
+++ b/src/timers.c
@@ -1,6 +1,9 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
+ *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ *               2016 John A. Regan (john.a.regan@durham.ac.uk)
+ *                    Tom Theuns (tom.theuns@durham.ac.uk)
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
diff --git a/src/timers.h b/src/timers.h
index de2d9edb9ed54717472e5ae1222dfb33235c3e95..c48961b39737f23eb936d7283f76651d33892991 100644
--- a/src/timers.h
+++ b/src/timers.h
@@ -1,6 +1,9 @@
 /*******************************************************************************
  * This file is part of SWIFT.
  * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
+ *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ *               2016 John A. Regan (john.a.regan@durham.ac.uk)
+ *                    Tom Theuns (tom.theuns@durham.ac.uk)
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
@@ -20,6 +23,7 @@
 #define SWIFT_TIMERS_H
 
 /* Includes. */
+#include "atomic.h"
 #include "cycle.h"
 #include "inline.h"
 
@@ -37,11 +41,16 @@ enum {
   timer_dopair_density,
   timer_dopair_force,
   timer_dopair_grav,
-  timer_dosub_density,
-  timer_dosub_force,
-  timer_dosub_grav,
+  timer_dograv_external,
+  timer_dosub_self_density,
+  timer_dosub_self_force,
+  timer_dosub_self_grav,
+  timer_dosub_pair_density,
+  timer_dosub_pair_force,
+  timer_dosub_pair_grav,
   timer_dopair_subset,
-  timer_doghost,
+  timer_do_ghost,
+  timer_dorecv_cell,
   timer_gettask,
   timer_qget,
   timer_qsteal,
@@ -66,7 +75,7 @@ extern ticks timers[timer_count];
 #define TIMER_TOC2(t) timers_toc(t, tic2)
 INLINE static ticks timers_toc(int t, ticks tic) {
   ticks d = (getticks() - tic);
-  __sync_add_and_fetch(&timers[t], d);
+  atomic_add(&timers[t], d);
   return d;
 }
 #else
diff --git a/src/timestep.h b/src/timestep.h
new file mode 100644
index 0000000000000000000000000000000000000000..1674958812d1f9e72eb618a9cb026befd610d06d
--- /dev/null
+++ b/src/timestep.h
@@ -0,0 +1,136 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (c) 2016 Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+#ifndef SWIFT_TIMESTEP_H
+#define SWIFT_TIMESTEP_H
+
+/* Config parameters. */
+#include "../config.h"
+
+/* Local headers. */
+#include "const.h"
+#include "debug.h"
+
+/**
+ * @brief Compute a valid integer time-step form a given time-step
+ *
+ * @param new_dt The time-step to convert.
+ * @param ti_begin The (integer) start of the previous time-step.
+ * @param ti_end The (integer) end of the previous time-step.
+ * @param timeBase_inv The inverse of the system's minimal time-step.
+ */
+__attribute__((always_inline)) INLINE static int get_integer_timestep(
+    float new_dt, int ti_begin, int ti_end, double timeBase_inv) {
+
+  /* Convert to integer time */
+  int new_dti = (int)(new_dt * timeBase_inv);
+
+  /* Recover the current timestep */
+  const int current_dti = ti_end - ti_begin;
+
+  /* Limit timestep increase */
+  if (current_dti > 0) new_dti = min(new_dti, 2 * current_dti);
+
+  /* Put this timestep on the time line */
+  int dti_timeline = max_nr_timesteps;
+  while (new_dti < dti_timeline) dti_timeline /= 2;
+  new_dti = dti_timeline;
+
+  /* Make sure we are allowed to increase the timestep size */
+  if (new_dti > current_dti) {
+    if ((max_nr_timesteps - ti_end) % new_dti > 0) new_dti = current_dti;
+  }
+
+  return new_dti;
+}
+
+/**
+ * @brief Compute the new (integer) time-step of a given #gpart
+ *
+ * @param gp The #gpart.
+ * @param e The #engine (used to get some constants).
+ */
+__attribute__((always_inline)) INLINE static int get_gpart_timestep(
+    const struct gpart *gp, const struct engine *e) {
+
+  const float new_dt_external = gravity_compute_timestep_external(
+      e->external_potential, e->physical_constants, gp);
+  const float new_dt_self =
+      gravity_compute_timestep_self(e->physical_constants, gp);
+
+  float new_dt = fminf(new_dt_external, new_dt_self);
+
+  /* Limit timestep within the allowed range */
+  new_dt = fminf(new_dt, e->dt_max);
+  new_dt = fmaxf(new_dt, e->dt_min);
+
+  /* Convert to integer time */
+  const int new_dti =
+      get_integer_timestep(new_dt, gp->ti_begin, gp->ti_end, e->timeBase_inv);
+
+  return new_dti;
+}
+
+/**
+ * @brief Compute the new (integer) time-step of a given #part
+ *
+ * @param p The #part.
+ * @param xp The #xpart partner of p.
+ * @param e The #engine (used to get some constants).
+ */
+__attribute__((always_inline)) INLINE static int get_part_timestep(
+    const struct part *p, const struct xpart *xp, const struct engine *e) {
+
+  /* Compute the next timestep (hydro condition) */
+  const float new_dt_hydro = hydro_compute_timestep(p, xp, e->hydro_properties);
+
+  /* Compute the next timestep (gravity condition) */
+  float new_dt_grav = FLT_MAX;
+  if (p->gpart != NULL) {
+
+    const float new_dt_external = gravity_compute_timestep_external(
+        e->external_potential, e->physical_constants, p->gpart);
+    const float new_dt_self =
+        gravity_compute_timestep_self(e->physical_constants, p->gpart);
+
+    new_dt_grav = fminf(new_dt_external, new_dt_self);
+  }
+
+  /* Final time-step is minimum of hydro and gravity */
+  float new_dt = fminf(new_dt_hydro, new_dt_grav);
+
+  /* Limit change in h */
+  const float dt_h_change =
+      (p->h_dt != 0.0f)
+          ? fabsf(e->hydro_properties->log_max_h_change * p->h / p->h_dt)
+          : FLT_MAX;
+
+  new_dt = fminf(new_dt, dt_h_change);
+
+  /* Limit timestep within the allowed range */
+  new_dt = fminf(new_dt, e->dt_max);
+  new_dt = fmaxf(new_dt, e->dt_min);
+
+  /* Convert to integer time */
+  const int new_dti =
+      get_integer_timestep(new_dt, p->ti_begin, p->ti_end, e->timeBase_inv);
+
+  return new_dti;
+}
+
+#endif /* SWIFT_TIMESTEP_H */
diff --git a/src/tools.c b/src/tools.c
index 1efdc027d3da50733372e73e1cfd6a9c7206784f..169aac82f98c2dada035325eacaf2f568fd7e56c 100644
--- a/src/tools.c
+++ b/src/tools.c
@@ -19,16 +19,25 @@
  *
  ******************************************************************************/
 
+/* Config parameters. */
+#include "../config.h"
+
+/* Some standard headers. */
 #include <math.h>
-#include <stdlib.h>
 #include <stddef.h>
 #include <stdio.h>
+#include <stdlib.h>
 
+/* This object's header. */
+#include "tools.h"
+
+/* Local includes. */
+#include "cell.h"
 #include "error.h"
+#include "gravity.h"
+#include "hydro.h"
 #include "part.h"
-#include "cell.h"
-#include "tools.h"
-#include "swift.h"
+#include "runner.h"
 
 /**
  *  Factorize a given integer, attempts to keep larger pair of factors.
@@ -56,10 +65,7 @@ void factor(int value, int *f1, int *f2) {
  * @param N The number of parts.
  * @param periodic Periodic boundary conditions flag.
  */
-
-void pairs_n2(double *dim, struct part *__restrict__ parts, int N,
-              int periodic) {
-
+void pairs_n2(double *dim, struct part *restrict parts, int N, int periodic) {
   int i, j, k, count = 0;
   // int mj, mk;
   // double maxratio = 1.0;
@@ -122,9 +128,7 @@ void pairs_n2(double *dim, struct part *__restrict__ parts, int N,
 }
 
 void pairs_single_density(double *dim, long long int pid,
-                          struct part *__restrict__ parts, int N,
-                          int periodic) {
-
+                          struct part *restrict parts, int N, int periodic) {
   int i, k;
   // int mj, mk;
   // double maxratio = 1.0;
@@ -384,9 +388,7 @@ void density_dump(int N) {
  */
 
 void engine_single_density(double *dim, long long int pid,
-                           struct part *__restrict__ parts, int N,
-                           int periodic) {
-
+                           struct part *restrict parts, int N, int periodic) {
   int i, k;
   double r2, dx[3];
   float fdx[3], ih;
@@ -432,8 +434,7 @@ void engine_single_density(double *dim, long long int pid,
 }
 
 void engine_single_force(double *dim, long long int pid,
-                         struct part *__restrict__ parts, int N, int periodic) {
-
+                         struct part *restrict parts, int N, int periodic) {
   int i, k;
   double r2, dx[3];
   float fdx[3];
@@ -475,3 +476,29 @@ void engine_single_force(double *dim, long long int pid,
           p.a_hydro[1], p.a_hydro[2]);
   fflush(stdout);
 }
+
+/**
+ * Returns a random number (uniformly distributed) in [a,b[
+ */
+double random_uniform(double a, double b) {
+  return (rand() / (double)RAND_MAX) * (b - a) + a;
+}
+
+/**
+ * @brief Randomly shuffle an array of particles.
+ */
+void shuffle_particles(struct part *parts, const int count) {
+  if (count > 1) {
+    for (int i = 0; i < count - 1; i++) {
+      int j = i + random_uniform(0., (double)(count - 1 - i));
+
+      struct part particle = parts[j];
+
+      parts[j] = parts[i];
+
+      parts[i] = particle;
+    }
+
+  } else
+    error("Array not big enough to shuffle!");
+}
diff --git a/src/tools.h b/src/tools.h
index 01226ee7cdbfe42aa44affadc4a9cbe02bad2428..97f036994bc0d1e2b4a95d806cfdbd253664a260 100644
--- a/src/tools.h
+++ b/src/tools.h
@@ -22,21 +22,28 @@
 #ifndef SWIFT_TOOL_H
 #define SWIFT_TOOL_H
 
-#include "runner.h"
+/* Config parameters. */
+#include "../config.h"
+
+/* Includes. */
 #include "cell.h"
+#include "part.h"
+#include "runner.h"
 
 void factor(int value, int *f1, int *f2);
 void density_dump(int N);
 void pairs_single_grav(double *dim, long long int pid,
-                       struct gpart *__restrict__ gparts,
-                       const struct part *parts, int N, int periodic);
+                       struct gpart *restrict gparts, const struct part *parts,
+                       int N, int periodic);
 void pairs_single_density(double *dim, long long int pid,
-                          struct part *__restrict__ parts, int N, int periodic);
+                          struct part *restrict parts, int N, int periodic);
 
 void pairs_all_density(struct runner *r, struct cell *ci, struct cell *cj);
 void self_all_density(struct runner *r, struct cell *ci);
 
-void pairs_n2(double *dim, struct part *__restrict__ parts, int N,
-              int periodic);
+void pairs_n2(double *dim, struct part *restrict parts, int N, int periodic);
+
+double random_uniform(double a, double b);
+void shuffle_particles(struct part *parts, const int count);
 
 #endif /* SWIFT_TOOL_H */
diff --git a/src/units.c b/src/units.c
index 184dbe8a0df000008dba1d7003558d83b1f08cad..8e0ff08f1d92f47afbf44366acef7038fc8675c5 100644
--- a/src/units.c
+++ b/src/units.c
@@ -39,7 +39,6 @@
 /* Includes. */
 #include "const.h"
 #include "error.h"
-#include "units.h"
 
 /**
  * @brief Initialises the UnitSystem structure with the constants given in
@@ -47,20 +46,23 @@
  *
  * @param us The UnitSystem to initialize.
  * @param params The parsed parameter file.
+ * @param category The section of the parameter file to read from.
  */
-void units_init(struct UnitSystem* us, const struct swift_params* params) {
-
-  us->UnitMass_in_cgs =
-      parser_get_param_double(params, "UnitSystem:UnitMass_in_cgs");
-  us->UnitLength_in_cgs =
-      parser_get_param_double(params, "UnitSystem:UnitLength_in_cgs");
-  const double unitVelocity =
-      parser_get_param_double(params, "UnitSystem:UnitVelocity_in_cgs");
+void units_init(struct UnitSystem* us, const struct swift_params* params,
+                const char* category) {
+
+  char buffer[200];
+  sprintf(buffer, "%s:UnitMass_in_cgs", category);
+  us->UnitMass_in_cgs = parser_get_param_double(params, buffer);
+  sprintf(buffer, "%s:UnitLength_in_cgs", category);
+  us->UnitLength_in_cgs = parser_get_param_double(params, buffer);
+  sprintf(buffer, "%s:UnitVelocity_in_cgs", category);
+  const double unitVelocity = parser_get_param_double(params, buffer);
   us->UnitTime_in_cgs = us->UnitLength_in_cgs / unitVelocity;
-  us->UnitCurrent_in_cgs =
-      parser_get_param_double(params, "UnitSystem:UnitCurrent_in_cgs");
-  us->UnitTemperature_in_cgs =
-      parser_get_param_double(params, "UnitSystem:UnitTemp_in_cgs");
+  sprintf(buffer, "%s:UnitCurrent_in_cgs", category);
+  us->UnitCurrent_in_cgs = parser_get_param_double(params, buffer);
+  sprintf(buffer, "%s:UnitTemp_in_cgs", category);
+  us->UnitTemperature_in_cgs = parser_get_param_double(params, buffer);
 }
 
 /**
@@ -331,7 +333,7 @@ void units_conversion_string(char* buffer, const struct UnitSystem* us,
  * the desired quantity. See conversionFactor() for a working example
  */
 double units_general_conversion_factor(const struct UnitSystem* us,
-                                       float baseUnitsExponants[5]) {
+                                       const float baseUnitsExponants[5]) {
   double factor = 1.;
   int i;
 
@@ -349,7 +351,7 @@ double units_general_conversion_factor(const struct UnitSystem* us,
  * the desired quantity. See conversionFactor() for a working example
  */
 float units_general_h_factor(const struct UnitSystem* us,
-                             float baseUnitsExponants[5]) {
+                             const float baseUnitsExponants[5]) {
   float factor_exp = 0.f;
 
   factor_exp += -baseUnitsExponants[UNIT_MASS];
@@ -367,7 +369,7 @@ float units_general_h_factor(const struct UnitSystem* us,
  * the desired quantity. See conversionFactor() for a working example
  */
 float units_general_a_factor(const struct UnitSystem* us,
-                             float baseUnitsExponants[5]) {
+                             const float baseUnitsExponants[5]) {
   float factor_exp = 0.f;
 
   factor_exp += baseUnitsExponants[UNIT_LENGTH];
@@ -385,7 +387,7 @@ float units_general_a_factor(const struct UnitSystem* us,
  * the desired quantity. See conversionFactor() for a working example
  */
 void units_general_conversion_string(char* buffer, const struct UnitSystem* us,
-                                     float baseUnitsExponants[5]) {
+                                     const float baseUnitsExponants[5]) {
   char temp[14];
   double a_exp = units_general_a_factor(us, baseUnitsExponants);
   double h_exp = units_general_h_factor(us, baseUnitsExponants);
diff --git a/src/units.h b/src/units.h
index 3e349dc16787cd4052a3e9205b21dce3c3732448..24e37e177480d7f84e41df1b73e2036aa00b7220 100644
--- a/src/units.h
+++ b/src/units.h
@@ -92,24 +92,25 @@ enum UnitConversionFactor {
   UNIT_CONV_TEMPERATURE
 };
 
-void units_init(struct UnitSystem*, const struct swift_params*);
+void units_init(struct UnitSystem*, const struct swift_params*,
+                const char* category);
 double units_get_base_unit(const struct UnitSystem*, enum BaseUnits);
 const char* units_get_base_unit_symbol(enum BaseUnits);
 const char* units_get_base_unit_CGS_symbol(enum BaseUnits);
 double units_general_conversion_factor(const struct UnitSystem* us,
-                                       float baseUnitsExponants[5]);
+                                       const float baseUnitsExponants[5]);
 double units_conversion_factor(const struct UnitSystem* us,
                                enum UnitConversionFactor unit);
 float units_general_h_factor(const struct UnitSystem* us,
-                             float baseUnitsExponants[5]);
+                             const float baseUnitsExponants[5]);
 float units_h_factor(const struct UnitSystem* us,
                      enum UnitConversionFactor unit);
 float units_general_a_factor(const struct UnitSystem* us,
-                             float baseUnitsExponants[5]);
+                             const float baseUnitsExponants[5]);
 float units_a_factor(const struct UnitSystem* us,
                      enum UnitConversionFactor unit);
 void units_general_conversion_string(char* buffer, const struct UnitSystem* us,
-                                     float baseUnitsExponants[5]);
+                                     const float baseUnitsExponants[5]);
 void units_conversion_string(char* buffer, const struct UnitSystem* us,
                              enum UnitConversionFactor unit);
 
diff --git a/src/vector.h b/src/vector.h
index ef2b7c4b9e42ceb61dc38c3196c1819be652926f..fa311f121f7b702f2288be0d561e520b52330457 100644
--- a/src/vector.h
+++ b/src/vector.h
@@ -79,6 +79,12 @@
                             _mm512_set1_epi64(ptrs[0])),                    \
       1)
 #define vec_gather(base, offsets) _mm512_i32gather_ps(offsets.m, base, 1)
+#define FILL_VEC(a)                                                     \
+  {                                                                     \
+    .f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a, .f[4] = a, .f[5] = a,   \
+    .f[6] = a, .f[7] = a, .f[8] = a, .f[9] = a, .f[10] = a, .f[11] = a, \
+    .f[12] = a, .f[13] = a, .f[14] = a, .f[15] = a                      \
+  }
 #elif defined(NO__AVX__)
 #define VECTORIZE
 #define VEC_SIZE 8
@@ -107,6 +113,11 @@
 #define vec_dbl_ftoi(a) _mm256_cvttpd_epi32(a)
 #define vec_dbl_fmin(a, b) _mm256_min_pd(a, b)
 #define vec_dbl_fmax(a, b) _mm256_max_pd(a, b)
+#define FILL_VEC(a)                                                   \
+  {                                                                   \
+    .f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a, .f[4] = a, .f[5] = a, \
+    .f[6] = a, .f[7] = a                                              \
+  }
 #ifdef __AVX2__
 #define VEC_HAVE_GATHER
 #define vec_gather(base, offsets) _mm256_i32gather_ps(base, offsets.m, 1)
@@ -139,6 +150,8 @@
 #define vec_dbl_ftoi(a) _mm_cvttpd_epi32(a)
 #define vec_dbl_fmin(a, b) _mm_min_pd(a, b)
 #define vec_dbl_fmax(a, b) _mm_max_pd(a, b)
+#define FILL_VEC(a) \
+  { .f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a }
 #else
 #define VEC_SIZE 4
 #endif
diff --git a/src/version.c b/src/version.c
index 27841a16019a69442e66b21c327f4241e440fb12..ab22c0d9ab8841dedd09aa83aa8803e468e69ce9 100644
--- a/src/version.c
+++ b/src/version.c
@@ -40,6 +40,9 @@
 /* This object's header. */
 #include "version.h"
 
+/* Local headers. */
+#include "version_string.h"
+
 /**
  * @brief Return the source code git revision
  *
diff --git a/src/version.h.in b/src/version.h
similarity index 82%
rename from src/version.h.in
rename to src/version.h
index 7824e06b996dfbb4178b57f1b72365a1e2d24484..4b6b38d220b36e5cb340571cd89e6f52a7b21a8e 100644
--- a/src/version.h.in
+++ b/src/version.h
@@ -2,33 +2,24 @@
  * This file is part of SWIFT.
  * Copyright (c) 2012 Matthieu Schaller (matthieu.schaller@durham.ac.uk).
  * Copyright (c) 2015 Peter W. Draper (p.w.draper@durham.ac.uk).
- * 
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
+ *
  ******************************************************************************/
 #ifndef SWIFT_VERSION_H
 #define SWIFT_VERSION_H
 
-/**
- * @file version.h
- * @brief Package version, git revision sha and compiler info.
- */
-
-#define PACKAGE_VERSION "@PACKAGE_VERSION@"
-#define GIT_REVISION "@GIT_REVISION@"
-#define GIT_BRANCH "@GIT_BRANCH@"
-
 const char* package_description(void);
 const char* package_version(void);
 const char* git_revision(void);
@@ -36,8 +27,8 @@ const char* git_branch(void);
 const char* compiler_name(void);
 const char* compiler_version(void);
 const char* mpi_version(void);
-const char *hdf5_version(void);
-const char *metis_version(void);
+const char* hdf5_version(void);
+const char* metis_version(void);
 void greetings(void);
 
 #endif /* SWIFT_VERSION_H */
diff --git a/src/version_string.h.in b/src/version_string.h.in
new file mode 100644
index 0000000000000000000000000000000000000000..0bc282e0a3b6ba6fe5bab773c10c12e6d9277c2c
--- /dev/null
+++ b/src/version_string.h.in
@@ -0,0 +1,32 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (c) 2012 Matthieu Schaller (matthieu.schaller@durham.ac.uk).
+ * Copyright (c) 2015 Peter W. Draper (p.w.draper@durham.ac.uk).
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+#ifndef SWIFT_VERSION_STRING_H
+#define SWIFT_VERSION_STRING_H
+
+/**
+ * @file version_string.h
+ * @brief Package version, git revision sha and compiler info.
+ */
+
+#define PACKAGE_VERSION "@PACKAGE_VERSION@"
+#define GIT_REVISION "@GIT_REVISION@"
+#define GIT_BRANCH "@GIT_BRANCH@"
+
+#endif /* SWIFT_VERSION_STRING_H */
diff --git a/tests/Makefile.am b/tests/Makefile.am
index b53a08615c5a8c7c2c31475bf7207522f8b9a58c..d0c132ad1b6dadd749a389fb71b873120b48139a 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -22,7 +22,7 @@ AM_LDFLAGS = ../src/.libs/libswiftsim.a $(HDF5_LDFLAGS) $(HDF5_LIBS)
 
 # List of programs and scripts to run in the test suite
 TESTS = testGreetings testReading.sh testSingle testPair.sh testPairPerturbed.sh \
-	test27cells.sh test27cellsPerturbed.sh testParser.sh
+	test27cells.sh test27cellsPerturbed.sh testParser.sh testKernel
 
 # List of test programs to compile
 check_PROGRAMS = testGreetings testReading testSingle testTimeIntegration \
diff --git a/tests/test27cells.c b/tests/test27cells.c
index 7915511eed50a229a94eda6bb338607099303421..3e2f11c7aabac0a7ff2df19f2c48f9f81ea55df5 100644
--- a/tests/test27cells.c
+++ b/tests/test27cells.c
@@ -18,9 +18,9 @@
  ******************************************************************************/
 
 #include <fenv.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <stdio.h>
 #include <unistd.h>
 #include "swift.h"
 
@@ -31,14 +31,6 @@ enum velocity_types {
   velocity_rotating
 };
 
-/**
- * @brief Returns a random number (uniformly distributed) in [a,b[
- */
-double random_uniform(double a, double b) {
-  return (rand() / (double)RAND_MAX) * (b - a) + a;
-}
-
-
 /**
  * @brief Constructs a cell and all of its particle in a valid state prior to
  * a DOPAIR or DOSELF calcuation.
@@ -46,10 +38,12 @@ double random_uniform(double a, double b) {
  * @param n The cube root of the number of particles.
  * @param offset The position of the cell offset from (0,0,0).
  * @param size The cell size.
- * @param h The smoothing length of the particles in units of the inter-particle separation.
+ * @param h The smoothing length of the particles in units of the inter-particle
+ *separation.
  * @param density The density of the fluid.
  * @param partId The running counter of IDs.
- * @param pert The perturbation to apply to the particles in the cell in units of the inter-particle separation.
+ * @param pert The perturbation to apply to the particles in the cell in units
+ *of the inter-particle separation.
  * @param vel The type of velocity field (0, random, divergent, rotating)
  */
 struct cell *make_cell(size_t n, double *offset, double size, double h,
@@ -127,10 +121,12 @@ struct cell *make_cell(size_t n, double *offset, double size, double h,
   cell->ti_end_min = 1;
   cell->ti_end_max = 1;
 
+  shuffle_particles(cell->parts, cell->count);
+
   cell->sorted = 0;
   cell->sort = NULL;
   cell->sortsize = 0;
-  runner_dosort(NULL, cell, 0x1FFF, 0);
+  runner_do_sort(NULL, cell, 0x1FFF, 0);
 
   return cell;
 }
@@ -145,7 +141,6 @@ void clean_up(struct cell *ci) {
  * @brief Initializes all particles field to be ready for a density calculation
  */
 void zero_particle_fields(struct cell *c) {
-
   for (size_t pid = 0; pid < c->count; pid++) {
     c->parts[pid].rho = 0.f;
     c->parts[pid].rho_dh = 0.f;
@@ -157,7 +152,6 @@ void zero_particle_fields(struct cell *c) {
  * @brief Ends the loop by adding the appropriate coefficients
  */
 void end_calculation(struct cell *c) {
-
   for (size_t pid = 0; pid < c->count; pid++) {
     hydro_end_density(&c->parts[pid], 1);
   }
@@ -168,7 +162,6 @@ void end_calculation(struct cell *c) {
  */
 void dump_particle_fields(char *fileName, struct cell *main_cell,
                           struct cell **cells) {
-
   FILE *file = fopen(fileName, "w");
 
   /* Write header */
@@ -205,7 +198,6 @@ void dump_particle_fields(char *fileName, struct cell *main_cell,
   for (int i = 0; i < 3; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 3; ++k) {
-
         struct cell *cj = cells[i * 9 + j * 3 + k];
         if (cj == main_cell) continue;
 
@@ -242,7 +234,6 @@ void runner_doself1_density(struct runner *r, struct cell *ci);
 
 /* And go... */
 int main(int argc, char *argv[]) {
-
   size_t runs = 0, particles = 0;
   double h = 1.2348, size = 1., rho = 1.;
   double perturbation = 0.;
@@ -310,7 +301,8 @@ int main(int argc, char *argv[]) {
   /* Help users... */
   message("Smoothing length: h = %f", h * size);
   message("Kernel:               %s", kernel_name);
-  message("Neighbour target: N = %f", h * h * h * kernel_nwneigh / 1.88273);
+  message("Neighbour target: N = %f",
+          h * h * h * 4.0 * M_PI * kernel_gamma3 / 3.0);
   message("Density target: rho = %f", rho);
   message("div_v target:   div = %f", vel == 2 ? 3.f : 0.f);
   message("curl_v target: curl = [0., 0., %f]", vel == 3 ? -2.f : 0.f);
@@ -336,7 +328,6 @@ int main(int argc, char *argv[]) {
   for (int i = 0; i < 3; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 3; ++k) {
-
         double offset[3] = {i * size, j * size, k * size};
         cells[i * 9 + j * 3 + k] = make_cell(particles, offset, size, h, rho,
                                              &partId, perturbation, vel);
@@ -349,7 +340,6 @@ int main(int argc, char *argv[]) {
 
   ticks time = 0;
   for (size_t i = 0; i < runs; ++i) {
-
     /* Zero the fields */
     for (int j = 0; j < 27; ++j) zero_particle_fields(cells[j]);
 
diff --git a/tests/testKernel.c b/tests/testKernel.c
index 5ad9cc81ea92e6ef9487489c5d560abf414e38df..182bae5334e1a5061e584212a31186dc4e7f0818 100644
--- a/tests/testKernel.c
+++ b/tests/testKernel.c
@@ -1,6 +1,7 @@
 /*******************************************************************************
  * This file is part of SWIFT.
- * Copyright (C) 2016 Matthieu Schaller (matthieu.schaller@durham.ac.uk).
+ * Copyright (C) 2016 Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ *                    James Willis (james.s.willis@durham.ac.uk)
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
@@ -17,21 +18,71 @@
  *
  ******************************************************************************/
 
-#include "swift.h"
+#define NO__AVX__
+#include "kernel_hydro.h"
+#include "vector.h"
+
+#include <stdlib.h>
+#include <strings.h>
+
+#define numPoints (1 << 4)
 
 int main() {
 
-  const float h = const_eta_kernel;
-  const int numPoints = 30;
+  const float h = 1.2348f;
+
+  float u[numPoints] = {0.f};
+  float W[numPoints] = {0.f};
+  float dW[numPoints] = {0.f};
+
+  printf("\nSerial Output\n");
+  printf("-------------\n");
+  const float numPoints_inv = 1. / numPoints;
 
   for (int i = 0; i < numPoints; ++i) {
+    u[i] = i * 2.5f * numPoints_inv / h;
+  }
+
+  for (int i = 0; i < numPoints; ++i) {
+
+    kernel_deval(u[i], &W[i], &dW[i]);
+
+    printf("%2d: h= %f H= %f x=%f W(x,h)=%f dW(x,h)=%f\n", i, h,
+           h * kernel_gamma, u[i] * h, W[i], dW[i]);
+  }
+
+  printf("\nVector Output for VEC_SIZE=%d\n", VEC_SIZE);
+  printf("-------------\n");
+  for (int i = 0; i < numPoints; i += VEC_SIZE) {
+
+    vector vx, vx_h;
+    vector W_vec, dW_vec;
+
+    for (int j = 0; j < VEC_SIZE; j++) {
+      vx.f[j] = (i + j) * 2.5f / numPoints;
+    }
+
+    vx_h.v = vx.v / vec_set1(h);
+
+    kernel_deval_vec(&vx_h, &W_vec, &dW_vec);
 
-    const float x = i * 3.f / numPoints;
-    float W, dW;
-    kernel_deval(x / h, &W, &dW);
+    for (int j = 0; j < VEC_SIZE; j++) {
+      printf("%2d: h= %f H= %f x=%f W(x,h)=%f dW(x,h)=%f\n", i + j, h,
+             h * kernel_gamma, vx.f[j], W_vec.f[j], dW_vec.f[j]);
 
-    printf("h= %f H= %f x=%f W(x,h)=%f\n", h, h * kernel_gamma, x, W);
+      if (fabsf(W_vec.f[j] - W[i + j]) > 2e-7) {
+        printf("Invalid value ! scalar= %e, vector= %e\n", W[i + j],
+               W_vec.f[j]);
+        return 1;
+      }
+      if (fabsf(dW_vec.f[j] - dW[i + j]) > 2e-7) {
+        printf("Invalid value ! scalar= %e, vector= %e\n", dW[i + j],
+               dW_vec.f[j]);
+        return 1;
+      }
+    }
   }
 
+  printf("\nAll values are consistent\n");
   return 0;
 }
diff --git a/tests/testPair.c b/tests/testPair.c
index 6e46b577ca63a8d3c2edce888a7485af0949813d..f9539fc1a444828c65b39e56618eb7bb98bd67de 100644
--- a/tests/testPair.c
+++ b/tests/testPair.c
@@ -18,19 +18,12 @@
  ******************************************************************************/
 
 #include <fenv.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <stdio.h>
 #include <unistd.h>
 #include "swift.h"
 
-/**
- * Returns a random number (uniformly distributed) in [a,b[
- */
-double random_uniform(double a, double b) {
-  return (rand() / (double)RAND_MAX) * (b - a) + a;
-}
-
 /* n is both particles per axis and box size:
  * particles are generated on a mesh with unit spacing
  */
@@ -93,10 +86,12 @@ struct cell *make_cell(size_t n, double *offset, double size, double h,
   cell->ti_end_min = 1;
   cell->ti_end_max = 1;
 
+  shuffle_particles(cell->parts, cell->count);
+
   cell->sorted = 0;
   cell->sort = NULL;
   cell->sortsize = 0;
-  runner_dosort(NULL, cell, 0x1FFF, 0);
+  runner_do_sort(NULL, cell, 0x1FFF, 0);
 
   return cell;
 }
@@ -111,7 +106,6 @@ void clean_up(struct cell *ci) {
  * @brief Initializes all particles field to be ready for a density calculation
  */
 void zero_particle_fields(struct cell *c) {
-
   for (size_t pid = 0; pid < c->count; pid++) {
     c->parts[pid].rho = 0.f;
     c->parts[pid].rho_dh = 0.f;
@@ -123,7 +117,6 @@ void zero_particle_fields(struct cell *c) {
  * @brief Dump all the particles to a file
  */
 void dump_particle_fields(char *fileName, struct cell *ci, struct cell *cj) {
-
   FILE *file = fopen(fileName, "w");
 
   /* Write header */
@@ -254,7 +247,6 @@ int main(int argc, char *argv[]) {
 
   time = 0;
   for (size_t i = 0; i < runs; ++i) {
-
     /* Zero the fields */
     zero_particle_fields(ci);
     zero_particle_fields(cj);
diff --git a/tests/testParser.c b/tests/testParser.c
index 0b08d20c9e2d48de1858877cf186eaa9d0ac84c0..f1211199924df728dfe57376781dc07fe862cec7 100644
--- a/tests/testParser.c
+++ b/tests/testParser.c
@@ -17,11 +17,11 @@
  *
  ******************************************************************************/
 
-#include "parser.h"
 #include <assert.h>
-#include <string.h>
-#include <stdio.h>
 #include <math.h>
+#include <stdio.h>
+#include <string.h>
+#include "parser.h"
 
 int main(int argc, char *argv[]) {
   const char *input_file = argv[1];
@@ -50,14 +50,17 @@ int main(int argc, char *argv[]) {
       parser_get_param_double(&param_file, "Simulation:start_time");
   const int kernel = parser_get_param_int(&param_file, "kernel");
 
+  const int optional = parser_get_opt_param_int(&param_file, "optional", 1);
+
   char ic_file[PARSER_MAX_LINE_SIZE];
   parser_get_param_string(&param_file, "IO:ic_file", ic_file);
 
   /* Print the variables to check their values are correct. */
   printf(
       "no_of_threads: %d, no_of_time_steps: %d, max_h: %f, start_time: %lf, "
-      "ic_file: %s, kernel: %d\n",
-      no_of_threads, no_of_time_steps, max_h, start_time, ic_file, kernel);
+      "ic_file: %s, kernel: %d optional: %d\n",
+      no_of_threads, no_of_time_steps, max_h, start_time, ic_file, kernel,
+      optional);
 
   assert(no_of_threads == 16);
   assert(no_of_time_steps == 10);
@@ -65,6 +68,7 @@ int main(int argc, char *argv[]) {
   assert(fabs(start_time - 1.23456789) < 0.00001);
   assert(strcmp(ic_file, "ic_file.ini") == 0); /*strcmp returns 0 if correct.*/
   assert(kernel == 4);
+  assert(optional == 1);
 
   return 0;
 }
diff --git a/tests/testReading.c b/tests/testReading.c
index 33aeb5095ba499bc0fd18ba15b513e351692432e..2fa88855a70a12265f180cd97528dda855322d1d 100644
--- a/tests/testReading.c
+++ b/tests/testReading.c
@@ -17,9 +17,12 @@
  *
  ******************************************************************************/
 
-#include "swift.h"
+/* Some standard headers. */
 #include <stdlib.h>
 
+/* Includes. */
+#include "swift.h"
+
 int main() {
 
   size_t Ngas = 0, Ngpart = 0;
diff --git a/tests/testSPHStep.c b/tests/testSPHStep.c
index 223078ecb637e64d94e37cdf8c0f60a86bdd5ff7..3af0c6ad1afdeab749a378153fd1a8e016f29659 100644
--- a/tests/testSPHStep.c
+++ b/tests/testSPHStep.c
@@ -141,7 +141,7 @@ int main() {
 
   /* Compute density */
   runner_doself1_density(&r, ci);
-  runner_doghost(&r, ci);
+  runner_do_ghost(&r, ci);
 
   message("h=%f rho=%f N_ngb=%f", p->h, p->rho, p->density.wcount);
   message("c=%f", p->force.c);
diff --git a/tests/testSingle.c b/tests/testSingle.c
index eb49a570b93b14734c9e6af37d3d8a2b90d04078..d37f20908a28fb2e1098043dd7fbbcf76bd8c247 100644
--- a/tests/testSingle.c
+++ b/tests/testSingle.c
@@ -22,15 +22,15 @@
 #include "../config.h"
 
 /* Some standard headers. */
+#include <fenv.h>
+#include <float.h>
+#include <limits.h>
+#include <math.h>
+#include <pthread.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <unistd.h>
 #include <string.h>
-#include <pthread.h>
-#include <math.h>
-#include <float.h>
-#include <limits.h>
-#include <fenv.h>
+#include <unistd.h>
 
 /* Conditional headers. */
 #ifdef HAVE_LIBZ
diff --git a/tests/testTimeIntegration.c b/tests/testTimeIntegration.c
index f3802888bccc40a424d659cde2605d12c9268e47..03893daf3530df040e5a5630bc6dc1d930ddcd1b 100644
--- a/tests/testTimeIntegration.c
+++ b/tests/testTimeIntegration.c
@@ -115,10 +115,10 @@ int main() {
     c.parts[0].a_hydro[1] = -(G * M_sun * c.parts[0].x[1] / r * r * r);
 
     /* Kick... */
-    runner_dokick(&run, &c, 0);
+    runner_do_kick(&run, &c, 0);
 
     /* Drift... */
-    runner_dodrift(&run, &c, 0);
+    runner_do_drift(&run, &c, 0);
   }
 
   /* Clean-up */
diff --git a/theory/paper_pasc/pasc_paper.tex b/theory/paper_pasc/pasc_paper.tex
index 87868a74d69023ee48d793cb2d0e29e71af777d6..a247192c9446475e86e0d24fff78a7a5fa557717 100644
--- a/theory/paper_pasc/pasc_paper.tex
+++ b/theory/paper_pasc/pasc_paper.tex
@@ -36,58 +36,48 @@
 
 \begin{document}
 
-%Conference
-\conferenceinfo{PASC '16}{June 8--10, 2016, Lausanne, Switzerland}
+\CopyrightYear{2016}
+\setcopyright{acmlicensed}
+\conferenceinfo{PASC '16,}{June 08 - 10, 2016, Lausanne, Switzerland}
+\isbn{978-1-4503-4126-4/16/06}\acmPrice{\$15.00}
+\doi{http://dx.doi.org/10.1145/2929908.2929916}
+
 
 \title{{\ttlit SWIFT}: Using task-based parallelism, fully asynchronous
 communication, and graph partition-based domain decomposition for
 strong scaling on more than 100\,000 cores.}
-% \title{{\ttlit SWIFT}: A task-based hybrid-parallel strongly scalable code for
-%   particle-based cosmological simulations}
 
-\numberofauthors{6}
+\numberofauthors{4}
   
 \author{
 \alignauthor
-       Main~Author\\
-       \affaddr{Institute}\\
-       \affaddr{Department}\\
-       \affaddr{University}\\
-       \affaddr{City Postal code, Country}\\
-       \email{\footnotesize \url{main.author@university.country}}
-% \alignauthor
-%        Matthieu~Schaller\\
-%        \affaddr{Institute for Computational Cosmology (ICC)}\\
-%        \affaddr{Department of Physics}\\
-%        \affaddr{Durham University}\\
-%        \affaddr{Durham DH1 3LE, UK}\\
-%        \email{\footnotesize \url{matthieu.schaller@durham.ac.uk}}
-% \alignauthor
-%        Pedro~Gonnet\\
-%        \affaddr{School of Engineering and Computing Sciences}\\
-%        \affaddr{Durham University}\\
-%        \affaddr{Durham DH1 3LE, UK}\\
-% \alignauthor
-%        Aidan~B.~G.~Chalk\\
-%        \affaddr{School of Engineering and Computing Sciences}\\
-%        \affaddr{Durham University}\\
-%        \affaddr{Durham DH1 3LE, UK}\\
-% \and
-% \alignauthor
-%        Peter~W.~Draper\\
-%        \affaddr{Institute for Computational Cosmology (ICC)}\\
-%        \affaddr{Department of Physics}\\
-%        \affaddr{Durham University}\\
-%        \affaddr{Durham DH1 3LE, UK}\\
-%        %% \alignauthor
-%        %% Tom Theuns\\
-%        %% \affaddr{Institute for Computational Cosmology}\\
-%        %% \affaddr{Department of Physics}\\
-%        %% \affaddr{Durham University}\\
-%        %% \affaddr{Durham DH1 3LE, UK}      
+       Matthieu~Schaller\\
+       \affaddr{Institute for Computational Cosmology (ICC)}\\
+       \affaddr{Department of Physics}\\
+       \affaddr{Durham University}\\
+       \affaddr{Durham DH1 3LE, UK}\\
+       \email{\footnotesize \url{matthieu.schaller@durham.ac.uk}}
+       \alignauthor
+      Pedro~Gonnet\\
+      \affaddr{School of Engineering and Computing Sciences}\\
+      \affaddr{Durham University}\\
+      \affaddr{Durham DH1 3LE, UK}\\ \vspace{1ex}
+      \affaddr{Google Switzerland GmbH}\\
+      \affaddr{8002 Z\"urich, Switzerland}\\
+\and
+\alignauthor
+       Aidan~B.~G.~Chalk\\
+       \affaddr{School of Engineering and Computing Sciences}\\
+       \affaddr{Durham University}\\
+       \affaddr{Durham DH1 3LE, UK}\\
+\alignauthor
+       Peter~W.~Draper\\
+       \affaddr{Institute for Computational Cosmology (ICC)}\\
+       \affaddr{Department of Physics}\\
+       \affaddr{Durham University}\\
+       \affaddr{Durham DH1 3LE, UK}\\
 }
 
-
 \date{\today}
 
 \maketitle
@@ -124,28 +114,12 @@ strong scaling on more than 100\,000 cores.}
 
   \end{itemize}
   
-  %% These three main aspects alongside improved cache-efficient
-  %% algorithms for neighbour finding allow the code to be 40x faster on
-  %% the same architecture than the standard code Gadget-2 widely used by
-  %% researchers.
-
   In order to use these approaches, the code had to be re-written from
   scratch, and the algorithms therein adapted to the task-based paradigm.
   As a result, we can show upwards of 60\% parallel efficiency for 
   moderate-sized problems when increasing the number of cores 512-fold,
   on both x86-based and Power8-based architectures.
-  
-  %% As a result, our code present excellent \emph{strong}
-  %% scaling on a variety of architectures, ranging from x86 Tier-2 systems to the
-  %% largest Tier-0 machines currently available. It displays, for instance, a
-  %% \emph{strong} scaling parallel efficiency of more than 60\% when going from
-  %% 512 to 131072 cores on a Blue Gene architecture. Similar results are obtained
-  %% on standard clusters of x86 CPUs.
-  
-  %% The task-based library, \qs, used as the backbone of the code is
-  %% itself also freely available and can be used in a wide variety of
-  %% other numerical problems.
-  
+    
 \end{abstract}
 
 
@@ -356,12 +330,20 @@ SMP~Superscalar \cite{ref:SMPSuperscalar}, OpenMP~3.0 \cite{ref:Duran2009},
 Intel's TBB \cite{ref:Reinders2007}, and, to some extent,
 Charm++ \cite{ref:Kale1993}.
 
-For convenience, and to make experimenting with different scheduling
-techniques easier, we chose to implement our own task scheduler
+Since none of these existing taks-based libraries provided the flexibility
+required to experiment with different scheduling and communication
+techniques, (\swift is an interdisciplinary effort between
+Computer Science and Astrophysics to study not only cosmological
+phenomena, but also novel simulation algorithms and parallel computing techniques)
+we chose to implement our own task scheduler
 in \swift, which has since been back-ported as the general-purpose
 \qs task scheduler \cite{gonnet2013quicksched}.
+In \qs and \swift, task dependencies are specified explicitly,
+as opposed to being implicitly derived from data dependencies,
+allowing us to more easily build complex task hierarchies.
 This also allowed us to extend the scheduler with the concept of
-task conflicts.
+task conflicts and integrate the asynchronous communication
+scheme described further on.
 
 Despite its advantages, and the variety of implementations,
 task-based parallelism is rarely used in
@@ -374,9 +356,19 @@ which is usually not an option for large and complex codebases.
 
 Since we were re-implementing \swift from scratch, this was not an issue.
 The tree-based neighbour-finding described above was replaced with a more
-task-friendly approach as described in \cite{ref:Gonnet2015}.
-Particle interactions are computed within, and between pairs, of
-hierarchical {\em cells} containing one or more particles.
+task-friendly approach as described in \cite{ref:Gonnet2015}, in which
+the domain is first decomposed into a grid of {\em cells} of edge length
+larger or equal to the largest particle radius.
+An initial set of interaction tasks is then defined over all cells and
+pairs of neighbouring cells, such that if two particles are close enough to interact,
+they are either in the same cell or they span a pair of neighbouring cells.
+These initial interaction tasks are then refined by recursively
+splitting cells that contain more than a certain number of particles
+and replacing tasks that span a pair of split cells with tasks
+spanning the neighboring sub-cells.
+The resulting refined set of tasks contains all the cells and pairs of cells
+over which particle interactions must be computed.
+
 The dependencies between the tasks are set following
 equations \eqn{rho}, \eqn{dvdt}, and \eqn{dudt}, i.e.~such that for any cell,
 all the tasks computing the particle densities therein must have
@@ -416,7 +408,10 @@ cores of a shared-memory machine \cite{ref:Gonnet2015}.
   and purple. Although the data for the yellow cell resides on
   Node~2, it is required for some tasks on Node~1, and thus needs
   to be copied over  during
-  the computation using {\tt send}/{\tt recv} tasks (diamond-shaped).}
+  the computation using {\tt send}/{\tt recv} tasks
+  (diamond-shaped). \newline 
+  Figure adapted from \cite{ref:Gonnet2015}.
+  }
 \label{tasks}
 \end{figure}  
 
@@ -871,7 +866,6 @@ Union's ERC Grant agreements 267291 ``Cosmiway'', and by {\sc intel}
 through establishment of the ICC as an {\sc intel} parallel computing
 centre (IPCC).
 
-\nocite{*}
 \bibliographystyle{abbrv}
 \bibliography{biblio}
 
diff --git a/theory/paper_pasc/run.sh b/theory/paper_pasc/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..b24e1b76a60858520ec2bc03b926ced642dc29e0
--- /dev/null
+++ b/theory/paper_pasc/run.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+pdflatex pasc_paper.tex
+bibtex pasc_paper.aux
+pdflatex pasc_paper.tex
+pdflatex pasc_paper.tex