diff --git a/configure.ac b/configure.ac
index 9f50ba4563cad932314a9f5e7e0bd8cf13b88b7c..ab9bdfad58711eef1dabcc27c8371ccc38bdc7c0 100644
--- a/configure.ac
+++ b/configure.ac
@@ -407,6 +407,24 @@ if test "$enable_memuse_reports" = "yes"; then
    AC_DEFINE([SWIFT_MEMUSE_REPORTS],1,[Enable memory usage reports])
 fi
 
+# The system memory report depends on the presence of /proc/self/statm, i.e.
+# a linux OS, check for that.
+if test -f "/proc/self/statm"; then
+   AC_DEFINE([SWIFT_MEMUSE_STATM],1,[Have /proc/self/statm capability])
+fi
+
+# Check if we want to make the dumper thread active.
+AC_ARG_ENABLE([dumper],
+   [AS_HELP_STRING([--enable-dumper],
+     [Dump active tasks and memory use (if configured)@<:@yes/no@:>@]
+   )],
+   [enable_dumper="$enableval"],
+   [enable_dumper="no"]
+)
+if test "$enable_dumper" = "yes"; then
+   AC_DEFINE([SWIFT_DUMPER_THREAD],1,[Enable dumper thread])
+fi
+
 # See if we want mpi reporting.
 AC_ARG_ENABLE([mpiuse-reports],
    [AS_HELP_STRING([--enable-mpiuse-reports],
@@ -1208,6 +1226,17 @@ fi
 AC_SUBST([VELOCIRAPTOR_LIBS])
 AM_CONDITIONAL([HAVEVELOCIRAPTOR],[test -n "$VELOCIRAPTOR_LIBS"])
 
+# Now that we found VELOCIraptor, let's check how it was compiled.
+if test "$have_velociraptor" == "yes"; then
+    AC_CHECK_LIB(
+       [velociraptor],
+       [VR_NOMASS],
+       [AC_DEFINE([HAVE_VELOCIRAPTOR_WITH_NOMASS],1,[The VELOCIraptor library has been compiled with the NOMASS option. Only useful if running a uniform box.])],
+       [AC_MSG_RESULT(VELOCIraptor not compiled to so as to *not* store masses per particle.)],
+       [$VELOCIRAPTOR_LIBS $HDF5_LDFLAGS $HDF5_LIBS $GSL_LIBS]
+    )
+fi
+
 # Check for dummy VELOCIraptor.
 AC_ARG_ENABLE([dummy-velociraptor],
     [AS_HELP_STRING([--enable-dummy-velociraptor],
@@ -1899,7 +1928,7 @@ esac
 # Feedback model
 AC_ARG_WITH([feedback],
    [AS_HELP_STRING([--with-feedback=<model>],
-      [Feedback model to use @<:@none, EAGLE, debug default: none@:>@]
+      [Feedback model to use @<:@none, EAGLE, GEAR default: none@:>@]
    )],
    [with_feedback="$withval"],
    [with_feedback="none"]
@@ -1932,7 +1961,7 @@ esac
 # Black hole model.
 AC_ARG_WITH([black-holes],
    [AS_HELP_STRING([--with-black-holes=<model>],
-      [Black holes model to use @<:@none, default: none@:>@]
+      [Black holes model to use @<:@none, EAGLE default: none@:>@]
    )],
    [with_black_holes="$withval"],
    [with_black_holes="none"]
@@ -1961,14 +1990,14 @@ esac
 # Task order
 AC_ARG_WITH([task-order],
 [AS_HELP_STRING([--with-task-order=<model>],
-[Task order to use @<:@none, EAGLE, GEAR default: none@:>@]
+[Task order to use @<:@default, EAGLE, GEAR default: default@:>@]
 )],
 [with_task_order="$withval"],
-[with_task_order="none"]
+[with_task_order="default"]
 )
 
 if test "$with_subgrid" != "none"; then
-if test "$with_task_order" != "none"; then
+if test "$with_task_order" != "default"; then
 AC_MSG_ERROR([Cannot provide with-subgrid and with-task-order together])
 else
 with_task_order="$with_subgrid_task_order"
@@ -1979,8 +2008,8 @@ case "$with_task_order" in
 EAGLE)
 AC_DEFINE([TASK_ORDER_EAGLE], [1], [EAGLE task order])
 ;;
-none)
-AC_DEFINE([TASK_ORDER_NONE], [1], [Default task order])
+default)
+AC_DEFINE([TASK_ORDER_NONE], [1], [Default (i.e. EAGLE/OWLS) task order])
 ;;
 GEAR)
 AC_DEFINE([TASK_ORDER_GEAR], [1], [GEAR task order])
diff --git a/doc/RTD/source/EquationOfState/index.rst b/doc/RTD/source/EquationOfState/index.rst
index 11e6069130a84ba54dec01e9892464574d9c2c6b..c4468a837efaddebc6a44f8e4e72c8fe056dfae7 100644
--- a/doc/RTD/source/EquationOfState/index.rst
+++ b/doc/RTD/source/EquationOfState/index.rst
@@ -7,18 +7,24 @@
 Equations of State
 ==================
 
-Currently (if the documentation was well updated), we have two different gas
-equations of state (EoS) implemented: ideal and isothermal; as well as a variety  
-of EoS for "planetary" materials. 
-The EoS describe the relations between our main thermodynamical variables: 
-the internal energy (\\(u\\)), the density (\\(\\rho\\)), the entropy (\\(A\\)) 
-and the pressure (\\(P\\)).
+Currently, SWIFT offers two different gas equations of state (EoS)
+implemented: ``ideal`` and ``isothermal``; as well as a variety of EoS for
+"planetary" materials.  The EoS describe the relations between our
+main thermodynamical variables: the internal energy per unit mass
+(\\(u\\)), the mass density (\\(\\rho\\)), the entropy (\\(A\\)) and
+the pressure (\\(P\\)).
 
 Gas EoS
 -------
 
-In the following section, the variables not yet defined are: \\(\\gamma\\) for
-the adiabatic index and \\( c_s \\) for the speed of sound.
+We write the adiabatic index as \\(\\gamma \\) and \\( c_s \\) denotes
+the speed of sound. The adiabatic index can be changed at configure
+time by choosing one of the allowed values of the option
+``--with-adiabatic-index``. The default value is \\(\\gamma = 5/3 \\).
+
+The tables below give the expression for the thermodynamic quantities
+on each row entry as a function of the gas density and the
+thermodynamical quantity given in the header of each column.
 
 .. csv-table:: Ideal Gas
    :header: "Variable", "A", "u", "P"
@@ -38,7 +44,11 @@ the adiabatic index and \\( c_s \\) for the speed of sound.
    "P", "", "\\(\\left( \\gamma - 1\\right) u \\rho \\)", ""
    "\\( c_s\\)", "", "\\(\\sqrt{ u \\gamma \\left( \\gamma - 1 \\right) } \\)", ""
 
-
+Note that when running with an isothermal equation of state, the value
+of the tracked thermodynamic variable (e.g. the entropy in a
+density-entropy scheme or the internal enegy in a density-energy SPH
+formulation) written to the snapshots is meaningless. The pressure,
+however, is always correct in all scheme.
 
 Planetary EoS
 -------------
diff --git a/doc/RTD/source/ParameterFiles/parameter_description.rst b/doc/RTD/source/ParameterFiles/parameter_description.rst
index 726631b7e0de1e2ed134fbba456f0a7f3288ea38..4abe0bf913e7721ba6d2436f47bd0f56941db55f 100644
--- a/doc/RTD/source/ParameterFiles/parameter_description.rst
+++ b/doc/RTD/source/ParameterFiles/parameter_description.rst
@@ -986,26 +986,34 @@ should contain. The type of partitioning attempted is controlled by the::
   DomainDecomposition:
     initial_type:
 
-parameter. Which can have the values *memory*, *region*, *grid* or
+parameter. Which can have the values *memory*, *edgememory*, *region*, *grid* or
 *vectorized*:
 
+    * *edgememory*
+
+    This is the default if METIS or ParMETIS is available. It performs a
+    partition based on the memory use of all the particles in each cell.
+    The total memory per cell is used to weight the cell vertex and all the
+    associated edges. This attempts to equalize the memory used by all the
+    ranks but with some consideration given to the need to not cut dense
+    regions (by also minimizing the edge cut). How successful this
+    attempt is depends on the granularity of cells and particles and the
+    number of ranks, clearly if most of the particles are in one cell, or a
+    small region of the volume, balance is impossible or difficult. Having
+    more top-level cells makes it easier to calculate a good distribution
+    (but this comes at the cost of greater overheads).
 
     * *memory*
 
-    This is the default if METIS or ParMETIS is available. It performs a
-    partition based on the memory use of all the particles in each cell,
-    attempting to equalize the memory used by all the ranks.
-    How successful this attempt is depends on the granularity of cells and particles
-    and the number of ranks, clearly if most of the particles are in one cell,
-    or a small region of the volume, balance is impossible or
-    difficult. Having more top-level cells makes it easier to calculate a
-    good distribution (but this comes at the cost of greater overheads).
+    This is like *edgememory*, but doesn't include any edge weights, it should
+    balance the particle memory use per rank more exactly (but note effects
+    like the numbers of cells per rank will also have an effect, as that
+    changes the need for foreign cells).
 
     * *region*
 
     The one other METIS/ParMETIS option is "region". This attempts to assign equal
-    numbers of cells to each rank, with the surface area of the regions minimised
-    (so we get blobs, rather than rectangular volumes of cells).
+    numbers of cells to each rank, with the surface area of the regions minimised.
 
 If ParMETIS and METIS are not available two other options are possible, but
 will give a poorer partition:
diff --git a/examples/GravityTests/DiscPatch/HydroStatic/plotSolution.py b/examples/GravityTests/DiscPatch/HydroStatic/plotSolution.py
index 681f7d8ab3f2320b5de75e688edcb92efef9d883..b00b6176554f2d5d2aa5e87cceba90c02f2a7e54 100644
--- a/examples/GravityTests/DiscPatch/HydroStatic/plotSolution.py
+++ b/examples/GravityTests/DiscPatch/HydroStatic/plotSolution.py
@@ -62,11 +62,9 @@ def get_analytic_pressure(x):
 def get_data(name):
   file = h5py.File(name, "r")
   coords = np.array(file["/PartType0/Coordinates"])
-  rho = np.array(file["/PartType0/Density"])
-  u = np.array(file["/PartType0/InternalEnergy"])
+  rho = np.array(file["/PartType0/Densities"])
   v = np.array(file["/PartType0/Velocities"])
-
-  P = (gamma - 1.) * rho * u
+  P = np.array(file["/PartType0/Pressures"])
 
   vtot = np.sqrt( v[:,0]**2 + v[:,1]**2 + v[:,2]**2 )
 
@@ -79,7 +77,7 @@ for f in sorted(glob.glob("Disc-Patch_*.hdf5")):
   if num < start or num > stop:
     continue
 
-  print "processing", f, "..."
+  print("processing", f, "...")
 
   xrange = np.linspace(0., 2. * x_disc, 1000)
   time, x, rho, P, v = get_data(f)
diff --git a/examples/HydroTests/GreshoVortex_2D/plotSolution.py b/examples/HydroTests/GreshoVortex_2D/plotSolution.py
index 2d4697b6ffaac0639da67ee90d824c75791ea573..fd63e22ba5c995b4ec3ab9b50b9b6f69750a08b0 100644
--- a/examples/HydroTests/GreshoVortex_2D/plotSolution.py
+++ b/examples/HydroTests/GreshoVortex_2D/plotSolution.py
@@ -1,61 +1,41 @@
 ###############################################################################
- # This file is part of SWIFT.
- # Copyright (c) 2016  Matthieu Schaller (matthieu.schaller@durham.ac.uk)
- # 
- # This program is free software: you can redistribute it and/or modify
- # it under the terms of the GNU Lesser General Public License as published
- # by the Free Software Foundation, either version 3 of the License, or
- # (at your option) any later version.
- # 
- # This program is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- # GNU General Public License for more details.
- # 
- # You should have received a copy of the GNU Lesser General Public License
- # along with this program.  If not, see <http://www.gnu.org/licenses/>.
- # 
- ##############################################################################
+# This file is part of SWIFT.
+# Copyright (c) 2016  Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+##############################################################################
 
 # Computes the analytical solution of the Gresho-Chan vortex and plots the SPH answer
 
 # Parameters
-gas_gamma = 5./3.     # Gas adiabatic index
-rho0 = 1          # Gas density
-P0 = 0.           # Constant additional pressure (should have no impact on the dynamics)
+gas_gamma = 5.0 / 3.0  # Gas adiabatic index
+rho0 = 1  # Gas density
+P0 = 0.0  # Constant additional pressure (should have no impact on the dynamics)
 
 # ---------------------------------------------------------------
 # Don't touch anything after this.
 # ---------------------------------------------------------------
 
 import matplotlib
+
 matplotlib.use("Agg")
 from pylab import *
 from scipy import stats
 import h5py
 
-# Plot parameters
-params = {'axes.labelsize': 10,
-'axes.titlesize': 10,
-'font.size': 12,
-'legend.fontsize': 12,
-'xtick.labelsize': 10,
-'ytick.labelsize': 10,
-'text.usetex': True,
- 'figure.figsize' : (9.90,6.45),
-'figure.subplot.left'    : 0.045,
-'figure.subplot.right'   : 0.99,
-'figure.subplot.bottom'  : 0.05,
-'figure.subplot.top'     : 0.99,
-'figure.subplot.wspace'  : 0.15,
-'figure.subplot.hspace'  : 0.12,
-'lines.markersize' : 6,
-'lines.linewidth' : 3.,
-'text.latex.unicode': True
-}
-rcParams.update(params)
-rc('font',**{'family':'sans-serif','sans-serif':['Times']})
-
+style.use("../../../tools/stylesheets/mnras.mplstyle")
 
 snap = int(sys.argv[1])
 
@@ -69,21 +49,27 @@ solution_v_r = zeros(N)
 
 for i in range(N):
     if solution_r[i] < 0.2:
-        solution_P[i] = P0 + 5. + 12.5*solution_r[i]**2
-        solution_v_phi[i] = 5.*solution_r[i]
+        solution_P[i] = P0 + 5.0 + 12.5 * solution_r[i] ** 2
+        solution_v_phi[i] = 5.0 * solution_r[i]
     elif solution_r[i] < 0.4:
-        solution_P[i] = P0 + 9. + 12.5*solution_r[i]**2 - 20.*solution_r[i] + 4.*log(solution_r[i]/0.2)
-        solution_v_phi[i] = 2. -5.*solution_r[i]
+        solution_P[i] = (
+            P0
+            + 9.0
+            + 12.5 * solution_r[i] ** 2
+            - 20.0 * solution_r[i]
+            + 4.0 * log(solution_r[i] / 0.2)
+        )
+        solution_v_phi[i] = 2.0 - 5.0 * solution_r[i]
     else:
-        solution_P[i] = P0 + 3. + 4.*log(2.)
-        solution_v_phi[i] = 0.
+        solution_P[i] = P0 + 3.0 + 4.0 * log(2.0)
+        solution_v_phi[i] = 0.0
 
 solution_rho = ones(N) * rho0
-solution_s = solution_P / solution_rho**gas_gamma
-solution_u = solution_P /((gas_gamma - 1.)*solution_rho)
+solution_s = solution_P / solution_rho ** gas_gamma
+solution_u = solution_P / ((gas_gamma - 1.0) * solution_rho)
 
 # Read the simulation data
-sim = h5py.File("gresho_%04d.hdf5"%snap, "r")
+sim = h5py.File("gresho_%04d.hdf5" % snap, "r")
 boxSize = sim["/Header"].attrs["BoxSize"][0]
 time = sim["/Header"].attrs["Time"][0]
 scheme = sim["/HydroScheme"].attrs["Scheme"]
@@ -92,133 +78,153 @@ neighbours = sim["/HydroScheme"].attrs["Kernel target N_ngb"]
 eta = sim["/HydroScheme"].attrs["Kernel eta"]
 git = sim["Code"].attrs["Git Revision"]
 
-pos = sim["/PartType0/Coordinates"][:,:]
-x = pos[:,0] - boxSize / 2
-y = pos[:,1] - boxSize / 2
-vel = sim["/PartType0/Velocities"][:,:]
-r = sqrt(x**2 + y**2)
-v_r = (x * vel[:,0] + y * vel[:,1]) / r
-v_phi = (-y * vel[:,0] + x * vel[:,1]) / r
-v_norm = sqrt(vel[:,0]**2 + vel[:,1]**2)
+pos = sim["/PartType0/Coordinates"][:, :]
+x = pos[:, 0] - boxSize / 2
+y = pos[:, 1] - boxSize / 2
+vel = sim["/PartType0/Velocities"][:, :]
+r = sqrt(x ** 2 + y ** 2)
+v_r = (x * vel[:, 0] + y * vel[:, 1]) / r
+v_phi = (-y * vel[:, 0] + x * vel[:, 1]) / r
+v_norm = sqrt(vel[:, 0] ** 2 + vel[:, 1] ** 2)
 rho = sim["/PartType0/Densities"][:]
 u = sim["/PartType0/InternalEnergies"][:]
 S = sim["/PartType0/Entropies"][:]
 P = sim["/PartType0/Pressures"][:]
 
 # Bin te data
-r_bin_edge = np.arange(0., 1., 0.02)
-r_bin = 0.5*(r_bin_edge[1:] + r_bin_edge[:-1])
-rho_bin,_,_ = stats.binned_statistic(r, rho, statistic='mean', bins=r_bin_edge)
-v_bin,_,_ = stats.binned_statistic(r, v_phi, statistic='mean', bins=r_bin_edge)
-P_bin,_,_ = stats.binned_statistic(r, P, statistic='mean', bins=r_bin_edge)
-S_bin,_,_ = stats.binned_statistic(r, S, statistic='mean', bins=r_bin_edge)
-u_bin,_,_ = stats.binned_statistic(r, u, statistic='mean', bins=r_bin_edge)
-rho2_bin,_,_ = stats.binned_statistic(r, rho**2, statistic='mean', bins=r_bin_edge)
-v2_bin,_,_ = stats.binned_statistic(r, v_phi**2, statistic='mean', bins=r_bin_edge)
-P2_bin,_,_ = stats.binned_statistic(r, P**2, statistic='mean', bins=r_bin_edge)
-S2_bin,_,_ = stats.binned_statistic(r, S**2, statistic='mean', bins=r_bin_edge)
-u2_bin,_,_ = stats.binned_statistic(r, u**2, statistic='mean', bins=r_bin_edge)
-rho_sigma_bin = np.sqrt(rho2_bin - rho_bin**2)
-v_sigma_bin = np.sqrt(v2_bin - v_bin**2)
-P_sigma_bin = np.sqrt(P2_bin - P_bin**2)
-S_sigma_bin = np.sqrt(S2_bin - S_bin**2)
-u_sigma_bin = np.sqrt(u2_bin - u_bin**2)
+r_bin_edge = np.arange(0.0, 1.0, 0.02)
+r_bin = 0.5 * (r_bin_edge[1:] + r_bin_edge[:-1])
+rho_bin, _, _ = stats.binned_statistic(r, rho, statistic="mean", bins=r_bin_edge)
+v_bin, _, _ = stats.binned_statistic(r, v_phi, statistic="mean", bins=r_bin_edge)
+P_bin, _, _ = stats.binned_statistic(r, P, statistic="mean", bins=r_bin_edge)
+S_bin, _, _ = stats.binned_statistic(r, S, statistic="mean", bins=r_bin_edge)
+u_bin, _, _ = stats.binned_statistic(r, u, statistic="mean", bins=r_bin_edge)
+rho2_bin, _, _ = stats.binned_statistic(r, rho ** 2, statistic="mean", bins=r_bin_edge)
+v2_bin, _, _ = stats.binned_statistic(r, v_phi ** 2, statistic="mean", bins=r_bin_edge)
+P2_bin, _, _ = stats.binned_statistic(r, P ** 2, statistic="mean", bins=r_bin_edge)
+S2_bin, _, _ = stats.binned_statistic(r, S ** 2, statistic="mean", bins=r_bin_edge)
+u2_bin, _, _ = stats.binned_statistic(r, u ** 2, statistic="mean", bins=r_bin_edge)
+rho_sigma_bin = np.sqrt(rho2_bin - rho_bin ** 2)
+v_sigma_bin = np.sqrt(v2_bin - v_bin ** 2)
+P_sigma_bin = np.sqrt(P2_bin - P_bin ** 2)
+S_sigma_bin = np.sqrt(S2_bin - S_bin ** 2)
+u_sigma_bin = np.sqrt(u2_bin - u_bin ** 2)
 
 
 # Plot the interesting quantities
-figure()
+figure(figsize=(7, 7 / 1.6))
+
+line_color = "C4"
+binned_color = "C2"
+binned_marker_size = 4
 
+scatter_props = dict(
+    marker=".",
+    ms=1,
+    markeredgecolor="none",
+    alpha=0.5,
+    zorder=-1,
+    rasterized=True,
+    linestyle="none",
+)
+
+errorbar_props = dict(color=binned_color, ms=binned_marker_size, fmt=".", lw=1.2)
 
 # Azimuthal velocity profile -----------------------------
 subplot(231)
 
-plot(r, v_phi, '.', color='r', ms=0.5)
-plot(solution_r, solution_v_phi, '--', color='k', alpha=0.8, lw=1.2)
-errorbar(r_bin, v_bin, yerr=v_sigma_bin, fmt='.', ms=8.0, color='b', lw=1.2)
-plot([0.2, 0.2], [-100, 100], ':', color='k', alpha=0.4, lw=1.2)
-plot([0.4, 0.4], [-100, 100], ':', color='k', alpha=0.4, lw=1.2)
-xlabel("${\\rm{Radius}}~r$", labelpad=0)
-ylabel("${\\rm{Azimuthal~velocity}}~v_\\phi$", labelpad=0)
-xlim(0,R_max)
+plot(r, v_phi, **scatter_props)
+plot(solution_r, solution_v_phi, "--", color=line_color, alpha=0.8, lw=1.2)
+errorbar(r_bin, v_bin, yerr=v_sigma_bin, **errorbar_props)
+plot([0.2, 0.2], [-100, 100], ":", color=line_color, alpha=0.4, lw=1.2)
+plot([0.4, 0.4], [-100, 100], ":", color=line_color, alpha=0.4, lw=1.2)
+xlabel("Radius $r$")
+ylabel("Azimuthal velocity $v_\\phi$")
+xlim(0, R_max)
 ylim(-0.1, 1.2)
 
 # Radial density profile --------------------------------
 subplot(232)
 
-plot(r, rho, '.', color='r', ms=0.5)
-plot(solution_r, solution_rho, '--', color='k', alpha=0.8, lw=1.2)
-errorbar(r_bin, rho_bin, yerr=rho_sigma_bin, fmt='.', ms=8.0, color='b', lw=1.2)
-plot([0.2, 0.2], [-100, 100], ':', color='k', alpha=0.4, lw=1.2)
-plot([0.4, 0.4], [-100, 100], ':', color='k', alpha=0.4, lw=1.2)
-xlabel("${\\rm{Radius}}~r$", labelpad=0)
-ylabel("${\\rm{Density}}~\\rho$", labelpad=0)
-xlim(0,R_max)
-ylim(rho0-0.3, rho0 + 0.3)
-#yticks([-0.2, -0.1, 0., 0.1, 0.2])
+plot(r, rho, **scatter_props)
+plot(solution_r, solution_rho, "--", color=line_color, alpha=0.8, lw=1.2)
+errorbar(r_bin, rho_bin, yerr=rho_sigma_bin, **errorbar_props)
+plot([0.2, 0.2], [-100, 100], ":", color=line_color, alpha=0.4, lw=1.2)
+plot([0.4, 0.4], [-100, 100], ":", color=line_color, alpha=0.4, lw=1.2)
+xlabel("Radius $r$")
+ylabel("Density $\\rho$")
+xlim(0, R_max)
+ylim(rho0 - 0.3, rho0 + 0.3)
+# yticks([-0.2, -0.1, 0., 0.1, 0.2])
 
 # Radial pressure profile --------------------------------
 subplot(233)
 
-plot(r, P, '.', color='r', ms=0.5)
-plot(solution_r, solution_P, '--', color='k', alpha=0.8, lw=1.2)
-errorbar(r_bin, P_bin, yerr=P_sigma_bin, fmt='.', ms=8.0, color='b', lw=1.2)
-plot([0.2, 0.2], [-100, 100], ':', color='k', alpha=0.4, lw=1.2)
-plot([0.4, 0.4], [-100, 100], ':', color='k', alpha=0.4, lw=1.2)
-xlabel("${\\rm{Radius}}~r$", labelpad=0)
-ylabel("${\\rm{Pressure}}~P$", labelpad=0)
+plot(r, P, **scatter_props)
+plot(solution_r, solution_P, "--", color=line_color, alpha=0.8, lw=1.2)
+errorbar(r_bin, P_bin, yerr=P_sigma_bin, **errorbar_props)
+plot([0.2, 0.2], [-100, 100], ":", color=line_color, alpha=0.4, lw=1.2)
+plot([0.4, 0.4], [-100, 100], ":", color=line_color, alpha=0.4, lw=1.2)
+xlabel("Radius $r$")
+ylabel("Pressure $P$")
 xlim(0, R_max)
 ylim(4.9 + P0, P0 + 6.1)
 
 # Internal energy profile --------------------------------
 subplot(234)
 
-plot(r, u, '.', color='r', ms=0.5)
-plot(solution_r, solution_u, '--', color='k', alpha=0.8, lw=1.2)
-errorbar(r_bin, u_bin, yerr=u_sigma_bin, fmt='.', ms=8.0, color='b', lw=1.2)
-plot([0.2, 0.2], [-100, 100], ':', color='k', alpha=0.4, lw=1.2)
-plot([0.4, 0.4], [-100, 100], ':', color='k', alpha=0.4, lw=1.2)
-xlabel("${\\rm{Radius}}~r$", labelpad=0)
-ylabel("${\\rm{Internal~Energy}}~u$", labelpad=0)
-xlim(0,R_max)
+plot(r, u, **scatter_props)
+plot(solution_r, solution_u, "--", color=line_color, alpha=0.8, lw=1.2)
+errorbar(r_bin, u_bin, yerr=u_sigma_bin, **errorbar_props)
+plot([0.2, 0.2], [-100, 100], ":", color=line_color, alpha=0.4, lw=1.2)
+plot([0.4, 0.4], [-100, 100], ":", color=line_color, alpha=0.4, lw=1.2)
+xlabel("$Radius $r$")
+ylabel("Internal Energy $u$")
+xlim(0, R_max)
 ylim(7.3, 9.1)
 
 
 # Radial entropy profile --------------------------------
 subplot(235)
 
-plot(r, S, '.', color='r', ms=0.5)
-plot(solution_r, solution_s, '--', color='k', alpha=0.8, lw=1.2)
-errorbar(r_bin, S_bin, yerr=S_sigma_bin, fmt='.', ms=8.0, color='b', lw=1.2)
-plot([0.2, 0.2], [-100, 100], ':', color='k', alpha=0.4, lw=1.2)
-plot([0.4, 0.4], [-100, 100], ':', color='k', alpha=0.4, lw=1.2)
-xlabel("${\\rm{Radius}}~r$", labelpad=0)
-ylabel("${\\rm{Entropy}}~S$", labelpad=0)
+plot(r, S, **scatter_props)
+plot(solution_r, solution_s, "--", color=line_color, alpha=0.8, lw=1.2)
+errorbar(r_bin, S_bin, yerr=S_sigma_bin, **errorbar_props)
+plot([0.2, 0.2], [-100, 100], ":", color=line_color, alpha=0.4, lw=1.2)
+plot([0.4, 0.4], [-100, 100], ":", color=line_color, alpha=0.4, lw=1.2)
+xlabel("Radius $r$")
+ylabel("Entropy $S$")
 xlim(0, R_max)
 ylim(4.9 + P0, P0 + 6.1)
 
-# Image --------------------------------------------------
-#subplot(234)
-#scatter(pos[:,0], pos[:,1], c=v_norm, cmap="PuBu", edgecolors='face', s=4, vmin=0, vmax=1)
-#text(0.95, 0.95, "$|v|$", ha="right", va="top")
-#xlim(0,1)
-#ylim(0,1)
-#xlabel("$x$", labelpad=0)
-#ylabel("$y$", labelpad=0)
-
 # Information -------------------------------------
 subplot(236, frameon=False)
 
-text(-0.49, 0.9, "Gresho-Chan vortex with  $\\gamma=%.3f$ at $t=%.2f$"%(gas_gamma,time), fontsize=10)
-text(-0.49, 0.8, "Background $\\rho_0=%.3f$"%rho0, fontsize=10)
-text(-0.49, 0.7, "Background $P_0=%.3f$"%P0, fontsize=10)
-plot([-0.49, 0.1], [0.62, 0.62], 'k-', lw=1)
-text(-0.49, 0.5, "$\\textsc{Swift}$ %s"%git, fontsize=10)
-text(-0.49, 0.4, scheme, fontsize=10)
-text(-0.49, 0.3, kernel, fontsize=10)
-text(-0.49, 0.2, "$%.2f$ neighbours ($\\eta=%.3f$)"%(neighbours, eta), fontsize=10)
+text_fontsize = 5
+
+text(
+    -0.49,
+    0.9,
+    "Gresho-Chan vortex (2D) with $\\gamma=%.3f$ at $t=%.2f$" % (gas_gamma, time),
+    fontsize=text_fontsize,
+)
+text(-0.49, 0.8, "Background $\\rho_0=%.3f$" % rho0, fontsize=text_fontsize)
+text(-0.49, 0.7, "Background $P_0=%.3f$" % P0, fontsize=text_fontsize)
+plot([-0.49, 0.1], [0.62, 0.62], "k-", lw=1)
+text(-0.49, 0.5, "SWIFT %s" % git.decode("utf-8"), fontsize=text_fontsize)
+text(-0.49, 0.4, scheme.decode("utf-8"), fontsize=text_fontsize)
+text(-0.49, 0.3, kernel.decode("utf-8"), fontsize=text_fontsize)
+text(
+    -0.49,
+    0.2,
+    "$%.2f$ neighbours ($\\eta=%.3f$)" % (neighbours, eta),
+    fontsize=text_fontsize,
+)
 xlim(-0.5, 0.5)
 ylim(0, 1)
 xticks([])
 yticks([])
 
-savefig("GreshoVortex.png", dpi=200)
+tight_layout()
+
+savefig("GreshoVortex.png")
diff --git a/examples/HydroTests/GreshoVortex_3D/plotSolution.py b/examples/HydroTests/GreshoVortex_3D/plotSolution.py
index 20beab7514759c764f5ca7c379183506b764a819..d3be5a404f7011d3dea24c992ef4ca93b4c4988c 100644
--- a/examples/HydroTests/GreshoVortex_3D/plotSolution.py
+++ b/examples/HydroTests/GreshoVortex_3D/plotSolution.py
@@ -22,43 +22,23 @@
 # answer
 
 # Parameters
-gas_gamma = 5./3. # Gas adiabatic index
-rho0 = 1          # Gas density
-P0 = 0.           # Constant additional pressure (should have no impact on the
-                  # dynamics)
+gas_gamma = 5.0 / 3.0  # Gas adiabatic index
+rho0 = 1  # Gas density
+P0 = 0.0  # Constant additional pressure (should have no impact on the
+# dynamics)
 
 # ---------------------------------------------------------------
 # Don't touch anything after this.
 # ---------------------------------------------------------------
 
 import matplotlib
+
 matplotlib.use("Agg")
 from pylab import *
 from scipy import stats
 import h5py
 
-# Plot parameters
-params = {'axes.labelsize': 10,
-'axes.titlesize': 10,
-'font.size': 12,
-'legend.fontsize': 12,
-'xtick.labelsize': 10,
-'ytick.labelsize': 10,
-'text.usetex': True,
- 'figure.figsize' : (9.90,6.45),
-'figure.subplot.left'    : 0.045,
-'figure.subplot.right'   : 0.99,
-'figure.subplot.bottom'  : 0.05,
-'figure.subplot.top'     : 0.99,
-'figure.subplot.wspace'  : 0.15,
-'figure.subplot.hspace'  : 0.12,
-'lines.markersize' : 6,
-'lines.linewidth' : 3.,
-'text.latex.unicode': True
-}
-rcParams.update(params)
-rc('font',**{'family':'sans-serif','sans-serif':['Times']})
-
+style.use("../../../tools/stylesheets/mnras.mplstyle")
 
 snap = int(sys.argv[1])
 
@@ -72,21 +52,27 @@ solution_v_r = zeros(N)
 
 for i in range(N):
     if solution_r[i] < 0.2:
-        solution_P[i] = P0 + 5. + 12.5*solution_r[i]**2
-        solution_v_phi[i] = 5.*solution_r[i]
+        solution_P[i] = P0 + 5.0 + 12.5 * solution_r[i] ** 2
+        solution_v_phi[i] = 5.0 * solution_r[i]
     elif solution_r[i] < 0.4:
-        solution_P[i] = P0 + 9. + 12.5*solution_r[i]**2 - 20.*solution_r[i] + 4.*log(solution_r[i]/0.2)
-        solution_v_phi[i] = 2. -5.*solution_r[i]
+        solution_P[i] = (
+            P0
+            + 9.0
+            + 12.5 * solution_r[i] ** 2
+            - 20.0 * solution_r[i]
+            + 4.0 * log(solution_r[i] / 0.2)
+        )
+        solution_v_phi[i] = 2.0 - 5.0 * solution_r[i]
     else:
-        solution_P[i] = P0 + 3. + 4.*log(2.)
-        solution_v_phi[i] = 0.
+        solution_P[i] = P0 + 3.0 + 4.0 * log(2.0)
+        solution_v_phi[i] = 0.0
 
 solution_rho = ones(N) * rho0
-solution_s = solution_P / solution_rho**gas_gamma
-solution_u = solution_P /((gas_gamma - 1.)*solution_rho)
+solution_s = solution_P / solution_rho ** gas_gamma
+solution_u = solution_P / ((gas_gamma - 1.0) * solution_rho)
 
 # Read the simulation data
-sim = h5py.File("gresho_%04d.hdf5"%snap, "r")
+sim = h5py.File("gresho_%04d.hdf5" % snap, "r")
 boxSize = sim["/Header"].attrs["BoxSize"][0]
 time = sim["/Header"].attrs["Time"][0]
 scheme = sim["/HydroScheme"].attrs["Scheme"]
@@ -95,14 +81,14 @@ neighbours = sim["/HydroScheme"].attrs["Kernel target N_ngb"]
 eta = sim["/HydroScheme"].attrs["Kernel eta"]
 git = sim["Code"].attrs["Git Revision"]
 
-pos = sim["/PartType0/Coordinates"][:,:]
-x = pos[:,0] - boxSize / 2
-y = pos[:,1] - boxSize / 2
-vel = sim["/PartType0/Velocities"][:,:]
-r = sqrt(x**2 + y**2)
-v_r = (x * vel[:,0] + y * vel[:,1]) / r
-v_phi = (-y * vel[:,0] + x * vel[:,1]) / r
-v_norm = sqrt(vel[:,0]**2 + vel[:,1]**2)
+pos = sim["/PartType0/Coordinates"][:, :]
+x = pos[:, 0] - boxSize / 2
+y = pos[:, 1] - boxSize / 2
+vel = sim["/PartType0/Velocities"][:, :]
+r = sqrt(x ** 2 + y ** 2)
+v_r = (x * vel[:, 0] + y * vel[:, 1]) / r
+v_phi = (-y * vel[:, 0] + x * vel[:, 1]) / r
+v_norm = sqrt(vel[:, 0] ** 2 + vel[:, 1] ** 2)
 rho = sim["/PartType0/Densities"][:]
 u = sim["/PartType0/InternalEnergies"][:]
 S = sim["/PartType0/Entropies"][:]
@@ -121,144 +107,183 @@ except:
     plot_viscosity = False
 
 # Bin te data
-r_bin_edge = np.arange(0., 1., 0.02)
-r_bin = 0.5*(r_bin_edge[1:] + r_bin_edge[:-1])
-rho_bin,_,_ = stats.binned_statistic(r, rho, statistic='mean', bins=r_bin_edge)
-v_bin,_,_ = stats.binned_statistic(r, v_phi, statistic='mean', bins=r_bin_edge)
-P_bin,_,_ = stats.binned_statistic(r, P, statistic='mean', bins=r_bin_edge)
-S_bin,_,_ = stats.binned_statistic(r, S, statistic='mean', bins=r_bin_edge)
-u_bin,_,_ = stats.binned_statistic(r, u, statistic='mean', bins=r_bin_edge)
-rho2_bin,_,_ = stats.binned_statistic(r, rho**2, statistic='mean', bins=r_bin_edge)
-v2_bin,_,_ = stats.binned_statistic(r, v_phi**2, statistic='mean', bins=r_bin_edge)
-P2_bin,_,_ = stats.binned_statistic(r, P**2, statistic='mean', bins=r_bin_edge)
-S2_bin,_,_ = stats.binned_statistic(r, S**2, statistic='mean', bins=r_bin_edge)
-u2_bin,_,_ = stats.binned_statistic(r, u**2, statistic='mean', bins=r_bin_edge)
-rho_sigma_bin = np.sqrt(rho2_bin - rho_bin**2)
-v_sigma_bin = np.sqrt(v2_bin - v_bin**2)
-P_sigma_bin = np.sqrt(P2_bin - P_bin**2)
-S_sigma_bin = np.sqrt(S2_bin - S_bin**2)
-u_sigma_bin = np.sqrt(u2_bin - u_bin**2)
+r_bin_edge = np.arange(0.0, 1.0, 0.02)
+r_bin = 0.5 * (r_bin_edge[1:] + r_bin_edge[:-1])
+rho_bin, _, _ = stats.binned_statistic(r, rho, statistic="mean", bins=r_bin_edge)
+v_bin, _, _ = stats.binned_statistic(r, v_phi, statistic="mean", bins=r_bin_edge)
+P_bin, _, _ = stats.binned_statistic(r, P, statistic="mean", bins=r_bin_edge)
+S_bin, _, _ = stats.binned_statistic(r, S, statistic="mean", bins=r_bin_edge)
+u_bin, _, _ = stats.binned_statistic(r, u, statistic="mean", bins=r_bin_edge)
+rho2_bin, _, _ = stats.binned_statistic(r, rho ** 2, statistic="mean", bins=r_bin_edge)
+v2_bin, _, _ = stats.binned_statistic(r, v_phi ** 2, statistic="mean", bins=r_bin_edge)
+P2_bin, _, _ = stats.binned_statistic(r, P ** 2, statistic="mean", bins=r_bin_edge)
+S2_bin, _, _ = stats.binned_statistic(r, S ** 2, statistic="mean", bins=r_bin_edge)
+u2_bin, _, _ = stats.binned_statistic(r, u ** 2, statistic="mean", bins=r_bin_edge)
+rho_sigma_bin = np.sqrt(rho2_bin - rho_bin ** 2)
+v_sigma_bin = np.sqrt(v2_bin - v_bin ** 2)
+P_sigma_bin = np.sqrt(P2_bin - P_bin ** 2)
+S_sigma_bin = np.sqrt(S2_bin - S_bin ** 2)
+u_sigma_bin = np.sqrt(u2_bin - u_bin ** 2)
 
 if plot_diffusion:
-    alpha_diff_bin,_,_ = stats.binned_statistic(r, diffusion, statistic='mean', bins=r_bin_edge)
-    alpha2_diff_bin,_,_ = stats.binned_statistic(r, diffusion**2, statistic='mean', bins=r_bin_edge)
-    alpha_diff_sigma_bin = np.sqrt(alpha2_diff_bin - alpha_diff_bin**2)
+    alpha_diff_bin, _, _ = stats.binned_statistic(
+        r, diffusion, statistic="mean", bins=r_bin_edge
+    )
+    alpha2_diff_bin, _, _ = stats.binned_statistic(
+        r, diffusion ** 2, statistic="mean", bins=r_bin_edge
+    )
+    alpha_diff_sigma_bin = np.sqrt(alpha2_diff_bin - alpha_diff_bin ** 2)
 
 if plot_viscosity:
-    alpha_visc_bin,_,_ = stats.binned_statistic(r, viscosity, statistic='mean', bins=r_bin_edge)
-    alpha2_visc_bin,_,_ = stats.binned_statistic(r, viscosity**2, statistic='mean', bins=r_bin_edge)
-    alpha_visc_sigma_bin = np.sqrt(alpha2_visc_bin - alpha_visc_bin**2)
+    alpha_visc_bin, _, _ = stats.binned_statistic(
+        r, viscosity, statistic="mean", bins=r_bin_edge
+    )
+    alpha2_visc_bin, _, _ = stats.binned_statistic(
+        r, viscosity ** 2, statistic="mean", bins=r_bin_edge
+    )
+    alpha_visc_sigma_bin = np.sqrt(alpha2_visc_bin - alpha_visc_bin ** 2)
 
 # Plot the interesting quantities
-figure()
+figure(figsize=(7, 7 / 1.6))
+
+line_color = "C4"
+binned_color = "C2"
+binned_marker_size = 4
+
+scatter_props = dict(
+    marker=".",
+    ms=1,
+    markeredgecolor="none",
+    alpha=0.1,
+    zorder=-1,
+    rasterized=True,
+    linestyle="none",
+)
+
+errorbar_props = dict(color=binned_color, ms=binned_marker_size, fmt=".", lw=1.2)
 
 
 # Azimuthal velocity profile -----------------------------
 subplot(231)
 
-plot(r, v_phi, '.', color='r', ms=0.5)
-plot(solution_r, solution_v_phi, '--', color='k', alpha=0.8, lw=1.2)
-errorbar(r_bin, v_bin, yerr=v_sigma_bin, fmt='.', ms=8.0, color='b', lw=1.2)
-plot([0.2, 0.2], [-100, 100], ':', color='k', alpha=0.4, lw=1.2)
-plot([0.4, 0.4], [-100, 100], ':', color='k', alpha=0.4, lw=1.2)
-xlabel("${\\rm{Radius}}~r$", labelpad=0)
-ylabel("${\\rm{Azimuthal~velocity}}~v_\\phi$", labelpad=0)
-xlim(0,R_max)
+plot(r, v_phi, **scatter_props)
+plot(solution_r, solution_v_phi, "--", color=line_color, alpha=0.8, lw=1.2)
+errorbar(r_bin, v_bin, yerr=v_sigma_bin, **errorbar_props)
+plot([0.2, 0.2], [-100, 100], ":", color=line_color, alpha=0.4, lw=1.2)
+plot([0.4, 0.4], [-100, 100], ":", color=line_color, alpha=0.4, lw=1.2)
+xlabel("Radius $r$")
+ylabel("Azimuthal velocity $v_\\phi$")
+xlim(0, R_max)
 ylim(-0.1, 1.2)
 
 # Radial density profile --------------------------------
 subplot(232)
 
-plot(r, rho, '.', color='r', ms=0.5)
-plot(solution_r, solution_rho, '--', color='k', alpha=0.8, lw=1.2)
-errorbar(r_bin, rho_bin, yerr=rho_sigma_bin, fmt='.', ms=8.0, color='b', lw=1.2)
-plot([0.2, 0.2], [-100, 100], ':', color='k', alpha=0.4, lw=1.2)
-plot([0.4, 0.4], [-100, 100], ':', color='k', alpha=0.4, lw=1.2)
-xlabel("${\\rm{Radius}}~r$", labelpad=0)
-ylabel("${\\rm{Density}}~\\rho$", labelpad=0)
-xlim(0,R_max)
-ylim(rho0-0.3, rho0 + 0.3)
-#yticks([-0.2, -0.1, 0., 0.1, 0.2])
+plot(r, rho, **scatter_props)
+plot(solution_r, solution_rho, "--", color=line_color, alpha=0.8, lw=1.2)
+errorbar(r_bin, rho_bin, yerr=rho_sigma_bin, **errorbar_props)
+plot([0.2, 0.2], [-100, 100], ":", color=line_color, alpha=0.4, lw=1.2)
+plot([0.4, 0.4], [-100, 100], ":", color=line_color, alpha=0.4, lw=1.2)
+xlabel("Radius $r$")
+ylabel("Density $\\rho$")
+xlim(0, R_max)
+ylim(rho0 - 0.3, rho0 + 0.3)
+# yticks([-0.2, -0.1, 0., 0.1, 0.2])
 
 # Radial pressure profile --------------------------------
 subplot(233)
 
-plot(r, P, '.', color='r', ms=0.5)
-plot(solution_r, solution_P, '--', color='k', alpha=0.8, lw=1.2)
-errorbar(r_bin, P_bin, yerr=P_sigma_bin, fmt='.', ms=8.0, color='b', lw=1.2)
-plot([0.2, 0.2], [-100, 100], ':', color='k', alpha=0.4, lw=1.2)
-plot([0.4, 0.4], [-100, 100], ':', color='k', alpha=0.4, lw=1.2)
-xlabel("${\\rm{Radius}}~r$", labelpad=0)
-ylabel("${\\rm{Pressure}}~P$", labelpad=0)
+plot(r, P, **scatter_props)
+plot(solution_r, solution_P, "--", color=line_color, alpha=0.8, lw=1.2)
+errorbar(r_bin, P_bin, yerr=P_sigma_bin, **errorbar_props)
+plot([0.2, 0.2], [-100, 100], ":", color=line_color, alpha=0.4, lw=1.2)
+plot([0.4, 0.4], [-100, 100], ":", color=line_color, alpha=0.4, lw=1.2)
+xlabel("Radius $r$")
+ylabel("Pressure $P$")
 xlim(0, R_max)
 ylim(4.9 + P0, P0 + 6.1)
 
 # Internal energy profile --------------------------------
 subplot(234)
 
-plot(r, u, '.', color='r', ms=0.5)
-plot(solution_r, solution_u, '--', color='k', alpha=0.8, lw=1.2)
-errorbar(r_bin, u_bin, yerr=u_sigma_bin, fmt='.', ms=8.0, color='b', lw=1.2)
-plot([0.2, 0.2], [-100, 100], ':', color='k', alpha=0.4, lw=1.2)
-plot([0.4, 0.4], [-100, 100], ':', color='k', alpha=0.4, lw=1.2)
-xlabel("${\\rm{Radius}}~r$", labelpad=0)
-ylabel("${\\rm{Internal~Energy}}~u$", labelpad=0)
-xlim(0,R_max)
+plot(r, u, **scatter_props)
+plot(solution_r, solution_u, "--", color=line_color, alpha=0.8, lw=1.2)
+errorbar(r_bin, u_bin, yerr=u_sigma_bin, **errorbar_props)
+plot([0.2, 0.2], [-100, 100], ":", color=line_color, alpha=0.4, lw=1.2)
+plot([0.4, 0.4], [-100, 100], ":", color=line_color, alpha=0.4, lw=1.2)
+xlabel("Radius $r$")
+ylabel("Internal Energy $u$")
+xlim(0, R_max)
 ylim(7.3, 9.1)
 
 
 # Radial entropy profile --------------------------------
 subplot(235)
-xlabel("${\\rm{Radius}}~r$", labelpad=0)
-
+xlabel("Radius $r$")
 xlim(0, R_max)
 
-xlabel("${\\rm{Radius}}~r$", labelpad=0)
-
 
 if plot_diffusion or plot_viscosity:
     if plot_diffusion:
-        plot(r, diffusion, ".", color='r', ms=0.5, alpha=0.2)
-        errorbar(r_bin, alpha_diff_bin, yerr=alpha_diff_sigma_bin, fmt=".", ms=8.0, color='b', lw=1.2, label="Diffusion")
+        plot(r, diffusion, **scatter_props)
+        errorbar(
+            r_bin,
+            alpha_diff_bin,
+            yerr=alpha_diff_sigma_bin,
+            **errorbar_props,
+            label="Diffusion",
+        )
 
     if plot_viscosity:
-        plot(r, viscosity, ".", color='g', ms=0.5, alpha=0.2)
-        errorbar(r_bin, alpha_visc_bin, yerr=alpha_visc_sigma_bin, fmt=".", ms=8.0, color='y', lw=1.2, label="Viscosity")
-
-    ylabel("${\\rm{Rate~Coefficient}}~\\alpha$", labelpad=0)
+        plot(r, viscosity, **{**scatter_props, "color": "C3"})
+        errorbar(
+            r_bin,
+            alpha_visc_bin,
+            yerr=alpha_visc_sigma_bin,
+            **{**errorbar_props, "color": "C4"},
+            label="Viscosity",
+        )
+
+    ylabel("Rate Coefficient $\\alpha$")
     legend()
 else:
-    plot(r, S, '.', color='r', ms=0.5)
-    plot(solution_r, solution_s, '--', color='k', alpha=0.8, lw=1.2)
-    errorbar(r_bin, S_bin, yerr=S_sigma_bin, fmt='.', ms=8.0, color='b', lw=1.2)
-    plot([0.2, 0.2], [-100, 100], ':', color='k', alpha=0.4, lw=1.2)
-    plot([0.4, 0.4], [-100, 100], ':', color='k', alpha=0.4, lw=1.2)
-    ylabel("${\\rm{Entropy}}~S$", labelpad=0)
+    plot(r, S, **scatter_props)
+    plot(solution_r, solution_s, "--", color=line_color, alpha=0.8, lw=1.2)
+    errorbar(r_bin, S_bin, yerr=S_sigma_bin, **errorbar_props)
+    plot([0.2, 0.2], [-100, 100], ":", color=line_color, alpha=0.4, lw=1.2)
+    plot([0.4, 0.4], [-100, 100], ":", color=line_color, alpha=0.4, lw=1.2)
+    ylabel("Entropy $S$")
     ylim(4.9 + P0, P0 + 6.1)
 
-# Image --------------------------------------------------
-#subplot(234)
-#scatter(pos[:,0], pos[:,1], c=v_norm, cmap="PuBu", edgecolors='face', s=4, vmin=0, vmax=1)
-#text(0.95, 0.95, "$|v|$", ha="right", va="top")
-#xlim(0,1)
-#ylim(0,1)
-#xlabel("$x$", labelpad=0)
-#ylabel("$y$", labelpad=0)
 
 # Information -------------------------------------
 subplot(236, frameon=False)
 
-text(-0.49, 0.9, "Gresho-Chan vortex with  $\\gamma=%.3f$ at $t=%.2f$"%(gas_gamma,time), fontsize=10)
-text(-0.49, 0.8, "Background $\\rho_0=%.3f$"%rho0, fontsize=10)
-text(-0.49, 0.7, "Background $P_0=%.3f$"%P0, fontsize=10)
-plot([-0.49, 0.1], [0.62, 0.62], 'k-', lw=1)
-text(-0.49, 0.5, "$\\textsc{Swift}$ %s"%git, fontsize=10)
-text(-0.49, 0.4, scheme, fontsize=10)
-text(-0.49, 0.3, kernel, fontsize=10)
-text(-0.49, 0.2, "$%.2f$ neighbours ($\\eta=%.3f$)"%(neighbours, eta), fontsize=10)
+text_fontsize = 5
+
+text(
+    -0.49,
+    0.9,
+    "Gresho-Chan vortex (3D) with $\\gamma=%.3f$ at $t=%.2f$" % (gas_gamma, time),
+    fontsize=text_fontsize,
+)
+text(-0.49, 0.8, "Background $\\rho_0=%.3f$" % rho0, fontsize=text_fontsize)
+text(-0.49, 0.7, "Background $P_0=%.3f$" % P0, fontsize=text_fontsize)
+plot([-0.49, 0.1], [0.62, 0.62], "k-", lw=1)
+text(-0.49, 0.5, "SWIFT %s" % git.decode("utf-8"), fontsize=text_fontsize)
+text(-0.49, 0.4, scheme.decode("utf-8"), fontsize=text_fontsize)
+text(-0.49, 0.3, kernel.decode("utf-8"), fontsize=text_fontsize)
+text(
+    -0.49,
+    0.2,
+    "$%.2f$ neighbours ($\\eta=%.3f$)" % (neighbours, eta),
+    fontsize=text_fontsize,
+)
 xlim(-0.5, 0.5)
 ylim(0, 1)
 xticks([])
 yticks([])
 
-savefig("GreshoVortex.png", dpi=200)
+tight_layout()
+
+savefig("GreshoVortex.png")
diff --git a/examples/main.c b/examples/main.c
index 9a61f0bae1271f7a34d770a4b6868c2721e9db93..95784341ab3ad363b90c5a33bed9ffc0885499ce 100644
--- a/examples/main.c
+++ b/examples/main.c
@@ -1244,13 +1244,15 @@ int main(int argc, char *argv[]) {
     error("Failed to generate restart filename");
 
   /* dump the parameters as used. */
+  if (!restart && myrank == 0) {
 
-  /* used parameters */
-  parser_write_params_to_file(params, "used_parameters.yml", 1);
-  /* unused parameters */
-  parser_write_params_to_file(params, "unused_parameters.yml", 0);
+    /* used parameters */
+    parser_write_params_to_file(params, "used_parameters.yml", /*used=*/1);
+    /* unused parameters */
+    parser_write_params_to_file(params, "unused_parameters.yml", /*used=*/0);
+  }
 
-  /* Dump memory use report if collected for the 0 step. */
+    /* Dump memory use report if collected for the 0 step. */
 #ifdef SWIFT_MEMUSE_REPORTS
   {
     char dumpfile[40];
@@ -1296,7 +1298,7 @@ int main(int argc, char *argv[]) {
     }
 
     /* Did we exceed the maximal runtime? */
-    if (clocks_get_hours_since_start() > restart_max_hours_runtime) {
+    if (e.runtime > restart_max_hours_runtime) {
       force_stop = 1;
       message("Runtime limit reached, dumping restart files...");
       if (resubmit_after_max_hours) resubmit = 1;
diff --git a/examples/main_fof.c b/examples/main_fof.c
index 4f12cba5a792cc538c9c096cdd481b92d2613610..280d2d3998569a574b536105afe4a9c4696c2cb1 100644
--- a/examples/main_fof.c
+++ b/examples/main_fof.c
@@ -673,9 +673,9 @@ int main(int argc, char *argv[]) {
 #endif  // SWIFT_DEBUG_THREADPOOL
 
   /* used parameters */
-  parser_write_params_to_file(params, "fof_used_parameters.yml", 1);
+  parser_write_params_to_file(params, "fof_used_parameters.yml", /*used=*/1);
   /* unused parameters */
-  parser_write_params_to_file(params, "fof_unused_parameters.yml", 0);
+  parser_write_params_to_file(params, "fof_unused_parameters.yml", /*used=*/0);
 
   /* Dump memory use report */
 #ifdef SWIFT_MEMUSE_REPORTS
diff --git a/examples/parameter_example.yml b/examples/parameter_example.yml
index 4d43ae30f5bcd0df4970168591f03f97c092b21e..cabc94d2bba38b462b1974c01f416c40df10ef9f 100644
--- a/examples/parameter_example.yml
+++ b/examples/parameter_example.yml
@@ -292,6 +292,7 @@ SineWavePotential:
   timestep_limit:   1.      # Time-step dimensionless pre-factor.
   growth_time:      0.      # (Optional) Time for the potential to grow to its final size.
 
+
 # Parameters related to entropy floors    ----------------------------------------------
 
 EAGLEEntropyFloor:
@@ -303,12 +304,14 @@ EAGLEEntropyFloor:
   Cool_over_density_threshold:     10.       # Overdensity above which the EAGLE Cool limiter entropy floor can kick in.
   Cool_temperature_norm_K:         8000      # Temperature of the EAGLE Cool limiter entropy floor at the density threshold expressed in Kelvin.
   Cool_gamma_effective:            1.        # Slope the of the EAGLE Cool limiter entropy floor
-  
+
+
 # Parameters related to pressure floors    ----------------------------------------------
 
 GEARPressureFloor:
   jeans_factor: 10.       # Number of particles required to suppose a resolved clump and avoid the pressure floor.
 
+
 # Parameters related to cooling function  ----------------------------------------------
 
 # Constant du/dt cooling function
@@ -326,6 +329,7 @@ LambdaCooling:
 EAGLECooling:
   dir_name:                  ./coolingtables/  # Location of the Wiersma+08 cooling tables
   H_reion_z:                 8.5               # Redshift of Hydrogen re-ionization
+  H_reion_eV_p_H:            2.0               # Energy inject by Hydrogen re-ionization in electron-volt per Hydrogen atom
   He_reion_z_centre:         3.5               # Redshift of the centre of the Helium re-ionization Gaussian
   He_reion_z_sigma:          0.5               # Spread in redshift of the  Helium re-ionization Gaussian
   He_reion_eV_p_H:           2.0               # Energy inject by Helium re-ionization in electron-volt per Hydrogen atom
@@ -336,16 +340,17 @@ EAGLECooling:
 # Cooling with Grackle 3.0
 GrackleCooling:
   cloudy_table: CloudyData_UVB=HM2012.h5       # Name of the Cloudy Table (available on the grackle bitbucket repository)
-  with_UV_background: 1                         # Enable or not the UV background
-  redshift: 0                                 # Redshift to use (-1 means time based redshift)
-  with_metal_cooling: 1                         # Enable or not the metal cooling
-  provide_volumetric_heating_rates: 0            # (optional) User provide volumetric heating rates
-  provide_specific_heating_rates: 0              # (optional) User provide specific heating rates
+  with_UV_background: 1                        # Enable or not the UV background
+  redshift: 0                                  # Redshift to use (-1 means time based redshift)
+  with_metal_cooling: 1                        # Enable or not the metal cooling
+  provide_volumetric_heating_rates: 0          # (optional) User provide volumetric heating rates
+  provide_specific_heating_rates: 0            # (optional) User provide specific heating rates
   max_steps: 10000                             # (optional) Max number of step when computing the initial composition
   convergence_limit: 1e-2                      # (optional) Convergence threshold (relative) for initial composition
   thermal_time_myr: 5                          # (optional) Time (in Myr) for adiabatic cooling after a feedback event.
-  self_shielding_method: -1                     # (optional) Grackle (1->3 for Grackle's ones, 0 for none and -1 for GEAR)
-  self_shielding_threshold_atom_per_cm3: 0.007  # Required only with GEAR's self shielding. Density threshold of the self shielding
+  self_shielding_method: -1                    # (optional) Grackle (1->3 for Grackle's ones, 0 for none and -1 for GEAR)
+  self_shielding_threshold_atom_per_cm3: 0.007 # Required only with GEAR's self shielding. Density threshold of the self shielding
+
 
 # Parameters related to chemistry models  -----------------------------------------------
 
@@ -365,16 +370,11 @@ EAGLEChemistry:
 # GEAR chemistry model (Revaz and Jablonka 2018)
 GEARChemistry:
   initial_metallicity: 1         # Initial metallicity of the gas (mass fraction)
-  scale_initial_metallicity: 1    # Should we scale the initial metallicity with the solar one?
+  scale_initial_metallicity: 1   # Should we scale the initial metallicity with the solar one?
 
 
 # Parameters related to star formation models  -----------------------------------------------
 
-# GEAR star formation model (Revaz and Jablonka 2018)
-GEARStarFormation:
-  star_formation_efficiency: 0.01   # star formation efficiency (c_*)
-  maximal_temperature:  3e4         # Upper limit to the temperature of a star forming particle
-
 # EAGLE star formation model (Schaye and Dalla Vecchia 2008)
 EAGLEStarFormation:
   EOS_density_norm_H_p_cm3:          0.1       # Physical density used for the normalisation of the EOS assumed for the star-forming gas in Hydrogen atoms per cm^3.
@@ -392,7 +392,13 @@ EAGLEStarFormation:
   threshold_Z0:                      0.002     # Reference metallicity (metal mass fraction) for the metal-dependant threshold for star formation.
   threshold_slope:                   -0.64     # Slope of the metal-dependant star formation threshold
   threshold_max_density_H_p_cm3:     10.0      # Maximal density of the metal-dependant density threshold for star formation in Hydrogen atoms per cm^3.  
-  
+
+# GEAR star formation model (Revaz and Jablonka 2018)
+GEARStarFormation:
+  star_formation_efficiency: 0.01   # star formation efficiency (c_*)
+  maximal_temperature:  3e4         # Upper limit to the temperature of a star forming particle
+
+
 # Parameters related to feedback models  -----------------------------------------------
 
 # EAGLE feedback model
@@ -438,9 +444,10 @@ EAGLEFeedback:
 
 # GEAR feedback model
 GEARFeedback:
-  supernovae_energy_erg: 0.1e51                      # Energy released by a single supernovae.
+  supernovae_energy_erg: 0.1e51                            # Energy released by a single supernovae.
   yields_table: chemistry-AGB+OMgSFeZnSrYBaEu-16072013.h5  # Table containing the yields.
-  discrete_yields: 0                                 # Should we use discrete yields or the IMF integrated one?
+  discrete_yields: 0                                       # Should we use discrete yields or the IMF integrated one?
+
 
 # Parameters related to AGN models  -----------------------------------------------
   
diff --git a/src/Makefile.am b/src/Makefile.am
index 7144496b68a9e6789d0eaf0fb0dd19ff014e6d4b..7678220bbf0a05fab3af60248245493c3be9886b 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -77,7 +77,7 @@ if HAVEGRACKLECOOLING
 GRACKLE_COOLING_SOURCES += cooling/grackle/cooling.c
 endif
 
-# source files for GRACKLE cooling
+# source files for GEAR feedback
 GEAR_FEEDBACK_SOURCES =
 if HAVEGEARFEEDBACK
 GEAR_FEEDBACK_SOURCES += feedback/GEAR/stellar_evolution.c feedback/GEAR/feedback.c \
@@ -102,8 +102,8 @@ AM_SOURCES = space.c runner_main.c runner_doiact_hydro.c runner_doiact_limiter.c
     chemistry.c cosmology.c restart.c mesh_gravity.c velociraptor_interface.c \
     outputlist.c velociraptor_dummy.c logger_io.c memuse.c mpiuse.c memuse_rnodes.c fof.c \
     hashmap.c pressure_floor.c \
-    $(EAGLE_COOLING_SOURCES) $(EAGLE_FEEDBACK_SOURCES) $(GRACKLE_COOLING_SOURCES) \
-		$(GEAR_FEEDBACK_SOURCES)
+    $(EAGLE_COOLING_SOURCES) $(EAGLE_FEEDBACK_SOURCES) \
+    $(GRACKLE_COOLING_SOURCES) $(GEAR_FEEDBACK_SOURCES)
 
 # Include files for distribution, not installation.
 nobase_noinst_HEADERS = align.h approx_math.h atomic.h barrier.h cycle.h error.h inline.h kernel_hydro.h kernel_gravity.h \
diff --git a/src/black_holes/Default/black_holes_io.h b/src/black_holes/Default/black_holes_io.h
index b2c3374aec6650a5cf29dc80a7485067de38fefb..6ca174b7fef2545af0f3191bba3037ed2b31805a 100644
--- a/src/black_holes/Default/black_holes_io.h
+++ b/src/black_holes/Default/black_holes_io.h
@@ -126,8 +126,9 @@ INLINE static void black_holes_write_particles(const struct bpart* bparts,
   list[2] = io_make_output_field("Masses", FLOAT, 1, UNIT_CONV_MASS, 0.f,
                                  bparts, mass, "Masses of the particles");
 
-  list[3] = io_make_output_field("ParticleIDs", LONGLONG, 1, UNIT_CONV_NO_UNITS,
-                                 0.f, bparts, id, "Unique ID of the particles");
+  list[3] =
+      io_make_output_field("ParticleIDs", ULONGLONG, 1, UNIT_CONV_NO_UNITS, 0.f,
+                           bparts, id, "Unique ID of the particles");
 
   list[4] = io_make_output_field(
       "SmoothingLengths", FLOAT, 1, UNIT_CONV_LENGTH, 1.f, bparts, h,
diff --git a/src/black_holes/EAGLE/black_holes_io.h b/src/black_holes/EAGLE/black_holes_io.h
index 7461d16958436e9b529779797f7b3a0b88c0bfaf..690c76d38a573ba2bdb3834d573a523a324772d8 100644
--- a/src/black_holes/EAGLE/black_holes_io.h
+++ b/src/black_holes/EAGLE/black_holes_io.h
@@ -130,8 +130,9 @@ INLINE static void black_holes_write_particles(const struct bpart* bparts,
       io_make_output_field("DynamicalMasses", FLOAT, 1, UNIT_CONV_MASS, 0.f,
                            bparts, mass, "Dynamical masses of the particles");
 
-  list[3] = io_make_output_field("ParticleIDs", LONGLONG, 1, UNIT_CONV_NO_UNITS,
-                                 0.f, bparts, id, "Unique ID of the particles");
+  list[3] =
+      io_make_output_field("ParticleIDs", ULONGLONG, 1, UNIT_CONV_NO_UNITS, 0.f,
+                           bparts, id, "Unique ID of the particles");
 
   list[4] = io_make_output_field(
       "SmoothingLengths", FLOAT, 1, UNIT_CONV_LENGTH, 1.f, bparts, h,
diff --git a/src/black_holes_struct.h b/src/black_holes_struct.h
index 5535caf23eb9d6eaed3774faa96e75f6f9f73625..fdc70ebb7e73dab16bcf86b099a8ea3b61f51e6a 100644
--- a/src/black_holes_struct.h
+++ b/src/black_holes_struct.h
@@ -27,6 +27,8 @@
 /* Config parameters. */
 #include "../config.h"
 
+#include "inline.h"
+
 /* Import the right black holes definition */
 #if defined(BLACK_HOLES_NONE)
 #include "./black_holes/Default/black_holes_struct.h"
diff --git a/src/cell.c b/src/cell.c
index b6d998ee39bfb053d5c6667e20d83c4be382534a..d5b0499d0005ca389e98a9ef44461e9a2c0ee126 100644
--- a/src/cell.c
+++ b/src/cell.c
@@ -409,6 +409,37 @@ int cell_link_foreign_gparts(struct cell *c, struct gpart *gparts) {
 #endif
 }
 
+/**
+ * @brief Recursively nullify all the particle pointers in a cell hierarchy.
+ *
+ * Should only be used on foreign cells!
+ *
+ * This will make any task or action running on these cells likely crash.
+ * Recreating the foreign links will be necessary.
+ *
+ * @param c The #cell to act on.
+ */
+void cell_unlink_foreign_particles(struct cell *c) {
+
+#ifdef SWIFT_DEBUG_CHECKS
+  if (c->nodeID == engine_rank)
+    error("Unlinking foreign particles in a local cell!");
+#endif
+
+  c->grav.parts = NULL;
+  c->hydro.parts = NULL;
+  c->stars.parts = NULL;
+  c->black_holes.parts = NULL;
+
+  if (c->split) {
+    for (int k = 0; k < 8; k++) {
+      if (c->progeny[k] != NULL) {
+        cell_unlink_foreign_particles(c->progeny[k]);
+      }
+    }
+  }
+}
+
 /**
  * @brief Recursively count the number of #part in foreign cells that
  * are in cells with hydro-related tasks.
diff --git a/src/cell.h b/src/cell.h
index df6949643b35455e55125474acfc9dfc5c13b3fc..b4242bf1f5812135f87d37d30f5ac7c3a405def1 100644
--- a/src/cell.h
+++ b/src/cell.h
@@ -94,8 +94,11 @@ struct pcell {
   /*! Hydro variables */
   struct {
 
+    /*! Number of #part in this cell. */
+    int count;
+
     /*! Maximal smoothing length. */
-    double h_max;
+    float h_max;
 
     /*! Minimal integer end-of-timestep in this cell for hydro tasks */
     integertime_t ti_end_min;
@@ -109,9 +112,6 @@ struct pcell {
     /*! Integer time of the last drift of the #part in this cell */
     integertime_t ti_old_part;
 
-    /*! Number of #part in this cell. */
-    int count;
-
   } hydro;
 
   /*! Gravity variables */
@@ -159,7 +159,7 @@ struct pcell {
     int count;
 
     /*! Maximal smoothing length. */
-    double h_max;
+    float h_max;
 
     /*! Minimal integer end-of-timestep in this cell for stars tasks */
     integertime_t ti_end_min;
@@ -179,7 +179,7 @@ struct pcell {
     int count;
 
     /*! Maximal smoothing length. */
-    double h_max;
+    float h_max;
 
     /*! Minimal integer end-of-timestep in this cell for black hole tasks */
     integertime_t ti_end_min;
@@ -386,9 +386,6 @@ struct cell {
     /*! Task for sorting the stars again after a SF event */
     struct task *stars_resort;
 
-    /*! Max smoothing length in this cell. */
-    double h_max;
-
     /*! Last (integer) time the cell's part were drifted forward in time. */
     integertime_t ti_old_part;
 
@@ -405,6 +402,9 @@ struct cell {
     /*! Spin lock for various uses (#part case). */
     swift_lock_type lock;
 
+    /*! Max smoothing length in this cell. */
+    float h_max;
+
     /*! Maximum part movement in this cell since last construction. */
     float dx_max_part;
 
@@ -569,9 +569,6 @@ struct cell {
     /*! Implicit tasks marking the exit of the stellar physics block of tasks */
     struct task *stars_out;
 
-    /*! Max smoothing length in this cell. */
-    double h_max;
-
     /*! Last (integer) time the cell's spart were drifted forward in time. */
     integertime_t ti_old_part;
 
@@ -587,6 +584,9 @@ struct cell {
     /*! Nr of #spart this cell can hold after addition of new #spart. */
     int count_total;
 
+    /*! Max smoothing length in this cell. */
+    float h_max;
+
     /*! Values of h_max before the drifts, used for sub-cell tasks. */
     float h_max_old;
 
@@ -678,9 +678,6 @@ struct cell {
     /*! Linked list of the tasks computing this cell's BH feedback. */
     struct link *feedback;
 
-    /*! Max smoothing length in this cell. */
-    double h_max;
-
     /*! Last (integer) time the cell's bpart were drifted forward in time. */
     integertime_t ti_old_part;
 
@@ -693,6 +690,9 @@ struct cell {
     /*! Nr of #bpart this cell can hold after addition of new #bpart. */
     int count_total;
 
+    /*! Max smoothing length in this cell. */
+    float h_max;
+
     /*! Values of h_max before the drifts, used for sub-cell tasks. */
     float h_max_old;
 
@@ -860,6 +860,7 @@ int cell_link_sparts(struct cell *c, struct spart *sparts);
 int cell_link_bparts(struct cell *c, struct bpart *bparts);
 int cell_link_foreign_parts(struct cell *c, struct part *parts);
 int cell_link_foreign_gparts(struct cell *c, struct gpart *gparts);
+void cell_unlink_foreign_particles(struct cell *c);
 int cell_count_parts_for_tasks(const struct cell *c);
 int cell_count_gparts_for_tasks(const struct cell *c);
 void cell_clean_links(struct cell *c, void *data);
diff --git a/src/clocks.c b/src/clocks.c
index 16af01938d8f4e6cb21490af3288fd64e1a93876..0ac2d97e18658195f9140857f8ee6fef4bd1fe1d 100644
--- a/src/clocks.c
+++ b/src/clocks.c
@@ -276,22 +276,24 @@ double clocks_get_hours_since_start(void) {
 }
 
 /**
- * @brief return the cpu time used.
+ * @brief return the cpu times used.
  *
- * Uses the times(2) function to access the user cpu times and returns the sum
- * of these for the process tree, i.e. current process plus "waited-for"
- * children. This may be pthread implementation specific as to what that
- * exactly means. Note we do not include the system time as that includes
- * spin times and we don't want to give credit for that.
+ * Uses the times(2) function to access the user and system cpu times and
+ * returns the sum of these for the process tree, i.e. current process plus
+ * "waited-for" children. This may be pthread implementation specific as to
+ * what that exactly means.
  *
- * @result cpu time used in sysconf(_SC_CLK_TCK) ticks, usually 100/s not our
- *         usual ticks.
+ * Note cpu times are reported in sysconf(_SC_CLK_TCK) ticks, usually 100/s
+ * not our usual ticks.
+ *
+ * @param usertime the user time.
+ * @param systime the system time.
  */
-double clocks_get_cputime_used(void) {
-
+void clocks_get_cputimes_used(double *usertime, double *systime) {
   struct tms tmstic;
   times(&tmstic);
-  return (double)(tmstic.tms_utime + tmstic.tms_cutime);
+  *usertime = (tmstic.tms_utime + tmstic.tms_cutime);
+  *systime = (tmstic.tms_stime + tmstic.tms_cstime);
 }
 
 /**
diff --git a/src/clocks.h b/src/clocks.h
index d306268674fc85c722e71a6bf8c0095341ba4e1a..c9c23807587e1ad4b519651911e1c37c9c1a1d6b 100644
--- a/src/clocks.h
+++ b/src/clocks.h
@@ -52,7 +52,7 @@ double clocks_diff_ticks(ticks tic, ticks toc);
 const char *clocks_get_timesincestart(void);
 double clocks_get_hours_since_start(void);
 
-double clocks_get_cputime_used(void);
+void clocks_get_cputimes_used(double *usertime, double *systime);
 int clocks_random_seed(void);
 
 #endif /* SWIFT_CLOCKS_H */
diff --git a/src/collectgroup.c b/src/collectgroup.c
index 6b096416a3f334617a4567e51b4be04c92432eef..6aa120a5708df486229773c8ff9dde4f24180ccf 100644
--- a/src/collectgroup.c
+++ b/src/collectgroup.c
@@ -56,6 +56,7 @@ struct mpicollectgroup1 {
   long long total_nr_tasks;
   float tasks_per_cell_max;
   struct star_formation_history sfh;
+  float runtime;
 };
 
 /* Forward declarations. */
@@ -125,6 +126,8 @@ void collectgroup1_apply(const struct collectgroup1 *grp1, struct engine *e) {
   e->tasks_per_cell_max = grp1->tasks_per_cell_max;
 
   star_formation_logger_add_to_accumulator(&e->sfh, &grp1->sfh);
+
+  e->runtime = grp1->runtime;
 }
 
 /**
@@ -174,6 +177,7 @@ void collectgroup1_apply(const struct collectgroup1 *grp1, struct engine *e) {
  * @param total_nr_tasks total number of tasks on rank.
  * @param tasks_per_cell the used number of tasks per cell.
  * @param sfh The star formation history logger
+ * @param runtime The runtime of rank in hours.
  */
 void collectgroup1_init(
     struct collectgroup1 *grp1, size_t updated, size_t g_updated,
@@ -186,7 +190,7 @@ void collectgroup1_init(
     integertime_t ti_black_holes_end_min, integertime_t ti_black_holes_end_max,
     integertime_t ti_black_holes_beg_max, int forcerebuild,
     long long total_nr_cells, long long total_nr_tasks, float tasks_per_cell,
-    const struct star_formation_history sfh) {
+    const struct star_formation_history sfh, float runtime) {
 
   grp1->updated = updated;
   grp1->g_updated = g_updated;
@@ -213,6 +217,7 @@ void collectgroup1_init(
   grp1->total_nr_tasks = total_nr_tasks;
   grp1->tasks_per_cell_max = tasks_per_cell;
   grp1->sfh = sfh;
+  grp1->runtime = runtime;
 }
 
 /**
@@ -254,6 +259,7 @@ void collectgroup1_reduce(struct collectgroup1 *grp1) {
   mpigrp11.total_nr_tasks = grp1->total_nr_tasks;
   mpigrp11.tasks_per_cell_max = grp1->tasks_per_cell_max;
   mpigrp11.sfh = grp1->sfh;
+  mpigrp11.runtime = grp1->runtime;
 
   struct mpicollectgroup1 mpigrp12;
   if (MPI_Allreduce(&mpigrp11, &mpigrp12, 1, mpicollectgroup1_type,
@@ -286,6 +292,7 @@ void collectgroup1_reduce(struct collectgroup1 *grp1) {
   grp1->total_nr_tasks = mpigrp12.total_nr_tasks;
   grp1->tasks_per_cell_max = mpigrp12.tasks_per_cell_max;
   grp1->sfh = mpigrp12.sfh;
+  grp1->runtime = mpigrp12.runtime;
 
 #endif
 }
@@ -357,6 +364,9 @@ static void doreduce1(struct mpicollectgroup1 *mpigrp11,
 
   /* Star formation history */
   star_formation_logger_add(&mpigrp11->sfh, &mpigrp12->sfh);
+
+  /* Use the maximum runtime as the global runtime. */
+  mpigrp11->runtime = max(mpigrp11->runtime, mpigrp12->runtime);
 }
 
 /**
diff --git a/src/collectgroup.h b/src/collectgroup.h
index 9e973142953beec552e8c76fdbb789dc6325587a..fcb49fe0596b23125525c944fa9bfa80418f5b80 100644
--- a/src/collectgroup.h
+++ b/src/collectgroup.h
@@ -60,6 +60,9 @@ struct collectgroup1 {
 
   /* Maximum value of actual tasks per cell across all ranks. */
   float tasks_per_cell_max;
+
+  /* Global runtime of application in hours. */
+  float runtime;
 };
 
 void collectgroup_init(void);
@@ -75,7 +78,7 @@ void collectgroup1_init(
     integertime_t ti_black_holes_end_min, integertime_t ti_black_holes_end_max,
     integertime_t ti_black_holes_beg_max, int forcerebuild,
     long long total_nr_cells, long long total_nr_tasks, float tasks_per_cell,
-    const struct star_formation_history sfh);
+    const struct star_formation_history sfh, float runtime);
 void collectgroup1_reduce(struct collectgroup1 *grp1);
 #ifdef WITH_MPI
 void mpicollect_free_MPI_type(void);
diff --git a/src/common_io.c b/src/common_io.c
index 460fda26699a3f4f45944f18c1fe65b2e9274dfa..458fe8229321f9fbd40b1b2c5a5bc70ebfe3417f 100644
--- a/src/common_io.c
+++ b/src/common_io.c
@@ -1580,7 +1580,7 @@ void io_copy_temp_buffer(void* temp, const struct engine* e,
 
     /* Copy the whole thing into a buffer */
     threadpool_map((struct threadpool*)&e->threadpool, io_copy_mapper, temp_c,
-                   N, copySize, 0, (void*)&props);
+                   N, copySize, threadpool_auto_chunk_size, (void*)&props);
 
   } else { /* Converting particle to data */
 
@@ -1593,8 +1593,8 @@ void io_copy_temp_buffer(void* temp, const struct engine* e,
 
       /* Copy the whole thing into a buffer */
       threadpool_map((struct threadpool*)&e->threadpool,
-                     io_convert_part_f_mapper, temp_f, N, copySize, 0,
-                     (void*)&props);
+                     io_convert_part_f_mapper, temp_f, N, copySize,
+                     threadpool_auto_chunk_size, (void*)&props);
 
     } else if (props.convert_part_i != NULL) {
 
@@ -1605,8 +1605,8 @@ void io_copy_temp_buffer(void* temp, const struct engine* e,
 
       /* Copy the whole thing into a buffer */
       threadpool_map((struct threadpool*)&e->threadpool,
-                     io_convert_part_i_mapper, temp_i, N, copySize, 0,
-                     (void*)&props);
+                     io_convert_part_i_mapper, temp_i, N, copySize,
+                     threadpool_auto_chunk_size, (void*)&props);
 
     } else if (props.convert_part_d != NULL) {
 
@@ -1617,8 +1617,8 @@ void io_copy_temp_buffer(void* temp, const struct engine* e,
 
       /* Copy the whole thing into a buffer */
       threadpool_map((struct threadpool*)&e->threadpool,
-                     io_convert_part_d_mapper, temp_d, N, copySize, 0,
-                     (void*)&props);
+                     io_convert_part_d_mapper, temp_d, N, copySize,
+                     threadpool_auto_chunk_size, (void*)&props);
 
     } else if (props.convert_part_l != NULL) {
 
@@ -1629,8 +1629,8 @@ void io_copy_temp_buffer(void* temp, const struct engine* e,
 
       /* Copy the whole thing into a buffer */
       threadpool_map((struct threadpool*)&e->threadpool,
-                     io_convert_part_l_mapper, temp_l, N, copySize, 0,
-                     (void*)&props);
+                     io_convert_part_l_mapper, temp_l, N, copySize,
+                     threadpool_auto_chunk_size, (void*)&props);
 
     } else if (props.convert_gpart_f != NULL) {
 
@@ -1641,8 +1641,8 @@ void io_copy_temp_buffer(void* temp, const struct engine* e,
 
       /* Copy the whole thing into a buffer */
       threadpool_map((struct threadpool*)&e->threadpool,
-                     io_convert_gpart_f_mapper, temp_f, N, copySize, 0,
-                     (void*)&props);
+                     io_convert_gpart_f_mapper, temp_f, N, copySize,
+                     threadpool_auto_chunk_size, (void*)&props);
 
     } else if (props.convert_gpart_i != NULL) {
 
@@ -1653,8 +1653,8 @@ void io_copy_temp_buffer(void* temp, const struct engine* e,
 
       /* Copy the whole thing into a buffer */
       threadpool_map((struct threadpool*)&e->threadpool,
-                     io_convert_gpart_i_mapper, temp_i, N, copySize, 0,
-                     (void*)&props);
+                     io_convert_gpart_i_mapper, temp_i, N, copySize,
+                     threadpool_auto_chunk_size, (void*)&props);
 
     } else if (props.convert_gpart_d != NULL) {
 
@@ -1665,8 +1665,8 @@ void io_copy_temp_buffer(void* temp, const struct engine* e,
 
       /* Copy the whole thing into a buffer */
       threadpool_map((struct threadpool*)&e->threadpool,
-                     io_convert_gpart_d_mapper, temp_d, N, copySize, 0,
-                     (void*)&props);
+                     io_convert_gpart_d_mapper, temp_d, N, copySize,
+                     threadpool_auto_chunk_size, (void*)&props);
 
     } else if (props.convert_gpart_l != NULL) {
 
@@ -1677,8 +1677,8 @@ void io_copy_temp_buffer(void* temp, const struct engine* e,
 
       /* Copy the whole thing into a buffer */
       threadpool_map((struct threadpool*)&e->threadpool,
-                     io_convert_gpart_l_mapper, temp_l, N, copySize, 0,
-                     (void*)&props);
+                     io_convert_gpart_l_mapper, temp_l, N, copySize,
+                     threadpool_auto_chunk_size, (void*)&props);
 
     } else if (props.convert_spart_f != NULL) {
 
@@ -1689,8 +1689,8 @@ void io_copy_temp_buffer(void* temp, const struct engine* e,
 
       /* Copy the whole thing into a buffer */
       threadpool_map((struct threadpool*)&e->threadpool,
-                     io_convert_spart_f_mapper, temp_f, N, copySize, 0,
-                     (void*)&props);
+                     io_convert_spart_f_mapper, temp_f, N, copySize,
+                     threadpool_auto_chunk_size, (void*)&props);
 
     } else if (props.convert_spart_i != NULL) {
 
@@ -1701,8 +1701,8 @@ void io_copy_temp_buffer(void* temp, const struct engine* e,
 
       /* Copy the whole thing into a buffer */
       threadpool_map((struct threadpool*)&e->threadpool,
-                     io_convert_spart_i_mapper, temp_i, N, copySize, 0,
-                     (void*)&props);
+                     io_convert_spart_i_mapper, temp_i, N, copySize,
+                     threadpool_auto_chunk_size, (void*)&props);
 
     } else if (props.convert_spart_d != NULL) {
 
@@ -1713,8 +1713,8 @@ void io_copy_temp_buffer(void* temp, const struct engine* e,
 
       /* Copy the whole thing into a buffer */
       threadpool_map((struct threadpool*)&e->threadpool,
-                     io_convert_spart_d_mapper, temp_d, N, copySize, 0,
-                     (void*)&props);
+                     io_convert_spart_d_mapper, temp_d, N, copySize,
+                     threadpool_auto_chunk_size, (void*)&props);
 
     } else if (props.convert_spart_l != NULL) {
 
@@ -1725,8 +1725,8 @@ void io_copy_temp_buffer(void* temp, const struct engine* e,
 
       /* Copy the whole thing into a buffer */
       threadpool_map((struct threadpool*)&e->threadpool,
-                     io_convert_spart_l_mapper, temp_l, N, copySize, 0,
-                     (void*)&props);
+                     io_convert_spart_l_mapper, temp_l, N, copySize,
+                     threadpool_auto_chunk_size, (void*)&props);
 
     } else if (props.convert_bpart_f != NULL) {
 
@@ -1737,8 +1737,8 @@ void io_copy_temp_buffer(void* temp, const struct engine* e,
 
       /* Copy the whole thing into a buffer */
       threadpool_map((struct threadpool*)&e->threadpool,
-                     io_convert_bpart_f_mapper, temp_f, N, copySize, 0,
-                     (void*)&props);
+                     io_convert_bpart_f_mapper, temp_f, N, copySize,
+                     threadpool_auto_chunk_size, (void*)&props);
 
     } else if (props.convert_bpart_i != NULL) {
 
@@ -1749,8 +1749,8 @@ void io_copy_temp_buffer(void* temp, const struct engine* e,
 
       /* Copy the whole thing into a buffer */
       threadpool_map((struct threadpool*)&e->threadpool,
-                     io_convert_bpart_i_mapper, temp_i, N, copySize, 0,
-                     (void*)&props);
+                     io_convert_bpart_i_mapper, temp_i, N, copySize,
+                     threadpool_auto_chunk_size, (void*)&props);
 
     } else if (props.convert_bpart_d != NULL) {
 
@@ -1761,8 +1761,8 @@ void io_copy_temp_buffer(void* temp, const struct engine* e,
 
       /* Copy the whole thing into a buffer */
       threadpool_map((struct threadpool*)&e->threadpool,
-                     io_convert_bpart_d_mapper, temp_d, N, copySize, 0,
-                     (void*)&props);
+                     io_convert_bpart_d_mapper, temp_d, N, copySize,
+                     threadpool_auto_chunk_size, (void*)&props);
 
     } else if (props.convert_bpart_l != NULL) {
 
@@ -1773,8 +1773,8 @@ void io_copy_temp_buffer(void* temp, const struct engine* e,
 
       /* Copy the whole thing into a buffer */
       threadpool_map((struct threadpool*)&e->threadpool,
-                     io_convert_bpart_l_mapper, temp_l, N, copySize, 0,
-                     (void*)&props);
+                     io_convert_bpart_l_mapper, temp_l, N, copySize,
+                     threadpool_auto_chunk_size, (void*)&props);
 
     } else {
       error("Missing conversion function");
@@ -1832,7 +1832,7 @@ void io_prepare_dm_gparts(struct threadpool* tp, struct gpart* const gparts,
                           size_t Ndm) {
 
   threadpool_map(tp, io_prepare_dm_gparts_mapper, gparts, Ndm,
-                 sizeof(struct gpart), 0, NULL);
+                 sizeof(struct gpart), threadpool_auto_chunk_size, NULL);
 }
 
 void io_prepare_dm_background_gparts_mapper(void* restrict data, int Ndm,
@@ -1868,7 +1868,7 @@ void io_prepare_dm_background_gparts(struct threadpool* tp,
                                      struct gpart* const gparts, size_t Ndm) {
 
   threadpool_map(tp, io_prepare_dm_background_gparts_mapper, gparts, Ndm,
-                 sizeof(struct gpart), 0, NULL);
+                 sizeof(struct gpart), threadpool_auto_chunk_size, NULL);
 }
 
 size_t io_count_dm_background_gparts(const struct gpart* const gparts,
@@ -1950,7 +1950,7 @@ void io_duplicate_hydro_gparts(struct threadpool* tp, struct part* const parts,
   data.Ndm = Ndm;
 
   threadpool_map(tp, io_duplicate_hydro_gparts_mapper, parts, Ngas,
-                 sizeof(struct part), 0, &data);
+                 sizeof(struct part), threadpool_auto_chunk_size, &data);
 }
 
 void io_duplicate_stars_gparts_mapper(void* restrict data, int Nstars,
@@ -2008,7 +2008,7 @@ void io_duplicate_stars_gparts(struct threadpool* tp,
   data.Ndm = Ndm;
 
   threadpool_map(tp, io_duplicate_stars_gparts_mapper, sparts, Nstars,
-                 sizeof(struct spart), 0, &data);
+                 sizeof(struct spart), threadpool_auto_chunk_size, &data);
 }
 
 void io_duplicate_black_holes_gparts_mapper(void* restrict data,
@@ -2068,7 +2068,8 @@ void io_duplicate_black_holes_gparts(struct threadpool* tp,
   data.Ndm = Ndm;
 
   threadpool_map(tp, io_duplicate_black_holes_gparts_mapper, bparts,
-                 Nblackholes, sizeof(struct bpart), 0, &data);
+                 Nblackholes, sizeof(struct bpart), threadpool_auto_chunk_size,
+                 &data);
 }
 
 /**
diff --git a/src/cooling/Compton/cooling.h b/src/cooling/Compton/cooling.h
index a594cdca6916c361b737763e0360ed32d459ba99..78318560220781fedf700dfab07b4930e536a6e8 100644
--- a/src/cooling/Compton/cooling.h
+++ b/src/cooling/Compton/cooling.h
@@ -136,9 +136,10 @@ __attribute__((always_inline)) INLINE static double Compton_cooling_rate_cgs(
  * @param cooling The #cooling_function_data used in the run.
  * @param p Pointer to the particle data.
  * @param xp Pointer to the particle' extended data.
- * @param time The current time.
  * @param dt The time-step of this particle.
  * @param dt_therm The time-step operator used for thermal quantities.
+ * @param time Time since Big Bang (or start of the simulation) in internal
+ * units.
  */
 __attribute__((always_inline)) INLINE static void cooling_cool_part(
     const struct phys_const* restrict phys_const,
@@ -147,8 +148,8 @@ __attribute__((always_inline)) INLINE static void cooling_cool_part(
     const struct hydro_props* hydro_props,
     const struct entropy_floor_properties* floor_props,
     const struct cooling_function_data* restrict cooling,
-    struct part* restrict p, struct xpart* restrict xp, const double time,
-    const float dt, const float dt_therm) {
+    struct part* restrict p, struct xpart* restrict xp, const float dt,
+    const float dt_therm, const double time) {
 
   /* Nothing to do here? */
   if (dt == 0.) return;
diff --git a/src/cooling/EAGLE/cooling.c b/src/cooling/EAGLE/cooling.c
index 3a6fb523b83ba34d7fea2ff981a5c899e9f04a1c..9d5a5786230d661dd39877154358bce094d9c6a6 100644
--- a/src/cooling/EAGLE/cooling.c
+++ b/src/cooling/EAGLE/cooling.c
@@ -372,10 +372,10 @@ INLINE static double bisection_iter(
  * @param cooling The #cooling_function_data used in the run.
  * @param p Pointer to the particle data.
  * @param xp Pointer to the extended particle data.
- * @param time The current time (since the Big Bang or start of the run) in
- * internal units.
  * @param dt The cooling time-step of this particle.
  * @param dt_therm The hydro time-step of this particle.
+ * @param time The current time (since the Big Bang or start of the run) in
+ * internal units.
  */
 void cooling_cool_part(const struct phys_const *phys_const,
                        const struct unit_system *us,
@@ -384,8 +384,8 @@ void cooling_cool_part(const struct phys_const *phys_const,
                        const struct entropy_floor_properties *floor_props,
                        const struct cooling_function_data *cooling,
                        struct part *restrict p, struct xpart *restrict xp,
-                       const double time, const float dt,
-                       const float dt_therm) {
+                       const float dt, const float dt_therm,
+                       const double time) {
 
   /* No cooling happens over zero time */
   if (dt == 0.) return;
diff --git a/src/cooling/EAGLE/cooling.h b/src/cooling/EAGLE/cooling.h
index 90c1c01e82c7eec6d6615546866083afdfc1e328..760e455a05e518027bb8ffd66954531d406591be 100644
--- a/src/cooling/EAGLE/cooling.h
+++ b/src/cooling/EAGLE/cooling.h
@@ -44,7 +44,7 @@ void cooling_cool_part(const struct phys_const *phys_const,
                        const struct entropy_floor_properties *floor_props,
                        const struct cooling_function_data *cooling,
                        struct part *restrict p, struct xpart *restrict xp,
-                       const double time, const float dt, const float dt_therm);
+                       const float dt, const float dt_therm, const double time);
 
 float cooling_timestep(const struct cooling_function_data *restrict cooling,
                        const struct phys_const *restrict phys_const,
diff --git a/src/cooling/const_du/cooling.h b/src/cooling/const_du/cooling.h
index de5164e59498a1a17c62fecfed0b0f61bcf2a2c2..69424fa331b60964fd7448b08ecc59d7751a492d 100644
--- a/src/cooling/const_du/cooling.h
+++ b/src/cooling/const_du/cooling.h
@@ -76,9 +76,10 @@ INLINE static void cooling_update(const struct cosmology* cosmo,
  * @param cooling The #cooling_function_data used in the run.
  * @param p Pointer to the particle data.
  * @param xp Pointer to the extended particle data.
- * @param time The current time.
  * @param dt The time-step of this particle.
  * @param dt_therm The time-step operator used for thermal quantities.
+ * @param time Time since Big Bang (or start of the simulation) in internal
+ * units.
  */
 __attribute__((always_inline)) INLINE static void cooling_cool_part(
     const struct phys_const* restrict phys_const,
@@ -87,8 +88,8 @@ __attribute__((always_inline)) INLINE static void cooling_cool_part(
     const struct hydro_props* hydro_props,
     const struct entropy_floor_properties* floor_props,
     const struct cooling_function_data* restrict cooling,
-    struct part* restrict p, struct xpart* restrict xp, const double time,
-    const float dt, const float dt_therm) {
+    struct part* restrict p, struct xpart* restrict xp, const float dt,
+    const float dt_therm, const double time) {
 
   /* Internal energy floor */
   const float u_floor = cooling->min_energy;
diff --git a/src/cooling/const_lambda/cooling.h b/src/cooling/const_lambda/cooling.h
index ef56b6e3575d11f767af8061e303f6ffe9248d7a..1597687bf6feec49f88f8a2f2b4bba05ecbf5939 100644
--- a/src/cooling/const_lambda/cooling.h
+++ b/src/cooling/const_lambda/cooling.h
@@ -106,9 +106,10 @@ __attribute__((always_inline)) INLINE static double cooling_rate_cgs(
  * @param cooling The #cooling_function_data used in the run.
  * @param p Pointer to the particle data.
  * @param xp Pointer to the particle' extended data.
- * @param time The current time.
  * @param dt The time-step of this particle.
  * @param dt_therm The time-step operator used for thermal quantities.
+ * @param time Time since Big Bang (or start of the simulation) in internal
+ * units.
  */
 __attribute__((always_inline)) INLINE static void cooling_cool_part(
     const struct phys_const* restrict phys_const,
@@ -117,8 +118,8 @@ __attribute__((always_inline)) INLINE static void cooling_cool_part(
     const struct hydro_props* hydro_props,
     const struct entropy_floor_properties* floor_props,
     const struct cooling_function_data* restrict cooling,
-    struct part* restrict p, struct xpart* restrict xp, const double time,
-    const float dt, const float dt_therm) {
+    struct part* restrict p, struct xpart* restrict xp, const float dt,
+    const float dt_therm, const double time) {
 
   /* Nothing to do here? */
   if (dt == 0.) return;
diff --git a/src/cooling/grackle/cooling.c b/src/cooling/grackle/cooling.c
index 6b74b72aefa0bb64717f83d9b5a54d497e6ae260..b51afd83f0be3fee94e10f1e03ea24303b621a26 100644
--- a/src/cooling/grackle/cooling.c
+++ b/src/cooling/grackle/cooling.c
@@ -748,9 +748,10 @@ void cooling_apply(struct part* restrict p, struct xpart* restrict xp,
  * @param cooling The #cooling_function_data used in the run.
  * @param p Pointer to the particle data.
  * @param xp Pointer to the particle' extended data.
- * @param time The current time.
  * @param dt The time-step of this particle.
  * @param dt_therm The time-step operator used for thermal quantities.
+ * @param time The current time (since the Big Bang or start of the run) in
+ * internal units.
  */
 void cooling_cool_part(const struct phys_const* restrict phys_const,
                        const struct unit_system* restrict us,
@@ -759,8 +760,8 @@ void cooling_cool_part(const struct phys_const* restrict phys_const,
                        const struct entropy_floor_properties* floor_props,
                        const struct cooling_function_data* restrict cooling,
                        struct part* restrict p, struct xpart* restrict xp,
-                       const double time, const double dt,
-                       const double dt_therm) {
+                       const double dt, const double dt_therm,
+                       const double time) {
 
   /* Nothing to do here? */
   if (dt == 0.) return;
diff --git a/src/cooling/grackle/cooling.h b/src/cooling/grackle/cooling.h
index 209b986c6f39595a7f60b64539c97e3292e79571..c5230c077b29bfdce39b34931c9f0107c490ee14 100644
--- a/src/cooling/grackle/cooling.h
+++ b/src/cooling/grackle/cooling.h
@@ -115,8 +115,8 @@ void cooling_cool_part(const struct phys_const* restrict phys_const,
                        const struct entropy_floor_properties* floor_props,
                        const struct cooling_function_data* restrict cooling,
                        struct part* restrict p, struct xpart* restrict xp,
-                       const double time, const double dt,
-                       const double dt_therm);
+                       const double dt, const double dt_therm,
+                       const double time);
 
 float cooling_get_temperature(
     const struct phys_const* restrict phys_const,
diff --git a/src/cooling/none/cooling.h b/src/cooling/none/cooling.h
index 107ea4b88a24d2505b8cceb346672f855df7b9da..abd29bfb84d23ec1b701f19479dbd71eac3a1555 100644
--- a/src/cooling/none/cooling.h
+++ b/src/cooling/none/cooling.h
@@ -63,9 +63,10 @@ INLINE static void cooling_update(const struct cosmology* cosmo,
  * @param cooling The #cooling_function_data used in the run.
  * @param p Pointer to the particle data.
  * @param xp Pointer to the extended particle data.
- * @param time The current time.
  * @param dt The time-step of this particle.
  * @param dt_therm The time-step operator used for thermal quantities.
+ * @param time The current time (since the Big Bang or start of the run) in
+ * internal units.
  */
 __attribute__((always_inline)) INLINE static void cooling_cool_part(
     const struct phys_const* restrict phys_const,
@@ -74,8 +75,8 @@ __attribute__((always_inline)) INLINE static void cooling_cool_part(
     const struct hydro_props* hydro_props,
     const struct entropy_floor_properties* floor_props,
     const struct cooling_function_data* restrict cooling,
-    struct part* restrict p, struct xpart* restrict xp, const double time,
-    const float dt, const float dt_therm) {}
+    struct part* restrict p, struct xpart* restrict xp, const float dt,
+    const float dt_therm, const double time) {}
 
 /**
  * @brief Computes the cooling time-step.
diff --git a/src/debug.c b/src/debug.c
index fba8db46661e9624f9332c74d0d76c435cb9a6d2..1bfe5d2377ef6f6f3f8f4733d244a76635c60e6f 100644
--- a/src/debug.c
+++ b/src/debug.c
@@ -375,31 +375,11 @@ static void dumpCells_map(struct cell *c, void *data) {
   size_t *ldata = (size_t *)data;
   FILE *file = (FILE *)ldata[0];
   struct engine *e = (struct engine *)ldata[1];
-  float ntasks = c->nr_tasks;
   int super = (int)ldata[2];
   int active = (int)ldata[3];
   int mpiactive = (int)ldata[4];
   int pactive = (int)ldata[5];
 
-#if SWIFT_DEBUG_CHECKS
-  /* The c->nr_tasks field does not include all the tasks. So let's check this
-   * the hard way. Note pairs share the task 50/50 with the other cell. */
-  ntasks = 0.0f;
-  struct task *tasks = e->sched.tasks;
-  int nr_tasks = e->sched.nr_tasks;
-  for (int k = 0; k < nr_tasks; k++) {
-    if (tasks[k].cj == NULL) {
-      if (c == tasks[k].ci) {
-        ntasks = ntasks + 1.0f;
-      }
-    } else {
-      if (c == tasks[k].ci || c == tasks[k].cj) {
-        ntasks = ntasks + 0.5f;
-      }
-    }
-  }
-#endif
-
   /* Only cells with particles are dumped. */
   if (c->hydro.count > 0 || c->grav.count > 0 || c->stars.count > 0) {
 
@@ -429,6 +409,36 @@ static void dumpCells_map(struct cell *c, void *data) {
         (!super || ((super && c->super == c) || (c->parent == NULL))) &&
         active && mpiactive) {
 
+      /* The c->nr_tasks field does not include all the tasks. So let's check
+       * this the hard way. Note pairs share the task 50/50 with the other
+       * cell. Also accumulate all the time used by tasks of this cell and
+       * form some idea of the effective task depth. */
+      float ntasks = 0.0f;
+      struct task *tasks = e->sched.tasks;
+      int nr_tasks = e->sched.nr_tasks;
+      double ticsum = 0.0; /* Sum of work for this cell. */
+      double dsum = 0.0;
+      for (int k = 0; k < nr_tasks; k++) {
+        if (tasks[k].cj == NULL) {
+          if (tasks[k].ci != NULL) {
+            if (c == tasks[k].ci || c == tasks[k].ci->super) {
+              ntasks = ntasks + 1.0f;
+              ticsum += (tasks[k].toc - tasks[k].tic);
+              dsum += tasks[k].ci->depth;
+            }
+          }
+        } else {
+          if (c == tasks[k].ci || c == tasks[k].ci->super || c == tasks[k].cj ||
+              c == tasks[k].cj->super) {
+            ntasks = ntasks + 0.5f;
+            ticsum += 0.5 * (tasks[k].toc - tasks[k].tic);
+            if (tasks[k].ci != NULL) dsum += (tasks[k].ci->depth * 0.5);
+            dsum += (tasks[k].cj->depth * 0.5);
+          }
+        }
+      }
+      dsum /= (double)ntasks;
+
       /* If requested we work out how many particles are active in this cell. */
       int pactcount = 0;
       if (pactive) {
@@ -443,15 +453,16 @@ static void dumpCells_map(struct cell *c, void *data) {
           if (spart_is_active(&sparts[k], e)) pactcount++;
       }
 
-      fprintf(file,
-              "  %6.3f %6.3f %6.3f %6.3f %6.3f %6.3f %6d %6d %6d %6d %6d %6d "
-              "%6.1f %20lld %6d %6d %6d %6d %6d %6d %6d\n",
-              c->loc[0], c->loc[1], c->loc[2], c->width[0], c->width[1],
-              c->width[2], e->step, c->hydro.count, c->grav.count,
-              c->stars.count, pactcount, c->depth, ntasks, c->hydro.ti_end_min,
-              get_time_bin(c->hydro.ti_end_min), (c->super == c),
-              (c->parent == NULL), cell_is_active_hydro(c, e), c->nodeID,
-              c->nodeID == e->nodeID, ismpiactive);
+      fprintf(
+          file,
+          "  %6.3f %6.3f %6.3f %6.3f %6.3f %6.3f %6d %6d %6d %6d %6d %6d %6d "
+          "%6.1f %20lld %6d %6d %6d %6d %6d %6d %6d %f %f\n",
+          c->loc[0], c->loc[1], c->loc[2], c->width[0], c->width[1],
+          c->width[2], e->step, c->hydro.count, c->grav.count, c->stars.count,
+          pactcount, c->depth, c->maxdepth, ntasks, c->hydro.ti_end_min,
+          get_time_bin(c->hydro.ti_end_min), (c->super == c),
+          (c->parent == NULL), cell_is_active_hydro(c, e), c->nodeID,
+          c->nodeID == e->nodeID, ismpiactive, ticsum, dsum);
     }
   }
 }
@@ -483,11 +494,12 @@ void dumpCells(const char *prefix, int super, int active, int mpiactive,
 
   /* Header. */
   fprintf(file,
-          "# %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s "
-          "%20s %6s %6s %6s %6s %6s %6s %6s\n",
+          "# %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s "
+          "%20s %6s %6s %6s %6s %6s %6s %6s %6s %6s\n",
           "x", "y", "z", "xw", "yw", "zw", "step", "count", "gcount", "scount",
-          "actcount", "depth", "tasks", "ti_end_min", "timebin", "issuper",
-          "istop", "active", "rank", "local", "mpiactive");
+          "actcount", "depth", "maxdepth", "tasks", "ti_end_min", "timebin",
+          "issuper", "istop", "active", "rank", "local", "mpiactive", "ticsum",
+          "avedepth");
 
   size_t data[6];
   data[0] = (size_t)file;
diff --git a/src/engine.c b/src/engine.c
index c1e294a442b7a78091ba0bf0af6e9f965a57d024..4d7d054de41d19af2c16fc9cda5b10f637ac3729 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -65,6 +65,7 @@
 #include "equation_of_state.h"
 #include "error.h"
 #include "feedback.h"
+#include "fof.h"
 #include "gravity.h"
 #include "gravity_cache.h"
 #include "hydro.h"
@@ -73,6 +74,7 @@
 #include "map.h"
 #include "memuse.h"
 #include "minmax.h"
+#include "mpiuse.h"
 #include "outputlist.h"
 #include "parallel_io.h"
 #include "part.h"
@@ -210,8 +212,9 @@ void engine_repartition(struct engine *e) {
   /* Task arrays. */
   scheduler_free_tasks(&e->sched);
 
-  /* Foreign parts. */
-  space_free_foreign_parts(e->s);
+  /* Foreign parts. (no need to nullify the cell pointers as the cells
+   * will be regenerated) */
+  space_free_foreign_parts(e->s, /*clear_cell_pointers=*/0);
 
   /* Now comes the tricky part: Exchange particles between all nodes.
      This is done in two steps, first allreducing a matrix of
@@ -253,79 +256,169 @@ void engine_repartition_trigger(struct engine *e) {
 #ifdef WITH_MPI
 
   const ticks tic = getticks();
+  static int opened = 0;
 
   /* Do nothing if there have not been enough steps since the last repartition
    * as we don't want to repeat this too often or immediately after a
-   * repartition step. Also nothing to do when requested. */
-  if (e->step - e->last_repartition >= 2 &&
-      e->reparttype->type != REPART_NONE) {
+   * repartition step. We attempt all this even when we are not repartitioning
+   * as the balance logs can still be interesting. */
+  if (e->step - e->last_repartition >= 2) {
 
     /* If we have fixed costs available and this is step 2 or we are forcing
-     * repartitioning then we do a fixed costs one now. */
-    if (e->reparttype->trigger > 1 ||
-        (e->step == 2 && e->reparttype->use_fixed_costs)) {
+     * repartitioning then we do a forced fixed costs repartition regardless. */
+    int forced = 0;
+    if (e->reparttype->type != REPART_NONE) {
+      if (e->reparttype->trigger > 1 ||
+          (e->step == 2 && e->reparttype->use_fixed_costs)) {
+        if (e->reparttype->trigger > 1) {
+          if ((e->step % (int)e->reparttype->trigger) == 0) e->forcerepart = 1;
+        } else {
+          e->forcerepart = 1;
+        }
+        e->reparttype->use_ticks = 0;
+        forced = 1;
+      }
+    }
 
-      if (e->reparttype->trigger > 1) {
-        if ((e->step % (int)e->reparttype->trigger) == 0) e->forcerepart = 1;
+    /* We only check the CPU loads when we have processed a significant number
+     * of all particles as we require all tasks to have timings or are
+     * interested in the various balances logs. */
+    if ((e->updates > 1 &&
+         e->updates >= e->total_nr_parts * e->reparttype->minfrac) ||
+        (e->g_updates > 1 &&
+         e->g_updates >= e->total_nr_gparts * e->reparttype->minfrac)) {
+
+      /* Are we using the task tick timings or fixed costs? */
+      if (e->reparttype->use_fixed_costs > 1) {
+        e->reparttype->use_ticks = 0;
       } else {
-        e->forcerepart = 1;
+        e->reparttype->use_ticks = 1;
       }
-      e->reparttype->use_ticks = 0;
 
-    } else {
+      /* Get the resident size of the process for the memory logs. */
+      long size, resident, shared, text, library, data, dirty;
+      memuse_use(&size, &resident, &shared, &text, &data, &library, &dirty);
+
+      /* Gather it together with the CPU times used by the tasks in the last
+       * step. */
+      double timemem[3] = {e->usertime_last_step, e->systime_last_step,
+                           (double)resident};
+      double timemems[e->nr_nodes * 3];
+      MPI_Gather(&timemem, 3, MPI_DOUBLE, timemems, 3, MPI_DOUBLE, 0,
+                 MPI_COMM_WORLD);
+      if (e->nodeID == 0) {
+
+        /* Get the range and mean of the two CPU times and memory. */
+        double umintime = timemems[0];
+        double umaxtime = timemems[0];
+
+        double smintime = timemems[1];
+        double smaxtime = timemems[1];
+
+        double minmem = timemems[2];
+        double maxmem = timemems[2];
+
+        double tmintime = umintime + smintime;
+        double tmaxtime = umaxtime + smaxtime;
+
+        double usum = timemems[0];
+        double ssum = timemems[1];
+        double tsum = usum + ssum;
+
+        double msum = timemems[2];
+
+        for (int k = 3; k < e->nr_nodes * 3; k += 3) {
+          if (timemems[k] > umaxtime) umaxtime = timemems[k];
+          if (timemems[k] < umintime) umintime = timemems[k];
+
+          if (timemems[k + 1] > smaxtime) smaxtime = timemems[k + 1];
+          if (timemems[k + 1] < smintime) smintime = timemems[k + 1];
+
+          double total = timemems[k] + timemems[k + 1];
+          if (total > tmaxtime) tmaxtime = total;
+          if (total < tmintime) tmintime = total;
+
+          usum += timemems[k];
+          ssum += timemems[k + 1];
+          tsum += total;
+
+          if (timemems[k + 2] > maxmem) maxmem = timemems[k + 2];
+          if (timemems[k + 2] < minmem) minmem = timemems[k + 2];
+          msum += timemems[k + 2];
+        }
+        double umean = usum / (double)e->nr_nodes;
+        double smean = ssum / (double)e->nr_nodes;
+        double tmean = tsum / (double)e->nr_nodes;
+        double mmean = msum / (double)e->nr_nodes;
+
+        /* Are we out of balance and need to repartition? */
+        /* ---------------------------------------------- */
+        double abs_trigger = fabs(e->reparttype->trigger);
+        double balance = (umaxtime - umintime) / umean;
+        if (e->reparttype->type != REPART_NONE) {
+
+          /* When forced we don't care about the balance. */
+          if (!forced) {
+            if (balance > abs_trigger) {
+              if (e->verbose)
+                message("trigger fraction %.3f > %.3f will repartition",
+                        balance, abs_trigger);
+              e->forcerepart = 1;
+            } else {
+              if (e->verbose && e->reparttype->type != REPART_NONE)
+                message("trigger fraction %.3f =< %.3f will not repartition",
+                        balance, abs_trigger);
+            }
+          }
 
-      /* It is only worth checking the CPU loads when we have processed a
-       * significant number of all particles as we require all tasks to have
-       * timings. */
-      if ((e->updates > 1 &&
-           e->updates >= e->total_nr_parts * e->reparttype->minfrac) ||
-          (e->g_updates > 1 &&
-           e->g_updates >= e->total_nr_gparts * e->reparttype->minfrac)) {
-
-        /* Should we are use the task timings or fixed costs. */
-        if (e->reparttype->use_fixed_costs > 1) {
-          e->reparttype->use_ticks = 0;
         } else {
-          e->reparttype->use_ticks = 1;
+          /* Not repartitioning, would that have been done otherwise? */
+          if (e->verbose)
+            message("trigger fraction %.3f > %.3f would have repartitioned",
+                    balance, abs_trigger);
         }
 
-        /* Get CPU time used since the last call to this function. */
-        double elapsed_cputime =
-            clocks_get_cputime_used() - e->cputime_last_step;
-
-        /* Gather the elapsed CPU times from all ranks for the last step. */
-        double elapsed_cputimes[e->nr_nodes];
-        MPI_Gather(&elapsed_cputime, 1, MPI_DOUBLE, elapsed_cputimes, 1,
-                   MPI_DOUBLE, 0, MPI_COMM_WORLD);
-        if (e->nodeID == 0) {
-
-          /* Get the range and mean of cputimes. */
-          double mintime = elapsed_cputimes[0];
-          double maxtime = elapsed_cputimes[0];
-          double sum = elapsed_cputimes[0];
-          for (int k = 1; k < e->nr_nodes; k++) {
-            if (elapsed_cputimes[k] > maxtime) maxtime = elapsed_cputimes[k];
-            if (elapsed_cputimes[k] < mintime) mintime = elapsed_cputimes[k];
-            sum += elapsed_cputimes[k];
-          }
-          double mean = sum / (double)e->nr_nodes;
-
-          /* Are we out of balance? */
-          double abs_trigger = fabs(e->reparttype->trigger);
-          if (((maxtime - mintime) / mean) > abs_trigger) {
-            if (e->verbose)
-              message("trigger fraction %.3f > %.3f will repartition",
-                      (maxtime - mintime) / mean, abs_trigger);
-            e->forcerepart = 1;
-          } else {
-            if (e->verbose)
-              message("trigger fraction %.3f =< %.3f will not repartition",
-                      (maxtime - mintime) / mean, abs_trigger);
-          }
+        /* Keep logs of all CPU times and resident memory size for debugging
+         * load issues. */
+        FILE *timelog = NULL;
+        FILE *memlog = NULL;
+        if (!opened) {
+          timelog = fopen("rank_cpu_balance.log", "w");
+          fprintf(timelog, "# step rank user sys sum\n");
+
+          memlog = fopen("rank_memory_balance.log", "w");
+          fprintf(memlog, "# step rank resident\n");
+
+          opened = 1;
+        } else {
+          timelog = fopen("rank_cpu_balance.log", "a");
+          memlog = fopen("rank_memory_balance.log", "a");
+        }
+
+        for (int k = 0; k < e->nr_nodes * 3; k += 3) {
+
+          fprintf(timelog, "%d %d %f %f %f\n", e->step, k / 3, timemems[k],
+                  timemems[k + 1], timemems[k] + timemems[k + 1]);
+
+          fprintf(memlog, "%d %d %ld\n", e->step, k / 3, (long)timemems[k + 2]);
         }
+
+        fprintf(timelog, "# %d mean times: %f %f %f\n", e->step, umean, smean,
+                tmean);
+        if (abs_trigger > 1.f) abs_trigger = 0.f; /* Not relevant. */
+        fprintf(timelog,
+                "# %d balance: %f, expected: %f (sys: %f, total: %f)\n",
+                e->step, balance, abs_trigger, (smaxtime - smintime) / smean,
+                (tmaxtime - tmintime) / tmean);
+
+        fclose(timelog);
+
+        fprintf(memlog, "# %d mean resident memory: %f, balance: %f\n", e->step,
+                mmean, (maxmem - minmem) / mmean);
+        fclose(memlog);
       }
 
-      /* All nodes do this together. */
+      /* All nodes do this together, so send to other ranks. */
       MPI_Bcast(&e->forcerepart, 1, MPI_INT, 0, MPI_COMM_WORLD);
     }
 
@@ -333,10 +426,6 @@ void engine_repartition_trigger(struct engine *e) {
     if (e->forcerepart) e->last_repartition = e->step;
   }
 
-  /* We always reset CPU time for next check, unless it will not be used. */
-  if (e->reparttype->type != REPART_NONE)
-    e->cputime_last_step = clocks_get_cputime_used();
-
   if (e->verbose)
     message("took %.3f %s", clocks_from_ticks(getticks() - tic),
             clocks_getunit());
@@ -1282,7 +1371,7 @@ void engine_print_task_counts(const struct engine *e) {
   for (int k = 0; k <= task_type_count; k++) counts[k] = 0;
   threadpool_map((struct threadpool *)&e->threadpool,
                  engine_do_tasks_count_mapper, (void *)tasks, nr_tasks,
-                 sizeof(struct task), 0, counts);
+                 sizeof(struct task), threadpool_auto_chunk_size, counts);
 
 #ifdef WITH_MPI
   printf("[%04i] %s engine_print_task_counts: task counts are [ %s=%i",
@@ -2340,11 +2429,29 @@ void engine_step(struct engine *e) {
     gravity_exact_force_compute(e->s, e);
 #endif
 
+    /* Get current CPU times.*/
+#ifdef WITH_MPI
+  double start_usertime = 0.0;
+  double start_systime = 0.0;
+  if (e->reparttype->type != REPART_NONE) {
+    clocks_get_cputimes_used(&start_usertime, &start_systime);
+  }
+#endif
+
   /* Start all the tasks. */
   TIMER_TIC;
   engine_launch(e, "tasks");
   TIMER_TOC(timer_runners);
 
+  /* Now record the CPU times used by the tasks. */
+#ifdef WITH_MPI
+  double end_usertime = 0.0;
+  double end_systime = 0.0;
+  clocks_get_cputimes_used(&end_usertime, &end_systime);
+  e->usertime_last_step = end_usertime - start_usertime;
+  e->systime_last_step = end_systime - start_systime;
+#endif
+
   /* Since the time-steps may have changed because of the limiter's
    * action, we need to communicate the new time-step sizes */
   if ((e->policy & engine_policy_timestep_sync) ||
@@ -2715,7 +2822,8 @@ void engine_reconstruct_multipoles(struct engine *e) {
 #endif
 
   threadpool_map(&e->threadpool, engine_do_reconstruct_multipoles_mapper,
-                 e->s->cells_top, e->s->nr_cells, sizeof(struct cell), 0, e);
+                 e->s->cells_top, e->s->nr_cells, sizeof(struct cell),
+                 threadpool_auto_chunk_size, e);
 
   if (e->verbose)
     message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
@@ -3105,17 +3213,10 @@ void engine_split(struct engine *e, struct partition *initial_partition) {
   swift_free("gparts", s->gparts);
   s->gparts = gparts_new;
 
-  /* Re-link the parts. */
-  if (s->nr_parts > 0 && s->nr_gparts > 0)
-    part_relink_parts_to_gparts(s->gparts, s->nr_gparts, s->parts);
-
-  /* Re-link the sparts. */
-  if (s->nr_sparts > 0 && s->nr_gparts > 0)
-    part_relink_sparts_to_gparts(s->gparts, s->nr_gparts, s->sparts);
-
-  /* Re-link the bparts. */
-  if (s->nr_bparts > 0 && s->nr_gparts > 0)
-    part_relink_bparts_to_gparts(s->gparts, s->nr_gparts, s->bparts);
+  /* Re-link everything to the gparts. */
+  if (s->nr_gparts > 0)
+    part_relink_all_parts_to_gparts(s->gparts, s->nr_gparts, s->parts,
+                                    s->sparts, s->bparts, &e->threadpool);
 
 #ifdef SWIFT_DEBUG_CHECKS
 
@@ -3383,6 +3484,72 @@ void engine_unpin(void) {
 #endif
 }
 
+#ifdef SWIFT_DUMPER_THREAD
+/**
+ * @brief dumper thread action, checks got the existence of the .dump file
+ * every 5 seconds and does the dump if found.
+ *
+ * @param p the #engine
+ */
+static void *engine_dumper_poll(void *p) {
+  struct engine *e = (struct engine *)p;
+  while (1) {
+    if (access(".dump", F_OK) == 0) {
+
+      /* OK, do our work. */
+      message("Dumping engine tasks in step: %d", e->step);
+      task_dump_active(e);
+
+#ifdef SWIFT_MEMUSE_REPORTS
+      /* Dump the currently logged memory. */
+      message("Dumping memory use report");
+      memuse_log_dump_error(e->nodeID);
+#endif
+
+#if defined(SWIFT_MPIUSE_REPORTS) && defined(WITH_MPI)
+      /* Dump the MPI interactions in the step. */
+      mpiuse_log_dump_error(e->nodeID);
+#endif
+
+      /* Add more interesting diagnostics. */
+      scheduler_dump_queues(e);
+
+      /* Delete the file. */
+      unlink(".dump");
+      message("Dumping completed");
+      fflush(stdout);
+    }
+
+    /* Take a breath. */
+    sleep(5);
+  }
+  return NULL;
+}
+#endif /* SWIFT_DUMPER_THREAD */
+
+#ifdef SWIFT_DUMPER_THREAD
+/**
+ * @brief creates the dumper thread.
+ *
+ * This watches for the creation of a ".dump" file in the current directory
+ * and if found dumps the current state of the tasks and memory use (if also
+ * configured).
+ *
+ * @param e the #engine
+ *
+ */
+static void engine_dumper_init(struct engine *e) {
+  pthread_t dumper;
+
+  /* Make sure the .dump file is not present, that is bad when starting up. */
+  struct stat buf;
+  if (stat(".dump", &buf) == 0) unlink(".dump");
+
+  /* Thread does not exit, so nothing to do but create it. */
+  pthread_create(&dumper, NULL, &engine_dumper_poll, e);
+}
+#endif /* SWIFT_DUMPER_THREAD */
+
 /**
  * @brief init an engine struct with the necessary properties for the
  *        simulation.
@@ -3512,7 +3679,8 @@ void engine_init(struct engine *e, struct space *s, struct swift_params *params,
   e->parameter_file = params;
   e->stf_this_timestep = 0;
 #ifdef WITH_MPI
-  e->cputime_last_step = 0;
+  e->usertime_last_step = 0.0;
+  e->systime_last_step = 0.0;
   e->last_repartition = 0;
 #endif
   e->total_nr_cells = 0;
@@ -4283,6 +4451,12 @@ void engine_config(int restart, int fof, struct engine *e,
   }
 #endif
 
+#ifdef SWIFT_DUMPER_THREAD
+
+  /* Start the dumper thread.*/
+  engine_dumper_init(e);
+#endif
+
   /* Wait for the runner threads to be in place. */
   swift_barrier_wait(&e->wait_barrier);
 }
diff --git a/src/engine.h b/src/engine.h
index a30d8943b8476f91915687988b42b0c074572397..f4de5d6a76205bda533e97790e4954a1d801e08b 100644
--- a/src/engine.h
+++ b/src/engine.h
@@ -360,8 +360,9 @@ struct engine {
   ticks tic_step, toc_step;
 
 #ifdef WITH_MPI
-  /* CPU time of the last step. */
-  double cputime_last_step;
+  /* CPU times that the tasks used in the last step. */
+  double usertime_last_step;
+  double systime_last_step;
 
   /* Step of last repartition. */
   int last_repartition;
@@ -476,6 +477,9 @@ struct engine {
   /* Maximum number of tasks needed for restarting. */
   int restart_max_tasks;
 
+  /* The globally agreed runtime, in hours. */
+  float runtime;
+
   /* Label of the run */
   char run_name[PARSER_MAX_LINE_SIZE];
 
diff --git a/src/engine_collect_end_of_step.c b/src/engine_collect_end_of_step.c
index 458a0a7a40e9e433f0241b04ff67fa1fbc37bbf0..0973399c11146baef3b3be60b3635426021b23cb 100644
--- a/src/engine_collect_end_of_step.c
+++ b/src/engine_collect_end_of_step.c
@@ -44,6 +44,7 @@ struct end_of_step_data {
       ti_black_holes_beg_max;
   struct engine *e;
   struct star_formation_history sfh;
+  float runtime;
 };
 
 /**
@@ -455,13 +456,16 @@ void engine_collect_end_of_step(struct engine *e, int apply) {
   data.ti_black_holes_end_max = 0, data.ti_black_holes_beg_max = 0;
   data.e = e;
 
+  /* Need to use a consistent check of the hours since we started. */
+  data.runtime = clocks_get_hours_since_start();
+
   /* Initialize the total SFH of the simulation to zero */
   star_formation_logger_init(&data.sfh);
 
   /* Collect information from the local top-level cells */
   threadpool_map(&e->threadpool, engine_collect_end_of_step_mapper,
                  s->local_cells_with_tasks_top, s->nr_local_cells_with_tasks,
-                 sizeof(int), 0, &data);
+                 sizeof(int), threadpool_auto_chunk_size, &data);
 
   /* Get the number of inhibited particles from the space-wide counters
    * since these have been updated atomically during the time-steps. */
@@ -480,7 +484,8 @@ void engine_collect_end_of_step(struct engine *e, int apply) {
       data.ti_stars_beg_max, data.ti_black_holes_end_min,
       data.ti_black_holes_end_max, data.ti_black_holes_beg_max, e->forcerebuild,
       e->s->tot_cells, e->sched.nr_tasks,
-      (float)e->sched.nr_tasks / (float)e->s->tot_cells, data.sfh);
+      (float)e->sched.nr_tasks / (float)e->s->tot_cells, data.sfh,
+      data.runtime);
 
 /* Aggregate collective data from the different nodes for this step. */
 #ifdef WITH_MPI
diff --git a/src/engine_drift.c b/src/engine_drift.c
index fee9ed1f11de2d5e372bd70554bd053b8313ac80..26f8160200c5bdc51f7dab51b56e9a4d771d3d39 100644
--- a/src/engine_drift.c
+++ b/src/engine_drift.c
@@ -293,28 +293,28 @@ void engine_drift_all(struct engine *e, const int drift_mpoles) {
     if (e->s->nr_parts > 0) {
       threadpool_map(&e->threadpool, engine_do_drift_all_part_mapper,
                      e->s->local_cells_top, e->s->nr_local_cells, sizeof(int),
-                     /* default chunk */ 0, e);
+                     threadpool_auto_chunk_size, e);
     }
     if (e->s->nr_gparts > 0) {
       threadpool_map(&e->threadpool, engine_do_drift_all_gpart_mapper,
                      e->s->local_cells_top, e->s->nr_local_cells, sizeof(int),
-                     /* default chunk */ 0, e);
+                     threadpool_auto_chunk_size, e);
     }
     if (e->s->nr_sparts > 0) {
       threadpool_map(&e->threadpool, engine_do_drift_all_spart_mapper,
                      e->s->local_cells_top, e->s->nr_local_cells, sizeof(int),
-                     /* default chunk */ 0, e);
+                     threadpool_auto_chunk_size, e);
     }
     if (e->s->nr_bparts > 0) {
       threadpool_map(&e->threadpool, engine_do_drift_all_bpart_mapper,
                      e->s->local_cells_top, e->s->nr_local_cells, sizeof(int),
-                     /* default chunk */ 0, e);
+                     threadpool_auto_chunk_size, e);
     }
     if (drift_mpoles && (e->policy & engine_policy_self_gravity)) {
       threadpool_map(&e->threadpool, engine_do_drift_all_multipole_mapper,
                      e->s->local_cells_with_tasks_top,
                      e->s->nr_local_cells_with_tasks, sizeof(int),
-                     /* default chunk */ 0, e);
+                     threadpool_auto_chunk_size, e);
     }
 
   } else {
@@ -325,27 +325,27 @@ void engine_drift_all(struct engine *e, const int drift_mpoles) {
     if (e->s->nr_parts > 0) {
       threadpool_map(&e->threadpool, engine_do_drift_all_part_mapper,
                      e->s->cells_top, e->s->nr_cells, sizeof(struct cell),
-                     /* default chunk */ 0, e);
+                     threadpool_auto_chunk_size, e);
     }
     if (e->s->nr_sparts > 0) {
       threadpool_map(&e->threadpool, engine_do_drift_all_spart_mapper,
                      e->s->cells_top, e->s->nr_cells, sizeof(struct cell),
-                     /* default chunk */ 0, e);
+                     threadpool_auto_chunk_size, e);
     }
     if (e->s->nr_bparts > 0) {
       threadpool_map(&e->threadpool, engine_do_drift_all_bpart_mapper,
                      e->s->cells_top, e->s->nr_cells, sizeof(struct cell),
-                     /* default chunk */ 0, e);
+                     threadpool_auto_chunk_size, e);
     }
     if (e->s->nr_gparts > 0) {
       threadpool_map(&e->threadpool, engine_do_drift_all_gpart_mapper,
                      e->s->cells_top, e->s->nr_cells, sizeof(struct cell),
-                     /* default chunk */ 0, e);
+                     threadpool_auto_chunk_size, e);
     }
     if (e->policy & engine_policy_self_gravity) {
       threadpool_map(&e->threadpool, engine_do_drift_all_multipole_mapper,
                      e->s->cells_top, e->s->nr_cells, sizeof(struct cell),
-                     /* default chunk */ 0, e);
+                     threadpool_auto_chunk_size, e);
     }
   }
 
@@ -401,7 +401,8 @@ void engine_drift_top_multipoles(struct engine *e) {
   const ticks tic = getticks();
 
   threadpool_map(&e->threadpool, engine_do_drift_top_multipoles_mapper,
-                 e->s->cells_top, e->s->nr_cells, sizeof(struct cell), 0, e);
+                 e->s->cells_top, e->s->nr_cells, sizeof(struct cell),
+                 threadpool_auto_chunk_size, e);
 
 #ifdef SWIFT_DEBUG_CHECKS
   /* Check that all cells have been drifted to the current time. */
diff --git a/src/engine_fof.c b/src/engine_fof.c
index 961b7e966ecc33250c3d005d3a31d59200f98fb0..bbecc54a9dd5a5aa9cbdb62b15a0fc621791ebb5 100644
--- a/src/engine_fof.c
+++ b/src/engine_fof.c
@@ -25,6 +25,9 @@
 /* This object's header. */
 #include "engine.h"
 
+/* Local headers. */
+#include "fof.h"
+
 /**
  * @brief Activate all the #gpart communications in preparation
  * fof a call to FOF.
diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index 5d8d3a5fed14330776ccd835664804427d1cb370..9f907f33b4b0173a32292c58e9b1a1fd8bad3312 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -3123,7 +3123,7 @@ void engine_make_fof_tasks(struct engine *e) {
   /* Construct a FOF loop over neighbours */
   if (e->policy & engine_policy_fof)
     threadpool_map(&e->threadpool, engine_make_fofloop_tasks_mapper, NULL,
-                   s->nr_cells, 1, 0, e);
+                   s->nr_cells, 1, threadpool_auto_chunk_size, e);
 
   if (e->verbose)
     message("Making FOF tasks took %.3f %s.",
@@ -3180,7 +3180,7 @@ void engine_maketasks(struct engine *e) {
   /* Construct the first hydro loop over neighbours */
   if (e->policy & engine_policy_hydro)
     threadpool_map(&e->threadpool, engine_make_hydroloop_tasks_mapper, NULL,
-                   s->nr_cells, 1, 0, e);
+                   s->nr_cells, 1, threadpool_auto_chunk_size, e);
 
   if (e->verbose)
     message("Making hydro tasks took %.3f %s.",
@@ -3191,7 +3191,7 @@ void engine_maketasks(struct engine *e) {
   /* Add the self gravity tasks. */
   if (e->policy & engine_policy_self_gravity) {
     threadpool_map(&e->threadpool, engine_make_self_gravity_tasks_mapper, NULL,
-                   s->nr_cells, 1, 0, e);
+                   s->nr_cells, 1, threadpool_auto_chunk_size, e);
   }
 
   if (e->verbose)
@@ -3242,7 +3242,8 @@ void engine_maketasks(struct engine *e) {
      store the density tasks in each cell, and make each sort
      depend on the sorts of its progeny. */
   threadpool_map(&e->threadpool, engine_count_and_link_tasks_mapper,
-                 sched->tasks, sched->nr_tasks, sizeof(struct task), 0, e);
+                 sched->tasks, sched->nr_tasks, sizeof(struct task),
+                 threadpool_auto_chunk_size, e);
 
   if (e->verbose)
     message("Counting and linking tasks took %.3f %s.",
@@ -3259,7 +3260,7 @@ void engine_maketasks(struct engine *e) {
   /* Now that the self/pair tasks are at the right level, set the super
    * pointers. */
   threadpool_map(&e->threadpool, cell_set_super_mapper, cells, nr_cells,
-                 sizeof(struct cell), 0, e);
+                 sizeof(struct cell), threadpool_auto_chunk_size, e);
 
   if (e->verbose)
     message("Setting super-pointers took %.3f %s.",
@@ -3267,7 +3268,7 @@ void engine_maketasks(struct engine *e) {
 
   /* Append hierarchical tasks to each cell. */
   threadpool_map(&e->threadpool, engine_make_hierarchical_tasks_mapper, cells,
-                 nr_cells, sizeof(struct cell), 0, e);
+                 nr_cells, sizeof(struct cell), threadpool_auto_chunk_size, e);
 
   tic2 = getticks();
 
@@ -3276,7 +3277,8 @@ void engine_maketasks(struct engine *e) {
      of its super-cell. */
   if (e->policy & engine_policy_hydro)
     threadpool_map(&e->threadpool, engine_make_extra_hydroloop_tasks_mapper,
-                   sched->tasks, sched->nr_tasks, sizeof(struct task), 0, e);
+                   sched->tasks, sched->nr_tasks, sizeof(struct task),
+                   threadpool_auto_chunk_size, e);
 
   if (e->verbose)
     message("Making extra hydroloop tasks took %.3f %s.",
@@ -3325,8 +3327,8 @@ void engine_maketasks(struct engine *e) {
 
     threadpool_map(&e->threadpool, engine_addtasks_send_mapper,
                    send_cell_type_pairs, num_send_cells,
-                   sizeof(struct cell_type_pair),
-                   /*chunk=*/0, e);
+                   sizeof(struct cell_type_pair), threadpool_auto_chunk_size,
+                   e);
 
     free(send_cell_type_pairs);
 
@@ -3366,8 +3368,8 @@ void engine_maketasks(struct engine *e) {
     }
     threadpool_map(&e->threadpool, engine_addtasks_recv_mapper,
                    recv_cell_type_pairs, num_recv_cells,
-                   sizeof(struct cell_type_pair),
-                   /*chunk=*/0, e);
+                   sizeof(struct cell_type_pair), threadpool_auto_chunk_size,
+                   e);
     free(recv_cell_type_pairs);
 
     if (e->verbose)
diff --git a/src/engine_marktasks.c b/src/engine_marktasks.c
index e7da143404849799911d7ae4f20fd8532d04b5b8..d3d17bcdd6dbf09fb6586cd584c7fdfd6ebe21fd 100644
--- a/src/engine_marktasks.c
+++ b/src/engine_marktasks.c
@@ -1004,7 +1004,7 @@ int engine_marktasks(struct engine *e) {
   /* Run through the tasks and mark as skip or not. */
   size_t extra_data[3] = {(size_t)e, (size_t)rebuild_space, (size_t)&e->sched};
   threadpool_map(&e->threadpool, engine_marktasks_mapper, s->tasks, s->nr_tasks,
-                 sizeof(struct task), 0, extra_data);
+                 sizeof(struct task), threadpool_auto_chunk_size, extra_data);
   rebuild_space = extra_data[1];
 
   if (e->verbose)
diff --git a/src/engine_redistribute.c b/src/engine_redistribute.c
index e2c3ea056e57e5097785a9014e4e0d2500d52a52..9e9ce4842a88a65f446f12a74de5039912247516 100644
--- a/src/engine_redistribute.c
+++ b/src/engine_redistribute.c
@@ -664,7 +664,8 @@ void engine_redistribute(struct engine *e) {
   redist_data.base = (void *)parts;
 
   threadpool_map(&e->threadpool, engine_redistribute_dest_mapper_part, parts,
-                 nr_parts, sizeof(struct part), 0, &redist_data);
+                 nr_parts, sizeof(struct part), threadpool_auto_chunk_size,
+                 &redist_data);
 
   /* Sort the particles according to their cell index. */
   if (nr_parts > 0)
@@ -711,7 +712,8 @@ void engine_redistribute(struct engine *e) {
     savelink_data.parts = (void *)parts;
     savelink_data.nodeID = nodeID;
     threadpool_map(&e->threadpool, engine_redistribute_savelink_mapper_part,
-                   nodes, nr_nodes, sizeof(int), 0, &savelink_data);
+                   nodes, nr_nodes, sizeof(int), threadpool_auto_chunk_size,
+                   &savelink_data);
   }
   swift_free("dest", dest);
 
@@ -729,7 +731,8 @@ void engine_redistribute(struct engine *e) {
   redist_data.base = (void *)sparts;
 
   threadpool_map(&e->threadpool, engine_redistribute_dest_mapper_spart, sparts,
-                 nr_sparts, sizeof(struct spart), 0, &redist_data);
+                 nr_sparts, sizeof(struct spart), threadpool_auto_chunk_size,
+                 &redist_data);
 
   /* Sort the particles according to their cell index. */
   if (nr_sparts > 0)
@@ -775,7 +778,8 @@ void engine_redistribute(struct engine *e) {
     savelink_data.parts = (void *)sparts;
     savelink_data.nodeID = nodeID;
     threadpool_map(&e->threadpool, engine_redistribute_savelink_mapper_spart,
-                   nodes, nr_nodes, sizeof(int), 0, &savelink_data);
+                   nodes, nr_nodes, sizeof(int), threadpool_auto_chunk_size,
+                   &savelink_data);
   }
   swift_free("s_dest", s_dest);
 
@@ -793,7 +797,8 @@ void engine_redistribute(struct engine *e) {
   redist_data.base = (void *)bparts;
 
   threadpool_map(&e->threadpool, engine_redistribute_dest_mapper_bpart, bparts,
-                 nr_bparts, sizeof(struct bpart), 0, &redist_data);
+                 nr_bparts, sizeof(struct bpart), threadpool_auto_chunk_size,
+                 &redist_data);
 
   /* Sort the particles according to their cell index. */
   if (nr_bparts > 0)
@@ -839,7 +844,8 @@ void engine_redistribute(struct engine *e) {
     savelink_data.parts = (void *)bparts;
     savelink_data.nodeID = nodeID;
     threadpool_map(&e->threadpool, engine_redistribute_savelink_mapper_bpart,
-                   nodes, nr_nodes, sizeof(int), 0, &savelink_data);
+                   nodes, nr_nodes, sizeof(int), threadpool_auto_chunk_size,
+                   &savelink_data);
   }
   swift_free("b_dest", b_dest);
 
@@ -857,7 +863,8 @@ void engine_redistribute(struct engine *e) {
   redist_data.base = (void *)gparts;
 
   threadpool_map(&e->threadpool, engine_redistribute_dest_mapper_gpart, gparts,
-                 nr_gparts, sizeof(struct gpart), 0, &redist_data);
+                 nr_gparts, sizeof(struct gpart), threadpool_auto_chunk_size,
+                 &redist_data);
 
   /* Sort the gparticles according to their cell index. */
   if (nr_gparts > 0)
diff --git a/src/engine_split_particles.c b/src/engine_split_particles.c
index 58d7f905588c79c6339d9c2e1ac264ec63095c3c..2a9fae7efb5d55dd0f3308af85829241c5031a7a 100644
--- a/src/engine_split_particles.c
+++ b/src/engine_split_particles.c
@@ -36,6 +36,201 @@
 const int particle_split_factor = 2;
 const double displacement_factor = 0.2;
 
+/**
+ * @brief Data structure used by the counter mapper function
+ */
+struct data_count {
+  const struct engine *const e;
+  const float mass_threshold;
+  size_t counter;
+};
+
+/**
+ * @brief Data structure used by the split mapper function
+ */
+struct data_split {
+  const struct engine *const e;
+  const float mass_threshold;
+  size_t *const k_parts;
+  size_t *const k_gparts;
+  swift_lock_type lock;
+};
+
+/**
+ * @brief Mapper function to count the number of #part above the mass threshold
+ * for splitting.
+ */
+void engine_split_gas_particle_count_mapper(void *restrict map_data, int count,
+                                            void *restrict extra_data) {
+
+  /* Unpack the data */
+  struct part *parts = (struct part *)map_data;
+  struct data_count *data = (struct data_count *)extra_data;
+  const struct engine *e = data->e;
+  const float mass_threshold = data->mass_threshold;
+
+  size_t counter = 0;
+
+  for (int i = 0; i < count; ++i) {
+
+    const struct part *p = &parts[i];
+
+    /* Ignore inhibited particles */
+    if (part_is_inhibited(p, e)) continue;
+
+    /* Is the mass of this particle larger than the threshold? */
+    const float gas_mass = hydro_get_mass(p);
+    if (gas_mass > mass_threshold) ++counter;
+  }
+
+  /* Increment the global counter */
+  atomic_add(&data->counter, counter);
+}
+
+/**
+ * @brief Mapper function to split the #part above the mass threshold.
+ */
+void engine_split_gas_particle_split_mapper(void *restrict map_data, int count,
+                                            void *restrict extra_data) {
+
+  /* Unpack the data */
+  struct part *parts = (struct part *)map_data;
+  struct data_split *data = (struct data_split *)extra_data;
+  const float mass_threshold = data->mass_threshold;
+  const struct engine *e = data->e;
+  const int with_gravity = (e->policy & engine_policy_self_gravity) ||
+                           (e->policy & engine_policy_external_gravity);
+
+  /* Additional thread-global particle arrays */
+  const struct space *s = e->s;
+  struct part *global_parts = s->parts;
+  struct xpart *global_xparts = s->xparts;
+  struct gpart *global_gparts = s->gparts;
+
+  /* xpart array corresponding to the thread-local particle array */
+  const ptrdiff_t offset = parts - global_parts;
+  struct xpart *xparts = global_xparts + offset;
+
+  /* Loop over the chunk of the part array assigned to this thread */
+  for (int i = 0; i < count; ++i) {
+
+    /* Is the mass of this particle larger than the threshold? */
+    struct part *p = &parts[i];
+
+    /* Ignore inhibited particles */
+    if (part_is_inhibited(p, e)) continue;
+
+    const float gas_mass = hydro_get_mass(p);
+    const float h = p->h;
+
+    /* Found a particle to split */
+    if (gas_mass > mass_threshold) {
+
+      /* Make sure only one thread is here */
+      lock_lock(&data->lock);
+
+      /* Where should we put the new particle in the array? */
+      const size_t k_parts = *data->k_parts;
+      const size_t k_gparts = *data->k_gparts;
+
+      /* Make the next slot available for the next particle */
+      (*data->k_parts)++;
+      if (with_gravity) (*data->k_gparts)++;
+
+      /* Release the lock and continue in parallel */
+      if (lock_unlock(&data->lock) != 0)
+        error("Impossible to unlock particle splitting");
+
+      /* We now know where to put the new particle we are going to create. */
+
+      /* Current other fields associated to this particle */
+      struct xpart *xp = &xparts[i];
+      struct gpart *gp = p->gpart;
+
+      /* Start by copying over the particles */
+      memcpy(&global_parts[k_parts], p, sizeof(struct part));
+      memcpy(&global_xparts[k_parts], xp, sizeof(struct xpart));
+
+      if (with_gravity) {
+        memcpy(&global_gparts[k_gparts], gp, sizeof(struct gpart));
+      }
+
+      /* Update the IDs */
+      global_parts[k_parts].id += 2;
+
+      /* Re-link everything */
+      if (with_gravity) {
+        global_parts[k_parts].gpart = &global_gparts[k_gparts];
+        global_gparts[k_gparts].id_or_neg_offset = -k_parts;
+      }
+
+      /* Displacement unit vector */
+      const double delta_x = random_unit_interval(p->id, e->ti_current,
+                                                  (enum random_number_type)0);
+      const double delta_y = random_unit_interval(p->id, e->ti_current,
+                                                  (enum random_number_type)1);
+      const double delta_z = random_unit_interval(p->id, e->ti_current,
+                                                  (enum random_number_type)2);
+
+      /* Displace the old particle */
+      p->x[0] += delta_x * displacement_factor * h;
+      p->x[1] += delta_y * displacement_factor * h;
+      p->x[2] += delta_z * displacement_factor * h;
+
+      if (with_gravity) {
+        gp->x[0] += delta_x * displacement_factor * h;
+        gp->x[1] += delta_y * displacement_factor * h;
+        gp->x[2] += delta_z * displacement_factor * h;
+      }
+
+      /* Displace the new particle */
+      global_parts[k_parts].x[0] -= delta_x * displacement_factor * h;
+      global_parts[k_parts].x[1] -= delta_y * displacement_factor * h;
+      global_parts[k_parts].x[2] -= delta_z * displacement_factor * h;
+
+      if (with_gravity) {
+        global_gparts[k_gparts].x[0] -= delta_x * displacement_factor * h;
+        global_gparts[k_gparts].x[1] -= delta_y * displacement_factor * h;
+        global_gparts[k_gparts].x[2] -= delta_z * displacement_factor * h;
+      }
+
+      /* Divide the mass */
+      const double new_mass = gas_mass * 0.5;
+      hydro_set_mass(p, new_mass);
+      hydro_set_mass(&global_parts[k_parts], new_mass);
+
+      if (with_gravity) {
+        gp->mass = new_mass;
+        global_gparts[k_gparts].mass = new_mass;
+      }
+
+      /* Split the chemistry fields */
+      chemistry_split_part(p, particle_split_factor);
+      chemistry_split_part(&global_parts[k_parts], particle_split_factor);
+
+      /* Split the cooling fields */
+      cooling_split_part(p, xp, particle_split_factor);
+      cooling_split_part(&global_parts[k_parts], &global_xparts[k_parts],
+                         particle_split_factor);
+
+      /* Split the star formation fields */
+      star_formation_split_part(p, xp, particle_split_factor);
+      star_formation_split_part(&global_parts[k_parts], &global_xparts[k_parts],
+                                particle_split_factor);
+
+      /* Split the tracer fields */
+      tracers_split_part(p, xp, particle_split_factor);
+      tracers_split_part(&global_parts[k_parts], &global_xparts[k_parts],
+                         particle_split_factor);
+
+      /* Mark the particles as not having been swallowed */
+      black_holes_mark_part_as_not_swallowed(&p->black_holes_data);
+      black_holes_mark_part_as_not_swallowed(
+          &global_parts[k_parts].black_holes_data);
+    }
+  }
+}
+
 /**
  * @brief Identify all the gas particles above a given mass threshold
  * and split them into 2.
@@ -45,12 +240,15 @@ const double displacement_factor = 0.2;
  * the local array of particles. In case of reallocations, it may
  * also have to loop over the gravity and other arrays.
  *
+ * This is done on a node-by-node basis. No MPI required here.
+ *
  * @param e The #engine.
  */
 void engine_split_gas_particles(struct engine *e) {
 
   /* Abort if we are not doing any splitting */
   if (!e->hydro_properties->particle_splitting) return;
+  if (e->s->nr_parts == 0) return;
 
   /* Time this */
   const ticks tic = getticks();
@@ -59,8 +257,9 @@ void engine_split_gas_particles(struct engine *e) {
   struct space *s = e->s;
   const int with_gravity = (e->policy & engine_policy_self_gravity) ||
                            (e->policy & engine_policy_external_gravity);
-  const double mass_threshold =
+  const float mass_threshold =
       e->hydro_properties->particle_splitting_mass_threshold;
+  const size_t nr_parts_old = s->nr_parts;
 
   /* Quick check to avoid problems */
   if (particle_split_factor != 2) {
@@ -68,23 +267,12 @@ void engine_split_gas_particles(struct engine *e) {
         "Invalid splitting factor. Can currently only split particles into 2!");
   }
 
-  /* Start by counting how many particles are above the threshold for splitting
-   */
-
-  size_t counter = 0;
-  const size_t nr_parts_old = s->nr_parts;
-  const struct part *parts_old = s->parts;
-  for (size_t i = 0; i < nr_parts_old; ++i) {
-
-    const struct part *p = &parts_old[i];
-
-    /* Ignore inhibited particles */
-    if (part_is_inhibited(p, e)) continue;
-
-    /* Is the mass of this particle larger than the threshold? */
-    const float gas_mass = hydro_get_mass(p);
-    if (gas_mass > mass_threshold) ++counter;
-  }
+  /* Start by counting how many particles are above the threshold
+   * for splitting (this is done in parallel over the threads) */
+  struct data_count data_count = {e, mass_threshold, 0};
+  threadpool_map(&e->threadpool, engine_split_gas_particle_count_mapper,
+                 s->parts, nr_parts_old, sizeof(struct part), 0, &data_count);
+  const size_t counter = data_count.counter;
 
   /* Early abort? */
   if (counter == 0) {
@@ -109,17 +297,29 @@ void engine_split_gas_particles(struct engine *e) {
 
     if (e->verbose) message("Reallocating the part array!");
 
+    /* Allocate a larger array and copy over */
     struct part *parts_new = NULL;
     if (swift_memalign("parts", (void **)&parts_new, part_align,
                        sizeof(struct part) * s->size_parts) != 0)
       error("Failed to allocate new part data.");
+
+    /* Tell the compiler that the arrays are aligned */
+    swift_align_information(struct part, s->parts, part_align);
+    swift_align_information(struct part, parts_new, part_align);
+
     memcpy(parts_new, s->parts, sizeof(struct part) * s->nr_parts);
     swift_free("parts", s->parts);
 
+    /* Allocate a larger array and copy over */
     struct xpart *xparts_new = NULL;
     if (swift_memalign("xparts", (void **)&xparts_new, xpart_align,
                        sizeof(struct xpart) * s->size_parts) != 0)
       error("Failed to allocate new part data.");
+
+    /* Tell the compiler that the arrays are aligned */
+    swift_align_information(struct xpart, s->xparts, xpart_align);
+    swift_align_information(struct xpart, xparts_new, xpart_align);
+
     memcpy(xparts_new, s->xparts, sizeof(struct xpart) * s->nr_parts);
     swift_free("xparts", s->xparts);
 
@@ -135,18 +335,23 @@ void engine_split_gas_particles(struct engine *e) {
 
     if (e->verbose) message("Reallocating the gpart array!");
 
+    /* Allocate a larger array and copy over */
     struct gpart *gparts_new = NULL;
     if (swift_memalign("gparts", (void **)&gparts_new, gpart_align,
                        sizeof(struct gpart) * s->size_gparts) != 0)
       error("Failed to allocate new gpart data.");
 
+    /* Tell the compiler that the arrays are aligned */
+    swift_align_information(struct gpart, s->gparts, gpart_align);
+    swift_align_information(struct gpart, gparts_new, gpart_align);
+
     /* Copy the particles */
     memcpy(gparts_new, s->gparts, sizeof(struct gpart) * s->nr_gparts);
     swift_free("gparts", s->gparts);
 
     /* We now need to correct all the pointers of the other particle arrays */
     part_relink_all_parts_to_gparts(gparts_new, s->nr_gparts, s->parts,
-                                    s->sparts, s->bparts);
+                                    s->sparts, s->bparts, &e->threadpool);
     s->gparts = gparts_new;
   }
 
@@ -157,115 +362,18 @@ void engine_split_gas_particles(struct engine *e) {
                     s->nr_gparts, s->nr_sparts, s->nr_bparts, e->verbose);
 #endif
 
+  /* We now have enough memory in the part array to accomodate the new
+   * particles. We can start the splitting procedure */
+
   size_t k_parts = s->nr_parts;
   size_t k_gparts = s->nr_gparts;
 
   /* Loop over the particles again to split them */
-  struct part *parts = s->parts;
-  struct xpart *xparts = s->xparts;
-  struct gpart *gparts = s->gparts;
-  for (size_t i = 0; i < nr_parts_old; ++i) {
-
-    /* Is the mass of this particle larger than the threshold? */
-    struct part *p = &parts[i];
-
-    /* Ignore inhibited particles */
-    if (part_is_inhibited(p, e)) continue;
-
-    const float gas_mass = hydro_get_mass(p);
-    const float h = p->h;
-
-    /* Found a particle to split */
-    if (gas_mass > mass_threshold) {
-
-      struct xpart *xp = &xparts[i];
-      struct gpart *gp = p->gpart;
-
-      /* Start by copying over the particles */
-      memcpy(&parts[k_parts], p, sizeof(struct part));
-      memcpy(&xparts[k_parts], xp, sizeof(struct xpart));
-
-      if (with_gravity) {
-        memcpy(&gparts[k_gparts], gp, sizeof(struct gpart));
-      }
-
-      /* Update the IDs */
-      parts[k_parts].id += 2;
-
-      /* Re-link everything */
-      if (with_gravity) {
-        parts[k_parts].gpart = &gparts[k_gparts];
-        gparts[k_gparts].id_or_neg_offset = -k_parts;
-      }
-
-      /* Displacement unit vector */
-      const double delta_x = random_unit_interval(p->id, e->ti_current,
-                                                  (enum random_number_type)0);
-      const double delta_y = random_unit_interval(p->id, e->ti_current,
-                                                  (enum random_number_type)1);
-      const double delta_z = random_unit_interval(p->id, e->ti_current,
-                                                  (enum random_number_type)2);
-
-      /* Displace the old particle */
-      p->x[0] += delta_x * displacement_factor * h;
-      p->x[1] += delta_y * displacement_factor * h;
-      p->x[2] += delta_z * displacement_factor * h;
-
-      if (with_gravity) {
-        gp->x[0] += delta_x * displacement_factor * h;
-        gp->x[1] += delta_y * displacement_factor * h;
-        gp->x[2] += delta_z * displacement_factor * h;
-      }
-
-      /* Displace the new particle */
-      parts[k_parts].x[0] -= delta_x * displacement_factor * h;
-      parts[k_parts].x[1] -= delta_y * displacement_factor * h;
-      parts[k_parts].x[2] -= delta_z * displacement_factor * h;
-
-      if (with_gravity) {
-        gparts[k_gparts].x[0] -= delta_x * displacement_factor * h;
-        gparts[k_gparts].x[1] -= delta_y * displacement_factor * h;
-        gparts[k_gparts].x[2] -= delta_z * displacement_factor * h;
-      }
-
-      /* Divide the mass */
-      const double new_mass = gas_mass * 0.5;
-      hydro_set_mass(p, new_mass);
-      hydro_set_mass(&parts[k_parts], new_mass);
-
-      if (with_gravity) {
-        gp->mass = new_mass;
-        gparts[k_gparts].mass = new_mass;
-      }
-
-      /* Split the chemistry fields */
-      chemistry_split_part(p, particle_split_factor);
-      chemistry_split_part(&parts[k_parts], particle_split_factor);
-
-      /* Split the cooling fields */
-      cooling_split_part(p, xp, particle_split_factor);
-      cooling_split_part(&parts[k_parts], &xparts[k_parts],
-                         particle_split_factor);
-
-      /* Split the star formation fields */
-      star_formation_split_part(p, xp, particle_split_factor);
-      star_formation_split_part(&parts[k_parts], &xparts[k_parts],
-                                particle_split_factor);
-
-      /* Split the tracer fields */
-      tracers_split_part(p, xp, particle_split_factor);
-      tracers_split_part(&parts[k_parts], &xparts[k_parts],
-                         particle_split_factor);
-
-      /* Mark the particles as not having been swallowed */
-      black_holes_mark_part_as_not_swallowed(&p->black_holes_data);
-      black_holes_mark_part_as_not_swallowed(&parts[k_parts].black_holes_data);
-
-      /* Move to the next free slot */
-      k_parts++;
-      if (with_gravity) k_gparts++;
-    }
-  }
+  struct data_split data_split = {e, mass_threshold, &k_parts, &k_gparts, 0};
+  lock_init(&data_split.lock);
+  threadpool_map(&e->threadpool, engine_split_gas_particle_split_mapper,
+                 s->parts, nr_parts_old, sizeof(struct part), 0, &data_split);
+  if (lock_destroy(&data_split.lock) != 0) error("Error destroying lock");
 
   /* Update the local counters */
   s->nr_parts = k_parts;
diff --git a/src/engine_unskip.c b/src/engine_unskip.c
index 22bc55f34e0bc51711f770d4d7a0d2b2a629f778..9142ff447ac023d0428d466029110233cef37149 100644
--- a/src/engine_unskip.c
+++ b/src/engine_unskip.c
@@ -383,7 +383,8 @@ void engine_unskip(struct engine *e) {
 
   /* Activate all the regular tasks */
   threadpool_map(&e->threadpool, engine_do_unskip_mapper, local_active_cells,
-                 num_active_cells * multiplier, sizeof(int), 1, &data);
+                 num_active_cells * multiplier, sizeof(int), /*chunk=*/1,
+                 &data);
 
 #ifdef WITH_PROFILER
   ProfilerStop();
@@ -444,7 +445,8 @@ void engine_unskip_timestep_communications(struct engine *e) {
 
   /* Activate all the part and gpart ti_end tasks */
   threadpool_map(&e->threadpool, engine_unskip_timestep_communications_mapper,
-                 tasks, nr_tasks, sizeof(struct task), 0, s);
+                 tasks, nr_tasks, sizeof(struct task),
+                 threadpool_auto_chunk_size, s);
 
   if (e->verbose)
     message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
diff --git a/src/fof.c b/src/fof.c
index 51f35227d4441971e25f46dbc6e57ccfaddb43b7..ff15262bab336f336af47d3cc2e3710b1bee355a 100644
--- a/src/fof.c
+++ b/src/fof.c
@@ -324,8 +324,8 @@ void fof_allocate(const struct space *s, const long long total_nr_DM_particles,
 
   /* Set initial group index */
   threadpool_map(&s->e->threadpool, fof_set_initial_group_index_mapper,
-                 props->group_index, s->nr_gparts, sizeof(size_t), 0,
-                 props->group_index);
+                 props->group_index, s->nr_gparts, sizeof(size_t),
+                 threadpool_auto_chunk_size, props->group_index);
 
   if (verbose)
     message("Setting initial group index took: %.3f %s.",
@@ -335,7 +335,8 @@ void fof_allocate(const struct space *s, const long long total_nr_DM_particles,
 
   /* Set initial group sizes */
   threadpool_map(&s->e->threadpool, fof_set_initial_group_size_mapper,
-                 props->group_size, s->nr_gparts, sizeof(size_t), 0, NULL);
+                 props->group_size, s->nr_gparts, sizeof(size_t),
+                 threadpool_auto_chunk_size, NULL);
 
   if (verbose)
     message("Setting initial group sizes took: %.3f %s.",
@@ -1759,7 +1760,8 @@ void fof_calc_group_mass(struct fof_props *props, const struct space *s,
 
   /* Increment the group mass for groups above min_group_size. */
   threadpool_map(&s->e->threadpool, fof_calc_group_mass_mapper, gparts,
-                 nr_gparts, sizeof(struct gpart), 0, (struct space *)s);
+                 nr_gparts, sizeof(struct gpart), threadpool_auto_chunk_size,
+                 (struct space *)s);
 
   /* Loop over particles and find the densest particle in each group. */
   /* JSW TODO: Parallelise with threadpool*/
@@ -2278,7 +2280,8 @@ void fof_search_foreign_cells(struct fof_props *props, const struct space *s) {
   data.nr_gparts = nr_gparts;
   data.space_gparts = s->gparts;
   threadpool_map(&e->threadpool, fof_set_outgoing_root_mapper, local_cells,
-                 num_cells_out, sizeof(struct cell **), 0, &data);
+                 num_cells_out, sizeof(struct cell **),
+                 threadpool_auto_chunk_size, &data);
 
   if (verbose)
     message(
@@ -2636,7 +2639,8 @@ void fof_search_tree(struct fof_props *props,
   ticks tic_calc_group_size = getticks();
 
   threadpool_map(&s->e->threadpool, fof_calc_group_size_mapper, gparts,
-                 nr_gparts, sizeof(struct gpart), 0, s);
+                 nr_gparts, sizeof(struct gpart), threadpool_auto_chunk_size,
+                 s);
   if (verbose)
     message("FOF calc group size took (FOF SCALING): %.3f %s.",
             clocks_from_ticks(getticks() - tic_calc_group_size),
@@ -2778,7 +2782,7 @@ void fof_search_tree(struct fof_props *props,
 
   /* Set default group ID for all particles */
   threadpool_map(&s->e->threadpool, fof_set_initial_group_id_mapper, s->gparts,
-                 s->nr_gparts, sizeof(struct gpart), 0,
+                 s->nr_gparts, sizeof(struct gpart), threadpool_auto_chunk_size,
                  (void *)&group_id_default);
 
   if (verbose)
diff --git a/src/gravity.c b/src/gravity.c
index 53ab6b816f964e4e2b071df1e3192d972c5567de..da68796e68f748cfdcdf37688911aa03ca11f003 100644
--- a/src/gravity.c
+++ b/src/gravity.c
@@ -36,6 +36,7 @@
 /* Local headers. */
 #include "active.h"
 #include "error.h"
+#include "threadpool.h"
 #include "version.h"
 
 struct exact_force_data {
@@ -615,7 +616,8 @@ void gravity_exact_force_compute(struct space *s, const struct engine *e) {
   data.const_G = e->physical_constants->const_newton_G;
 
   threadpool_map(&s->e->threadpool, gravity_exact_force_compute_mapper,
-                 s->gparts, s->nr_gparts, sizeof(struct gpart), 0, &data);
+                 s->gparts, s->nr_gparts, sizeof(struct gpart),
+                 threadpool_auto_chunk_size, &data);
 
   message("Computed exact gravity for %d gparts (took %.3f %s). ",
           data.counter_global, clocks_from_ticks(getticks() - tic),
diff --git a/src/logger_io.c b/src/logger_io.c
index e227db3b169767ad4a88514d136a9593b476b64d..8921304c062b75b67344d06ef3c4203a931eba13 100644
--- a/src/logger_io.c
+++ b/src/logger_io.c
@@ -54,6 +54,7 @@
 #include "serial_io.h"
 #include "single_io.h"
 #include "stars_io.h"
+#include "threadpool.h"
 #include "tracers_io.h"
 #include "units.h"
 #include "version.h"
@@ -128,7 +129,7 @@ void writeIndexArray(const struct engine* e, FILE* f, struct io_props* props,
 
   /* Copy the whole thing into a buffer */
   threadpool_map((struct threadpool*)&e->threadpool, logger_io_copy_mapper,
-                 temp, N, typeSize, 0, props);
+                 temp, N, typeSize, threadpool_auto_chunk_size, props);
 
   /* Write data to file */
   fwrite(temp, typeSize, num_elements, f);
diff --git a/src/memuse.c b/src/memuse.c
index 00a2a5f879a994d96e0747eba55c56a3161a6b86..c3eefbf045b84918ace42d917e8a9bccb3a0f02a 100644
--- a/src/memuse.c
+++ b/src/memuse.c
@@ -34,6 +34,11 @@
 #include <sys/types.h>
 #include <unistd.h>
 
+#ifndef SWIFT_MEMUSE_STATM
+#include <sys/resource.h>
+#include <sys/time.h>
+#endif
+
 /* Local defines. */
 #include "memuse.h"
 
@@ -408,7 +413,8 @@ void memuse_log_dump_error(int rank) {
 
 /**
  * @brief parse the process /proc/self/statm file to get the process
- *        memory use (in KB). Top field in ().
+ *        memory use (in KB). Top field in (). If this file is not
+ *        available only the resident field will be returned.
  *
  * @param size     total virtual memory (VIRT/VmSize)
  * @param resident resident non-swapped memory (RES/VmRSS)
@@ -423,6 +429,8 @@ void memuse_log_dump_error(int rank) {
 void memuse_use(long *size, long *resident, long *shared, long *text,
                 long *data, long *library, long *dirty) {
 
+#ifdef SWIFT_MEMUSE_STATM
+
   /* Open the file. */
   FILE *file = fopen("/proc/self/statm", "r");
   if (file != NULL) {
@@ -432,29 +440,38 @@ void memuse_use(long *size, long *resident, long *shared, long *text,
     if (nscan == 7) {
       /* Convert pages into bytes. Usually 4096, but could be 512 on some
        * systems so take care in conversion to KB. */
-      long sz = sysconf(_SC_PAGESIZE);
-      *size *= sz;
-      *resident *= sz;
-      *shared *= sz;
-      *text *= sz;
-      *library *= sz;
-      *data *= sz;
-      *dirty *= sz;
-
-      *size /= 1024;
-      *resident /= 1024;
-      *shared /= 1024;
-      *text /= 1024;
-      *library /= 1024;
-      *data /= 1024;
-      *dirty /= 1024;
+      uint64_t sz = sysconf(_SC_PAGESIZE);
+      *size = (*size) * sz / 1024;
+      *resident = (*resident) * sz / 1024;
+      *shared = (*shared) * sz / 1024;
+      *text = (*text) * sz / 1024;
+      *library = (*library) * sz / 1024;
+      *data = (*data) * sz / 1024;
+      *dirty = (*dirty) * sz / 1024;
+
     } else {
       error("Failed to read sufficient fields from /proc/self/statm");
     }
     fclose(file);
+
   } else {
     error("Failed to open /proc/self/statm");
   }
+#else
+
+  /* Not a Linux compatible OS, try to use very limited POSIX call instead.
+   * Linux only claims to support ru_maxrss, and POSIX only ru_utime and
+   * ru_stime, so this may still fail. */
+  struct rusage usage;
+  getrusage(RUSAGE_SELF, &usage);
+  *size = 0;
+  *resident = usage.ru_maxrss;
+  *shared = usage.ru_ixrss;
+  *text = 0;
+  *library = 0;
+  *data = usage.ru_isrss;
+  *dirty = 0;
+#endif
 }
 
 /**
@@ -469,13 +486,8 @@ void memuse_use(long *size, long *resident, long *shared, long *text,
  */
 const char *memuse_process(int inmb) {
   static char buffer[256];
-  long size;
-  long resident;
-  long shared;
-  long text;
-  long library;
-  long data;
-  long dirty;
+
+  long size, resident, shared, text, library, data, dirty;
   memuse_use(&size, &resident, &shared, &text, &data, &library, &dirty);
 
   if (inmb) {
diff --git a/src/mesh_gravity.c b/src/mesh_gravity.c
index 9efd8c03966cdfb230f48bedaeb2e4cfab3c1368..0efec0bc80fc381d9bd6ff733cc546e1885c1d9e 100644
--- a/src/mesh_gravity.c
+++ b/src/mesh_gravity.c
@@ -37,6 +37,7 @@
 #include "part.h"
 #include "runner.h"
 #include "space.h"
+#include "threadpool.h"
 
 #ifdef HAVE_FFTW
 
@@ -51,10 +52,9 @@
  * @param k Index along z.
  * @param N Size of the array along one axis.
  */
-__attribute__((always_inline)) INLINE static int row_major_id_periodic(int i,
-                                                                       int j,
-                                                                       int k,
-                                                                       int N) {
+__attribute__((always_inline, const)) INLINE static int row_major_id_periodic(
+    const int i, const int j, const int k, const int N) {
+
   return (((i + N) % N) * N * N + ((j + N) % N) * N + ((k + N) % N));
 }
 
@@ -72,9 +72,10 @@ __attribute__((always_inline)) INLINE static int row_major_id_periodic(int i,
  * @param dy Second CIC coefficient along y
  * @param dz Second CIC coefficient along z
  */
-__attribute__((always_inline)) INLINE static double CIC_get(
-    double mesh[6][6][6], int i, int j, int k, double tx, double ty, double tz,
-    double dx, double dy, double dz) {
+__attribute__((always_inline, const)) INLINE static double CIC_get(
+    double mesh[6][6][6], const int i, const int j, const int k,
+    const double tx, const double ty, const double tz, const double dx,
+    const double dy, const double dz) {
 
   double temp;
   temp = mesh[i + 0][j + 0][k + 0] * tx * ty * tz;
@@ -106,8 +107,9 @@ __attribute__((always_inline)) INLINE static double CIC_get(
  * @param value The value to interpolate.
  */
 __attribute__((always_inline)) INLINE static void CIC_set(
-    double* mesh, int N, int i, int j, int k, double tx, double ty, double tz,
-    double dx, double dy, double dz, double value) {
+    double* mesh, const int N, const int i, const int j, const int k,
+    const double tx, const double ty, const double tz, const double dx,
+    const double dy, const double dz, const double value) {
 
   /* Classic CIC interpolation */
   atomic_add_d(&mesh[row_major_id_periodic(i + 0, j + 0, k + 0, N)],
@@ -137,8 +139,9 @@ __attribute__((always_inline)) INLINE static void CIC_set(
  * @param fac The width of a mesh cell.
  * @param dim The dimensions of the simulation box.
  */
-INLINE static void gpart_to_mesh_CIC(const struct gpart* gp, double* rho, int N,
-                                     double fac, const double dim[3]) {
+INLINE static void gpart_to_mesh_CIC(const struct gpart* gp, double* rho,
+                                     const int N, const double fac,
+                                     const double dim[3]) {
 
   /* Box wrap the multipole's position */
   const double pos_x = box_wrap(gp->x[0], 0., dim[0]);
@@ -183,8 +186,9 @@ INLINE static void gpart_to_mesh_CIC(const struct gpart* gp, double* rho, int N,
  * @param fac The width of a mesh cell.
  * @param dim The dimensions of the simulation box.
  */
-void cell_gpart_to_mesh_CIC(const struct cell* c, double* rho, int N,
-                            double fac, const double dim[3]) {
+void cell_gpart_to_mesh_CIC(const struct cell* c, double* rho, const int N,
+                            const double fac, const double dim[3]) {
+
   const int gcount = c->grav.count;
   const struct gpart* gparts = c->grav.parts;
 
@@ -253,8 +257,8 @@ void cell_gpart_to_mesh_CIC_mapper(void* map_data, int num, void* extra) {
  * @param fac width of a mesh cell.
  * @param dim The dimensions of the simulation box.
  */
-void mesh_to_gparts_CIC(struct gpart* gp, const double* pot, int N, double fac,
-                        const double dim[3]) {
+void mesh_to_gparts_CIC(struct gpart* gp, const double* pot, const int N,
+                        const double fac, const double dim[3]) {
 
   /* Box wrap the gpart's position */
   const double pos_x = box_wrap(gp->x[0], 0., dim[0]);
@@ -342,6 +346,139 @@ void mesh_to_gparts_CIC(struct gpart* gp, const double* pot, int N, double fac,
 #endif
 }
 
+/**
+ * @brief Shared information about the Green function to be used by all the
+ * threads in the pool.
+ */
+struct Green_function_data {
+
+  int N;
+  fftw_complex* frho;
+  double green_fac;
+  double a_smooth2;
+  double k_fac;
+};
+
+/**
+ * @brief Mapper function for the application of the Green function.
+ *
+ * @param map_data The array of the density field Fourier transform.
+ * @param num The number of elements to iterate on (along the x-axis).
+ * @param extra The properties of the Green function.
+ */
+void mesh_apply_Green_function_mapper(void* map_data, const int num,
+                                      void* extra) {
+
+  struct Green_function_data* data = (struct Green_function_data*)extra;
+
+  /* Unpack the array */
+  fftw_complex* const frho = data->frho;
+  const int N = data->N;
+  const int N_half = N / 2;
+
+  /* Unpack the Green function properties */
+  const double green_fac = data->green_fac;
+  const double a_smooth2 = data->a_smooth2;
+  const double k_fac = data->k_fac;
+
+  /* Range handled by this call */
+  const int i_start = (fftw_complex*)map_data - frho;
+  const int i_end = i_start + num;
+
+  /* Loop over the x range corresponding to this thread */
+  for (int i = i_start; i < i_end; ++i) {
+
+    /* kx component of vector in Fourier space and 1/sinc(kx) */
+    const int kx = (i > N_half ? i - N : i);
+    const double kx_d = (double)kx;
+    const double fx = k_fac * kx_d;
+    const double sinc_kx_inv = (kx != 0) ? fx / sin(fx) : 1.;
+
+    for (int j = 0; j < N; ++j) {
+
+      /* ky component of vector in Fourier space and 1/sinc(ky) */
+      const int ky = (j > N_half ? j - N : j);
+      const double ky_d = (double)ky;
+      const double fy = k_fac * ky_d;
+      const double sinc_ky_inv = (ky != 0) ? fy / sin(fy) : 1.;
+
+      for (int k = 0; k < N_half + 1; ++k) {
+
+        /* kz component of vector in Fourier space and 1/sinc(kz) */
+        const int kz = (k > N_half ? k - N : k);
+        const double kz_d = (double)kz;
+        const double fz = k_fac * kz_d;
+        const double sinc_kz_inv = (kz != 0) ? fz / (sin(fz) + FLT_MIN) : 1.;
+
+        /* Norm of vector in Fourier space */
+        const double k2 = (kx_d * kx_d + ky_d * ky_d + kz_d * kz_d);
+
+        /* Avoid FPEs... */
+        if (k2 == 0.) continue;
+
+        /* Green function */
+        double W = 1.;
+        fourier_kernel_long_grav_eval(k2 * a_smooth2, &W);
+        const double green_cor = green_fac * W / (k2 + FLT_MIN);
+
+        /* Deconvolution of CIC */
+        const double CIC_cor = sinc_kx_inv * sinc_ky_inv * sinc_kz_inv;
+        const double CIC_cor2 = CIC_cor * CIC_cor;
+        const double CIC_cor4 = CIC_cor2 * CIC_cor2;
+
+        /* Combined correction */
+        const double total_cor = green_cor * CIC_cor4;
+
+        /* Apply to the mesh */
+        const int index = N * (N_half + 1) * i + (N_half + 1) * j + k;
+        frho[index][0] *= total_cor;
+        frho[index][1] *= total_cor;
+      }
+    }
+  }
+}
+
+/**
+ * @brief Apply the Green function in Fourier space to the density
+ * array to get the potential.
+ *
+ * Also deconvolves the CIC kernel.
+ *
+ * @param tp The threadpool.
+ * @param frho The NxNx(N/2) complex array of the Fourier transform of the
+ * density field.
+ * @param N The dimension of the array.
+ * @param r_s The Green function smoothing scale.
+ * @param box_size The physical size of the simulation box.
+ */
+void mesh_apply_Green_function(struct threadpool* tp, fftw_complex* frho,
+                               const int N, const double r_s,
+                               const double box_size) {
+
+  /* Some common factors */
+  struct Green_function_data data;
+  data.frho = frho;
+  data.N = N;
+  data.green_fac = -1. / (M_PI * box_size);
+  data.a_smooth2 = 4. * M_PI * M_PI * r_s * r_s / (box_size * box_size);
+  data.k_fac = M_PI / (double)N;
+
+  /* Parallelize the Green function application using the threadpool
+     to split the x-axis loop over the threads.
+     The array is N x N x (N/2). We use the thread to each deal with
+     a range [i_min, i_max[ x N x (N/2) */
+  if (N < 32) {
+    mesh_apply_Green_function_mapper(frho, N, &data);
+  } else {
+    threadpool_map(tp, mesh_apply_Green_function_mapper, frho, N,
+                   sizeof(fftw_complex), threadpool_auto_chunk_size, &data);
+  }
+
+  /* Correct singularity at (0,0,0) */
+  frho[0][0] = 0.;
+  frho[0][1] = 0.;
+}
+
 #endif
 
 /**
@@ -359,7 +496,7 @@ void mesh_to_gparts_CIC(struct gpart* gp, const double* pot, int N, double fac,
  * @param verbose Are we talkative?
  */
 void pm_mesh_compute_potential(struct pm_mesh* mesh, const struct space* s,
-                               struct threadpool* tp, int verbose) {
+                               struct threadpool* tp, const int verbose) {
 
 #ifdef HAVE_FFTW
 
@@ -416,7 +553,8 @@ void pm_mesh_compute_potential(struct pm_mesh* mesh, const struct space* s,
   /* Do a parallel CIC mesh assignment of the gparts but only using
      the local top-level cells */
   threadpool_map(tp, cell_gpart_to_mesh_CIC_mapper, (void*)local_cells,
-                 nr_local_cells, sizeof(int), 0, (void*)&data);
+                 nr_local_cells, sizeof(int), threadpool_auto_chunk_size,
+                 (void*)&data);
 
   if (verbose)
     message("Gpart assignment took %.3f %s.",
@@ -432,7 +570,7 @@ void pm_mesh_compute_potential(struct pm_mesh* mesh, const struct space* s,
                 MPI_COMM_WORLD);
 
   if (verbose)
-    message("Mesh comunication took %.3f %s.",
+    message("Mesh communication took %.3f %s.",
             clocks_from_ticks(getticks() - tic), clocks_getunit());
 #endif
 
@@ -453,66 +591,8 @@ void pm_mesh_compute_potential(struct pm_mesh* mesh, const struct space* s,
 
   tic = getticks();
 
-  /* Some common factors */
-  const double green_fac = -1. / (M_PI * box_size);
-  const double a_smooth2 = 4. * M_PI * M_PI * r_s * r_s / (box_size * box_size);
-  const double k_fac = M_PI / (double)N;
-
   /* Now de-convolve the CIC kernel and apply the Green function */
-  for (int i = 0; i < N; ++i) {
-
-    /* kx component of vector in Fourier space and 1/sinc(kx) */
-    const int kx = (i > N_half ? i - N : i);
-    const double kx_d = (double)kx;
-    const double fx = k_fac * kx_d;
-    const double sinc_kx_inv = (kx != 0) ? fx / sin(fx) : 1.;
-
-    for (int j = 0; j < N; ++j) {
-
-      /* ky component of vector in Fourier space and 1/sinc(ky) */
-      const int ky = (j > N_half ? j - N : j);
-      const double ky_d = (double)ky;
-      const double fy = k_fac * ky_d;
-      const double sinc_ky_inv = (ky != 0) ? fy / sin(fy) : 1.;
-
-      for (int k = 0; k < N_half + 1; ++k) {
-
-        /* kz component of vector in Fourier space and 1/sinc(kz) */
-        const int kz = (k > N_half ? k - N : k);
-        const double kz_d = (double)kz;
-        const double fz = k_fac * kz_d;
-        const double sinc_kz_inv = (kz != 0) ? fz / (sin(fz) + FLT_MIN) : 1.;
-
-        /* Norm of vector in Fourier space */
-        const double k2 = (kx_d * kx_d + ky_d * ky_d + kz_d * kz_d);
-
-        /* Avoid FPEs... */
-        if (k2 == 0.) continue;
-
-        /* Green function */
-        double W = 1.;
-        fourier_kernel_long_grav_eval(k2 * a_smooth2, &W);
-        const double green_cor = green_fac * W / (k2 + FLT_MIN);
-
-        /* Deconvolution of CIC */
-        const double CIC_cor = sinc_kx_inv * sinc_ky_inv * sinc_kz_inv;
-        const double CIC_cor2 = CIC_cor * CIC_cor;
-        const double CIC_cor4 = CIC_cor2 * CIC_cor2;
-
-        /* Combined correction */
-        const double total_cor = green_cor * CIC_cor4;
-
-        /* Apply to the mesh */
-        const int index = N * (N_half + 1) * i + (N_half + 1) * j + k;
-        frho[index][0] *= total_cor;
-        frho[index][1] *= total_cor;
-      }
-    }
-  }
-
-  /* Correct singularity at (0,0,0) */
-  frho[0][0] = 0.;
-  frho[0][1] = 0.;
+  mesh_apply_Green_function(tp, frho, N, r_s, box_size);
 
   if (verbose)
     message("Applying Green function took %.3f %s.",
@@ -534,7 +614,7 @@ void pm_mesh_compute_potential(struct pm_mesh* mesh, const struct space* s,
   mesh->potential = rho;
 
   /* message("\n\n\n POTENTIAL"); */
-  /* print_array(potential, N); */
+  /* print_array(mesh->potential, N); */
 
   /* Clean-up the mess */
   fftw_destroy_plan(forward_plan);
diff --git a/src/mpiuse.c b/src/mpiuse.c
index f32d9a069e680e0cffc110e288b816ca6475b276..0688affa1b17b0298b5cfe5dafe8ce28f5e16032 100644
--- a/src/mpiuse.c
+++ b/src/mpiuse.c
@@ -351,4 +351,16 @@ void mpiuse_log_dump(const char *filename, ticks stepticks) {
   //        clocks_getunit());
 }
 
+/**
+ * @brief dump the log for using the given rank to generate a standard
+ *        name for the output. Used when exiting in error.
+ *
+ * @param rank the rank exiting in error.
+ */
+void mpiuse_log_dump_error(int rank) {
+  char filename[60];
+  sprintf(filename, "mpiuse-error-report-rank%d.txt", rank);
+  mpiuse_log_dump(filename, clocks_start_ticks);
+}
+
 #endif /* defined(SWIFT_MPIUSE_REPORTS) && defined(WITH_MPI) */
diff --git a/src/mpiuse.h b/src/mpiuse.h
index 20c6d05e1cc2d6b3995fbfc1dc2f69809ed093c9..812af6a78ea98d138f45638e442f100fe23d8596 100644
--- a/src/mpiuse.h
+++ b/src/mpiuse.h
@@ -33,6 +33,7 @@
 void mpiuse_log_dump(const char *filename, ticks stepticks);
 void mpiuse_log_allocation(int type, int subtype, void *ptr, int activation,
                            size_t size, int otherrank, int tag);
+void mpiuse_log_dump_error(int rank);
 #else
 
 /* No-op when not reporting. */
diff --git a/src/part.c b/src/part.c
index cb73aba1afa958e7a5e65919af819687ba9df863..f172708c4b43ccfb3647f743811b3afeb6149be2 100644
--- a/src/part.c
+++ b/src/part.c
@@ -32,6 +32,7 @@
 #include "error.h"
 #include "hydro.h"
 #include "part.h"
+#include "threadpool.h"
 
 /**
  * @brief Re-link the #gpart%s associated with the list of #part%s.
@@ -40,8 +41,8 @@
  * @param N The number of particles to re-link;
  * @param offset The offset of #part%s relative to the global parts list.
  */
-void part_relink_gparts_to_parts(struct part *parts, size_t N,
-                                 ptrdiff_t offset) {
+void part_relink_gparts_to_parts(struct part *parts, const size_t N,
+                                 const ptrdiff_t offset) {
   for (size_t k = 0; k < N; k++) {
     if (parts[k].gpart) {
       parts[k].gpart->id_or_neg_offset = -(k + offset);
@@ -56,8 +57,8 @@ void part_relink_gparts_to_parts(struct part *parts, size_t N,
  * @param N The number of s-particles to re-link;
  * @param offset The offset of #spart%s relative to the global sparts list.
  */
-void part_relink_gparts_to_sparts(struct spart *sparts, size_t N,
-                                  ptrdiff_t offset) {
+void part_relink_gparts_to_sparts(struct spart *sparts, const size_t N,
+                                  const ptrdiff_t offset) {
   for (size_t k = 0; k < N; k++) {
     if (sparts[k].gpart) {
       sparts[k].gpart->id_or_neg_offset = -(k + offset);
@@ -72,8 +73,8 @@ void part_relink_gparts_to_sparts(struct spart *sparts, size_t N,
  * @param N The number of s-particles to re-link;
  * @param offset The offset of #bpart%s relative to the global bparts list.
  */
-void part_relink_gparts_to_bparts(struct bpart *bparts, size_t N,
-                                  ptrdiff_t offset) {
+void part_relink_gparts_to_bparts(struct bpart *bparts, const size_t N,
+                                  const ptrdiff_t offset) {
   for (size_t k = 0; k < N; k++) {
     if (bparts[k].gpart) {
       bparts[k].gpart->id_or_neg_offset = -(k + offset);
@@ -88,7 +89,7 @@ void part_relink_gparts_to_bparts(struct bpart *bparts, size_t N,
  * @param N The number of particles to re-link;
  * @param parts The global #part array in which to find the #gpart offsets.
  */
-void part_relink_parts_to_gparts(struct gpart *gparts, size_t N,
+void part_relink_parts_to_gparts(struct gpart *gparts, const size_t N,
                                  struct part *parts) {
   for (size_t k = 0; k < N; k++) {
     if (gparts[k].type == swift_type_gas) {
@@ -104,7 +105,7 @@ void part_relink_parts_to_gparts(struct gpart *gparts, size_t N,
  * @param N The number of particles to re-link;
  * @param sparts The global #spart array in which to find the #gpart offsets.
  */
-void part_relink_sparts_to_gparts(struct gpart *gparts, size_t N,
+void part_relink_sparts_to_gparts(struct gpart *gparts, const size_t N,
                                   struct spart *sparts) {
   for (size_t k = 0; k < N; k++) {
     if (gparts[k].type == swift_type_stars) {
@@ -120,7 +121,7 @@ void part_relink_sparts_to_gparts(struct gpart *gparts, size_t N,
  * @param N The number of particles to re-link;
  * @param bparts The global #bpart array in which to find the #gpart offsets.
  */
-void part_relink_bparts_to_gparts(struct gpart *gparts, size_t N,
+void part_relink_bparts_to_gparts(struct gpart *gparts, const size_t N,
                                   struct bpart *bparts) {
   for (size_t k = 0; k < N; k++) {
     if (gparts[k].type == swift_type_black_hole) {
@@ -130,19 +131,34 @@ void part_relink_bparts_to_gparts(struct gpart *gparts, size_t N,
 }
 
 /**
- * @brief Re-link both the #part%s and #spart%s associated with the list of
- * #gpart%s.
+ * @brief Helper structure to pass data to the liking mapper functions.
+ */
+struct relink_data {
+  struct part *const parts;
+  struct gpart *const garts;
+  struct spart *const sparts;
+  struct bpart *const bparts;
+};
+
+/**
+ * @brief #threadpool mapper function for the linking of all particle types
+ * to the #gpart array.
  *
- * @param gparts The list of #gpart.
- * @param N The number of particles to re-link;
- * @param parts The global #part array in which to find the #gpart offsets.
- * @param sparts The global #spart array in which to find the #gpart offsets.
- * @param bparts The global #bpart array in which to find the #gpart offsets.
+ * @brief map_data The array of #gpart.
+ * @brief count The number of #gpart.
+ * @brief extra_data the #relink_data containing pointer to the other arrays.
  */
-void part_relink_all_parts_to_gparts(struct gpart *gparts, size_t N,
-                                     struct part *parts, struct spart *sparts,
-                                     struct bpart *bparts) {
-  for (size_t k = 0; k < N; k++) {
+void part_relink_all_parts_to_gparts_mapper(void *restrict map_data, int count,
+                                            void *restrict extra_data) {
+
+  /* Un-pack the data */
+  struct relink_data *data = (struct relink_data *)extra_data;
+  struct part *const parts = data->parts;
+  struct spart *const sparts = data->sparts;
+  struct bpart *const bparts = data->bparts;
+  struct gpart *const gparts = (struct gpart *)map_data;
+
+  for (int k = 0; k < count; k++) {
     if (gparts[k].type == swift_type_gas) {
       parts[-gparts[k].id_or_neg_offset].gpart = &gparts[k];
     } else if (gparts[k].type == swift_type_stars) {
@@ -153,6 +169,31 @@ void part_relink_all_parts_to_gparts(struct gpart *gparts, size_t N,
   }
 }
 
+/**
+ * @brief Re-link both the #part%s, #spart%s and #bpart%s associated
+ * with the list of #gpart%s.
+ *
+ * This function uses thread parallelism and should not be called inside
+ * an already threaded section (unlike the functions linking individual arrays
+ * that are designed to be called in thread-parallel code).
+ *
+ * @param gparts The list of #gpart.
+ * @param N The number of particles to re-link;
+ * @param parts The global #part array in which to find the #gpart offsets.
+ * @param sparts The global #spart array in which to find the #gpart offsets.
+ * @param bparts The global #bpart array in which to find the #gpart offsets.
+ * @param tp The #threadpool object.
+ */
+void part_relink_all_parts_to_gparts(struct gpart *gparts, const size_t N,
+                                     struct part *parts, struct spart *sparts,
+                                     struct bpart *bparts,
+                                     struct threadpool *tp) {
+
+  struct relink_data data = {parts, /*gparts=*/NULL, sparts, bparts};
+  threadpool_map(tp, part_relink_all_parts_to_gparts_mapper, gparts, N,
+                 sizeof(struct gpart), 0, &data);
+}
+
 /**
  * @brief Verifies that the #gpart, #part and #spart are correctly linked
  * together
diff --git a/src/part.h b/src/part.h
index d0ff99089526c223f148d32058003edaf474126d..3f2d6371156e5e259f095f88ac744be50be845eb 100644
--- a/src/part.h
+++ b/src/part.h
@@ -32,9 +32,10 @@
 
 /* Local headers. */
 #include "align.h"
-#include "fof.h"
 #include "part_type.h"
-#include "timeline.h"
+
+/* Pre-declarations */
+struct threadpool;
 
 /* Some constants. */
 #define part_align 128
@@ -119,21 +120,22 @@
 #error "Invalid choice of black hole particle"
 #endif
 
-void part_relink_gparts_to_parts(struct part *parts, size_t N,
-                                 ptrdiff_t offset);
-void part_relink_gparts_to_sparts(struct spart *sparts, size_t N,
-                                  ptrdiff_t offset);
-void part_relink_gparts_to_bparts(struct bpart *bparts, size_t N,
-                                  ptrdiff_t offset);
-void part_relink_parts_to_gparts(struct gpart *gparts, size_t N,
+void part_relink_gparts_to_parts(struct part *parts, const size_t N,
+                                 const ptrdiff_t offset);
+void part_relink_gparts_to_sparts(struct spart *sparts, const size_t N,
+                                  const ptrdiff_t offset);
+void part_relink_gparts_to_bparts(struct bpart *bparts, const size_t N,
+                                  const ptrdiff_t offset);
+void part_relink_parts_to_gparts(struct gpart *gparts, const size_t N,
                                  struct part *parts);
-void part_relink_sparts_to_gparts(struct gpart *gparts, size_t N,
+void part_relink_sparts_to_gparts(struct gpart *gparts, const size_t N,
                                   struct spart *sparts);
-void part_relink_bparts_to_gparts(struct gpart *gparts, size_t N,
+void part_relink_bparts_to_gparts(struct gpart *gparts, const size_t N,
                                   struct bpart *bparts);
-void part_relink_all_parts_to_gparts(struct gpart *gparts, size_t N,
+void part_relink_all_parts_to_gparts(struct gpart *gparts, const size_t N,
                                      struct part *parts, struct spart *sparts,
-                                     struct bpart *bparts);
+                                     struct bpart *bparts,
+                                     struct threadpool *tp);
 void part_verify_links(struct part *parts, struct gpart *gparts,
                        struct spart *sparts, struct bpart *bparts,
                        size_t nr_parts, size_t nr_gparts, size_t nr_sparts,
diff --git a/src/partition.c b/src/partition.c
index 2ee321c872e6c34ff4ea82a9369c4637d56bb1d9..9004adb2adb54e980174608ffb30d546c69911ed 100644
--- a/src/partition.c
+++ b/src/partition.c
@@ -60,13 +60,15 @@
 #include "partition.h"
 #include "restart.h"
 #include "space.h"
+#include "threadpool.h"
 #include "tools.h"
 
 /* Simple descriptions of initial partition types for reports. */
 const char *initial_partition_name[] = {
     "axis aligned grids of cells", "vectorized point associated cells",
     "memory balanced, using particle weighted cells",
-    "similar sized regions, using unweighted cells"};
+    "similar sized regions, using unweighted cells",
+    "memory and edge balanced cells using particle weights"};
 
 /* Simple descriptions of repartition types for reports. */
 const char *repartition_name[] = {
@@ -399,55 +401,158 @@ static void ACCUMULATE_SIZES_MAPPER(gpart);
  */
 static void ACCUMULATE_SIZES_MAPPER(spart);
 
+/* qsort support. */
+static int ptrcmp(const void *p1, const void *p2) {
+  const double *v1 = *(const double **)p1;
+  const double *v2 = *(const double **)p2;
+  return (*v1) - (*v2);
+}
+
 /**
  * @brief Accumulate total memory size in particles per cell.
  *
  * @param s the space containing the cells.
+ * @param verbose whether to report any clipped cell counts.
  * @param counts the number of bytes in particles per cell. Should be
  *               allocated as size s->nr_cells.
  */
-static void accumulate_sizes(struct space *s, double *counts) {
+static void accumulate_sizes(struct space *s, int verbose, double *counts) {
 
   bzero(counts, sizeof(double) * s->nr_cells);
 
   struct counts_mapper_data mapper_data;
-  mapper_data.counts = counts;
   mapper_data.s = s;
+  double gsize = 0.0;
+  double *gcounts = NULL;
+  double hsize = 0.0;
+  double ssize = 0.0;
 
-  double hsize = (double)sizeof(struct part);
-  if (s->nr_parts > 0) {
-    mapper_data.size = hsize;
-    threadpool_map(&s->e->threadpool, accumulate_sizes_mapper_part, s->parts,
-                   s->nr_parts, sizeof(struct part), space_splitsize,
-                   &mapper_data);
-  }
-
-  double gsize = (double)sizeof(struct gpart);
   if (s->nr_gparts > 0) {
+    /* Self-gravity gets more efficient with density (see gitlab issue #640)
+     * so we end up with too much weight in cells with larger numbers of
+     * gparts, to suppress this we fix a upper weight limit based on a
+     * percentile clip to on the numbers of cells. Should be relatively
+     * harmless when not really needed. */
+    if ((gcounts = (double *)malloc(sizeof(double) * s->nr_cells)) == NULL)
+      error("Failed to allocate gcounts buffer.");
+    bzero(gcounts, sizeof(double) * s->nr_cells);
+    gsize = (double)sizeof(struct gpart);
+
+    mapper_data.counts = gcounts;
     mapper_data.size = gsize;
     threadpool_map(&s->e->threadpool, accumulate_sizes_mapper_gpart, s->gparts,
                    s->nr_gparts, sizeof(struct gpart), space_splitsize,
                    &mapper_data);
+
+    /* Get all the counts from all the nodes. */
+    if (MPI_Allreduce(MPI_IN_PLACE, gcounts, s->nr_cells, MPI_DOUBLE, MPI_SUM,
+                      MPI_COMM_WORLD) != MPI_SUCCESS)
+      error("Failed to allreduce particle cell gpart weights.");
+
+    /* Now we need to sort... */
+    double **ptrs = NULL;
+    if ((ptrs = (double **)malloc(sizeof(double *) * s->nr_cells)) == NULL)
+      error("Failed to allocate pointers buffer.");
+    for (int k = 0; k < s->nr_cells; k++) {
+      ptrs[k] = &gcounts[k];
+    }
+
+    /* Sort pointers, not counts... */
+    qsort(ptrs, s->nr_cells, sizeof(double *), ptrcmp);
+
+    /* Percentile cut keeps 99.8% of cells and clips above. */
+    int cut = ceil(s->nr_cells * 0.998);
+
+    /* And clip. */
+    int nadj = 0;
+    double clip = *ptrs[cut];
+    for (int k = cut + 1; k < s->nr_cells; k++) {
+      *ptrs[k] = clip;
+      nadj++;
+    }
+    if (verbose) message("clipped gravity part counts of %d cells", nadj);
+    free(ptrs);
+  }
+
+  /* Other particle types are assumed to correlate with processing time. */
+  if (s->nr_parts > 0) {
+    mapper_data.counts = counts;
+    hsize = (double)sizeof(struct part);
+    mapper_data.size = hsize;
+    threadpool_map(&s->e->threadpool, accumulate_sizes_mapper_part, s->parts,
+                   s->nr_parts, sizeof(struct part), space_splitsize,
+                   &mapper_data);
   }
 
-  double ssize = (double)sizeof(struct spart);
   if (s->nr_sparts > 0) {
+    ssize = (double)sizeof(struct spart);
     mapper_data.size = ssize;
     threadpool_map(&s->e->threadpool, accumulate_sizes_mapper_spart, s->sparts,
                    s->nr_sparts, sizeof(struct spart), space_splitsize,
                    &mapper_data);
   }
 
+  /* Merge the counts arrays across all nodes, if needed. Doesn't include any
+   * gparts. */
+  if (s->nr_parts > 0 || s->nr_sparts > 0) {
+    if (MPI_Allreduce(MPI_IN_PLACE, counts, s->nr_cells, MPI_DOUBLE, MPI_SUM,
+                      MPI_COMM_WORLD) != MPI_SUCCESS)
+      error("Failed to allreduce particle cell weights.");
+  }
+
+  /* And merge with gravity counts. */
+  double sum = 0.0;
+  if (s->nr_gparts > 0) {
+    for (int k = 0; k < s->nr_cells; k++) {
+      counts[k] += gcounts[k];
+      sum += counts[k];
+    }
+    free(gcounts);
+  } else {
+    for (int k = 0; k < s->nr_cells; k++) {
+      sum += counts[k];
+    }
+  }
+
   /* Keep the sum of particles across all ranks in the range of IDX_MAX. */
-  if ((s->e->total_nr_parts * hsize + s->e->total_nr_gparts * gsize +
-       s->e->total_nr_sparts * ssize) > (double)IDX_MAX) {
-    double vscale =
-        (double)(IDX_MAX - 1000) /
-        (double)(s->e->total_nr_parts * hsize + s->e->total_nr_gparts * gsize +
-                 s->e->total_nr_sparts * ssize);
+  if (sum > (double)(IDX_MAX - 10000)) {
+    double vscale = (double)(IDX_MAX - 10000) / sum;
     for (int k = 0; k < s->nr_cells; k++) counts[k] *= vscale;
   }
 }
+
+/**
+ * @brief Make edge weights from the accumulated particle sizes per cell.
+ *
+ * @param s the space containing the cells.
+ * @param counts the number of bytes in particles per cell.
+ * @param edges weights for the edges of these regions. Should be 26 * counts.
+ */
+static void sizes_to_edges(struct space *s, double *counts, double *edges) {
+
+  bzero(edges, sizeof(double) * s->nr_cells * 26);
+
+  for (int l = 0; l < s->nr_cells; l++) {
+    int p = 0;
+    for (int i = -1; i <= 1; i++) {
+      int isid = ((i < 0) ? 0 : ((i > 0) ? 2 : 1));
+      for (int j = -1; j <= 1; j++) {
+        int jsid = isid * 3 + ((j < 0) ? 0 : ((j > 0) ? 2 : 1));
+        for (int k = -1; k <= 1; k++) {
+          int ksid = jsid * 3 + ((k < 0) ? 0 : ((k > 0) ? 2 : 1));
+
+          /* If not self, we work out the sort indices to get the expected
+           * fractional weight and add that. Scale to keep sum less than
+           * counts and a bit of tuning... */
+          if (i || j || k) {
+            edges[l * 26 + p] = counts[l] * sid_scale[sortlistID[ksid]] / 26.0;
+            p++;
+          }
+        }
+      }
+    }
+  }
+}
 #endif
 
 #if defined(WITH_MPI) && (defined(HAVE_METIS) || defined(HAVE_PARMETIS))
@@ -464,7 +569,7 @@ static void split_metis(struct space *s, int nregions, int *celllist) {
 
   /* To check or visualise the partition dump all the cells. */
   /*if (engine_rank == 0) dumpCellRanks("metis_partition", s->cells_top,
-   * s->nr_cells);*/
+                                      s->nr_cells);*/
 }
 #endif
 
@@ -780,7 +885,7 @@ static void pick_parmetis(int nodeID, struct space *s, int nregions,
 
     /* Dump graphs to disk files for testing. */
     /*dumpMETISGraph("parmetis_graph", ncells, 1, std_xadj, full_adjncy,
-      full_weights_v, NULL, full_weights_e);*/
+                   full_weights_v, NULL, full_weights_e); */
 
     /* xadj is set for each rank, different to serial version in that each
      * rank starts with 0, so we need to re-offset. */
@@ -1213,7 +1318,7 @@ static void pick_metis(int nodeID, struct space *s, int nregions,
 
     /* Dump graph in METIS format */
     /*dumpMETISGraph("metis_graph", idx_ncells, one, xadj, adjncy, weights_v,
-      NULL, weights_e);*/
+                   NULL, weights_e);*/
 
     if (METIS_PartGraphKway(&idx_ncells, &one, xadj, adjncy, weights_v, NULL,
                             weights_e, &idx_nregions, NULL, NULL, options,
@@ -1482,7 +1587,8 @@ static void repart_edge_metis(int vweights, int eweights, int timebins,
   ticks tic = getticks();
 
   threadpool_map(&s->e->threadpool, partition_gather_weights, tasks, nr_tasks,
-                 sizeof(struct task), 0, &weights_data);
+                 sizeof(struct task), threadpool_auto_chunk_size,
+                 &weights_data);
   if (s->e->verbose)
     message("weight mapper took %.3f %s.", clocks_from_ticks(getticks() - tic),
             clocks_getunit());
@@ -1536,22 +1642,22 @@ static void repart_edge_metis(int vweights, int eweights, int timebins,
   if (vweights && eweights) {
     if (vsum > esum) {
       if (vsum > (double)IDX_MAX) {
-        vscale = (double)(IDX_MAX - 1000) / vsum;
+        vscale = (double)(IDX_MAX - 10000) / vsum;
         escale = vscale;
       }
     } else {
       if (esum > (double)IDX_MAX) {
-        escale = (double)(IDX_MAX - 1000) / esum;
+        escale = (double)(IDX_MAX - 10000) / esum;
         vscale = escale;
       }
     }
   } else if (vweights) {
     if (vsum > (double)IDX_MAX) {
-      vscale = (double)(IDX_MAX - 1000) / vsum;
+      vscale = (double)(IDX_MAX - 10000) / vsum;
     }
   } else if (eweights) {
     if (esum > (double)IDX_MAX) {
-      escale = (double)(IDX_MAX - 1000) / esum;
+      escale = (double)(IDX_MAX - 10000) / esum;
     }
   }
 
@@ -1655,17 +1761,11 @@ static void repart_memory_metis(struct repartition *repartition, int nodeID,
   double *weights = NULL;
   if ((weights = (double *)malloc(sizeof(double) * s->nr_cells)) == NULL)
     error("Failed to allocate cell weights buffer.");
-  bzero(weights, sizeof(double) * s->nr_cells);
 
   /* Check each particle and accumulate the sizes per cell. */
-  accumulate_sizes(s, weights);
+  accumulate_sizes(s, s->e->verbose, weights);
 
-  /* Get all the counts from all the nodes. */
-  if (MPI_Allreduce(MPI_IN_PLACE, weights, s->nr_cells, MPI_DOUBLE, MPI_SUM,
-                    MPI_COMM_WORLD) != MPI_SUCCESS)
-    error("Failed to allreduce particle cell weights.");
-
-    /* Allocate cell list for the partition. If not already done. */
+  /* Allocate cell list for the partition. If not already done. */
 #ifdef HAVE_PARMETIS
   int refine = 1;
 #endif
@@ -1839,27 +1939,38 @@ void partition_initial_partition(struct partition *initial_partition,
     }
 
   } else if (initial_partition->type == INITPART_METIS_WEIGHT ||
+             initial_partition->type == INITPART_METIS_WEIGHT_EDGE ||
              initial_partition->type == INITPART_METIS_NOWEIGHT) {
 #if defined(WITH_MPI) && (defined(HAVE_METIS) || defined(HAVE_PARMETIS))
     /* Simple k-way partition selected by METIS using cell particle
      * counts as weights or not. Should be best when starting with a
      * inhomogeneous dist.
      */
-
-    /* Space for particles sizes per cell, which will be used as weights. */
-    double *weights = NULL;
+    double *weights_v = NULL;
+    double *weights_e = NULL;
     if (initial_partition->type == INITPART_METIS_WEIGHT) {
-      if ((weights = (double *)malloc(sizeof(double) * s->nr_cells)) == NULL)
-        error("Failed to allocate weights buffer.");
-      bzero(weights, sizeof(double) * s->nr_cells);
+      /* Particles sizes per cell, which will be used as weights. */
+      if ((weights_v = (double *)malloc(sizeof(double) * s->nr_cells)) == NULL)
+        error("Failed to allocate weights_v buffer.");
+
+      /* Check each particle and accumulate the sizes per cell. */
+      accumulate_sizes(s, s->e->verbose, weights_v);
+
+    } else if (initial_partition->type == INITPART_METIS_WEIGHT_EDGE) {
 
-      /* Check each particle and accumilate the sizes per cell. */
-      accumulate_sizes(s, weights);
+      /* Particle sizes also counted towards the edges. */
 
-      /* Get all the counts from all the nodes. */
-      if (MPI_Allreduce(MPI_IN_PLACE, weights, s->nr_cells, MPI_DOUBLE, MPI_SUM,
-                        MPI_COMM_WORLD) != MPI_SUCCESS)
-        error("Failed to allreduce particle cell weights.");
+      if ((weights_v = (double *)malloc(sizeof(double) * s->nr_cells)) == NULL)
+        error("Failed to allocate weights_v buffer.");
+      if ((weights_e = (double *)malloc(sizeof(double) * s->nr_cells * 26)) ==
+          NULL)
+        error("Failed to allocate weights_e buffer.");
+
+      /* Check each particle and accumulate the sizes per cell. */
+      accumulate_sizes(s, s->e->verbose, weights_v);
+
+      /* Spread these into edge weights. */
+      sizes_to_edges(s, weights_v, weights_e);
     }
 
     /* Do the calculation. */
@@ -1868,12 +1979,13 @@ void partition_initial_partition(struct partition *initial_partition,
       error("Failed to allocate celllist");
 #ifdef HAVE_PARMETIS
     if (initial_partition->usemetis) {
-      pick_metis(nodeID, s, nr_nodes, weights, NULL, celllist);
+      pick_metis(nodeID, s, nr_nodes, weights_v, weights_e, celllist);
     } else {
-      pick_parmetis(nodeID, s, nr_nodes, weights, NULL, 0, 0, 0.0f, celllist);
+      pick_parmetis(nodeID, s, nr_nodes, weights_v, weights_e, 0, 0, 0.0f,
+                    celllist);
     }
 #else
-    pick_metis(nodeID, s, nr_nodes, weights, NULL, celllist);
+    pick_metis(nodeID, s, nr_nodes, weights_v, weights_e, celllist);
 #endif
 
     /* And apply to our cells */
@@ -1888,7 +2000,8 @@ void partition_initial_partition(struct partition *initial_partition,
       partition_initial_partition(initial_partition, nodeID, nr_nodes, s);
     }
 
-    if (weights != NULL) free(weights);
+    if (weights_v != NULL) free(weights_v);
+    if (weights_e != NULL) free(weights_e);
     free(celllist);
 #else
     error("SWIFT was not compiled with METIS or ParMETIS support");
@@ -1943,7 +2056,7 @@ void partition_init(struct partition *partition,
 /* Defaults make use of METIS if available */
 #if defined(HAVE_METIS) || defined(HAVE_PARMETIS)
   const char *default_repart = "fullcosts";
-  const char *default_part = "memory";
+  const char *default_part = "edgememory";
 #else
   const char *default_repart = "none";
   const char *default_part = "grid";
@@ -1974,10 +2087,13 @@ void partition_init(struct partition *partition,
     case 'm':
       partition->type = INITPART_METIS_WEIGHT;
       break;
+    case 'e':
+      partition->type = INITPART_METIS_WEIGHT_EDGE;
+      break;
     default:
       message("Invalid choice of initial partition type '%s'.", part_type);
       error(
-          "Permitted values are: 'grid', 'region', 'memory' or "
+          "Permitted values are: 'grid', 'region', 'memory', 'edgememory' or "
           "'vectorized'");
 #else
     default:
diff --git a/src/partition.h b/src/partition.h
index de0d95a5e343f1aa85a03c2cda49019f2fd08037..8f6dbbd148d510650578ff0a857f9fd29c3e3c25 100644
--- a/src/partition.h
+++ b/src/partition.h
@@ -28,7 +28,8 @@ enum partition_type {
   INITPART_GRID = 0,
   INITPART_VECTORIZE,
   INITPART_METIS_WEIGHT,
-  INITPART_METIS_NOWEIGHT
+  INITPART_METIS_NOWEIGHT,
+  INITPART_METIS_WEIGHT_EDGE
 };
 
 /* Simple descriptions of types for reports. */
diff --git a/src/proxy.c b/src/proxy.c
index 338f1beaa8734168d8139902a2746cd00250bffd..4b8fdfcbd8f6e96a0d32cad55d563cc809471d20 100644
--- a/src/proxy.c
+++ b/src/proxy.c
@@ -43,6 +43,7 @@
 #include "error.h"
 #include "memuse.h"
 #include "space.h"
+#include "threadpool.h"
 
 #ifdef WITH_MPI
 /* MPI data type for the communications */
@@ -394,7 +395,7 @@ void proxy_cells_exchange(struct proxy *proxies, int num_proxies,
   /* Run through the cells and get the size of the ones that will be sent off.
    */
   threadpool_map(&s->e->threadpool, proxy_cells_count_mapper, s->cells_top,
-                 s->nr_cells, sizeof(struct cell), /*chunk=*/0,
+                 s->nr_cells, sizeof(struct cell), threadpool_auto_chunk_size,
                  /*extra_data=*/NULL);
   int count_out = 0;
   int *offset =
@@ -421,7 +422,8 @@ void proxy_cells_exchange(struct proxy *proxies, int num_proxies,
   /* Pack the cells. */
   struct pack_mapper_data data = {s, offset, pcells, with_gravity};
   threadpool_map(&s->e->threadpool, proxy_cells_pack_mapper, s->cells_top,
-                 s->nr_cells, sizeof(struct cell), /*chunk=*/0, &data);
+                 s->nr_cells, sizeof(struct cell), threadpool_auto_chunk_size,
+                 &data);
 
   if (s->e->verbose)
     message("Packing cells took %.3f %s.", clocks_from_ticks(getticks() - tic2),
@@ -429,7 +431,7 @@ void proxy_cells_exchange(struct proxy *proxies, int num_proxies,
 
   /* Launch the first part of the exchange. */
   threadpool_map(&s->e->threadpool, proxy_cells_exchange_first_mapper, proxies,
-                 num_proxies, sizeof(struct proxy), /*chunk=*/0,
+                 num_proxies, sizeof(struct proxy), threadpool_auto_chunk_size,
                  /*extra_data=*/NULL);
   for (int k = 0; k < num_proxies; k++) {
     reqs_in[k] = proxies[k].req_cells_count_in;
diff --git a/src/queue.c b/src/queue.c
index dcb8d450500110dab7d7d5b11bae7e402cd683c2..8f532a6f24d833dcad7fa1136a1d5400b01de430 100644
--- a/src/queue.c
+++ b/src/queue.c
@@ -308,3 +308,33 @@ void queue_clean(struct queue *q) {
   free(q->entries);
   free(q->tid_incoming);
 }
+
+/**
+ * @brief Dump a formatted list of tasks in the queue to the given file stream.
+ *
+ * @param nodeID the node id of this rank.
+ * @param index a number for this queue, added to the output.
+ * @param file the FILE stream, should opened for write.
+ * @param q The task #queue.
+ */
+void queue_dump(int nodeID, int index, FILE *file, struct queue *q) {
+
+  swift_lock_type *qlock = &q->lock;
+
+  /* Grab the queue lock. */
+  if (lock_lock(qlock) != 0) error("Locking the qlock failed.\n");
+
+  /* Fill any tasks from the incoming DEQ. */
+  queue_get_incoming(q);
+
+  /* Loop over the queue entries. */
+  for (int k = 0; k < q->count; k++) {
+    struct task *t = &q->tasks[q->tid[k]];
+
+    fprintf(file, "%d %d %d %s %s %.2f\n", nodeID, index, k,
+            taskID_names[t->type], subtaskID_names[t->subtype], t->weight);
+  }
+
+  /* Release the task lock. */
+  if (lock_unlock(qlock) != 0) error("Unlocking the qlock failed.\n");
+}
diff --git a/src/queue.h b/src/queue.h
index bcc3134610590f43f27163eb52e7e171ea5287fc..0576403bef8ed66dd408d40748435530155a7901 100644
--- a/src/queue.h
+++ b/src/queue.h
@@ -84,4 +84,6 @@ void queue_init(struct queue *q, struct task *tasks);
 void queue_insert(struct queue *q, struct task *t);
 void queue_clean(struct queue *q);
 
+void queue_dump(int nodeID, int index, FILE *file, struct queue *q);
+
 #endif /* SWIFT_QUEUE_H */
diff --git a/src/runner_ghost.c b/src/runner_ghost.c
index 2aaf172ea46aa285603270ca5f8fdb1d2b10c1c7..a1f38c9936578e63a31097cdd1b577c8fc6f762b 100644
--- a/src/runner_ghost.c
+++ b/src/runner_ghost.c
@@ -472,7 +472,7 @@ void runner_do_stars_ghost(struct runner *r, struct cell *c, int timer) {
    * Therefore we need to update h_max between the super- and top-levels */
   if (c->stars.ghost) {
     for (struct cell *tmp = c->parent; tmp != NULL; tmp = tmp->parent) {
-      atomic_max_d(&tmp->stars.h_max, h_max);
+      atomic_max_f(&tmp->stars.h_max, h_max);
     }
   }
 
@@ -783,7 +783,7 @@ void runner_do_black_holes_density_ghost(struct runner *r, struct cell *c,
    * Therefore we need to update h_max between the super- and top-levels */
   if (c->black_holes.density_ghost) {
     for (struct cell *tmp = c->parent; tmp != NULL; tmp = tmp->parent) {
-      atomic_max_d(&tmp->black_holes.h_max, h_max);
+      atomic_max_f(&tmp->black_holes.h_max, h_max);
     }
   }
 
@@ -809,7 +809,8 @@ void runner_do_black_holes_swallow_ghost(struct runner *r, struct cell *c,
   TIMER_TIC;
 
   /* Anything to do here? */
-  if (!cell_is_active_hydro(c, e)) return;
+  if (c->black_holes.count == 0) return;
+  if (!cell_is_active_black_holes(c, e)) return;
 
   /* Recurse? */
   if (c->split) {
@@ -1378,7 +1379,7 @@ void runner_do_ghost(struct runner *r, struct cell *c, int timer) {
    * Therefore we need to update h_max between the super- and top-levels */
   if (c->hydro.ghost) {
     for (struct cell *tmp = c->parent; tmp != NULL; tmp = tmp->parent) {
-      atomic_max_d(&tmp->hydro.h_max, h_max);
+      atomic_max_f(&tmp->hydro.h_max, h_max);
     }
   }
 
diff --git a/src/runner_others.c b/src/runner_others.c
index 2f7a82e2ff0158e99987b25c3bc9c6e7f3c15dec..6dcc92d8a492621346f92e455bbb02fae88e1fb0 100644
--- a/src/runner_others.c
+++ b/src/runner_others.c
@@ -45,6 +45,7 @@
 #include "engine.h"
 #include "error.h"
 #include "feedback.h"
+#include "fof.h"
 #include "gravity.h"
 #include "hydro.h"
 #include "logger.h"
@@ -145,6 +146,7 @@ void runner_do_grav_mesh(struct runner *r, struct cell *c, int timer) {
  * @param timer 1 if the time is to be recorded.
  */
 void runner_do_cooling(struct runner *r, struct cell *c, int timer) {
+
   const struct engine *e = r->e;
   const struct cosmology *cosmo = e->cosmology;
   const int with_cosmology = (e->policy & engine_policy_cosmology);
@@ -202,8 +204,8 @@ void runner_do_cooling(struct runner *r, struct cell *c, int timer) {
 
         /* Let's cool ! */
         cooling_cool_part(constants, us, cosmo, hydro_props,
-                          entropy_floor_props, cooling_func, p, xp, time,
-                          dt_cool, dt_therm);
+                          entropy_floor_props, cooling_func, p, xp, dt_cool,
+                          dt_therm, time);
 
         /* Apply the effects of feedback on this particle
          * (Note: Only used in schemes that have a delayed feedback mechanism
diff --git a/src/scheduler.c b/src/scheduler.c
index 28482ae281cfd0ca4c6d1da1b5134ab77441d466..24efe915a8a5e1f7a9cf066fd356d54e1dfc7b0d 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -53,6 +53,7 @@
 #include "space.h"
 #include "space_getsid.h"
 #include "task.h"
+#include "threadpool.h"
 #include "timers.h"
 #include "version.h"
 
@@ -1059,12 +1060,14 @@ void scheduler_splittasks(struct scheduler *s, const int fof_tasks,
   if (fof_tasks) {
     /* Call the mapper on each current task. */
     threadpool_map(s->threadpool, scheduler_splittasks_fof_mapper, s->tasks,
-                   s->nr_tasks, sizeof(struct task), 0, s);
+                   s->nr_tasks, sizeof(struct task), threadpool_auto_chunk_size,
+                   s);
 
   } else {
     /* Call the mapper on each current task. */
     threadpool_map(s->threadpool, scheduler_splittasks_mapper, s->tasks,
-                   s->nr_tasks, sizeof(struct task), 0, s);
+                   s->nr_tasks, sizeof(struct task), threadpool_auto_chunk_size,
+                   s);
   }
 }
 
@@ -1363,21 +1366,35 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
                (sizeof(int) * 8 - intrinsics_clz(t->ci->stars.count));
         break;
 
+      case task_type_stars_resort:
+        cost = wscale * intrinsics_popcount(t->flags) * scount_i *
+               (sizeof(int) * 8 - intrinsics_clz(t->ci->stars.count));
+        break;
+
       case task_type_self:
         if (t->subtype == task_subtype_grav) {
           cost = 1.f * (wscale * gcount_i) * gcount_i;
         } else if (t->subtype == task_subtype_external_grav)
           cost = 1.f * wscale * gcount_i;
-        else if (t->subtype == task_subtype_stars_density)
-          cost = 1.f * wscale * scount_i * count_i;
-        else if (t->subtype == task_subtype_stars_feedback)
+        else if (t->subtype == task_subtype_stars_density ||
+                 t->subtype == task_subtype_stars_feedback)
           cost = 1.f * wscale * scount_i * count_i;
-        else if (t->subtype == task_subtype_bh_density)
-          cost = 1.f * wscale * bcount_i * count_i;
-        else if (t->subtype == task_subtype_bh_feedback)
+        else if (t->subtype == task_subtype_bh_density ||
+                 t->subtype == task_subtype_bh_swallow ||
+                 t->subtype == task_subtype_bh_feedback)
           cost = 1.f * wscale * bcount_i * count_i;
-        else  // hydro loops
+        else if (t->subtype == task_subtype_do_gas_swallow)
+          cost = 1.f * wscale * count_i;
+        else if (t->subtype == task_subtype_do_bh_swallow)
+          cost = 1.f * wscale * bcount_i;
+        else if (t->subtype == task_subtype_density ||
+                 t->subtype == task_subtype_gradient ||
+                 t->subtype == task_subtype_force ||
+                 t->subtype == task_subtype_limiter)
           cost = 1.f * (wscale * count_i) * count_i;
+        else
+          error("Untreated sub-type for selfs: %s",
+                subtaskID_names[t->subtype]);
         break;
 
       case task_type_pair:
@@ -1398,6 +1415,7 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
                    sid_scale[t->flags];
 
         } else if (t->subtype == task_subtype_bh_density ||
+                   t->subtype == task_subtype_bh_swallow ||
                    t->subtype == task_subtype_bh_feedback) {
           if (t->ci->nodeID != nodeID)
             cost = 3.f * wscale * count_i * bcount_j * sid_scale[t->flags];
@@ -1407,11 +1425,24 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
             cost = 2.f * wscale * (bcount_i * count_j + bcount_j * count_i) *
                    sid_scale[t->flags];
 
-        } else {  // hydro loops
+        } else if (t->subtype == task_subtype_do_gas_swallow) {
+          cost = 1.f * wscale * (count_i + count_j);
+
+        } else if (t->subtype == task_subtype_do_bh_swallow) {
+          cost = 1.f * wscale * (bcount_i + bcount_j);
+
+        } else if (t->subtype == task_subtype_density ||
+                   t->subtype == task_subtype_gradient ||
+                   t->subtype == task_subtype_force ||
+                   t->subtype == task_subtype_limiter) {
           if (t->ci->nodeID != nodeID || t->cj->nodeID != nodeID)
             cost = 3.f * (wscale * count_i) * count_j * sid_scale[t->flags];
           else
             cost = 2.f * (wscale * count_i) * count_j * sid_scale[t->flags];
+
+        } else {
+          error("Untreated sub-type for pairs: %s",
+                subtaskID_names[t->subtype]);
         }
         break;
 
@@ -1431,6 +1462,7 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
           }
 
         } else if (t->subtype == task_subtype_bh_density ||
+                   t->subtype == task_subtype_bh_swallow ||
                    t->subtype == task_subtype_bh_feedback) {
           if (t->ci->nodeID != nodeID) {
             cost = 3.f * (wscale * count_i) * bcount_j * sid_scale[t->flags];
@@ -1441,26 +1473,48 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
                    sid_scale[t->flags];
           }
 
-        } else {  // hydro loops
+        } else if (t->subtype == task_subtype_do_gas_swallow) {
+          cost = 1.f * wscale * (count_i + count_j);
+
+        } else if (t->subtype == task_subtype_do_bh_swallow) {
+          cost = 1.f * wscale * (bcount_i + bcount_j);
+
+        } else if (t->subtype == task_subtype_density ||
+                   t->subtype == task_subtype_gradient ||
+                   t->subtype == task_subtype_force ||
+                   t->subtype == task_subtype_limiter) {
           if (t->ci->nodeID != nodeID || t->cj->nodeID != nodeID) {
             cost = 3.f * (wscale * count_i) * count_j * sid_scale[t->flags];
           } else {
             cost = 2.f * (wscale * count_i) * count_j * sid_scale[t->flags];
           }
+
+        } else {
+          error("Untreated sub-type for sub-pairs: %s",
+                subtaskID_names[t->subtype]);
         }
         break;
 
       case task_type_sub_self:
-        if (t->subtype == task_subtype_stars_density) {
-          cost = 1.f * (wscale * scount_i) * count_i;
-        } else if (t->subtype == task_subtype_stars_feedback) {
+        if (t->subtype == task_subtype_stars_density ||
+            t->subtype == task_subtype_stars_feedback) {
           cost = 1.f * (wscale * scount_i) * count_i;
-        } else if (t->subtype == task_subtype_bh_density) {
-          cost = 1.f * (wscale * bcount_i) * count_i;
-        } else if (t->subtype == task_subtype_bh_feedback) {
+        } else if (t->subtype == task_subtype_bh_density ||
+                   t->subtype == task_subtype_bh_swallow ||
+                   t->subtype == task_subtype_bh_feedback) {
           cost = 1.f * (wscale * bcount_i) * count_i;
-        } else {
+        } else if (t->subtype == task_subtype_do_gas_swallow) {
+          cost = 1.f * wscale * count_i;
+        } else if (t->subtype == task_subtype_do_bh_swallow) {
+          cost = 1.f * wscale * bcount_i;
+        } else if (t->subtype == task_subtype_density ||
+                   t->subtype == task_subtype_gradient ||
+                   t->subtype == task_subtype_force ||
+                   t->subtype == task_subtype_limiter) {
           cost = 1.f * (wscale * count_i) * count_i;
+        } else {
+          error("Untreated sub-type for sub-selfs: %s",
+                subtaskID_names[t->subtype]);
         }
         break;
       case task_type_ghost:
@@ -1475,6 +1529,9 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
       case task_type_bh_density_ghost:
         if (t->ci == t->ci->hydro.super) cost = wscale * bcount_i;
         break;
+      case task_type_bh_swallow_ghost2:
+        if (t->ci == t->ci->hydro.super) cost = wscale * bcount_i;
+        break;
       case task_type_drift_part:
         cost = wscale * count_i;
         break;
@@ -1496,6 +1553,9 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
       case task_type_grav_long_range:
         cost = wscale * gcount_i;
         break;
+      case task_type_grav_mesh:
+        cost = wscale * gcount_i;
+        break;
       case task_type_grav_mm:
         cost = wscale * (gcount_i + gcount_j);
         break;
@@ -1520,6 +1580,12 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
       case task_type_timestep:
         cost = wscale * (count_i + gcount_i + scount_i + bcount_i);
         break;
+      case task_type_timestep_limiter:
+        cost = wscale * count_i;
+        break;
+      case task_type_timestep_sync:
+        cost = wscale * count_i;
+        break;
       case task_type_send:
         if (count_i < 1e5)
           cost = 10.f * (wscale * count_i) * count_i;
@@ -1610,7 +1676,7 @@ void scheduler_start(struct scheduler *s) {
   /* Re-wait the tasks. */
   if (s->active_count > 1000) {
     threadpool_map(s->threadpool, scheduler_rewait_mapper, s->tid_active,
-                   s->active_count, sizeof(int), 0, s);
+                   s->active_count, sizeof(int), threadpool_auto_chunk_size, s);
   } else {
     scheduler_rewait_mapper(s->tid_active, s->active_count, s);
   }
@@ -1618,7 +1684,7 @@ void scheduler_start(struct scheduler *s) {
   /* Loop over the tasks and enqueue whoever is ready. */
   if (s->active_count > 1000) {
     threadpool_map(s->threadpool, scheduler_enqueue_mapper, s->tid_active,
-                   s->active_count, sizeof(int), 0, s);
+                   s->active_count, sizeof(int), threadpool_auto_chunk_size, s);
   } else {
     scheduler_enqueue_mapper(s->tid_active, s->active_count, s);
   }
@@ -2274,3 +2340,29 @@ void scheduler_write_task_level(const struct scheduler *s) {
   fclose(f);
   free(count);
 }
+/**
+ * @brief dump all the active queues of all the known schedulers into files.
+ *
+ * @param e the #scheduler
+ */
+void scheduler_dump_queues(struct engine *e) {
+
+  struct scheduler *s = &e->sched;
+  char dumpfile[35];
+
+#ifdef WITH_MPI
+  /* Open a file per rank and write the header. Use per rank to avoid MPI
+   * calls that can interact with other blocking ones.  */
+  snprintf(dumpfile, sizeof(dumpfile), "queue_dump_MPI-step%d.dat_%d", e->step,
+           e->nodeID);
+#else
+  snprintf(dumpfile, sizeof(dumpfile), "queue_dump-step%d.dat", e->step);
+#endif
+
+  FILE *file_thread = fopen(dumpfile, "w");
+  fprintf(file_thread, "# rank queue index type subtype weight\n");
+  for (int l = 0; l < s->nr_queues; l++) {
+    queue_dump(engine_rank, l, file_thread, &s->queues[l]);
+  }
+  fclose(file_thread);
+}
diff --git a/src/scheduler.h b/src/scheduler.h
index 694a88017b863d29ad9d15a4ae7d5acac5cf3223..23cb39afbd7ac890d2d3c7ecacef3bffe01b0128 100644
--- a/src/scheduler.h
+++ b/src/scheduler.h
@@ -202,5 +202,6 @@ void scheduler_clean(struct scheduler *s);
 void scheduler_free_tasks(struct scheduler *s);
 void scheduler_write_dependencies(struct scheduler *s, int verbose);
 void scheduler_write_task_level(const struct scheduler *s);
+void scheduler_dump_queues(struct engine *e);
 
 #endif /* SWIFT_SCHEDULER_H */
diff --git a/src/space.c b/src/space.c
index 3fffafc4b7cd27827d6a24e116676581c8ae9413..9d652042fd4361ea1fd76e09eeef4556c772c887 100644
--- a/src/space.c
+++ b/src/space.c
@@ -57,6 +57,7 @@
 #include "minmax.h"
 #include "multipole.h"
 #include "pressure_floor.h"
+#include "proxy.h"
 #include "restart.h"
 #include "sort_part.h"
 #include "star_formation.h"
@@ -301,7 +302,8 @@ void space_free_cells(struct space *s) {
   ticks tic = getticks();
 
   threadpool_map(&s->e->threadpool, space_rebuild_recycle_mapper, s->cells_top,
-                 s->nr_cells, sizeof(struct cell), 0, s);
+                 s->nr_cells, sizeof(struct cell), threadpool_auto_chunk_size,
+                 s);
   s->maxdepth = 0;
 
   if (s->e->verbose)
@@ -313,8 +315,10 @@ void space_free_cells(struct space *s) {
  * @brief Free any memory in use for foreign particles.
  *
  * @param s The #space.
+ * @param clear_cell_pointers Are we also setting all the foreign cell particle
+ * pointers to NULL?
  */
-void space_free_foreign_parts(struct space *s) {
+void space_free_foreign_parts(struct space *s, const int clear_cell_pointers) {
 
 #ifdef WITH_MPI
   if (s->parts_foreign != NULL) {
@@ -337,6 +341,13 @@ void space_free_foreign_parts(struct space *s) {
     s->size_bparts_foreign = 0;
     s->bparts_foreign = NULL;
   }
+  if (clear_cell_pointers) {
+    for (int k = 0; k < s->e->nr_proxies; k++) {
+      for (int j = 0; j < s->e->proxies[k].nr_cells_in; j++) {
+        cell_unlink_foreign_particles(s->e->proxies[k].cells_in[j]);
+      }
+    }
+  }
 #endif
 }
 
@@ -1996,7 +2007,8 @@ void space_split(struct space *s, int verbose) {
 
   threadpool_map(&s->e->threadpool, space_split_mapper,
                  s->local_cells_with_particles_top,
-                 s->nr_local_cells_with_particles, sizeof(int), 0, s);
+                 s->nr_local_cells_with_particles, sizeof(int),
+                 threadpool_auto_chunk_size, s);
 
   if (verbose)
     message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
@@ -2056,17 +2068,20 @@ void space_reorder_extras(struct space *s, int verbose) {
   /* Re-order the gas particles */
   if (space_extra_parts)
     threadpool_map(&s->e->threadpool, space_reorder_extra_parts_mapper,
-                   s->local_cells_top, s->nr_local_cells, sizeof(int), 0, s);
+                   s->local_cells_top, s->nr_local_cells, sizeof(int),
+                   threadpool_auto_chunk_size, s);
 
   /* Re-order the gravity particles */
   if (space_extra_gparts)
     threadpool_map(&s->e->threadpool, space_reorder_extra_gparts_mapper,
-                   s->local_cells_top, s->nr_local_cells, sizeof(int), 0, s);
+                   s->local_cells_top, s->nr_local_cells, sizeof(int),
+                   threadpool_auto_chunk_size, s);
 
   /* Re-order the star particles */
   if (space_extra_sparts)
     threadpool_map(&s->e->threadpool, space_reorder_extra_sparts_mapper,
-                   s->local_cells_top, s->nr_local_cells, sizeof(int), 0, s);
+                   s->local_cells_top, s->nr_local_cells, sizeof(int),
+                   threadpool_auto_chunk_size, s);
 
   /* Re-order the black hole particles */
   if (space_extra_bparts)
@@ -2100,7 +2115,8 @@ void space_sanitize(struct space *s) {
   if (s->e->nodeID == 0) message("Cleaning up unreasonable values of h");
 
   threadpool_map(&s->e->threadpool, space_sanitize_mapper, s->cells_top,
-                 s->nr_cells, sizeof(struct cell), 0, NULL);
+                 s->nr_cells, sizeof(struct cell), threadpool_auto_chunk_size,
+                 /*extra_data=*/NULL);
 }
 
 /**
@@ -2684,7 +2700,8 @@ void space_parts_get_cell_index(struct space *s, int *ind, int *cell_counts,
   data.count_extra_bpart = 0;
 
   threadpool_map(&s->e->threadpool, space_parts_get_cell_index_mapper, s->parts,
-                 s->nr_parts, sizeof(struct part), 0, &data);
+                 s->nr_parts, sizeof(struct part), threadpool_auto_chunk_size,
+                 &data);
 
   *count_inhibited_parts = data.count_inhibited_part;
   *count_extra_parts = data.count_extra_part;
@@ -2732,7 +2749,8 @@ void space_gparts_get_cell_index(struct space *s, int *gind, int *cell_counts,
   data.count_extra_bpart = 0;
 
   threadpool_map(&s->e->threadpool, space_gparts_get_cell_index_mapper,
-                 s->gparts, s->nr_gparts, sizeof(struct gpart), 0, &data);
+                 s->gparts, s->nr_gparts, sizeof(struct gpart),
+                 threadpool_auto_chunk_size, &data);
 
   *count_inhibited_gparts = data.count_inhibited_gpart;
   *count_extra_gparts = data.count_extra_gpart;
@@ -2780,7 +2798,8 @@ void space_sparts_get_cell_index(struct space *s, int *sind, int *cell_counts,
   data.count_extra_bpart = 0;
 
   threadpool_map(&s->e->threadpool, space_sparts_get_cell_index_mapper,
-                 s->sparts, s->nr_sparts, sizeof(struct spart), 0, &data);
+                 s->sparts, s->nr_sparts, sizeof(struct spart),
+                 threadpool_auto_chunk_size, &data);
 
   *count_inhibited_sparts = data.count_inhibited_spart;
   *count_extra_sparts = data.count_extra_spart;
@@ -2828,7 +2847,8 @@ void space_bparts_get_cell_index(struct space *s, int *bind, int *cell_counts,
   data.count_extra_bpart = 0;
 
   threadpool_map(&s->e->threadpool, space_bparts_get_cell_index_mapper,
-                 s->bparts, s->nr_bparts, sizeof(struct bpart), 0, &data);
+                 s->bparts, s->nr_bparts, sizeof(struct bpart),
+                 threadpool_auto_chunk_size, &data);
 
   *count_inhibited_bparts = data.count_inhibited_bpart;
   *count_extra_bparts = data.count_extra_bpart;
@@ -4184,15 +4204,18 @@ void space_synchronize_particle_positions(struct space *s) {
 
   if (s->nr_gparts > 0 && s->nr_parts > 0)
     threadpool_map(&s->e->threadpool, space_synchronize_part_positions_mapper,
-                   s->parts, s->nr_parts, sizeof(struct part), 0, (void *)s);
+                   s->parts, s->nr_parts, sizeof(struct part),
+                   threadpool_auto_chunk_size, (void *)s);
 
   if (s->nr_gparts > 0 && s->nr_sparts > 0)
     threadpool_map(&s->e->threadpool, space_synchronize_spart_positions_mapper,
-                   s->sparts, s->nr_sparts, sizeof(struct spart), 0, NULL);
+                   s->sparts, s->nr_sparts, sizeof(struct spart),
+                   threadpool_auto_chunk_size, /*extra_data=*/NULL);
 
   if (s->nr_gparts > 0 && s->nr_bparts > 0)
     threadpool_map(&s->e->threadpool, space_synchronize_bpart_positions_mapper,
-                   s->bparts, s->nr_bparts, sizeof(struct bpart), 0, NULL);
+                   s->bparts, s->nr_bparts, sizeof(struct bpart),
+                   threadpool_auto_chunk_size, /*extra_data=*/NULL);
 
   if (s->e->verbose)
     message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
@@ -4320,7 +4343,8 @@ void space_first_init_parts(struct space *s, int verbose) {
   const ticks tic = getticks();
   if (s->nr_parts > 0)
     threadpool_map(&s->e->threadpool, space_first_init_parts_mapper, s->parts,
-                   s->nr_parts, sizeof(struct part), 0, s);
+                   s->nr_parts, sizeof(struct part), threadpool_auto_chunk_size,
+                   s);
 
   if (verbose)
     message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
@@ -4381,7 +4405,8 @@ void space_first_init_gparts(struct space *s, int verbose) {
   const ticks tic = getticks();
   if (s->nr_gparts > 0)
     threadpool_map(&s->e->threadpool, space_first_init_gparts_mapper, s->gparts,
-                   s->nr_gparts, sizeof(struct gpart), 0, s);
+                   s->nr_gparts, sizeof(struct gpart),
+                   threadpool_auto_chunk_size, s);
 
   if (verbose)
     message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
@@ -4473,7 +4498,8 @@ void space_first_init_sparts(struct space *s, int verbose) {
   const ticks tic = getticks();
   if (s->nr_sparts > 0)
     threadpool_map(&s->e->threadpool, space_first_init_sparts_mapper, s->sparts,
-                   s->nr_sparts, sizeof(struct spart), 0, s);
+                   s->nr_sparts, sizeof(struct spart),
+                   threadpool_auto_chunk_size, s);
 
   if (verbose)
     message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
@@ -4555,7 +4581,8 @@ void space_first_init_bparts(struct space *s, int verbose) {
   const ticks tic = getticks();
   if (s->nr_bparts > 0)
     threadpool_map(&s->e->threadpool, space_first_init_bparts_mapper, s->bparts,
-                   s->nr_bparts, sizeof(struct bpart), 0, s);
+                   s->nr_bparts, sizeof(struct bpart),
+                   threadpool_auto_chunk_size, s);
 
   if (verbose)
     message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
@@ -4597,7 +4624,8 @@ void space_init_parts(struct space *s, int verbose) {
 
   if (s->nr_parts > 0)
     threadpool_map(&s->e->threadpool, space_init_parts_mapper, s->parts,
-                   s->nr_parts, sizeof(struct part), 0, s->e);
+                   s->nr_parts, sizeof(struct part), threadpool_auto_chunk_size,
+                   s->e);
   if (verbose)
     message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
             clocks_getunit());
@@ -4623,7 +4651,8 @@ void space_init_gparts(struct space *s, int verbose) {
 
   if (s->nr_gparts > 0)
     threadpool_map(&s->e->threadpool, space_init_gparts_mapper, s->gparts,
-                   s->nr_gparts, sizeof(struct gpart), 0, NULL);
+                   s->nr_gparts, sizeof(struct gpart),
+                   threadpool_auto_chunk_size, /*extra_data=*/NULL);
   if (verbose)
     message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
             clocks_getunit());
@@ -4649,7 +4678,8 @@ void space_init_sparts(struct space *s, int verbose) {
 
   if (s->nr_sparts > 0)
     threadpool_map(&s->e->threadpool, space_init_sparts_mapper, s->sparts,
-                   s->nr_sparts, sizeof(struct spart), 0, NULL);
+                   s->nr_sparts, sizeof(struct spart),
+                   threadpool_auto_chunk_size, /*extra_data=*/NULL);
   if (verbose)
     message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
             clocks_getunit());
@@ -4675,7 +4705,8 @@ void space_init_bparts(struct space *s, int verbose) {
 
   if (s->nr_bparts > 0)
     threadpool_map(&s->e->threadpool, space_init_bparts_mapper, s->bparts,
-                   s->nr_bparts, sizeof(struct bpart), 0, NULL);
+                   s->nr_bparts, sizeof(struct bpart),
+                   threadpool_auto_chunk_size, /*extra_data=*/NULL);
   if (verbose)
     message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
             clocks_getunit());
@@ -4710,7 +4741,8 @@ void space_convert_quantities(struct space *s, int verbose) {
 
   if (s->nr_parts > 0)
     threadpool_map(&s->e->threadpool, space_convert_quantities_mapper, s->parts,
-                   s->nr_parts, sizeof(struct part), 0, s);
+                   s->nr_parts, sizeof(struct part), threadpool_auto_chunk_size,
+                   s);
 
   if (verbose)
     message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
@@ -5599,10 +5631,12 @@ void space_check_swallow(struct space *s) {
 #ifdef SWIFT_DEBUG_CHECKS
 
   threadpool_map(&s->e->threadpool, space_check_part_swallow_mapper, s->parts,
-                 s->nr_parts, sizeof(struct part), 0, NULL);
+                 s->nr_parts, sizeof(struct part), threadpool_auto_chunk_size,
+                 /*extra_data=*/NULL);
 
   threadpool_map(&s->e->threadpool, space_check_bpart_swallow_mapper, s->bparts,
-                 s->nr_bparts, sizeof(struct bpart), 0, NULL);
+                 s->nr_bparts, sizeof(struct bpart), threadpool_auto_chunk_size,
+                 /*extra_data=*/NULL);
 #else
   error("Calling debugging code without debugging flag activated.");
 #endif
@@ -5874,7 +5908,10 @@ void space_struct_restore(struct space *s, FILE *stream) {
                         NULL, "bparts");
   }
 
-  /* Need to reconnect the gravity parts to their hydro and stars particles. */
+  /* Need to reconnect the gravity parts to their hydro, star and BH particles.
+   * Note that we can't use the threadpool here as we have not restored it yet.
+   */
+
   /* Re-link the parts. */
   if (s->nr_parts > 0 && s->nr_gparts > 0)
     part_relink_parts_to_gparts(s->gparts, s->nr_gparts, s->parts);
diff --git a/src/space.h b/src/space.h
index 5ce9a3429939edec57def1e551d7db09bedcbcf5..885f9ba85e2249976236a108c4357ec9815f43bf 100644
--- a/src/space.h
+++ b/src/space.h
@@ -379,7 +379,7 @@ void space_reset_task_counters(struct space *s);
 void space_clean(struct space *s);
 void space_free_cells(struct space *s);
 
-void space_free_foreign_parts(struct space *s);
+void space_free_foreign_parts(struct space *s, const int clear_cell_pointers);
 
 void space_struct_dump(struct space *s, FILE *stream);
 void space_struct_restore(struct space *s, FILE *stream);
diff --git a/src/stars/Default/stars_io.h b/src/stars/Default/stars_io.h
index ebabc3e0f555a8055f67b402674945e28dae6747..88444726329bad9baf797aaa8437c48d4fbc1516 100644
--- a/src/stars/Default/stars_io.h
+++ b/src/stars/Default/stars_io.h
@@ -125,8 +125,9 @@ INLINE static void stars_write_particles(const struct spart *sparts,
   list[2] = io_make_output_field("Masses", FLOAT, 1, UNIT_CONV_MASS, 0.f,
                                  sparts, mass, "Masses of the particles");
 
-  list[3] = io_make_output_field("ParticleIDs", LONGLONG, 1, UNIT_CONV_NO_UNITS,
-                                 0.f, sparts, id, "Unique ID of the particles");
+  list[3] =
+      io_make_output_field("ParticleIDs", ULONGLONG, 1, UNIT_CONV_NO_UNITS, 0.f,
+                           sparts, id, "Unique ID of the particles");
 
   list[4] = io_make_output_field(
       "SmoothingLengths", FLOAT, 1, UNIT_CONV_LENGTH, 1.f, sparts, h,
diff --git a/src/stars/EAGLE/stars_io.h b/src/stars/EAGLE/stars_io.h
index a7fd91f51355245ec9164df9fd1aa9366ff6e917..1669a8b45b366a6682b8d73fc0056d1f0ddc53ec 100644
--- a/src/stars/EAGLE/stars_io.h
+++ b/src/stars/EAGLE/stars_io.h
@@ -132,8 +132,9 @@ INLINE static void stars_write_particles(const struct spart *sparts,
                                  "Masses of the particles at the current point "
                                  "in time (i.e. after stellar losses");
 
-  list[3] = io_make_output_field("ParticleIDs", LONGLONG, 1, UNIT_CONV_NO_UNITS,
-                                 0.f, sparts, id, "Unique ID of the particles");
+  list[3] =
+      io_make_output_field("ParticleIDs", ULONGLONG, 1, UNIT_CONV_NO_UNITS, 0.f,
+                           sparts, id, "Unique ID of the particles");
 
   list[4] = io_make_output_field(
       "SmoothingLengths", FLOAT, 1, UNIT_CONV_LENGTH, 1.f, sparts, h,
diff --git a/src/statistics.c b/src/statistics.c
index 11b64acecb79d9466176ef91e4b9cd70e2ebb743..3a2e2831a1e2b436ae072a285259128740dbb5e5 100644
--- a/src/statistics.c
+++ b/src/statistics.c
@@ -325,12 +325,14 @@ void stats_collect(const struct space *s, struct statistics *stats) {
   /* Run parallel collection of statistics for parts */
   if (s->nr_parts > 0)
     threadpool_map(&s->e->threadpool, stats_collect_part_mapper, s->parts,
-                   s->nr_parts, sizeof(struct part), 0, &extra_data);
+                   s->nr_parts, sizeof(struct part), threadpool_auto_chunk_size,
+                   &extra_data);
 
   /* Run parallel collection of statistics for gparts */
   if (s->nr_gparts > 0)
     threadpool_map(&s->e->threadpool, stats_collect_gpart_mapper, s->gparts,
-                   s->nr_gparts, sizeof(struct gpart), 0, &extra_data);
+                   s->nr_gparts, sizeof(struct gpart),
+                   threadpool_auto_chunk_size, &extra_data);
 }
 
 /**
diff --git a/src/swift.h b/src/swift.h
index d8221080b1179b8bddc5441cdd4ae19a3fca5b74..c7a5f8d5acbcc17dd467cbcfb914eac025894dae 100644
--- a/src/swift.h
+++ b/src/swift.h
@@ -43,6 +43,7 @@
 #include "error.h"
 #include "feedback.h"
 #include "feedback_properties.h"
+#include "fof.h"
 #include "gravity.h"
 #include "gravity_derivatives.h"
 #include "gravity_properties.h"
diff --git a/src/swift_velociraptor_part.h b/src/swift_velociraptor_part.h
index 700842ac5a13e5bee4af15cc0d8726fc668ce421..d499cedebf98b06df14e259882b286b859bd7d04 100644
--- a/src/swift_velociraptor_part.h
+++ b/src/swift_velociraptor_part.h
@@ -39,8 +39,10 @@ struct swift_vel_part {
   /*! Particle velocity. */
   float v[3];
 
+#ifndef HAVE_VELOCIRAPTOR_WITH_NOMASS
   /*! Particle mass. */
   float mass;
+#endif
 
   /*! Gravitational potential */
   float potential;
diff --git a/src/task.c b/src/task.c
index 1ba295588769c70ad48a4f713a8ed96c3defbe01..30f6a9621916615d9aa6e9e315e30d222acd8f84 100644
--- a/src/task.c
+++ b/src/task.c
@@ -905,7 +905,7 @@ void task_free_mpi_comms(void) {
  *
  * Dumps the information to a file "thread_info-stepn.dat" where n is the
  * given step value, or "thread_info_MPI-stepn.dat", if we are running
- * under MPI. Note if running under MPIU all the ranks are dumped into this
+ * under MPI. Note if running under MPI all the ranks are dumped into this
  * one file, which has an additional field to identify the rank.
  *
  * @param e the #engine
@@ -1172,3 +1172,66 @@ void task_dump_stats(const char *dumpfile, struct engine *e, int header,
   }
 #endif
 }
+
+/**
+ * @brief dump all the active tasks of all the known engines into files.
+ *
+ * Dumps the information into file "task_dump-stepn.dat" where n is the given
+ * step value, or files "task_dump_MPI-stepn.dat_rank", if we are running
+ * under MPI. Note if running under MPI all the ranks are dumped into separate
+ * files to avoid interaction with other MPI calls that may be blocking at the
+ * time. Very similar to task_dump_all() except for the additional fields used
+ * in task debugging and we record tasks that have not ran (i.e !skip, but toc
+ * == 0) and how many waits are still active.
+ *
+ * @param e the #engine
+ */
+void task_dump_active(struct engine *e) {
+
+  /* Need this to convert ticks to seconds. */
+  unsigned long long cpufreq = clocks_get_cpufreq();
+  char dumpfile[35];
+
+#ifdef WITH_MPI
+  snprintf(dumpfile, sizeof(dumpfile), "task_dump_MPI-step%d.dat_%d", e->step,
+           e->nodeID);
+#else
+  snprintf(dumpfile, sizeof(dumpfile), "task_dump-step%d.dat", e->step);
+#endif
+
+  FILE *file_thread = fopen(dumpfile, "w");
+  fprintf(file_thread,
+          "# rank otherrank type subtype waits pair tic toc"
+          " ci.hydro.count cj.hydro.count ci.grav.count cj.grav.count"
+          " flags\n");
+
+  /* Add some information to help with the plots and conversion of ticks to
+   * seconds. */
+  fprintf(file_thread, "%i 0 none none -1 0 %lld %lld %lld %lld %lld 0 %lld\n",
+          engine_rank, (long long int)e->tic_step, (long long int)e->toc_step,
+          e->updates, e->g_updates, e->s_updates, cpufreq);
+  int count = 0;
+  for (int l = 0; l < e->sched.nr_tasks; l++) {
+    struct task *t = &e->sched.tasks[l];
+
+    /* Not implicit and not skipped. */
+    if (!t->implicit && !t->skip) {
+
+      /* Get destination rank of MPI requests. */
+      int paired = (t->cj != NULL);
+      int otherrank = t->ci->nodeID;
+      if (paired) otherrank = t->cj->nodeID;
+
+      fprintf(file_thread, "%i %i %s %s %i %i %lli %lli %i %i %i %i %lli\n",
+              engine_rank, otherrank, taskID_names[t->type],
+              subtaskID_names[t->subtype], t->wait, paired,
+              (long long int)t->tic, (long long int)t->toc,
+              (t->ci != NULL) ? t->ci->hydro.count : 0,
+              (t->cj != NULL) ? t->cj->hydro.count : 0,
+              (t->ci != NULL) ? t->ci->grav.count : 0,
+              (t->cj != NULL) ? t->cj->grav.count : 0, t->flags);
+    }
+    count++;
+  }
+  fclose(file_thread);
+}
diff --git a/src/task.h b/src/task.h
index d3024d4d5dfc93e9dcdee2346127265b102ea9c7..02e1c5ca385749c3dff347b44d8e1f0e984fadd0 100644
--- a/src/task.h
+++ b/src/task.h
@@ -239,6 +239,7 @@ void task_print(const struct task *t);
 void task_dump_all(struct engine *e, int step);
 void task_dump_stats(const char *dumpfile, struct engine *e, int header,
                      int allranks);
+void task_dump_active(struct engine *e);
 void task_get_full_name(int type, int subtype, char *name);
 void task_get_group_name(int type, int subtype, char *cluster);
 
diff --git a/src/threadpool.c b/src/threadpool.c
index 0ef69f9895d3d127a330b105ec385d97975fc82f..c4b6698e9eeb9a657ad66e7a1a7566626b9dc875 100644
--- a/src/threadpool.c
+++ b/src/threadpool.c
@@ -252,7 +252,7 @@ void threadpool_init(struct threadpool *tp, int num_threads) {
  * @param N Number of elements in @c map_data.
  * @param stride Size, in bytes, of each element of @c map_data.
  * @param chunk Number of map data elements to pass to the function at a time,
- *        or zero to choose the number automatically.
+ *        or #threadpool_auto_chunk_size to choose the number automatically.
  * @param extra_data Addtitional pointer that will be passed to the mapping
  *        function, may contain additional data.
  */
@@ -279,9 +279,10 @@ void threadpool_map(struct threadpool *tp, threadpool_map_function map_function,
   tp->map_data_size = N;
   tp->map_data_count = 0;
   tp->map_data_chunk =
-      chunk ? chunk
-            : max((int)(N / (tp->num_threads * threadpool_default_chunk_ratio)),
-                  1);
+      (chunk == threadpool_auto_chunk_size)
+          ? max((int)(N / (tp->num_threads * threadpool_default_chunk_ratio)),
+                1)
+          : chunk;
   tp->map_function = map_function;
   tp->map_data = map_data;
   tp->map_extra_data = extra_data;
diff --git a/src/threadpool.h b/src/threadpool.h
index 9d19b56836330f59093933bbed5900727142f4a2..a4964ffcbb387a1e3930c3ba172cd783dd6e8123 100644
--- a/src/threadpool.h
+++ b/src/threadpool.h
@@ -32,6 +32,7 @@
 /* Local defines. */
 #define threadpool_log_initial_size 1000
 #define threadpool_default_chunk_ratio 7
+#define threadpool_auto_chunk_size 0
 
 /* Function type for mappings. */
 typedef void (*threadpool_map_function)(void *map_data, int num_elements,
diff --git a/src/velociraptor_interface.c b/src/velociraptor_interface.c
index 6f6f76bc12ea9a78fd661fe8664c054e500c2576..adfe49366d60d3c5796865beb7d51467b831e1ae 100644
--- a/src/velociraptor_interface.c
+++ b/src/velociraptor_interface.c
@@ -35,12 +35,16 @@
 #include "engine.h"
 #include "hydro.h"
 #include "swift_velociraptor_part.h"
+#include "threadpool.h"
 #include "velociraptor_struct.h"
 
 #ifdef HAVE_VELOCIRAPTOR
 
 /**
  * @brief Structure for passing cosmological information to VELOCIraptor.
+ *
+ * This should match the structure cosmoinfo in the file src/swiftinterface.h
+ * in the VELOCIraptor code.
  */
 struct cosmoinfo {
 
@@ -77,6 +81,9 @@ struct cosmoinfo {
 
 /**
  * @brief Structure for passing unit information to VELOCIraptor.
+ *
+ * This should match the structure unitinfo in the file src/swiftinterface.h
+ * in the VELOCIraptor code.
  */
 struct unitinfo {
 
@@ -111,6 +118,9 @@ struct cell_loc {
 /**
  * @brief Structure for passing simulation information to VELOCIraptor for a
  * given call.
+ *
+ * This should match the structure siminfo in the file src/swiftinterface.h
+ * in the VELOCIraptor code.
  */
 struct siminfo {
 
@@ -129,6 +139,9 @@ struct siminfo {
   /*! Number of top-level cells. */
   int numcells;
 
+  /*! Number of top-level cells. */
+  int numcellsperdim;
+
   /*! Locations of top-level cells. */
   struct cell_loc *cell_loc;
 
@@ -161,6 +174,11 @@ struct siminfo {
 
   /*! Do we have other particles? */
   int iother;
+
+#ifdef HAVE_VELOCIRAPTOR_WITH_NOMASS
+  /*! Mass of the DM particles */
+  double mass_uniform_box;
+#endif
 };
 
 /**
@@ -228,10 +246,14 @@ void velociraptor_convert_particles_mapper(void *map_data, int nr_gparts,
   const struct cooling_function_data *cool_func = e->cooling_func;
 
   const float a_inv = e->cosmology->a_inv;
+  const int periodic = s->periodic;
+  const double dim[3] = {s->dim[0], s->dim[1], s->dim[2]};
+  const double pos_dithering[3] = {s->pos_dithering[0], s->pos_dithering[1],
+                                   s->pos_dithering[2]};
 
   /* Convert particle properties into VELOCIraptor units.
    * VELOCIraptor wants:
-   * - Co-moving positions,
+   * - Un-dithered co-moving positions,
    * - Peculiar velocities,
    * - Co-moving potential,
    * - Physical internal energy (for the gas),
@@ -239,15 +261,27 @@ void velociraptor_convert_particles_mapper(void *map_data, int nr_gparts,
    */
   for (int i = 0; i < nr_gparts; i++) {
 
-    swift_parts[i].x[0] = gparts[i].x[0];
-    swift_parts[i].x[1] = gparts[i].x[1];
-    swift_parts[i].x[2] = gparts[i].x[2];
+    if (periodic) {
+      swift_parts[i].x[0] =
+          box_wrap(gparts[i].x[0] - pos_dithering[0], 0.0, dim[0]);
+      swift_parts[i].x[1] =
+          box_wrap(gparts[i].x[1] - pos_dithering[1], 0.0, dim[1]);
+      swift_parts[i].x[2] =
+          box_wrap(gparts[i].x[2] - pos_dithering[2], 0.0, dim[2]);
+    } else {
+      swift_parts[i].x[0] = gparts[i].x[0];
+      swift_parts[i].x[1] = gparts[i].x[1];
+      swift_parts[i].x[2] = gparts[i].x[2];
+    }
 
     swift_parts[i].v[0] = gparts[i].v_full[0] * a_inv;
     swift_parts[i].v[1] = gparts[i].v_full[1] * a_inv;
     swift_parts[i].v[2] = gparts[i].v_full[2] * a_inv;
 
+#ifndef HAVE_VELOCIRAPTOR_WITH_NOMASS
     swift_parts[i].mass = gravity_get_mass(&gparts[i]);
+#endif
+
     swift_parts[i].potential = gravity_get_comoving_potential(&gparts[i]);
 
     swift_parts[i].type = gparts[i].type;
@@ -408,6 +442,12 @@ void velociraptor_invoke(struct engine *e, const int linked_with_snap) {
   const int nr_cells = s->nr_cells;
   const struct cell *cells_top = s->cells_top;
 
+  /* Start by freeing some of the unnecessary memory to give VR some breathing
+     space */
+#ifdef WITH_MPI
+  space_free_foreign_parts(e->s, /*clear_cell_pointers=*/1);
+#endif
+
   /* Allow thread to run on any core for the duration of the call to
    * VELOCIraptor so that  when OpenMP threads are spawned
    * they can run on any core on the processor. */
@@ -509,6 +549,20 @@ void velociraptor_invoke(struct engine *e, const int linked_with_snap) {
     sim_info.interparticlespacing = -1.;
   }
 
+#ifdef HAVE_VELOCIRAPTOR_WITH_NOMASS
+  /* Assume all particles have the same mass */
+  double DM_mass = 0.;
+  for (size_t i = 0; i < e->s->nr_gparts; ++i) {
+    const struct gpart *gp = &e->s->gparts[i];
+    if (gp->time_bin != time_bin_inhibited &&
+        gp->time_bin != time_bin_not_created) {
+      DM_mass = gp->mass;
+      break;
+    }
+  }
+  sim_info.mass_uniform_box = DM_mass;
+#endif
+
   /* Set the spatial extent of the simulation volume */
   sim_info.spacedimension[0] = s->dim[0];
   sim_info.spacedimension[1] = s->dim[1];
@@ -516,6 +570,9 @@ void velociraptor_invoke(struct engine *e, const int linked_with_snap) {
 
   /* Store number of top-level cells */
   sim_info.numcells = s->nr_cells;
+  sim_info.numcellsperdim = s->cdim[0]; /* We assume a cubic box! */
+  if (s->cdim[0] != s->cdim[1] || s->cdim[0] != s->cdim[2])
+    error("Trying to run VR on a non-cubic number of top-level cells");
 
   /* Size and inverse size of the top-level cells in VELOCIraptor units */
   sim_info.cellwidth[0] = s->cells_top[0].width[0];
@@ -539,9 +596,18 @@ void velociraptor_invoke(struct engine *e, const int linked_with_snap) {
   for (int i = 0; i < s->nr_cells; i++) {
     cell_node_ids[i] = cells_top[i].nodeID;
 
-    sim_info.cell_loc[i].loc[0] = cells_top[i].loc[0];
-    sim_info.cell_loc[i].loc[1] = cells_top[i].loc[1];
-    sim_info.cell_loc[i].loc[2] = cells_top[i].loc[2];
+    if (s->periodic) {
+      sim_info.cell_loc[i].loc[0] =
+          box_wrap(cells_top[i].loc[0] - s->pos_dithering[0], 0.0, s->dim[0]);
+      sim_info.cell_loc[i].loc[1] =
+          box_wrap(cells_top[i].loc[1] - s->pos_dithering[1], 0.0, s->dim[1]);
+      sim_info.cell_loc[i].loc[2] =
+          box_wrap(cells_top[i].loc[2] - s->pos_dithering[2], 0.0, s->dim[2]);
+    } else {
+      sim_info.cell_loc[i].loc[0] = cells_top[i].loc[0];
+      sim_info.cell_loc[i].loc[1] = cells_top[i].loc[1];
+      sim_info.cell_loc[i].loc[2] = cells_top[i].loc[2];
+    }
   }
 
   if (e->verbose) {
@@ -628,7 +694,8 @@ void velociraptor_invoke(struct engine *e, const int linked_with_snap) {
 
   struct velociraptor_copy_data copy_data = {e, swift_parts};
   threadpool_map(&e->threadpool, velociraptor_convert_particles_mapper,
-                 s->gparts, nr_gparts, sizeof(struct gpart), 0, &copy_data);
+                 s->gparts, nr_gparts, sizeof(struct gpart),
+                 threadpool_auto_chunk_size, &copy_data);
 
   /* Report timing */
   if (e->verbose)
@@ -697,6 +764,12 @@ void velociraptor_invoke(struct engine *e, const int linked_with_snap) {
   /* Record we have ran stf this timestep */
   e->stf_this_timestep = 1;
 
+  /* Reallocate the memory that was freed earlier */
+#ifdef WITH_MPI
+
+  engine_allocate_foreign_particles(e);
+#endif
+
 #else
   error("SWIFT not configure to run with VELOCIraptor.");
 #endif /* HAVE_VELOCIRAPTOR */
diff --git a/tools/analyse_runtime.py b/tools/analyse_runtime.py
index 6d6c61be463bbc398c612cc9838c0453587e6193..52caaef25d0184a4d46f105d619d31790f2d74c0 100755
--- a/tools/analyse_runtime.py
+++ b/tools/analyse_runtime.py
@@ -56,7 +56,7 @@ num_files = len(sys.argv) - 1
 labels = [
     ["engine_split_gas_particles:", 1],
     ["Gpart assignment", 1],
-    ["Mesh comunication", 1],
+    ["Mesh communication", 1],
     ["Forward Fourier transform", 1],
     ["Green function", 1],
     ["Backwards Fourier transform", 1],
diff --git a/tools/check_mpireports.py b/tools/check_mpireports.py
new file mode 100755
index 0000000000000000000000000000000000000000..ce555751431c087840b1a4c96b58d6f5a1c213c9
--- /dev/null
+++ b/tools/check_mpireports.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python
+"""
+Usage:
+    check_mpireportx.py [options] mpi-report...
+
+Check any mpi reports from a step for any requests which are not fulfilled,
+that any sends or receives that are not matched to a successful completion.
+Note that a report with a final sum of memory equal to zero will not have any
+of these!
+
+This file is part of SWIFT.
+
+Copyright (C) 2020 Peter W. Draper (p.w.draper@durham.ac.uk)
+All Rights Reserved.
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published
+by the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import sys
+import argparse
+
+#  Handle the command line.
+parser = argparse.ArgumentParser(description="Check MPI report")
+
+parser.add_argument("input",
+                    nargs="+",
+                    metavar="mpi-report",
+                    help="MPI report")
+parser.add_argument(
+    "-v",
+    "--verbose",
+    dest="verbose",
+    help="Verbose output",
+    default=False,
+    action="store_true",
+)
+args = parser.parse_args()
+infiles = args.input
+
+#  Indices for words in a line.
+sticcol=0
+eticcol=1
+dticcol=2
+stepcol=3
+rankcol=4
+otherrankcol=5
+typecol=6
+itypecol=7
+subtypecol=8
+isubtypecol=9
+activationcol=10
+tagcol=11
+sizecol=12
+sum=13
+
+#  Keyed lines.
+isends = {}
+irecvs = {}
+esends = {}
+erecvs = {}
+
+#  Gather keys from input file. We created dicts with matchable keys
+#  for the sends and recvs, when they start and complete.
+for f in infiles:
+    if args.verbose:
+        print "Processing: " + f
+    with open(f, "r") as fp:
+        for line in fp:
+            if line[0] == '#':
+                continue
+            words = line.split()
+            if words[activationcol] == "1":
+                key = words[otherrankcol] + "/" + \
+                      words[rankcol] + "/" + \
+                      words[subtypecol] + "/" + \
+                      words[tagcol] + "/" + \
+                      words[sizecol]
+                if words[typecol] == "send":
+                    isends[key] = [line[:-1]]
+                else:
+                    irecvs[key] = [line[:-1]]
+                    
+            else:
+                # Size will be negative.
+                key = words[otherrankcol] + "/" + \
+                      words[rankcol] + "/" + \
+                      words[subtypecol] + "/" + \
+                      words[tagcol] + "/" + \
+                      words[sizecol][1:]
+                if words[typecol] == "send":
+                    esends[key] = [line[:-1]]
+                else:
+                    erecvs[key] = [line[:-1]]
+
+#  Now report any uncompleted sends or receives.
+print "# stic etic dtic step rank otherrank type itype subtype isubtype activation tag size sum"
+for key in isends:
+    if not key in esends:
+        print isends[key][0]
+for key in irecvs:
+    if not key in erecvs:
+        print irecvs[key][0]
+
+sys.exit(0)
diff --git a/tools/task_plots/analyse_tasks.py b/tools/task_plots/analyse_tasks.py
index c093d18c2dba6bed0a22e9f7279285b45817445d..4a87ebad3cf13bdedd39e329509f003e858a0a1b 100755
--- a/tools/task_plots/analyse_tasks.py
+++ b/tools/task_plots/analyse_tasks.py
@@ -93,6 +93,7 @@ TASKTYPES = [
     "kick2",
     "timestep",
     "timestep_limiter",
+    "timestep_sync",
     "send",
     "recv",
     "grav_long_range",
@@ -116,6 +117,9 @@ TASKTYPES = [
     "bh_in",
     "bh_out",
     "bh_ghost",
+    "bh_swallow_ghost1",
+    "bh_swallow_ghost2",
+    "bh_swallow_ghost3",
     "fof_self",
     "fof_pair",
     "count",
@@ -135,16 +139,21 @@ SUBTYPES = [
     "tend_bpart",
     "xv",
     "rho",
+    "part_swallow",
+    "bpart_merger",
     "gpart",
     "multipole",
     "spart",
     "stars_density",
     "stars_feedback",
-    "sf_counts"
-    "bpart",
+    "sf_counts",
+    "bpart_rho",
+    "bpart_swallow",
+    "bpart_feedback",
     "bh_density",
     "bh_swallow",
-    "do_swallow",
+    "do_gas_swallow",
+    "do_bh_swallow",
     "bh_feedback",
     "count",
 ]
diff --git a/tools/task_plots/plot_tasks.py b/tools/task_plots/plot_tasks.py
index 64d5529cc19499d3e4ee7d521e5afcbccb38c019..b0b9f0e511dbdeff42a6c1885740593f8d7dbc42 100755
--- a/tools/task_plots/plot_tasks.py
+++ b/tools/task_plots/plot_tasks.py
@@ -170,6 +170,7 @@ TASKTYPES = [
     "kick2",
     "timestep",
     "timestep_limiter",
+    "timestep_sync",
     "send",
     "recv",
     "grav_long_range",
@@ -193,6 +194,9 @@ TASKTYPES = [
     "bh_in",
     "bh_out",
     "bh_ghost",
+    "bh_swallow_ghost1",
+    "bh_swallow_ghost2",
+    "bh_swallow_ghost3",
     "fof_self",
     "fof_pair",
     "count",
@@ -212,16 +216,21 @@ SUBTYPES = [
     "tend_bpart",
     "xv",
     "rho",
+    "part_swallow",
+    "bpart_merger",
     "gpart",
     "multipole",
     "spart",
     "stars_density",
     "stars_feedback",
     "sf_counts",
-    "bpart",
+    "bpart_rho",
+    "bpart_swallow",
+    "bpart_feedback",
     "bh_density",
     "bh_swallow",
-    "do_swallow",
+    "do_gas_swallow",
+    "do_bh_swallow",
     "bh_feedback",
     "count",
 ]