diff --git a/.gitignore b/.gitignore
index 812aa06dd54ee8bb81e87796deb8ee05160fec72..893f786506ed168565fb995812136bcb7f1305f1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,11 +26,13 @@ examples/*/*.xmf
 examples/*/*.hdf5
 examples/*/*.png
 examples/*/*.txt
+examples/*/*.dot
 examples/*/used_parameters.yml
 examples/*/*/*.xmf
 examples/*/*/*.hdf5
 examples/*/*/*.png
 examples/*/*/*.txt
+examples/*/*/*.dot
 examples/*/*/used_parameters.yml
 examples/*/gravity_checks_*.dat
 
diff --git a/INSTALL.swift b/INSTALL.swift
index 90096bc1b88d34ab86c04f623f1d3f04ca5a2997..fd960200117972d72ec5144250c51fa5126f09f4 100644
--- a/INSTALL.swift
+++ b/INSTALL.swift
@@ -75,6 +75,15 @@ also be switched off for benchmarking purposes. To do so, you can use:
 
     ./configure --disable-vec
 
+Please note that to build SWIFT on MacOS, you will need to configure
+using
+
+    ./configure --disable-compiler-warnings
+
+due to the incorrect behaviour of the LLVM compiler on this platform
+that raises warnings when the pthread flags are passed to the linker.
+
+
 
                                  Dependencies
                                  ============
diff --git a/configure.ac b/configure.ac
index c0bb9e7a2b6fae10e998a76a5cf833575a3283ec..d95290309e285d9bb1fa212954c9a770071e886f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -379,6 +379,19 @@ AX_PTHREAD([LIBS="$PTHREAD_LIBS $LIBS" CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
     or use CPPFLAGS and LDFLAGS if the library is installed in a
     non-standard location.]))
 
+# Check whether POSIX thread barriers are implemented (e.g. OSX does not have them)
+have_pthread_barrier="no"
+AC_CHECK_LIB(pthread, pthread_barrier_init,
+	     have_pthread_barrier="yes",
+	     AC_MSG_WARN(POSIX implementation does not have barriers. SWIFT will use home-made ones.))
+if test "x$have_pthread_barrier" == "xyes"; then
+  AC_DEFINE([HAVE_PTHREAD_BARRIERS], [1], [The posix library implements barriers])
+fi
+
+# Check whether POSIX file allocation functions exist (e.g. OSX does not have them)
+AC_CHECK_LIB(pthread, posix_fallocate,
+	     AC_DEFINE([HAVE_POSIX_FALLOCATE], [1], [The posix library implements file allocation functions.]),
+	     AC_MSG_WARN(POSIX implementation does not have file allocation functions.))
 
 # Check for METIS. Note AX_LIB_METIS exists, but cannot be configured
 # to be default off (i.e. given no option it tries to locate METIS), so we
@@ -548,6 +561,10 @@ if test "$with_hdf5" = "yes"; then
 fi
 AM_CONDITIONAL([HAVEPARALLELHDF5],[test "$have_parallel_hdf5" = "yes"])
 
+# Check for floating-point execeptions
+AC_CHECK_FUNC(feenableexcept, AC_DEFINE([HAVE_FE_ENABLE_EXCEPT],[1],
+    [Defined if the floating-point exception can be enabled using non-standard GNU functions.]))
+
 # Check for setaffinity.
 AC_CHECK_FUNC(pthread_setaffinity_np, AC_DEFINE([HAVE_SETAFFINITY],[1],
     [Defined if pthread_setaffinity_np exists.]) )
@@ -914,19 +931,20 @@ AC_MSG_RESULT([
 
    $PACKAGE_NAME v.$PACKAGE_VERSION
 
-   Compiler        : $CC
-    - vendor       : $ax_cv_c_compiler_vendor
-    - version      : $ax_cv_c_compiler_version
-    - flags        : $CFLAGS
-   MPI enabled     : $enable_mpi
-   HDF5 enabled    : $with_hdf5
-    - parallel     : $have_parallel_hdf5
-   Metis enabled   : $have_metis
-   FFTW3 enabled   : $have_fftw3
-   libNUMA enabled : $have_numa
-   Using tcmalloc  : $have_tcmalloc
-   Using jemalloc  : $have_jemalloc
-   CPU profiler    : $have_profiler
+   Compiler         : $CC
+    - vendor        : $ax_cv_c_compiler_vendor
+    - version       : $ax_cv_c_compiler_version
+    - flags         : $CFLAGS
+   MPI enabled      : $enable_mpi
+   HDF5 enabled     : $with_hdf5
+    - parallel      : $have_parallel_hdf5
+   Metis enabled    : $have_metis
+   FFTW3 enabled    : $have_fftw3
+   libNUMA enabled  : $have_numa
+   Using tcmalloc   : $have_tcmalloc
+   Using jemalloc   : $have_jemalloc
+   CPU profiler     : $have_profiler
+   Pthread barriers : $have_pthread_barrier
 
    Hydro scheme       : $with_hydro
    Dimensionality     : $with_dimension
diff --git a/examples/DiscPatch/HydroStatic/disc-patch-icc.yml b/examples/DiscPatch/HydroStatic/disc-patch-icc.yml
index 6f17cfbb1e0125faf8e47fe4e9e55bfdf4df7b71..2859599c03c44ff6a15522164a8e2a407bdde41e 100644
--- a/examples/DiscPatch/HydroStatic/disc-patch-icc.yml
+++ b/examples/DiscPatch/HydroStatic/disc-patch-icc.yml
@@ -31,6 +31,9 @@ SPH:
   max_ghost_iterations:  30       # Maximal number of iterations allowed to converge towards the smoothing length.
   h_max:                 60.      # Maximal smoothing length allowed (in internal units).
 
+EoS:
+  isothermal_internal_energy: 20.267845
+
 # Parameters related to the initial conditions
 InitialConditions:
   file_name:  Disc-Patch.hdf5       # The file to read
diff --git a/examples/DiscPatch/HydroStatic/makeIC.py b/examples/DiscPatch/HydroStatic/makeIC.py
index 11b482059b494fc9a6b9447fdfe2e7ec543d52ff..83b07e50d59869c51fd3854a7f15cfd7d476d04d 100644
--- a/examples/DiscPatch/HydroStatic/makeIC.py
+++ b/examples/DiscPatch/HydroStatic/makeIC.py
@@ -202,7 +202,5 @@ print "--- Runtime parameters (YAML file): ---"
 print "DiscPatchPotential:surface_density:    ", surface_density
 print "DiscPatchPotential:scale_height:       ", scale_height
 print "DiscPatchPotential:x_disc:             ", 0.5 * boxSize_x
+print "EoS:isothermal_internal_energy: %ef"%u_therm
 print ""
-
-print "--- Constant parameters: ---"
-print "const_isothermal_internal_energy: %ef"%u_therm
diff --git a/examples/analyse_dump_cells.py b/examples/analyse_dump_cells.py
new file mode 100755
index 0000000000000000000000000000000000000000..3140e799566c75fe494a75895db0f4a8dcff4e57
--- /dev/null
+++ b/examples/analyse_dump_cells.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python
+"""
+Usage:
+    analysedumpcells.py nx ny nx cell<1>.dat cell<2>.dat ...
+
+Analyses a number of output files created by calls to the dumpCells() debug
+function (presumably called in engine_step()) to output a list of active
+top-level cells, identifying those that are on the edges of the volumes being
+processed on by various nodes. The point is that these should be minimised to
+reduce the MPI communications.
+
+The nx, ny and nz arguments are the number of cells in the complete space,
+we need these so that we can wrap around the edge of space correctly.
+
+This file is part of SWIFT.
+Copyright (c) 2017 Peter W. Draper (p.w.draper@durham.ac.uk)
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published
+by the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+import pylab as pl
+import numpy as np
+import sys
+import pandas
+
+xcol = 0
+ycol = 1
+zcol = 2
+xwcol = 3
+ywcol = 4
+zwcol = 5
+localcol = 18
+supercol = 15
+activecol = 16
+
+#  Command-line arguments.
+if len(sys.argv) < 5:
+    print "usage: ", sys.argv[0], " nx ny nz cell1.dat cell2.dat ..."
+    sys.exit(1)
+nx = int(sys.argv[1])
+ny = int(sys.argv[2])
+nz = int(sys.argv[3])
+
+print "# x y z onedge"
+allactives = []
+onedge = 0
+tcount = 0
+for i in range(4, len(sys.argv)):
+
+    #  Read the file.
+    data = pl.loadtxt(sys.argv[i])
+    #print data
+
+    #  Select cells that are on the current rank and are super cells.
+    rdata = data[data[:,localcol] == 1]
+    tdata = rdata[rdata[:,supercol] == 1]
+
+    #  Separation of the cells is in data.
+    xwidth = tdata[0,xwcol]
+    ywidth = tdata[0,ywcol]
+    zwidth = tdata[0,zwcol]
+
+    #  Fill space nx, ny,n nz with all toplevel cells and flag their active
+    #  state.
+    space = np.zeros((nx,ny,nz))
+    actives = []
+    for line in tdata:
+        ix = int(np.rint(line[xcol] / xwidth))
+        iy = int(np.rint(line[ycol] / ywidth))
+        iz = int(np.rint(line[zcol] / zwidth))
+        active = int(line[activecol])
+        space[ix,iy,iz] = 1 + active
+        tcount = tcount + 1
+        if active == 1:
+            actives.append([ix, iy, iz, line])
+    
+    #  Report all active cells and flag any without 26 neighbours. These are
+    #  on the edge of the partition volume and will have foreign neighbour
+    #  cells.
+    for active in actives:
+        count = 0
+        for ii in [-1, 0, 1]:
+            i = active[0] + ii
+            if i < 0:
+                i = i + nx
+            elif i >= nx:
+                i = i - nx
+
+            for jj in [-1, 0, 1]:
+                j = active[1] + jj
+                if j < 0:
+                    j = j + ny
+                elif j >= ny:
+                    j = j - ny
+
+                for kk in [-1, 0, 1]:
+                    k = active[2] + kk
+                    if k < 0:
+                        k = k + nz
+                    elif k >= nz:
+                        k = k - nz
+                    if space[i, j, k] > 0:
+                        count = count + 1
+        if count < 27:
+            onedge = onedge + 1
+            print active[3][0], active[3][1], active[3][2], 1
+        else:
+            print active[3][0], active[3][1], active[3][2], 0
+
+    allactives.extend(actives)
+
+print "# top cells: ", tcount, " active: ", len(allactives), " on edge: ", onedge
+
+sys.exit(0)
+
diff --git a/examples/main.c b/examples/main.c
index 0d87ea350793e0d089b41d3bd05a68cd97753ef9..31e6a5db089a919107e0f768f2e23b60a3520dd0 100644
--- a/examples/main.c
+++ b/examples/main.c
@@ -203,7 +203,11 @@ int main(int argc, char *argv[]) {
         with_drift_all = 1;
         break;
       case 'e':
+#ifdef HAVE_FE_ENABLE_EXCEPT
         with_fp_exceptions = 1;
+#else
+        error("Need support for floating point exception on this platform");
+#endif
         break;
       case 'f':
         if (sscanf(optarg, "%llu", &cpufreq) != 1) {
@@ -379,11 +383,19 @@ int main(int argc, char *argv[]) {
 
   /* Do we choke on FP-exceptions ? */
   if (with_fp_exceptions) {
+#ifdef HAVE_FE_ENABLE_EXCEPT
     feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
+#endif
     if (myrank == 0)
       message("WARNING: Floating point exceptions will be reported.");
   }
 
+/* Do we have slow barriers? */
+#ifndef HAVE_PTHREAD_BARRIERS
+  if (myrank == 0)
+    message("WARNING: Non-optimal thread barriers are being used.");
+#endif
+
   /* How large are the parts? */
   if (myrank == 0) {
     message("sizeof(part)        is %4zi bytes.", sizeof(struct part));
@@ -463,6 +475,7 @@ int main(int argc, char *argv[]) {
   /* Initialise the hydro properties */
   struct hydro_props hydro_properties;
   if (with_hydro) hydro_props_init(&hydro_properties, params);
+  if (with_hydro) eos_init(&eos, params);
 
   /* Initialise the gravity properties */
   struct gravity_props gravity_properties;
@@ -689,9 +702,9 @@ int main(int argc, char *argv[]) {
 
   /* Legend */
   if (myrank == 0)
-    printf("# %6s %14s %14s %10s %10s %10s %16s [%s]\n", "Step", "Time",
+    printf("# %6s %14s %14s %12s %12s %12s %16s [%s] %6s\n", "Step", "Time",
            "Time-step", "Updates", "g-Updates", "s-Updates", "Wall-clock time",
-           clocks_getunit());
+           clocks_getunit(), "Props");
 
   /* File for the timers */
   if (with_verbose_timers) timers_open_file(myrank);
diff --git a/examples/parameter_example.yml b/examples/parameter_example.yml
index 6327540c27753ea61a79d7fb4c16d60c5f00635d..948731d7879ec3e34f6e53f58dc72f6d7a6145ec 100644
--- a/examples/parameter_example.yml
+++ b/examples/parameter_example.yml
@@ -15,6 +15,7 @@ Scheduler:
   cell_split_size:        400       # (Optional) Maximal number of particles per cell (this is the default value).
   max_top_level_cells:    12        # (Optional) Maximal number of top-level cells in any dimension. The number of top-level cells will be the cube of this (this is the default value).
   tasks_per_cell:         0         # (Optional) The average number of tasks per cell. If not large enough the simulation will fail (means guess...).
+  mpi_message_limit:      4096      # (Optional) Maximum MPI task message size to send non-buffered, KB.
 
 # Parameters governing the time integration (Set dt_min and dt_max to the same value for a fixed time-step run.)
 TimeIntegration:
@@ -50,6 +51,9 @@ SPH:
   max_volume_change:     1.4      # (Optional) Maximal allowed change of kernel volume over one time-step.
   max_ghost_iterations:  30       # (Optional) Maximal number of iterations allowed to converge towards the smoothing length.
 
+EoS:
+  isothermal_internal_energy: 20.26784 # Thermal energy per unit mass for the case of isothermal equation of state (in internal units).
+
 # Parameters for the self-gravity scheme
 Gravity:
   eta:          0.025    # Constant dimensionless multiplier for time integration.
@@ -76,8 +80,13 @@ DomainDecomposition:
   initial_grid_x:   10      # (Optional) Grid size if the "grid" strategy is chosen.
   initial_grid_y:   10      # ""
   initial_grid_z:   10      # ""
-  repartition_type: task_weights # (Optional) The re-decomposition strategy: "none", "task_weights", "particle_weights",
-                                 #            "edge_task_weights" or "hybrid_weights".
+
+  repartition_type: costs/costs # (Optional) The re-decomposition strategy, one of:
+                            # "none/none", "costs/costs", "counts/none", "none/costs", "counts/costs",
+                            # "costs/time" or "none/time".
+                            # These are vertex/edge weights with "costs" as task timing, "counts" as
+                            # sum of particles and "time" as the expected time of the next updates
+
   trigger:          0.05    # (Optional) Fractional (<1) CPU time difference between MPI ranks required to trigger a
                             # new decomposition, or number of steps (>1) between decompositions
   minfrac:          0.9     # (Optional) Fractional of all particles that should be updated in previous step when
@@ -117,7 +126,7 @@ SineWavePotential:
   amplitude:        10.     # Amplitude of the sine wave (internal units)
   timestep_limit:   1.      # Time-step dimensionless pre-factor.
   growth_time:      0.      # (Optional) Time for the potential to grow to its final size.
- 
+
 # Parameters related to cooling function  ----------------------------------------------
 
 # Constant du/dt cooling function
diff --git a/examples/plot_task_dependencies.sh b/examples/plot_task_dependencies.sh
new file mode 100755
index 0000000000000000000000000000000000000000..77784d8a9cdd3720621c9ad35c4cfbdaf0167ff1
--- /dev/null
+++ b/examples/plot_task_dependencies.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+#  Creates a graphic from the task graph file dependency_graph.dot.
+#  Requires the graphviz command "dot".
+
+if [ ! -e dependency_graph.dot ]; then
+    echo "Missing task-graph output 'dependency_graph.dot'! Cannot generate figure."
+else 
+    dot -Tpng dependency_graph.dot -o task_graph.png
+    echo "Output written to task_graph.png"
+fi
+
+exit
diff --git a/examples/process_cells b/examples/process_cells
new file mode 100755
index 0000000000000000000000000000000000000000..eead4572387f2732cf0b86265f14d20039f73ae1
--- /dev/null
+++ b/examples/process_cells
@@ -0,0 +1,64 @@
+#!/bin/bash
+#
+# Usage:
+#  process_cells nx ny nz nprocess
+#
+# Description:
+#  Process all the cell dumps in the current directory.
+
+#  Outputs file per rank with the active cells identified and marked as to
+#  whether they are near an edge or not. Note requires the numbers of cells
+#  per dimension of the space.
+#
+#  Also outputs a graphic showing the fraction of active cells on edges
+#  for each step.
+
+#  Handle command-line
+if test "$4" = ""; then
+    echo "Usage: $0 nx ny nz nprocess"
+    exit 1
+fi
+NX=$1
+NY=$2
+NZ=$3
+NPROCS=$4
+
+#  Locate script.
+SCRIPTHOME=$(dirname "$0")
+
+#  Find all files. Use version sort to get into correct order.
+files=$(ls -v cells_*.dat)
+if test $? != 0; then
+    echo "Failed to find any cell dump files"
+    exit 1
+fi
+
+#  Construct list of names need the number of ranks.
+ranks=$(ls -v cells_*.dat | sed 's,cells_\(.*\)_.*.dat,\1,' | sort | uniq | wc -l)
+echo "Number of ranks = $ranks"
+
+#  Now construct a list of files ordered by rank, not step.
+files=$(ls cells_*.dat | sort -t "_" -k 3,3 -n | xargs -n 4)
+
+#  Need number of steps.
+nfiles=$(echo $files| wc -w)
+echo "Number of files = $nfiles"
+steps=$(( $nfiles / $ranks + 1 ))
+echo "Number of steps = $steps"
+
+#  And process them,
+echo "Processing cell dumps files..."
+#echo $files | xargs -P $NPROCS -n 4 /bin/bash -c "${SCRIPTHOME}/process_cells_helper $NX $NY $NZ \$0 \$1 \$2 \$3"
+
+#  Create summary.
+grep "top cells" step*-active-cells.dat | sort -h > active_cells.log
+
+#  And plot of active cells to edge cells.
+stilts plot2plane ifmt=ascii in=active_cells.log xmin=-0.1 xmax=1.1 ymin=0 ymax=$steps grid=1 \
+       legend=false xpix=600 ypix=500 xlabel="Edge cells/Active cells" ylabel="Step" \
+       layer1=mark x1="col9/1.0/col6" y1="index" size1=3 shading1=aux auxmap=rainbow \
+       aux=col6 auxfunc=log auxlabel="Active cells" layer2=histogram x2="col9/1.0/col6" \
+       color2=grey binsize2=0.01 phase2=0.5 barform2=semi_steps thick2=1 \
+       out=active_cells.png
+
+exit
diff --git a/examples/process_cells_helper b/examples/process_cells_helper
new file mode 100755
index 0000000000000000000000000000000000000000..a96bdba0de56e04c28ed6d8675c705241ce241d9
--- /dev/null
+++ b/examples/process_cells_helper
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+#  Helper for process_cells.
+
+#  Locate script.
+SCRIPTHOME=$(dirname "$0")
+
+step=$(echo $4|sed 's,cells_\(.*\)_\(.*\).dat,\2,')
+echo "${SCRIPTHOME}/analyse_dump_cells.py $* > step${step}-active-cells.dat"
+${SCRIPTHOME}/analyse_dump_cells.py $* > step${step}-active-cells.dat
+
+exit
diff --git a/m4/ax_cc_maxopt.m4 b/m4/ax_cc_maxopt.m4
index 93d5d6dcd78ff77c934f77ad0e1e02ef37873a37..0fcef66c7542fbb840e68d259550343a1a5601a1 100644
--- a/m4/ax_cc_maxopt.m4
+++ b/m4/ax_cc_maxopt.m4
@@ -146,6 +146,22 @@ if test "$ac_test_CFLAGS" != "set"; then
         fi
 	;;
 
+    clang)
+     # default optimization flags for clang on all systems
+     CFLAGS="-O3 -fomit-frame-pointer"
+
+     # Always good optimisation to have
+     AX_CHECK_COMPILE_FLAG(-fstrict-aliasing, CFLAGS="$CFLAGS -fstrict-aliasing")
+
+     # note that we enable "unsafe" fp optimization with other compilers, too
+     AX_CHECK_COMPILE_FLAG(-ffast-math, CFLAGS="$CFLAGS -ffast-math")
+
+     # not all codes will benefit from this.
+     AX_CHECK_COMPILE_FLAG(-funroll-loops, CFLAGS="$CFLAGS -funroll-loops")
+
+     AX_GCC_ARCHFLAG($acx_maxopt_portable)
+     ;;
+
     gnu)
      # default optimization flags for gcc on all systems
      CFLAGS="-O3 -fomit-frame-pointer"
@@ -155,7 +171,7 @@ if test "$ac_test_CFLAGS" != "set"; then
 
      #  -fstrict-aliasing for gcc-2.95+
      AX_CHECK_COMPILE_FLAG(-fstrict-aliasing,
-	CFLAGS="$CFLAGS -fstrict-aliasing")
+        CFLAGS="$CFLAGS -fstrict-aliasing")
 
      # note that we enable "unsafe" fp optimization with other compilers, too
      AX_CHECK_COMPILE_FLAG(-ffast-math, CFLAGS="$CFLAGS -ffast-math")
diff --git a/m4/ax_gcc_archflag.m4 b/m4/ax_gcc_archflag.m4
index 298db809e82130b77758f5e66c29c276ceb8266a..6d4feb1e0aab816d0830dcdfdd6d84d61575c8b2 100644
--- a/m4/ax_gcc_archflag.m4
+++ b/m4/ax_gcc_archflag.m4
@@ -108,7 +108,8 @@ case $host_cpu in
 	    *3?6[[ae]]?:*:*:*) ax_gcc_arch="ivybridge core-avx-i corei7-avx corei7 core2 pentium-m pentium3 pentiumpro" ;;
 	    *3?6[[cf]]?:*:*:*|*4?6[[56]]?:*:*:*) ax_gcc_arch="haswell core-avx2 core-avx-i corei7-avx corei7 core2 pentium-m pentium3 pentiumpro" ;;
 	    *3?6d?:*:*:*|*4?6f?:*:*:*) ax_gcc_arch="broadwell core-avx2 core-avx-i corei7-avx corei7 core2 pentium-m pentium3 pentiumpro" ;;
-	    *9?6[[de]]?:*:*:*) ax_gcc_arch="kabylake core-avx2 core-avx-i corei7-avx corei7 core2 pentium-m pentium3 pentiumpro" ;;	
+	    *4?6[[de]]?:*:*:*) ax_gcc_arch="skylake core-avx2 core-avx-i corei7-avx corei7 core2 pentium-m pentium3 pentiumpro" ;;	
+            *9?6[[de]]?:*:*:*) ax_gcc_arch="kabylake core-avx2 core-avx-i corei7-avx corei7 core2 pentium-m pentium3 pentiumpro" ;;
 	    *1?6c?:*:*:*|*2?6[[67]]?:*:*:*|*3?6[[56]]?:*:*:*) ax_gcc_arch="bonnell atom core2 pentium-m pentium3 pentiumpro" ;;
 	    *3?67?:*:*:*|*[[45]]?6[[ad]]?:*:*:*) ax_gcc_arch="silvermont atom core2 pentium-m pentium3 pentiumpro" ;;
 	    *000?f[[012]]?:*:*:*|?f[[012]]?:*:*:*|f[[012]]?:*:*:*) ax_gcc_arch="pentium4 pentiumpro" ;;
diff --git a/m4/ax_pthread.m4 b/m4/ax_pthread.m4
index d383ad5c6d6a5061370800bb1dc89b7a334c0638..5fbf9fe0d68616042f87a8365190211cb8ccfbf1 100644
--- a/m4/ax_pthread.m4
+++ b/m4/ax_pthread.m4
@@ -1,5 +1,5 @@
 # ===========================================================================
-#        http://www.gnu.org/software/autoconf-archive/ax_pthread.html
+#        https://www.gnu.org/software/autoconf-archive/ax_pthread.html
 # ===========================================================================
 #
 # SYNOPSIS
@@ -19,10 +19,10 @@
 #   is necessary on AIX to use the special cc_r compiler alias.)
 #
 #   NOTE: You are assumed to not only compile your program with these flags,
-#   but also link it with them as well. e.g. you should link with
+#   but also to link with them as well. For example, you might link with
 #   $PTHREAD_CC $CFLAGS $PTHREAD_CFLAGS $LDFLAGS ... $PTHREAD_LIBS $LIBS
 #
-#   If you are only building threads programs, you may wish to use these
+#   If you are only building threaded programs, you may wish to use these
 #   variables in your default LIBS, CFLAGS, and CC:
 #
 #     LIBS="$PTHREAD_LIBS $LIBS"
@@ -30,8 +30,8 @@
 #     CC="$PTHREAD_CC"
 #
 #   In addition, if the PTHREAD_CREATE_JOINABLE thread-attribute constant
-#   has a nonstandard name, defines PTHREAD_CREATE_JOINABLE to that name
-#   (e.g. PTHREAD_CREATE_UNDETACHED on AIX).
+#   has a nonstandard name, this macro defines PTHREAD_CREATE_JOINABLE to
+#   that name (e.g. PTHREAD_CREATE_UNDETACHED on AIX).
 #
 #   Also HAVE_PTHREAD_PRIO_INHERIT is defined if pthread is found and the
 #   PTHREAD_PRIO_INHERIT symbol is defined when compiling with
@@ -67,7 +67,7 @@
 #   Public License for more details.
 #
 #   You should have received a copy of the GNU General Public License along
-#   with this program. If not, see <http://www.gnu.org/licenses/>.
+#   with this program. If not, see <https://www.gnu.org/licenses/>.
 #
 #   As a special exception, the respective Autoconf Macro's copyright owner
 #   gives unlimited permission to copy, distribute and modify the configure
@@ -82,35 +82,40 @@
 #   modified version of the Autoconf Macro, you may extend this special
 #   exception to the GPL to apply to your modified version as well.
 
-#serial 21
+#serial 24
 
 AU_ALIAS([ACX_PTHREAD], [AX_PTHREAD])
 AC_DEFUN([AX_PTHREAD], [
 AC_REQUIRE([AC_CANONICAL_HOST])
+AC_REQUIRE([AC_PROG_CC])
+AC_REQUIRE([AC_PROG_SED])
 AC_LANG_PUSH([C])
 ax_pthread_ok=no
 
 # We used to check for pthread.h first, but this fails if pthread.h
-# requires special compiler flags (e.g. on True64 or Sequent).
+# requires special compiler flags (e.g. on Tru64 or Sequent).
 # It gets checked for in the link test anyway.
 
 # First of all, check if the user has set any of the PTHREAD_LIBS,
 # etcetera environment variables, and if threads linking works using
 # them:
-if test x"$PTHREAD_LIBS$PTHREAD_CFLAGS" != x; then
-        save_CFLAGS="$CFLAGS"
+if test "x$PTHREAD_CFLAGS$PTHREAD_LIBS" != "x"; then
+        ax_pthread_save_CC="$CC"
+        ax_pthread_save_CFLAGS="$CFLAGS"
+        ax_pthread_save_LIBS="$LIBS"
+        AS_IF([test "x$PTHREAD_CC" != "x"], [CC="$PTHREAD_CC"])
         CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
-        save_LIBS="$LIBS"
         LIBS="$PTHREAD_LIBS $LIBS"
-        AC_MSG_CHECKING([for pthread_join in LIBS=$PTHREAD_LIBS with CFLAGS=$PTHREAD_CFLAGS])
-        AC_TRY_LINK_FUNC([pthread_join], [ax_pthread_ok=yes])
+        AC_MSG_CHECKING([for pthread_join using $CC $PTHREAD_CFLAGS $PTHREAD_LIBS])
+        AC_LINK_IFELSE([AC_LANG_CALL([], [pthread_join])], [ax_pthread_ok=yes])
         AC_MSG_RESULT([$ax_pthread_ok])
-        if test x"$ax_pthread_ok" = xno; then
+        if test "x$ax_pthread_ok" = "xno"; then
                 PTHREAD_LIBS=""
                 PTHREAD_CFLAGS=""
         fi
-        LIBS="$save_LIBS"
-        CFLAGS="$save_CFLAGS"
+        CC="$ax_pthread_save_CC"
+        CFLAGS="$ax_pthread_save_CFLAGS"
+        LIBS="$ax_pthread_save_LIBS"
 fi
 
 # We must check for the threads library under a number of different
@@ -123,7 +128,7 @@ fi
 # which indicates that we try without any flags at all, and "pthread-config"
 # which is a program returning the flags for the Pth emulation library.
 
-ax_pthread_flags="pthreads none -Kthread -kthread lthread -pthread -pthreads -mthreads pthread --thread-safe -mt pthread-config"
+ax_pthread_flags="pthreads none -Kthread -pthread -pthreads -mthreads pthread --thread-safe -mt pthread-config"
 
 # The ordering *is* (sometimes) important.  Some notes on the
 # individual items follow:
@@ -132,82 +137,225 @@ ax_pthread_flags="pthreads none -Kthread -kthread lthread -pthread -pthreads -mt
 # none: in case threads are in libc; should be tried before -Kthread and
 #       other compiler flags to prevent continual compiler warnings
 # -Kthread: Sequent (threads in libc, but -Kthread needed for pthread.h)
-# -kthread: FreeBSD kernel threads (preferred to -pthread since SMP-able)
-# lthread: LinuxThreads port on FreeBSD (also preferred to -pthread)
-# -pthread: Linux/gcc (kernel threads), BSD/gcc (userland threads)
-# -pthreads: Solaris/gcc
-# -mthreads: Mingw32/gcc, Lynx/gcc
+# -pthread: Linux/gcc (kernel threads), BSD/gcc (userland threads), Tru64
+#           (Note: HP C rejects this with "bad form for `-t' option")
+# -pthreads: Solaris/gcc (Note: HP C also rejects)
 # -mt: Sun Workshop C (may only link SunOS threads [-lthread], but it
-#      doesn't hurt to check since this sometimes defines pthreads too;
-#      also defines -D_REENTRANT)
-#      ... -mt is also the pthreads flag for HP/aCC
+#      doesn't hurt to check since this sometimes defines pthreads and
+#      -D_REENTRANT too), HP C (must be checked before -lpthread, which
+#      is present but should not be used directly; and before -mthreads,
+#      because the compiler interprets this as "-mt" + "-hreads")
+# -mthreads: Mingw32/gcc, Lynx/gcc
 # pthread: Linux, etcetera
 # --thread-safe: KAI C++
 # pthread-config: use pthread-config program (for GNU Pth library)
 
-case ${host_os} in
+case $host_os in
+
+        freebsd*)
+
+        # -kthread: FreeBSD kernel threads (preferred to -pthread since SMP-able)
+        # lthread: LinuxThreads port on FreeBSD (also preferred to -pthread)
+
+        ax_pthread_flags="-kthread lthread $ax_pthread_flags"
+        ;;
+
+        hpux*)
+
+        # From the cc(1) man page: "[-mt] Sets various -D flags to enable
+        # multi-threading and also sets -lpthread."
+
+        ax_pthread_flags="-mt -pthread pthread $ax_pthread_flags"
+        ;;
+
+        openedition*)
+
+        # IBM z/OS requires a feature-test macro to be defined in order to
+        # enable POSIX threads at all, so give the user a hint if this is
+        # not set. (We don't define these ourselves, as they can affect
+        # other portions of the system API in unpredictable ways.)
+
+        AC_EGREP_CPP([AX_PTHREAD_ZOS_MISSING],
+            [
+#            if !defined(_OPEN_THREADS) && !defined(_UNIX03_THREADS)
+             AX_PTHREAD_ZOS_MISSING
+#            endif
+            ],
+            [AC_MSG_WARN([IBM z/OS requires -D_OPEN_THREADS or -D_UNIX03_THREADS to enable pthreads support.])])
+        ;;
+
         solaris*)
 
         # On Solaris (at least, for some versions), libc contains stubbed
         # (non-functional) versions of the pthreads routines, so link-based
-        # tests will erroneously succeed.  (We need to link with -pthreads/-mt/
-        # -lpthread.)  (The stubs are missing pthread_cleanup_push, or rather
-        # a function called by this macro, so we could check for that, but
-        # who knows whether they'll stub that too in a future libc.)  So,
-        # we'll just look for -pthreads and -lpthread first:
+        # tests will erroneously succeed. (N.B.: The stubs are missing
+        # pthread_cleanup_push, or rather a function called by this macro,
+        # so we could check for that, but who knows whether they'll stub
+        # that too in a future libc.)  So we'll check first for the
+        # standard Solaris way of linking pthreads (-mt -lpthread).
+
+        ax_pthread_flags="-mt,pthread pthread $ax_pthread_flags"
+        ;;
+esac
+
+# GCC generally uses -pthread, or -pthreads on some platforms (e.g. SPARC)
 
-        ax_pthread_flags="-pthreads pthread -mt -pthread $ax_pthread_flags"
+AS_IF([test "x$GCC" = "xyes"],
+      [ax_pthread_flags="-pthread -pthreads $ax_pthread_flags"])
+
+# The presence of a feature test macro requesting re-entrant function
+# definitions is, on some systems, a strong hint that pthreads support is
+# correctly enabled
+
+case $host_os in
+        darwin* | hpux* | linux* | osf* | solaris*)
+        ax_pthread_check_macro="_REENTRANT"
         ;;
 
-        darwin*)
-        ax_pthread_flags="-pthread $ax_pthread_flags"
+        aix*)
+        ax_pthread_check_macro="_THREAD_SAFE"
+        ;;
+
+        *)
+        ax_pthread_check_macro="--"
         ;;
 esac
+AS_IF([test "x$ax_pthread_check_macro" = "x--"],
+      [ax_pthread_check_cond=0],
+      [ax_pthread_check_cond="!defined($ax_pthread_check_macro)"])
+
+# Are we compiling with Clang?
+
+AC_CACHE_CHECK([whether $CC is Clang],
+    [ax_cv_PTHREAD_CLANG],
+    [ax_cv_PTHREAD_CLANG=no
+     # Note that Autoconf sets GCC=yes for Clang as well as GCC
+     if test "x$GCC" = "xyes"; then
+        AC_EGREP_CPP([AX_PTHREAD_CC_IS_CLANG],
+            [/* Note: Clang 2.7 lacks __clang_[a-z]+__ */
+#            if defined(__clang__) && defined(__llvm__)
+             AX_PTHREAD_CC_IS_CLANG
+#            endif
+            ],
+            [ax_cv_PTHREAD_CLANG=yes])
+     fi
+    ])
+ax_pthread_clang="$ax_cv_PTHREAD_CLANG"
+
+ax_pthread_clang_warning=no
+
+# Clang needs special handling, because older versions handle the -pthread
+# option in a rather... idiosyncratic way
+
+if test "x$ax_pthread_clang" = "xyes"; then
+
+        # Clang takes -pthread; it has never supported any other flag
+
+        # (Note 1: This will need to be revisited if a system that Clang
+        # supports has POSIX threads in a separate library.  This tends not
+        # to be the way of modern systems, but it's conceivable.)
+
+        # (Note 2: On some systems, notably Darwin, -pthread is not needed
+        # to get POSIX threads support; the API is always present and
+        # active.  We could reasonably leave PTHREAD_CFLAGS empty.  But
+        # -pthread does define _REENTRANT, and while the Darwin headers
+        # ignore this macro, third-party headers might not.)
+
+        PTHREAD_CFLAGS="-pthread"
+        PTHREAD_LIBS=
+
+        ax_pthread_ok=yes
+
+        # However, older versions of Clang make a point of warning the user
+        # that, in an invocation where only linking and no compilation is
+        # taking place, the -pthread option has no effect ("argument unused
+        # during compilation").  They expect -pthread to be passed in only
+        # when source code is being compiled.
+        #
+        # Problem is, this is at odds with the way Automake and most other
+        # C build frameworks function, which is that the same flags used in
+        # compilation (CFLAGS) are also used in linking.  Many systems
+        # supported by AX_PTHREAD require exactly this for POSIX threads
+        # support, and in fact it is often not straightforward to specify a
+        # flag that is used only in the compilation phase and not in
+        # linking.  Such a scenario is extremely rare in practice.
+        #
+        # Even though use of the -pthread flag in linking would only print
+        # a warning, this can be a nuisance for well-run software projects
+        # that build with -Werror.  So if the active version of Clang has
+        # this misfeature, we search for an option to squash it.
+
+        AC_CACHE_CHECK([whether Clang needs flag to prevent "argument unused" warning when linking with -pthread],
+            [ax_cv_PTHREAD_CLANG_NO_WARN_FLAG],
+            [ax_cv_PTHREAD_CLANG_NO_WARN_FLAG=unknown
+             # Create an alternate version of $ac_link that compiles and
+             # links in two steps (.c -> .o, .o -> exe) instead of one
+             # (.c -> exe), because the warning occurs only in the second
+             # step
+             ax_pthread_save_ac_link="$ac_link"
+             ax_pthread_sed='s/conftest\.\$ac_ext/conftest.$ac_objext/g'
+             ax_pthread_link_step=`$as_echo "$ac_link" | sed "$ax_pthread_sed"`
+             ax_pthread_2step_ac_link="($ac_compile) && (echo ==== >&5) && ($ax_pthread_link_step)"
+             ax_pthread_save_CFLAGS="$CFLAGS"
+             for ax_pthread_try in '' -Qunused-arguments -Wno-unused-command-line-argument unknown; do
+                AS_IF([test "x$ax_pthread_try" = "xunknown"], [break])
+                CFLAGS="-Werror -Wunknown-warning-option $ax_pthread_try -pthread $ax_pthread_save_CFLAGS"
+                ac_link="$ax_pthread_save_ac_link"
+                AC_LINK_IFELSE([AC_LANG_SOURCE([[int main(void){return 0;}]])],
+                    [ac_link="$ax_pthread_2step_ac_link"
+                     AC_LINK_IFELSE([AC_LANG_SOURCE([[int main(void){return 0;}]])],
+                         [break])
+                    ])
+             done
+             ac_link="$ax_pthread_save_ac_link"
+             CFLAGS="$ax_pthread_save_CFLAGS"
+             AS_IF([test "x$ax_pthread_try" = "x"], [ax_pthread_try=no])
+             ax_cv_PTHREAD_CLANG_NO_WARN_FLAG="$ax_pthread_try"
+            ])
 
-# Clang doesn't consider unrecognized options an error unless we specify
-# -Werror. We throw in some extra Clang-specific options to ensure that
-# this doesn't happen for GCC, which also accepts -Werror.
+        case "$ax_cv_PTHREAD_CLANG_NO_WARN_FLAG" in
+                no | unknown) ;;
+                *) PTHREAD_CFLAGS="$ax_cv_PTHREAD_CLANG_NO_WARN_FLAG $PTHREAD_CFLAGS" ;;
+        esac
 
-AC_MSG_CHECKING([if compiler needs -Werror to reject unknown flags])
-save_CFLAGS="$CFLAGS"
-ax_pthread_extra_flags="-Werror"
-CFLAGS="$CFLAGS $ax_pthread_extra_flags -Wunknown-warning-option -Wsizeof-array-argument"
-AC_COMPILE_IFELSE([AC_LANG_PROGRAM([int foo(void);],[foo()])],
-                  [AC_MSG_RESULT([yes])],
-                  [ax_pthread_extra_flags=
-                   AC_MSG_RESULT([no])])
-CFLAGS="$save_CFLAGS"
+fi # $ax_pthread_clang = yes
 
-if test x"$ax_pthread_ok" = xno; then
-for flag in $ax_pthread_flags; do
+if test "x$ax_pthread_ok" = "xno"; then
+for ax_pthread_try_flag in $ax_pthread_flags; do
 
-        case $flag in
+        case $ax_pthread_try_flag in
                 none)
                 AC_MSG_CHECKING([whether pthreads work without any flags])
                 ;;
 
+                -mt,pthread)
+                AC_MSG_CHECKING([whether pthreads work with -mt -lpthread])
+                PTHREAD_CFLAGS="-mt"
+                PTHREAD_LIBS="-lpthread"
+                ;;
+
                 -*)
-                AC_MSG_CHECKING([whether pthreads work with $flag])
-                PTHREAD_CFLAGS="$flag"
+                AC_MSG_CHECKING([whether pthreads work with $ax_pthread_try_flag])
+                PTHREAD_CFLAGS="$ax_pthread_try_flag"
                 ;;
 
                 pthread-config)
                 AC_CHECK_PROG([ax_pthread_config], [pthread-config], [yes], [no])
-                if test x"$ax_pthread_config" = xno; then continue; fi
+                AS_IF([test "x$ax_pthread_config" = "xno"], [continue])
                 PTHREAD_CFLAGS="`pthread-config --cflags`"
                 PTHREAD_LIBS="`pthread-config --ldflags` `pthread-config --libs`"
                 ;;
 
                 *)
-                AC_MSG_CHECKING([for the pthreads library -l$flag])
-                PTHREAD_LIBS="-l$flag"
+                AC_MSG_CHECKING([for the pthreads library -l$ax_pthread_try_flag])
+                PTHREAD_LIBS="-l$ax_pthread_try_flag"
                 ;;
         esac
 
-        save_LIBS="$LIBS"
-        save_CFLAGS="$CFLAGS"
+        ax_pthread_save_CFLAGS="$CFLAGS"
+        ax_pthread_save_LIBS="$LIBS"
+        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
         LIBS="$PTHREAD_LIBS $LIBS"
-        CFLAGS="$CFLAGS $PTHREAD_CFLAGS $ax_pthread_extra_flags"
 
         # Check for various functions.  We must include pthread.h,
         # since some functions may be macros.  (On the Sequent, we
@@ -218,7 +366,11 @@ for flag in $ax_pthread_flags; do
         # pthread_cleanup_push because it is one of the few pthread
         # functions on Solaris that doesn't have a non-functional libc stub.
         # We try pthread_create on general principles.
+
         AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <pthread.h>
+#                       if $ax_pthread_check_cond
+#                        error "$ax_pthread_check_macro must be defined"
+#                       endif
                         static void routine(void *a) { a = 0; }
                         static void *start_routine(void *a) { return a; }],
                        [pthread_t th; pthread_attr_t attr;
@@ -227,16 +379,14 @@ for flag in $ax_pthread_flags; do
                         pthread_attr_init(&attr);
                         pthread_cleanup_push(routine, 0);
                         pthread_cleanup_pop(0) /* ; */])],
-                [ax_pthread_ok=yes],
-                [])
+            [ax_pthread_ok=yes],
+            [])
 
-        LIBS="$save_LIBS"
-        CFLAGS="$save_CFLAGS"
+        CFLAGS="$ax_pthread_save_CFLAGS"
+        LIBS="$ax_pthread_save_LIBS"
 
         AC_MSG_RESULT([$ax_pthread_ok])
-        if test "x$ax_pthread_ok" = xyes; then
-                break;
-        fi
+        AS_IF([test "x$ax_pthread_ok" = "xyes"], [break])
 
         PTHREAD_LIBS=""
         PTHREAD_CFLAGS=""
@@ -244,71 +394,74 @@ done
 fi
 
 # Various other checks:
-if test "x$ax_pthread_ok" = xyes; then
-        save_LIBS="$LIBS"
-        LIBS="$PTHREAD_LIBS $LIBS"
-        save_CFLAGS="$CFLAGS"
+if test "x$ax_pthread_ok" = "xyes"; then
+        ax_pthread_save_CFLAGS="$CFLAGS"
+        ax_pthread_save_LIBS="$LIBS"
         CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+        LIBS="$PTHREAD_LIBS $LIBS"
 
         # Detect AIX lossage: JOINABLE attribute is called UNDETACHED.
-        AC_MSG_CHECKING([for joinable pthread attribute])
-        attr_name=unknown
-        for attr in PTHREAD_CREATE_JOINABLE PTHREAD_CREATE_UNDETACHED; do
-            AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <pthread.h>],
-                           [int attr = $attr; return attr /* ; */])],
-                [attr_name=$attr; break],
-                [])
-        done
-        AC_MSG_RESULT([$attr_name])
-        if test "$attr_name" != PTHREAD_CREATE_JOINABLE; then
-            AC_DEFINE_UNQUOTED([PTHREAD_CREATE_JOINABLE], [$attr_name],
-                               [Define to necessary symbol if this constant
-                                uses a non-standard name on your system.])
-        fi
-
-        AC_MSG_CHECKING([if more special flags are required for pthreads])
-        flag=no
-        case ${host_os} in
-            aix* | freebsd* | darwin*) flag="-D_THREAD_SAFE";;
-            osf* | hpux*) flag="-D_REENTRANT";;
-            solaris*)
-            if test "$GCC" = "yes"; then
-                flag="-D_REENTRANT"
-            else
-                # TODO: What about Clang on Solaris?
-                flag="-mt -D_REENTRANT"
-            fi
-            ;;
-        esac
-        AC_MSG_RESULT([$flag])
-        if test "x$flag" != xno; then
-            PTHREAD_CFLAGS="$flag $PTHREAD_CFLAGS"
-        fi
+        AC_CACHE_CHECK([for joinable pthread attribute],
+            [ax_cv_PTHREAD_JOINABLE_ATTR],
+            [ax_cv_PTHREAD_JOINABLE_ATTR=unknown
+             for ax_pthread_attr in PTHREAD_CREATE_JOINABLE PTHREAD_CREATE_UNDETACHED; do
+                 AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <pthread.h>],
+                                                 [int attr = $ax_pthread_attr; return attr /* ; */])],
+                                [ax_cv_PTHREAD_JOINABLE_ATTR=$ax_pthread_attr; break],
+                                [])
+             done
+            ])
+        AS_IF([test "x$ax_cv_PTHREAD_JOINABLE_ATTR" != "xunknown" && \
+               test "x$ax_cv_PTHREAD_JOINABLE_ATTR" != "xPTHREAD_CREATE_JOINABLE" && \
+               test "x$ax_pthread_joinable_attr_defined" != "xyes"],
+              [AC_DEFINE_UNQUOTED([PTHREAD_CREATE_JOINABLE],
+                                  [$ax_cv_PTHREAD_JOINABLE_ATTR],
+                                  [Define to necessary symbol if this constant
+                                   uses a non-standard name on your system.])
+               ax_pthread_joinable_attr_defined=yes
+              ])
+
+        AC_CACHE_CHECK([whether more special flags are required for pthreads],
+            [ax_cv_PTHREAD_SPECIAL_FLAGS],
+            [ax_cv_PTHREAD_SPECIAL_FLAGS=no
+             case $host_os in
+             solaris*)
+             ax_cv_PTHREAD_SPECIAL_FLAGS="-D_POSIX_PTHREAD_SEMANTICS"
+             ;;
+             esac
+            ])
+        AS_IF([test "x$ax_cv_PTHREAD_SPECIAL_FLAGS" != "xno" && \
+               test "x$ax_pthread_special_flags_added" != "xyes"],
+              [PTHREAD_CFLAGS="$ax_cv_PTHREAD_SPECIAL_FLAGS $PTHREAD_CFLAGS"
+               ax_pthread_special_flags_added=yes])
 
         AC_CACHE_CHECK([for PTHREAD_PRIO_INHERIT],
-            [ax_cv_PTHREAD_PRIO_INHERIT], [
-                AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include <pthread.h>]],
-                                                [[int i = PTHREAD_PRIO_INHERIT;]])],
-                    [ax_cv_PTHREAD_PRIO_INHERIT=yes],
-                    [ax_cv_PTHREAD_PRIO_INHERIT=no])
+            [ax_cv_PTHREAD_PRIO_INHERIT],
+            [AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include <pthread.h>]],
+                                             [[int i = PTHREAD_PRIO_INHERIT;]])],
+                            [ax_cv_PTHREAD_PRIO_INHERIT=yes],
+                            [ax_cv_PTHREAD_PRIO_INHERIT=no])
             ])
-        AS_IF([test "x$ax_cv_PTHREAD_PRIO_INHERIT" = "xyes"],
-            [AC_DEFINE([HAVE_PTHREAD_PRIO_INHERIT], [1], [Have PTHREAD_PRIO_INHERIT.])])
+        AS_IF([test "x$ax_cv_PTHREAD_PRIO_INHERIT" = "xyes" && \
+               test "x$ax_pthread_prio_inherit_defined" != "xyes"],
+              [AC_DEFINE([HAVE_PTHREAD_PRIO_INHERIT], [1], [Have PTHREAD_PRIO_INHERIT.])
+               ax_pthread_prio_inherit_defined=yes
+              ])
 
-        LIBS="$save_LIBS"
-        CFLAGS="$save_CFLAGS"
+        CFLAGS="$ax_pthread_save_CFLAGS"
+        LIBS="$ax_pthread_save_LIBS"
 
         # More AIX lossage: compile with *_r variant
-        if test "x$GCC" != xyes; then
+        if test "x$GCC" != "xyes"; then
             case $host_os in
                 aix*)
                 AS_CASE(["x/$CC"],
-                  [x*/c89|x*/c89_128|x*/c99|x*/c99_128|x*/cc|x*/cc128|x*/xlc|x*/xlc_v6|x*/xlc128|x*/xlc128_v6],
-                  [#handle absolute path differently from PATH based program lookup
-                   AS_CASE(["x$CC"],
-                     [x/*],
-                     [AS_IF([AS_EXECUTABLE_P([${CC}_r])],[PTHREAD_CC="${CC}_r"])],
-                     [AC_CHECK_PROGS([PTHREAD_CC],[${CC}_r],[$CC])])])
+                    [x*/c89|x*/c89_128|x*/c99|x*/c99_128|x*/cc|x*/cc128|x*/xlc|x*/xlc_v6|x*/xlc128|x*/xlc128_v6],
+                    [#handle absolute path differently from PATH based program lookup
+                     AS_CASE(["x$CC"],
+                         [x/*],
+                         [AS_IF([AS_EXECUTABLE_P([${CC}_r])],[PTHREAD_CC="${CC}_r"])],
+                         [AC_CHECK_PROGS([PTHREAD_CC],[${CC}_r],[$CC])])])
                 ;;
             esac
         fi
@@ -321,7 +474,7 @@ AC_SUBST([PTHREAD_CFLAGS])
 AC_SUBST([PTHREAD_CC])
 
 # Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND:
-if test x"$ax_pthread_ok" = xyes; then
+if test "x$ax_pthread_ok" = "xyes"; then
         ifelse([$1],,[AC_DEFINE([HAVE_PTHREAD],[1],[Define if you have POSIX threads libraries and header files.])],[$1])
         :
 else
diff --git a/src/Makefile.am b/src/Makefile.am
index ec01184928faf3d58b2d0890965a745d05718354..e229d7f11e76aae4b6898a2f86145ec2bd025592 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -57,10 +57,10 @@ AM_SOURCES = space.c runner.c queue.c task.c cell.c engine.c \
     runner_doiact_fft.c threadpool.c cooling.c sourceterms.c \
     statistics.c runner_doiact_vec.c profiler.c dump.c logger.c \
     part_type.c xmf.c gravity_properties.c gravity.c \
-    collectgroup.c hydro_space.c
+    collectgroup.c hydro_space.c equation_of_state.c
 
 # Include files for distribution, not installation.
-nobase_noinst_HEADERS = align.h approx_math.h atomic.h cycle.h error.h inline.h kernel_hydro.h kernel_gravity.h \
+nobase_noinst_HEADERS = align.h approx_math.h atomic.h barrier.h cycle.h error.h inline.h kernel_hydro.h kernel_gravity.h \
 		 kernel_long_gravity.h vector.h cache.h runner_doiact.h runner_doiact_vec.h runner_doiact_grav.h runner_doiact_fft.h \
                  runner_doiact_nosort.h units.h intrinsics.h minmax.h kick.h timestep.h drift.h adiabatic_index.h io_properties.h \
 		 dimension.h equation_of_state.h part_type.h periodic.h \
diff --git a/src/barrier.h b/src/barrier.h
new file mode 100644
index 0000000000000000000000000000000000000000..dbe856b402580f859042023cd423bd955887f790
--- /dev/null
+++ b/src/barrier.h
@@ -0,0 +1,170 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (c) 2017 Matthieu Schaller (matthieu.schaller@durham.ac.uk).
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+#ifndef SWIFT_BARRIER_H
+#define SWIFT_BARRIER_H
+
+/**
+ * @file barrier.h
+ * @brief Define the thread barriers if the POSIX implementation on this system
+ * does not.
+ *
+ * The pthread barriers are only an option of the POSIX norm and they are not
+ * necessarily implemented. One example is OSX where all the rest of POSIX
+ * exists but not the barriers.
+ * We implement them here in a simple way to allow for SWIFT to run on such
+ * systems but this may lead to poorer performance.
+ *
+ * Note that we only define the three functions we need. This is a textbook
+ * implementation of a barrier that uses the common POSIX features (mutex,
+ * conditions and broadcasts).
+ *
+ * If the pthread barriers exist (Linux systems), we default to them.
+ */
+
+/* Config parameters. */
+#include "../config.h"
+
+/* Standard headers */
+#include <pthread.h>
+
+/* Does this POSIX implementation provide barriers? */
+#ifdef HAVE_PTHREAD_BARRIERS
+
+#define swift_barrier_t pthread_barrier_t
+#define swift_barrier_wait pthread_barrier_wait
+#define swift_barrier_init pthread_barrier_init
+#define swift_barrier_destroy pthread_barrier_destroy
+
+#else
+
+/* Local headers */
+#include "error.h"
+#include "inline.h"
+
+/**
+ * @brief An ersatz of POSIX barriers to be used on systems that don't provide
+ * the good ones.
+ */
+typedef struct {
+
+  /*! Barrier mutex */
+  pthread_mutex_t mutex;
+
+  /*! Condition to open the barrier */
+  pthread_cond_t condition;
+
+  /*! Total number of threads */
+  int limit;
+
+  /*! Number of threads that reached the barrier */
+  int count;
+
+} swift_barrier_t;
+
+/**
+ * @brief Initialise a barrier object.
+ *
+ * @param barrier The #swift_barrier_t to initialise
+ * @param unused Unused parameter (NULL) as we don't support barrier attributes.
+ * @param count The number of threads that will wait at the barrier.
+ */
+static INLINE int swift_barrier_init(swift_barrier_t *barrier, void *unused,
+                                     unsigned int count) {
+  /* Initialise the mutex */
+  if (pthread_mutex_init(&barrier->mutex, 0) != 0)
+    error("Error initializing the barrier mutex");
+
+  /* Initialise the condition */
+  if (pthread_cond_init(&barrier->condition, 0) != 0)
+    error("Error initializing the barrier condition");
+
+  barrier->limit = count;
+  barrier->count = 0;
+
+  /* All is good */
+  return 0;
+}
+
+/**
+ * @brief Make a set of threads wait at the barrier
+ *
+ * Note that once all threads have reached the barrier, we also
+ * reset the barrier to state where it is ready to be re-used
+ * without calling swift_barrier_init.
+ *
+ * @param barrier The (initialised) #swift_barrier_t to wait at.
+ */
+static INLINE int swift_barrier_wait(swift_barrier_t *barrier) {
+
+  /* Start by locking the barrier */
+  pthread_mutex_lock(&barrier->mutex);
+
+  /* One more thread has gone home*/
+  barrier->count++;
+
+  /* Are threads still running? */
+  if (barrier->count < barrier->limit) {
+
+    /* We need to make the thread wait until everyone is back */
+    pthread_cond_wait(&barrier->condition, &(barrier->mutex));
+
+    /* Release the mutex */
+    pthread_mutex_unlock(&barrier->mutex);
+
+    /* Say that this was not the last thread */
+    return 0;
+
+  } else { /* Everybody is home */
+
+    /* Open the barrier (i.e. release the threads blocked in the while loop) */
+    pthread_cond_broadcast(&barrier->condition);
+
+    /* Re-initialize the barrier */
+    barrier->count = 0;
+
+    /* Release the mutex */
+    pthread_mutex_unlock(&barrier->mutex);
+
+    /* Say that we are all done */
+    return 1;
+  }
+}
+
+/**
+ * @brief Destroy a barrier object
+ *
+ * Note that if destroy is called before a barrier is open, we return
+ * an error message and do not attempt to wait for the barrier to open
+ * before destroying it.
+ *
+ * @param barrier The #swift_barrier_t object to destroy.
+ */
+static INLINE int swift_barrier_destroy(swift_barrier_t *barrier) {
+
+  /* Destroy the pthread things */
+  pthread_cond_destroy(&barrier->condition);
+  pthread_mutex_destroy(&barrier->mutex);
+
+  /* All is good */
+  return 0;
+}
+
+#endif /* HAVE_PTHREAD_BARRIERS */
+
+#endif /* SWIFT_BARRIER_H */
diff --git a/src/cache.h b/src/cache.h
index 3eb1e194dd4232319ac1d4a4323ca8099f044063..c939da28589c1421e0e4241dca124a8320d5c87b 100644
--- a/src/cache.h
+++ b/src/cache.h
@@ -349,10 +349,12 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
     y[i] = (float)(parts_i[idx].x[1] - total_ci_shift[1]);
     z[i] = (float)(parts_i[idx].x[2] - total_ci_shift[2]);
     h[i] = parts_i[idx].h;
-    m[i] = parts_i[idx].mass;
     vx[i] = parts_i[idx].v[0];
     vy[i] = parts_i[idx].v[1];
     vz[i] = parts_i[idx].v[2];
+#ifdef GADGET2_SPH
+    m[i] = parts_i[idx].mass;
+#endif
   }
 
 #ifdef SWIFT_DEBUG_CHECKS
@@ -431,10 +433,12 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
     yj[i] = (float)(parts_j[idx].x[1] - total_cj_shift[1]);
     zj[i] = (float)(parts_j[idx].x[2] - total_cj_shift[2]);
     hj[i] = parts_j[idx].h;
-    mj[i] = parts_j[idx].mass;
     vxj[i] = parts_j[idx].v[0];
     vyj[i] = parts_j[idx].v[1];
     vzj[i] = parts_j[idx].v[2];
+#ifdef GADGET2_SPH
+    mj[i] = parts_j[idx].mass;
+#endif
   }
 
 #ifdef SWIFT_DEBUG_CHECKS
@@ -572,15 +576,17 @@ cache_read_two_partial_cells_sorted_force(
     y[i] = (float)(parts_i[idx].x[1] - total_ci_shift[1]);
     z[i] = (float)(parts_i[idx].x[2] - total_ci_shift[2]);
     h[i] = parts_i[idx].h;
-    m[i] = parts_i[idx].mass;
     vx[i] = parts_i[idx].v[0];
     vy[i] = parts_i[idx].v[1];
     vz[i] = parts_i[idx].v[2];
+#ifdef GADGET2_SPH
+    m[i] = parts_i[idx].mass;
     rho[i] = parts_i[idx].rho;
     grad_h[i] = parts_i[idx].force.f;
     pOrho2[i] = parts_i[idx].force.P_over_rho2;
     balsara[i] = parts_i[idx].force.balsara;
     soundspeed[i] = parts_i[idx].force.soundspeed;
+#endif
   }
 
   /* Pad cache with fake particles that exist outside the cell so will not
@@ -635,15 +641,17 @@ cache_read_two_partial_cells_sorted_force(
     yj[i] = (float)(parts_j[idx].x[1] - total_cj_shift[1]);
     zj[i] = (float)(parts_j[idx].x[2] - total_cj_shift[2]);
     hj[i] = parts_j[idx].h;
-    mj[i] = parts_j[idx].mass;
     vxj[i] = parts_j[idx].v[0];
     vyj[i] = parts_j[idx].v[1];
     vzj[i] = parts_j[idx].v[2];
+#ifdef GADGET2_SPH
+    mj[i] = parts_j[idx].mass;
     rhoj[i] = parts_j[idx].rho;
     grad_hj[i] = parts_j[idx].force.f;
     pOrho2j[i] = parts_j[idx].force.P_over_rho2;
     balsaraj[i] = parts_j[idx].force.balsara;
     soundspeedj[i] = parts_j[idx].force.soundspeed;
+#endif
   }
 
   /* Pad cache with fake particles that exist outside the cell so will not
diff --git a/src/cell.c b/src/cell.c
index 8d6c6b35c68b4af27ed7c836b1618e9d7857d9dd..9c5d1bd3aae8a0982db79c8fd370354c5c840dac 100644
--- a/src/cell.c
+++ b/src/cell.c
@@ -50,6 +50,7 @@
 #include "active.h"
 #include "atomic.h"
 #include "drift.h"
+#include "engine.h"
 #include "error.h"
 #include "gravity.h"
 #include "hydro.h"
@@ -1862,6 +1863,7 @@ void cell_activate_subcell_external_grav_tasks(struct cell *ci,
 int cell_unskip_tasks(struct cell *c, struct scheduler *s) {
 
   struct engine *e = s->space->e;
+  const int nodeID = e->nodeID;
   int rebuild = 0;
 
   /* Un-skip the density tasks involved with this cell. */
@@ -1873,13 +1875,13 @@ int cell_unskip_tasks(struct cell *c, struct scheduler *s) {
     const int cj_active = (cj != NULL) ? cell_is_active(cj, e) : 0;
 
     /* Only activate tasks that involve a local active cell. */
-    if ((ci_active && ci->nodeID == engine_rank) ||
-        (cj_active && cj->nodeID == engine_rank)) {
+    if ((ci_active && ci->nodeID == nodeID) ||
+        (cj_active && cj->nodeID == nodeID)) {
       scheduler_activate(s, t);
 
       /* Activate hydro drift */
       if (t->type == task_type_self) {
-        if (ci->nodeID == engine_rank) cell_activate_drift_part(ci, s);
+        if (ci->nodeID == nodeID) cell_activate_drift_part(ci, s);
       }
 
       /* Set the correct sorting flags and activate hydro drifts */
@@ -1891,8 +1893,8 @@ int cell_unskip_tasks(struct cell *c, struct scheduler *s) {
         cj->dx_max_sort_old = cj->dx_max_sort;
 
         /* Activate the drift tasks. */
-        if (ci->nodeID == engine_rank) cell_activate_drift_part(ci, s);
-        if (cj->nodeID == engine_rank) cell_activate_drift_part(cj, s);
+        if (ci->nodeID == nodeID) cell_activate_drift_part(ci, s);
+        if (cj->nodeID == nodeID) cell_activate_drift_part(cj, s);
 
         /* Check the sorts and activate them if needed. */
         cell_activate_sorts(ci, t->flags, s);
@@ -1913,7 +1915,7 @@ int cell_unskip_tasks(struct cell *c, struct scheduler *s) {
 
 #ifdef WITH_MPI
       /* Activate the send/recv tasks. */
-      if (ci->nodeID != engine_rank) {
+      if (ci->nodeID != nodeID) {
 
         /* If the local cell is active, receive data from the foreign cell. */
         if (cj_active) {
@@ -1951,7 +1953,7 @@ int cell_unskip_tasks(struct cell *c, struct scheduler *s) {
         /* If the local cell is active, send its ti_end values. */
         if (cj_active) scheduler_activate_send(s, cj->send_ti, ci->nodeID);
 
-      } else if (cj->nodeID != engine_rank) {
+      } else if (cj->nodeID != nodeID) {
 
         /* If the local cell is active, receive data from the foreign cell. */
         if (ci_active) {
@@ -2001,8 +2003,8 @@ int cell_unskip_tasks(struct cell *c, struct scheduler *s) {
     struct cell *cj = t->cj;
 
     /* Only activate tasks that involve a local active cell. */
-    if ((cell_is_active(ci, e) && ci->nodeID == engine_rank) ||
-        (cj != NULL && cell_is_active(cj, e) && cj->nodeID == engine_rank)) {
+    if ((cell_is_active(ci, e) && ci->nodeID == nodeID) ||
+        (cj != NULL && cell_is_active(cj, e) && cj->nodeID == nodeID)) {
       scheduler_activate(s, t);
 
       /* Set the drifting flags */
@@ -2018,7 +2020,7 @@ int cell_unskip_tasks(struct cell *c, struct scheduler *s) {
   }
 
   /* Unskip all the other task types. */
-  if (c->nodeID == engine_rank && cell_is_active(c, e)) {
+  if (c->nodeID == nodeID && cell_is_active(c, e)) {
 
     for (struct link *l = c->gradient; l != NULL; l = l->next)
       scheduler_activate(s, l->t);
diff --git a/src/common_io.c b/src/common_io.c
index 1b0770b1132c8c2a62bbe0bcf3426f8f6eef4a17..47e04b868b9d610983b977bd6455c1c78d70bb41 100644
--- a/src/common_io.c
+++ b/src/common_io.c
@@ -40,10 +40,13 @@
 #include "common_io.h"
 
 /* Local includes. */
+#include "engine.h"
 #include "error.h"
 #include "hydro.h"
+#include "io_properties.h"
 #include "kernel_hydro.h"
 #include "part.h"
+#include "threadpool.h"
 #include "units.h"
 #include "version.h"
 
@@ -266,7 +269,6 @@ void io_write_attribute_f(hid_t grp, const char* name, float data) {
  * @param name The name of the attribute
  * @param data The value to write
  */
-
 void io_write_attribute_i(hid_t grp, const char* name, int data) {
   io_write_attribute(grp, name, INT, &data, 1);
 }
@@ -295,16 +297,19 @@ void io_write_attribute_s(hid_t grp, const char* name, const char* str) {
  * @brief Reads the Unit System from an IC file.
  * @param h_file The (opened) HDF5 file from which to read.
  * @param us The unit_system to fill.
+ * @param mpi_rank The MPI rank we are on.
  *
  * If the 'Units' group does not exist in the ICs, cgs units will be assumed
  */
-void io_read_unit_system(hid_t h_file, struct unit_system* us) {
+void io_read_unit_system(hid_t h_file, struct unit_system* us, int mpi_rank) {
 
   /* First check if it exists as this is *not* required. */
   const htri_t exists = H5Lexists(h_file, "/Units", H5P_DEFAULT);
 
   if (exists == 0) {
-    message("'Units' group not found in ICs. Assuming CGS unit system.");
+
+    if (mpi_rank == 0)
+      message("'Units' group not found in ICs. Assuming CGS unit system.");
 
     /* Default to CGS */
     us->UnitMass_in_cgs = 1.;
@@ -319,7 +324,7 @@ void io_read_unit_system(hid_t h_file, struct unit_system* us) {
           exists);
   }
 
-  message("Reading IC units from ICs.");
+  if (mpi_rank == 0) message("Reading IC units from ICs.");
   hid_t h_grp = H5Gopen(h_file, "/Units", H5P_DEFAULT);
 
   /* Ok, Read the damn thing */
@@ -386,6 +391,7 @@ void io_write_code_description(hid_t h_file) {
                        configuration_options());
   io_write_attribute_s(h_grpcode, "CFLAGS", compilation_cflags());
   io_write_attribute_s(h_grpcode, "HDF5 library version", hdf5_version());
+  io_write_attribute_s(h_grpcode, "Thread barriers", thread_barrier_version());
 #ifdef HAVE_FFTW
   io_write_attribute_s(h_grpcode, "FFTW library version", fftw3_version());
 #endif
@@ -400,6 +406,208 @@ void io_write_code_description(hid_t h_file) {
   H5Gclose(h_grpcode);
 }
 
+/**
+ * @brief Mapper function to copy #part or #gpart fields into a buffer.
+ */
+void io_copy_mapper(void* restrict temp, int N, void* restrict extra_data) {
+
+  const struct io_props props = *((const struct io_props*)extra_data);
+  const size_t typeSize = io_sizeof_type(props.type);
+  const size_t copySize = typeSize * props.dimension;
+
+  /* How far are we with this chunk? */
+  char* restrict temp_c = temp;
+  const ptrdiff_t delta = (temp_c - props.start_temp_c) / copySize;
+
+  for (int k = 0; k < N; k++) {
+    memcpy(&temp_c[k * copySize], props.field + (delta + k) * props.partSize,
+           copySize);
+  }
+}
+
+/**
+ * @brief Mapper function to copy #part into a buffer of floats using a
+ * conversion function.
+ */
+void io_convert_part_f_mapper(void* restrict temp, int N,
+                              void* restrict extra_data) {
+
+  const struct io_props props = *((const struct io_props*)extra_data);
+  const struct part* restrict parts = props.parts;
+  const struct engine* e = props.e;
+  const size_t dim = props.dimension;
+
+  /* How far are we with this chunk? */
+  float* restrict temp_f = temp;
+  const ptrdiff_t delta = (temp_f - props.start_temp_f) / dim;
+
+  for (int i = 0; i < N; i++)
+    props.convert_part_f(e, parts + delta + i, &temp_f[i * dim]);
+}
+
+/**
+ * @brief Mapper function to copy #part into a buffer of doubles using a
+ * conversion function.
+ */
+void io_convert_part_d_mapper(void* restrict temp, int N,
+                              void* restrict extra_data) {
+
+  const struct io_props props = *((const struct io_props*)extra_data);
+  const struct part* restrict parts = props.parts;
+  const struct engine* e = props.e;
+  const size_t dim = props.dimension;
+
+  /* How far are we with this chunk? */
+  double* restrict temp_d = temp;
+  const ptrdiff_t delta = (temp_d - props.start_temp_d) / dim;
+
+  for (int i = 0; i < N; i++)
+    props.convert_part_d(e, parts + delta + i, &temp_d[i * dim]);
+}
+
+/**
+ * @brief Mapper function to copy #gpart into a buffer of floats using a
+ * conversion function.
+ */
+void io_convert_gpart_f_mapper(void* restrict temp, int N,
+                               void* restrict extra_data) {
+
+  const struct io_props props = *((const struct io_props*)extra_data);
+  const struct gpart* restrict gparts = props.gparts;
+  const struct engine* e = props.e;
+  const size_t dim = props.dimension;
+
+  /* How far are we with this chunk? */
+  float* restrict temp_f = temp;
+  const ptrdiff_t delta = (temp_f - props.start_temp_f) / dim;
+
+  for (int i = 0; i < N; i++)
+    props.convert_gpart_f(e, gparts + delta + i, &temp_f[i * dim]);
+}
+
+/**
+ * @brief Mapper function to copy #gpart into a buffer of doubles using a
+ * conversion function.
+ */
+void io_convert_gpart_d_mapper(void* restrict temp, int N,
+                               void* restrict extra_data) {
+
+  const struct io_props props = *((const struct io_props*)extra_data);
+  const struct gpart* restrict gparts = props.gparts;
+  const struct engine* e = props.e;
+  const size_t dim = props.dimension;
+
+  /* How far are we with this chunk? */
+  double* restrict temp_d = temp;
+  const ptrdiff_t delta = (temp_d - props.start_temp_d) / dim;
+
+  for (int i = 0; i < N; i++)
+    props.convert_gpart_d(e, gparts + delta + i, &temp_d[i * dim]);
+}
+
+/**
+ * @brief Copy the particle data into a temporary buffer ready for i/o.
+ *
+ * @param temp The buffer to be filled. Must be allocated and aligned properly.
+ * @param e The #engine.
+ * @param props The #io_props corresponding to the particle field we are
+ * copying.
+ * @param N The number of particles to copy
+ * @param internal_units The system of units used internally.
+ * @param snapshot_units The system of units used for the snapshots.
+ */
+void io_copy_temp_buffer(void* temp, const struct engine* e,
+                         struct io_props props, size_t N,
+                         const struct unit_system* internal_units,
+                         const struct unit_system* snapshot_units) {
+
+  const size_t typeSize = io_sizeof_type(props.type);
+  const size_t copySize = typeSize * props.dimension;
+  const size_t num_elements = N * props.dimension;
+
+  /* Copy particle data to temporary buffer */
+  if (props.conversion == 0) { /* No conversion */
+
+    /* Prepare some parameters */
+    char* temp_c = temp;
+    props.start_temp_c = temp_c;
+
+    /* Copy the whole thing into a buffer */
+    threadpool_map((struct threadpool*)&e->threadpool, io_copy_mapper, temp_c,
+                   N, copySize, 0, (void*)&props);
+
+  } else { /* Converting particle to data */
+
+    if (props.convert_part_f != NULL) {
+
+      /* Prepare some parameters */
+      float* temp_f = temp;
+      props.start_temp_f = temp;
+      props.e = e;
+
+      /* Copy the whole thing into a buffer */
+      threadpool_map((struct threadpool*)&e->threadpool,
+                     io_convert_part_f_mapper, temp_f, N, copySize, 0,
+                     (void*)&props);
+
+    } else if (props.convert_part_d != NULL) {
+
+      /* Prepare some parameters */
+      double* temp_d = temp;
+      props.start_temp_d = temp;
+      props.e = e;
+
+      /* Copy the whole thing into a buffer */
+      threadpool_map((struct threadpool*)&e->threadpool,
+                     io_convert_part_d_mapper, temp_d, N, copySize, 0,
+                     (void*)&props);
+
+    } else if (props.convert_gpart_f != NULL) {
+
+      /* Prepare some parameters */
+      float* temp_f = temp;
+      props.start_temp_f = temp;
+      props.e = e;
+
+      /* Copy the whole thing into a buffer */
+      threadpool_map((struct threadpool*)&e->threadpool,
+                     io_convert_gpart_f_mapper, temp_f, N, copySize, 0,
+                     (void*)&props);
+
+    } else if (props.convert_gpart_d != NULL) {
+
+      /* Prepare some parameters */
+      double* temp_d = temp;
+      props.start_temp_d = temp;
+      props.e = e;
+
+      /* Copy the whole thing into a buffer */
+      threadpool_map((struct threadpool*)&e->threadpool,
+                     io_convert_gpart_d_mapper, temp_d, N, copySize, 0,
+                     (void*)&props);
+
+    } else {
+      error("Missing conversion function");
+    }
+  }
+
+  /* Unit conversion if necessary */
+  const double factor =
+      units_conversion_factor(internal_units, snapshot_units, props.units);
+  if (factor != 1.) {
+
+    /* message("Converting ! factor=%e", factor); */
+
+    if (io_is_double_precision(props.type)) {
+      swift_declare_aligned_ptr(double, temp_d, temp, IO_BUFFER_ALIGNMENT);
+      for (size_t i = 0; i < num_elements; ++i) temp_d[i] *= factor;
+    } else {
+      swift_declare_aligned_ptr(float, temp_f, temp, IO_BUFFER_ALIGNMENT);
+      for (size_t i = 0; i < num_elements; ++i) temp_f[i] *= factor;
+    }
+  }
+}
+
 #endif /* HAVE_HDF5 */
 
 /* ------------------------------------------------------------------------------------------------
diff --git a/src/common_io.h b/src/common_io.h
index 577d1664db20196c8d0c48ef5b5e4e960b0e195e..2e6d4247a5e2d0f558f6bfa601fd6c57d80e05d9 100644
--- a/src/common_io.h
+++ b/src/common_io.h
@@ -30,6 +30,11 @@
 #define FIELD_BUFFER_SIZE 200
 #define PARTICLE_GROUP_BUFFER_SIZE 50
 #define FILENAME_BUFFER_SIZE 150
+#define IO_BUFFER_ALIGNMENT 1024
+
+/* Avoid cyclic inclusion problems */
+struct io_props;
+struct engine;
 
 #if defined(HAVE_HDF5)
 
@@ -68,10 +73,15 @@ void io_write_attribute_s(hid_t grp, const char* name, const char* str);
 
 void io_write_code_description(hid_t h_file);
 
-void io_read_unit_system(hid_t h_file, struct unit_system* us);
+void io_read_unit_system(hid_t h_file, struct unit_system* us, int mpi_rank);
 void io_write_unit_system(hid_t h_grp, const struct unit_system* us,
                           const char* groupName);
 
+void io_copy_temp_buffer(void* temp, const struct engine* e,
+                         const struct io_props props, size_t N,
+                         const struct unit_system* internal_units,
+                         const struct unit_system* snapshot_units);
+
 #endif /* defined HDF5 */
 
 void io_collect_dm_gparts(const struct gpart* const gparts, size_t Ntot,
diff --git a/src/const.h b/src/const.h
index c8060a2be51468a791e192a65a74f1a4d9bc8e30..dd7e1e267246c0f73e760191a071c6e24b96cfe8 100644
--- a/src/const.h
+++ b/src/const.h
@@ -36,9 +36,6 @@
 /* Time integration constants. */
 #define const_max_u_change 0.1f
 
-/* Thermal energy per unit mass used as a constant for the isothermal EoS */
-#define const_isothermal_internal_energy 20.2678457288f
-
 /* Type of gradients to use (GIZMO_SPH only) */
 /* If no option is chosen, no gradients are used (first order scheme) */
 //#define GRADIENTS_SPH
diff --git a/src/debug.c b/src/debug.c
index 17048a16d0f6415eb66e6427a11611a5bef6c649..402234a27ef50bbc38c06f61dea96e48ff02c59a 100644
--- a/src/debug.c
+++ b/src/debug.c
@@ -274,10 +274,13 @@ int checkCellhdxmax(const struct cell *c, int *depth) {
  * only.
  */
 static void dumpCells_map(struct cell *c, void *data) {
-  uintptr_t *ldata = (uintptr_t *)data;
+  size_t *ldata = (size_t *)data;
   FILE *file = (FILE *)ldata[0];
   struct engine *e = (struct engine *)ldata[1];
   float ntasks = c->nr_tasks;
+  int active = (int)ldata[2];
+  int mpiactive = (int)ldata[3];
+  int pactive = (int)ldata[4];
 
 #if SWIFT_DEBUG_CHECKS
   /* The c->nr_tasks field does not include all the tasks. So let's check this
@@ -298,46 +301,94 @@ static void dumpCells_map(struct cell *c, void *data) {
   }
 #endif
 
-  /* Only locally active cells are dumped. */
-  if (c->count > 0 || c->gcount > 0 || c->scount > 0)
-    fprintf(file,
-            "  %6.3f %6.3f %6.3f %6.3f %6.3f %6.3f %6d %6d %6d %6d "
-            "%6.1f %20lld %6d %6d %6d %6d\n",
-            c->loc[0], c->loc[1], c->loc[2], c->width[0], c->width[1],
-            c->width[2], c->count, c->gcount, c->scount, c->depth, ntasks,
-            c->ti_end_min, get_time_bin(c->ti_end_min), (c->super == c),
-            cell_is_active(c, e), c->nodeID);
+  /* Only cells with particles are dumped. */
+  if (c->count > 0 || c->gcount > 0 || c->scount > 0) {
+
+/* In MPI mode we may only output cells with foreign partners.
+ * These define the edges of the partitions. */
+#if WITH_MPI
+    if (mpiactive)
+      mpiactive = (c->send_xv != NULL);
+    else
+      mpiactive = 1;
+#else
+    mpiactive = 1;
+#endif
+
+    /* Active cells, otherwise all. */
+    if (active)
+      active = cell_is_active(c, e);
+    else
+      active = 1;
+
+    /* So output local super cells that are active and have MPI tasks as
+     * requested. */
+    if (c->nodeID == e->nodeID && (c->super == c) && active && mpiactive) {
+
+      /* If requested we work out how many particles are active in this cell. */
+      int pactcount = 0;
+      if (pactive) {
+        const struct part *parts = c->parts;
+        for (int k = 0; k < c->count; k++)
+          if (part_is_active(&parts[k], e)) pactcount++;
+        struct gpart *gparts = c->gparts;
+        for (int k = 0; k < c->gcount; k++)
+          if (gpart_is_active(&gparts[k], e)) pactcount++;
+        struct spart *sparts = c->sparts;
+        for (int k = 0; k < c->scount; k++)
+          if (spart_is_active(&sparts[k], e)) pactcount++;
+      }
+
+      fprintf(file,
+              "  %6.3f %6.3f %6.3f %6.3f %6.3f %6.3f %6d %6d %6d %6d %6d %6d "
+              "%6.1f %20lld %6d %6d %6d %6d %6d\n",
+              c->loc[0], c->loc[1], c->loc[2], c->width[0], c->width[1],
+              c->width[2], e->step, c->count, c->gcount, c->scount, pactcount,
+              c->depth, ntasks, c->ti_end_min, get_time_bin(c->ti_end_min),
+              (c->super == c), cell_is_active(c, e), c->nodeID,
+              c->nodeID == e->nodeID);
+    }
+  }
 }
 
 /**
  * @brief Dump the location, depth, task counts and timebins and active state,
- * for all cells to a simple text file.
+ * for all cells to a simple text file. A more costly count of the active
+ * particles in a cell can also be output.
  *
- * @param prefix base output filename
+ * @param prefix base output filename, result is written to
+ *               %prefix%_%rank%_%step%.dat
+ * @param active just output active cells.
+ * @param mpiactive just output MPI active cells, i.e. those with foreign cells.
+ * @param pactive also output a count of active particles.
  * @param s the space holding the cells to dump.
+ * @param rank node ID of MPI rank, or 0 if not relevant.
+ * @param step the current engine step, or some unique integer.
  */
-void dumpCells(const char *prefix, struct space *s) {
+void dumpCells(const char *prefix, int active, int mpiactive, int pactive,
+               struct space *s, int rank, int step) {
 
   FILE *file = NULL;
 
   /* Name of output file. */
-  static int nseq = 0;
   char fname[200];
-  int uniq = atomic_inc(&nseq);
-  sprintf(fname, "%s_%03d.dat", prefix, uniq);
-
+  sprintf(fname, "%s_%03d_%03d.dat", prefix, rank, step);
   file = fopen(fname, "w");
 
   /* Header. */
   fprintf(file,
-          "# %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s "
-          "%20s %6s %6s %6s\n",
-          "x", "y", "z", "xw", "yw", "zw", "count", "gcount", "scount", "depth",
-          "tasks", "ti_end_min", "timebin", "issuper", "active", "rank");
+          "# %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s "
+          "%20s %6s %6s %6s %6s %6s\n",
+          "x", "y", "z", "xw", "yw", "zw", "step", "count", "gcount", "scount",
+          "actcount", "depth", "tasks", "ti_end_min", "timebin", "issuper",
+          "active", "rank", "local");
 
-  uintptr_t data[2];
+  size_t data[5];
   data[0] = (size_t)file;
   data[1] = (size_t)s->e;
+  data[2] = (size_t)active;
+  data[3] = (size_t)mpiactive;
+  data[4] = (size_t)pactive;
   space_map_cells_pre(s, 1, dumpCells_map, &data);
   fclose(file);
 }
diff --git a/src/debug.h b/src/debug.h
index 20ae8fc2c20fa458b7be82f75b2fbb3be9acd32f..2cff37fa6f6f03185dc769bb770fe3a98a424ce1 100644
--- a/src/debug.h
+++ b/src/debug.h
@@ -36,7 +36,8 @@ void printgParticle_single(struct gpart *gp);
 
 int checkSpacehmax(struct space *s);
 int checkCellhdxmax(const struct cell *c, int *depth);
-void dumpCells(const char *prefix, struct space *s);
+void dumpCells(const char *prefix, int active, int mpiactive, int pactive,
+               struct space *s, int rank, int step);
 
 #ifdef HAVE_METIS
 #include "metis.h"
diff --git a/src/dump.c b/src/dump.c
index 2c0cf221ebd897bab0d047c196ce8a2aeddc6eae..de5c5afb18bd2c3cb80fe983b0f88ee60350fd5e 100644
--- a/src/dump.c
+++ b/src/dump.c
@@ -20,6 +20,15 @@
 /* Config parameters. */
 #include "../config.h"
 
+#ifdef HAVE_POSIX_FALLOCATE
+
+/* This object's header. */
+#include "dump.h"
+
+/* Local headers. */
+#include "atomic.h"
+#include "error.h"
+
 /* Some standard headers. */
 #include <errno.h>
 #include <fcntl.h>
@@ -29,13 +38,6 @@
 #include <sys/types.h>
 #include <unistd.h>
 
-/* This object's header. */
-#include "dump.h"
-
-/* Local headers. */
-#include "atomic.h"
-#include "error.h"
-
 /**
  * @brief Obtain a chunk of memory from a dump.
  *
@@ -44,7 +46,6 @@
  * @param offset The offset of the returned memory address within the dump file.
  * @return A pointer to the memory-mapped chunk of data.
  */
-
 void *dump_get(struct dump *d, size_t count, size_t *offset) {
   size_t local_offset = atomic_add(&d->count, count);
   *offset = local_offset + d->file_offset;
@@ -54,7 +55,6 @@ void *dump_get(struct dump *d, size_t count, size_t *offset) {
 /**
  * @brief Ensure that at least size bytes are available in the #dump.
  */
-
 void dump_ensure(struct dump *d, size_t size) {
 
   /* If we have enough space already, just bail. */
@@ -88,7 +88,6 @@ void dump_ensure(struct dump *d, size_t size) {
 /**
  * @brief Flush the #dump to disk.
  */
-
 void dump_sync(struct dump *d) {
   if (msync(d->data, d->count, MS_SYNC) != 0)
     error("Failed to sync memory-mapped data.");
@@ -97,7 +96,6 @@ void dump_sync(struct dump *d) {
 /**
  * @brief Finalize the #dump.
  */
-
 void dump_close(struct dump *d) {
   /* Unmap the data in memory. */
   if (munmap(d->data, d->count) != 0) {
@@ -121,7 +119,6 @@ void dump_close(struct dump *d) {
  *                 note that it will be overwritten.
  * @param size The initial buffer size for this #dump.
  */
-
 void dump_init(struct dump *d, const char *filename, size_t size) {
 
   /* Create the output file. */
@@ -151,3 +148,5 @@ void dump_init(struct dump *d, const char *filename, size_t size) {
   d->file_offset = 0;
   d->page_mask = page_mask;
 }
+
+#endif
diff --git a/src/dump.h b/src/dump.h
index a7e934218c271d2f82b99d39f278e5af3047be6e..6857aa3a008a27e0e8ed23854d84f848ee0ca2be 100644
--- a/src/dump.h
+++ b/src/dump.h
@@ -19,8 +19,13 @@
 #ifndef SWIFT_DUMP_H
 #define SWIFT_DUMP_H
 
-/* Includes. */
-#include "lock.h"
+/* Config parameters. */
+#include "../config.h"
+
+#ifdef HAVE_POSIX_FALLOCATE /* Are we on a sensible platform? */
+
+/* Standard headers */
+#include <stdlib.h>
 
 /* Some constants. */
 #define dump_grow_ensure_factor 10
@@ -54,4 +59,6 @@ void dump_sync(struct dump *d);
 void dump_close(struct dump *d);
 void *dump_get(struct dump *d, size_t count, size_t *offset);
 
+#endif /* HAVE_POSIX_FALLOCATE */
+
 #endif /* SWIFT_DUMP_H */
diff --git a/src/engine.c b/src/engine.c
index 3f02b85f2b881522c8ff887a58b9813a7dd4d27b..88a86a74e9812f9e8eecbee286d7031b98707270 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -880,6 +880,9 @@ void engine_redistribute(struct engine *e) {
             nodeID, nr_parts, nr_sparts, nr_gparts, my_cells);
   }
 
+  /* Flag that a redistribute has taken place */
+  e->step_props |= engine_step_prop_redistribute;
+
   if (e->verbose)
     message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
             clocks_getunit());
@@ -945,6 +948,9 @@ void engine_repartition(struct engine *e) {
   /* Tell the engine it should re-build whenever possible */
   e->forcerebuild = 1;
 
+  /* Flag that a repartition has taken place */
+  e->step_props |= engine_step_prop_repartition;
+
   if (e->verbose)
     message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
             clocks_getunit());
@@ -2273,9 +2279,8 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
     /* Self-interaction? */
     else if (t->type == task_type_self && t->subtype == task_subtype_density) {
 
-      /* Make all density tasks depend on the drift and the sorts. */
+      /* Make the self-density tasks depend on the drift only. */
       scheduler_addunlock(sched, t->ci->super->drift_part, t);
-      scheduler_addunlock(sched, t->ci->super->sorts, t);
 
 #ifdef EXTRA_HYDRO_LOOP
       /* Start by constructing the task for the second  and third hydro loop. */
@@ -3098,6 +3103,9 @@ void engine_rebuild(struct engine *e, int clean_h_values) {
   /* Print the status of the system */
   // if (e->verbose) engine_print_task_counts(e);
 
+  /* Flag that a rebuild has taken place */
+  e->step_props |= engine_step_prop_rebuild;
+
   if (e->verbose)
     message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
             clocks_getunit());
@@ -3153,10 +3161,10 @@ void engine_prepare(struct engine *e) {
 void engine_barrier(struct engine *e) {
 
   /* Wait at the wait barrier. */
-  pthread_barrier_wait(&e->wait_barrier);
+  swift_barrier_wait(&e->wait_barrier);
 
   /* Wait at the run barrier. */
-  pthread_barrier_wait(&e->run_barrier);
+  swift_barrier_wait(&e->run_barrier);
 }
 
 /**
@@ -3472,7 +3480,7 @@ void engine_launch(struct engine *e) {
   atomic_inc(&e->sched.waiting);
 
   /* Cry havoc and let loose the dogs of war. */
-  pthread_barrier_wait(&e->run_barrier);
+  swift_barrier_wait(&e->run_barrier);
 
   /* Load the tasks. */
   scheduler_start(&e->sched);
@@ -3484,7 +3492,7 @@ void engine_launch(struct engine *e) {
   pthread_mutex_unlock(&e->sched.sleep_mutex);
 
   /* Sit back and wait for the runners to come home. */
-  pthread_barrier_wait(&e->wait_barrier);
+  swift_barrier_wait(&e->wait_barrier);
 
   if (e->verbose)
     message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
@@ -3527,10 +3535,9 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs,
   /* Print the number of active tasks ? */
   if (e->verbose) engine_print_task_counts(e);
 
-  /* Init the particle hydro data (by hand). */
-  for (size_t k = 0; k < s->nr_parts; k++)
-    hydro_init_part(&s->parts[k], &e->s->hs);
-  for (size_t k = 0; k < s->nr_gparts; k++) gravity_init_gpart(&s->gparts[k]);
+  /* Init the particle data (by hand). */
+  space_init_parts(s, e->verbose);
+  space_init_gparts(s, e->verbose);
 
   /* Now, launch the calculation */
   TIMER_TIC;
@@ -3542,9 +3549,7 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs,
 
     if (e->nodeID == 0) message("Converting internal energy variable.");
 
-    /* Apply the conversion */
-    for (size_t i = 0; i < s->nr_parts; ++i)
-      hydro_convert_quantities(&s->parts[i], &s->xparts[i]);
+    space_convert_quantities(e->s, e->verbose);
 
     /* Correct what we did (e.g. in PE-SPH, need to recompute rho_bar) */
     if (hydro_need_extra_init_loop) {
@@ -3577,10 +3582,9 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs,
   /* No drift this time */
   engine_skip_drift(e);
 
-  /* Init the particle hydro data (by hand). */
-  for (size_t k = 0; k < s->nr_parts; k++)
-    hydro_init_part(&s->parts[k], &e->s->hs);
-  for (size_t k = 0; k < s->nr_gparts; k++) gravity_init_gpart(&s->gparts[k]);
+  /* Init the particle data (by hand). */
+  space_init_parts(e->s, e->verbose);
+  space_init_gparts(e->s, e->verbose);
 
   /* Print the number of active tasks ? */
   if (e->verbose) engine_print_task_counts(e);
@@ -3591,6 +3595,8 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs,
     gravity_exact_force_compute(e->s, e);
 #endif
 
+  if (e->nodeID == 0) scheduler_write_dependencies(&e->sched, e->verbose);
+
   /* Run the 0th time-step */
   engine_launch(e);
 
@@ -3701,14 +3707,14 @@ void engine_step(struct engine *e) {
   if (e->nodeID == 0) {
 
     /* Print some information to the screen */
-    printf("  %6d %14e %14e %10zu %10zu %10zu %21.3f\n", e->step, e->time,
+    printf("  %6d %14e %14e %12zu %12zu %12zu %21.3f %6d\n", e->step, e->time,
            e->timeStep, e->updates, e->g_updates, e->s_updates,
-           e->wallclock_time);
+           e->wallclock_time, e->step_props);
     fflush(stdout);
 
-    fprintf(e->file_timesteps, "  %6d %14e %14e %10zu %10zu %10zu %21.3f\n",
+    fprintf(e->file_timesteps, "  %6d %14e %14e %12zu %12zu %12zu %21.3f %6d\n",
             e->step, e->time, e->timeStep, e->updates, e->g_updates,
-            e->s_updates, e->wallclock_time);
+            e->s_updates, e->wallclock_time, e->step_props);
     fflush(e->file_timesteps);
   }
 
@@ -3720,6 +3726,7 @@ void engine_step(struct engine *e) {
   e->time = e->ti_current * e->timeBase + e->timeBegin;
   e->timeOld = e->ti_old * e->timeBase + e->timeBegin;
   e->timeStep = (e->ti_current - e->ti_old) * e->timeBase;
+  e->step_props = engine_step_prop_none;
 
   /* Prepare the tasks to be launched, rebuild or repartition if needed. */
   engine_prepare(e);
@@ -3744,6 +3751,9 @@ void engine_step(struct engine *e) {
   /* Print the number of active tasks ? */
   if (e->verbose) engine_print_task_counts(e);
 
+/* Dump local cells and active particle counts. */
+/* dumpCells("cells", 0, 0, 1, e->s, e->nodeID, e->step); */
+
 #ifdef SWIFT_DEBUG_CHECKS
   /* Check that we have the correct total mass in the top-level multipoles */
   size_t num_gpart_mpole = 0;
@@ -3809,6 +3819,9 @@ void engine_step(struct engine *e) {
 
     /* ... and find the next output time */
     engine_compute_next_snapshot_time(e);
+
+    /* Flag that we dumped a snapshot */
+    e->step_props |= engine_step_prop_snapshot;
   }
 
   /* Save some  statistics */
@@ -3819,6 +3832,9 @@ void engine_step(struct engine *e) {
 
     /* and move on */
     e->timeLastStatistics += e->deltaTimeStatistics;
+
+    /* Flag that we dumped some statistics */
+    e->step_props |= engine_step_prop_statistics;
   }
 
   /* Now apply all the collected time step updates and particle counts. */
@@ -4357,6 +4373,7 @@ void engine_init(struct engine *e, struct space *s,
   e->reparttype = reparttype;
   e->dump_snapshot = 0;
   e->save_stats = 0;
+  e->step_props = engine_step_prop_none;
   e->links = NULL;
   e->nr_links = 0;
   e->timeBegin = parser_get_param_double(params, "TimeIntegration:time_begin");
@@ -4517,7 +4534,7 @@ void engine_init(struct engine *e, struct space *s,
   if (with_aff) engine_unpin();
 #endif
 
-  if (with_aff) {
+  if (with_aff && nodeID == 0) {
 #ifdef HAVE_SETAFFINITY
 #ifdef WITH_MPI
     printf("[%04i] %s engine_init: cpu map is [ ", nodeID,
@@ -4580,9 +4597,16 @@ void engine_init(struct engine *e, struct space *s,
             e->hydro_properties->delta_neighbours,
             e->hydro_properties->eta_neighbours);
 
-    fprintf(e->file_timesteps, "# %6s %14s %14s %10s %10s %10s %16s [%s]\n",
+    fprintf(e->file_timesteps,
+            "# Step Properties: Rebuild=%d, Redistribute=%d, Repartition=%d, "
+            "Statistics=%d, Snapshot=%d\n",
+            engine_step_prop_rebuild, engine_step_prop_redistribute,
+            engine_step_prop_repartition, engine_step_prop_statistics,
+            engine_step_prop_snapshot);
+
+    fprintf(e->file_timesteps, "# %6s %14s %14s %12s %12s %12s %16s [%s] %6s\n",
             "Step", "Time", "Time-step", "Updates", "g-Updates", "s-Updates",
-            "Wall-clock time", clocks_getunit());
+            "Wall-clock time", clocks_getunit(), "Props");
     fflush(e->file_timesteps);
   }
 
@@ -4667,8 +4691,8 @@ void engine_init(struct engine *e, struct space *s,
   threadpool_init(&e->threadpool, e->nr_threads);
 
   /* First of all, init the barrier and lock it. */
-  if (pthread_barrier_init(&e->wait_barrier, NULL, e->nr_threads + 1) != 0 ||
-      pthread_barrier_init(&e->run_barrier, NULL, e->nr_threads + 1) != 0)
+  if (swift_barrier_init(&e->wait_barrier, NULL, e->nr_threads + 1) != 0 ||
+      swift_barrier_init(&e->run_barrier, NULL, e->nr_threads + 1) != 0)
     error("Failed to initialize barrier.");
 
   /* Expected average for tasks per cell. If set to zero we use a heuristic
@@ -4681,6 +4705,12 @@ void engine_init(struct engine *e, struct space *s,
   scheduler_init(&e->sched, e->s, engine_estimate_nr_tasks(e), nr_queues,
                  (policy & scheduler_flag_steal), e->nodeID, &e->threadpool);
 
+  /* Maximum size of MPI task messages, in KB, that should not be buffered,
+   * that is sent using MPI_Issend, not MPI_Isend. 4Mb by default.
+   */
+  e->sched.mpi_message_limit =
+      parser_get_opt_param_int(params, "Scheduler:mpi_message_limit", 4) * 1024;
+
   /* Allocate and init the threads. */
   if ((e->runners = (struct runner *)malloc(sizeof(struct runner) *
                                             e->nr_threads)) == NULL)
@@ -4754,7 +4784,7 @@ void engine_init(struct engine *e, struct space *s,
 #endif
 
   /* Wait for the runner threads to be in place. */
-  pthread_barrier_wait(&e->wait_barrier);
+  swift_barrier_wait(&e->wait_barrier);
 }
 
 /**
diff --git a/src/engine.h b/src/engine.h
index 2b5a4eee7cde974fc555460c6483d02b21bd6c04..f80589ad3fad508f48a6fc3eda676a929b824c6e 100644
--- a/src/engine.h
+++ b/src/engine.h
@@ -32,11 +32,8 @@
 #include <mpi.h>
 #endif
 
-/* Some standard headers. */
-#include <pthread.h>
-#include <stdio.h>
-
 /* Includes. */
+#include "barrier.h"
 #include "clocks.h"
 #include "collectgroup.h"
 #include "cooling_struct.h"
@@ -51,7 +48,9 @@
 #include "task.h"
 #include "units.h"
 
-/* Some constants. */
+/**
+ * @brief The different policies the #engine can follow.
+ */
 enum engine_policy {
   engine_policy_none = 0,
   engine_policy_rand = (1 << 0),
@@ -74,7 +73,19 @@ enum engine_policy {
 #define engine_maxpolicy 15
 extern const char *engine_policy_names[];
 
-#define engine_queue_scale 1.2
+/**
+ * @brief The different unusual events that can take place in a time-step.
+ */
+enum engine_step_properties {
+  engine_step_prop_none = 0,
+  engine_step_prop_rebuild = (1 << 0),
+  engine_step_prop_redistribute = (1 << 1),
+  engine_step_prop_repartition = (1 << 2),
+  engine_step_prop_statistics = (1 << 3),
+  engine_step_prop_snapshot = (1 << 4)
+};
+
+/* Some constants */
 #define engine_maxproxies 64
 #define engine_tasksreweight 1
 #define engine_parts_size_grow 1.05
@@ -83,7 +94,9 @@ extern const char *engine_policy_names[];
 #define engine_default_timesteps_file_name "timesteps"
 #define engine_max_parts_per_ghost 1000
 
-/* The rank of the engine as a global variable (for messages). */
+/**
+ * @brief The rank of the engine as a global variable (for messages).
+ */
 extern int engine_rank;
 
 /* Data structure for the engine. */
@@ -143,9 +156,12 @@ struct engine {
   /* Maximal ti_beg for the next time-step */
   integertime_t ti_beg_max;
 
-  /* Number of particles updated */
+  /* Number of particles updated in the previous step */
   size_t updates, g_updates, s_updates;
 
+  /* Properties of the previous step */
+  int step_props;
+
   /* Total numbers of particles in the system. */
   size_t total_nr_parts, total_nr_gparts;
 
@@ -175,8 +191,8 @@ struct engine {
   int count_step;
 
   /* Data for the threads' barrier. */
-  pthread_barrier_t wait_barrier;
-  pthread_barrier_t run_barrier;
+  swift_barrier_t wait_barrier;
+  swift_barrier_t run_barrier;
 
   /* ID of the node this engine lives on. */
   int nr_nodes, nodeID;
diff --git a/src/equation_of_state.c b/src/equation_of_state.c
new file mode 100644
index 0000000000000000000000000000000000000000..a792bc58cf13fa5f710dc1cd80713ff90e700b18
--- /dev/null
+++ b/src/equation_of_state.c
@@ -0,0 +1,67 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (c) 2017 Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+
+/* This object's header. */
+#include "equation_of_state.h"
+
+/* local headers */
+#include "common_io.h"
+
+/* Equation of state for the physics model
+ * (temporary ugly solution as a global variable) */
+struct eos_parameters eos;
+
+void eos_init(struct eos_parameters *e, const struct swift_params *params) {
+
+#if defined(EOS_IDEAL_GAS)
+/* nothing to do here */
+#elif defined(EOS_ISOTHERMAL_GAS)
+  e->isothermal_internal_energy =
+      parser_get_param_float(params, "EoS:isothermal_internal_energy");
+#endif
+}
+
+void eos_print(const struct eos_parameters *e) {
+
+#if defined(EOS_IDEAL_GAS)
+  message("Equation of state: Ideal gas.");
+#elif defined(EOS_ISOTHERMAL_GAS)
+  message(
+      "Equation of state: Isothermal with internal energy "
+      "per unit mass set to %f.",
+      e->isothermal_internal_energy);
+#endif
+
+  message("Adiabatic index gamma: %f.", hydro_gamma);
+}
+
+#if defined(HAVE_HDF5)
+void eos_print_snapshot(hid_t h_grpsph, const struct eos_parameters *e) {
+
+  io_write_attribute_f(h_grpsph, "Adiabatic index", hydro_gamma);
+
+#if defined(EOS_IDEAL_GAS)
+  io_write_attribute_s(h_grpsph, "Equation of state", "Ideal gas");
+#elif defined(EOS_ISOTHERMAL_GAS)
+  io_write_attribute_s(h_grpsph, "Equation of state", "Isothermal gas");
+  io_write_attribute_f(h_grpsph, "Thermal energy per unit mass",
+                       e->isothermal_internal_energy);
+#endif
+}
+#endif
diff --git a/src/equation_of_state.h b/src/equation_of_state.h
index e51ed99519dc9c418e34789fcce95b5f28d69a99..76b8bb922133c9c4f29453a14068fe3f9044d66f 100644
--- a/src/equation_of_state.h
+++ b/src/equation_of_state.h
@@ -37,9 +37,13 @@
 #include "debug.h"
 #include "inline.h"
 
+extern struct eos_parameters eos;
+
 /* ------------------------------------------------------------------------- */
 #if defined(EOS_IDEAL_GAS)
 
+struct eos_parameters {};
+
 /**
  * @brief Returns the internal energy given density and entropy
  *
@@ -172,10 +176,17 @@ __attribute__((always_inline)) INLINE static float gas_soundspeed_from_pressure(
 /* ------------------------------------------------------------------------- */
 #elif defined(EOS_ISOTHERMAL_GAS)
 
+struct eos_parameters {
+
+  /*! Thermal energy per unit mass */
+  float isothermal_internal_energy;
+};
+
 /**
  * @brief Returns the internal energy given density and entropy
  *
- * Since we are using an isothermal EoS, the entropy value is ignored
+ * Since we are using an isothermal EoS, the entropy and density values are
+ * ignored.
  * Computes \f$u = u_{cst}\f$.
  *
  * @param density The density \f$\rho\f$.
@@ -184,12 +195,13 @@ __attribute__((always_inline)) INLINE static float gas_soundspeed_from_pressure(
 __attribute__((always_inline)) INLINE static float
 gas_internal_energy_from_entropy(float density, float entropy) {
 
-  return const_isothermal_internal_energy;
+  return eos.isothermal_internal_energy;
 }
+
 /**
  * @brief Returns the pressure given density and entropy
  *
- * Since we are using an isothermal EoS, the entropy value is ignored
+ * Since we are using an isothermal EoS, the entropy value is ignored.
  * Computes \f$P = (\gamma - 1)u_{cst}\rho\f$.
  *
  * @param density The density \f$\rho\f$.
@@ -198,13 +210,14 @@ gas_internal_energy_from_entropy(float density, float entropy) {
 __attribute__((always_inline)) INLINE static float gas_pressure_from_entropy(
     float density, float entropy) {
 
-  return hydro_gamma_minus_one * const_isothermal_internal_energy * density;
+  return hydro_gamma_minus_one * eos.isothermal_internal_energy * density;
 }
 
 /**
  * @brief Returns the entropy given density and pressure.
  *
- * Computes \f$A = \frac{P}{\rho^\gamma}\f$.
+ * Since we are using an isothermal EoS, the pressure value is ignored.
+ * Computes \f$A = (\gamma - 1)u_{cst}\rho^{-(\gamma-1)}\f$.
  *
  * @param density The density \f$\rho\f$.
  * @param pressure The pressure \f$P\f$ (ignored).
@@ -213,15 +226,16 @@ __attribute__((always_inline)) INLINE static float gas_pressure_from_entropy(
 __attribute__((always_inline)) INLINE static float gas_entropy_from_pressure(
     float density, float pressure) {
 
-  return hydro_gamma_minus_one * const_isothermal_internal_energy *
+  return hydro_gamma_minus_one * eos.isothermal_internal_energy *
          pow_minus_gamma_minus_one(density);
 }
 
 /**
  * @brief Returns the sound speed given density and entropy
  *
- * Since we are using an isothermal EoS, the entropy value is ignored
- * Computes \f$c = \sqrt{u_{cst} \gamma \rho^{\gamma-1}}\f$.
+ * Since we are using an isothermal EoS, the entropy and density values are
+ * ignored.
+ * Computes \f$c = \sqrt{u_{cst} \gamma (\gamma-1)}\f$.
  *
  * @param density The density \f$\rho\f$.
  * @param entropy The entropy \f$S\f$.
@@ -229,14 +243,14 @@ __attribute__((always_inline)) INLINE static float gas_entropy_from_pressure(
 __attribute__((always_inline)) INLINE static float gas_soundspeed_from_entropy(
     float density, float entropy) {
 
-  return sqrtf(const_isothermal_internal_energy * hydro_gamma *
+  return sqrtf(eos.isothermal_internal_energy * hydro_gamma *
                hydro_gamma_minus_one);
 }
 
 /**
  * @brief Returns the entropy given density and internal energy
  *
- * Since we are using an isothermal EoS, the energy value is ignored
+ * Since we are using an isothermal EoS, the energy value is ignored.
  * Computes \f$S = \frac{(\gamma - 1)u_{cst}}{\rho^{\gamma-1}}\f$.
  *
  * @param density The density \f$\rho\f$
@@ -245,14 +259,14 @@ __attribute__((always_inline)) INLINE static float gas_soundspeed_from_entropy(
 __attribute__((always_inline)) INLINE static float
 gas_entropy_from_internal_energy(float density, float u) {
 
-  return hydro_gamma_minus_one * const_isothermal_internal_energy *
+  return hydro_gamma_minus_one * eos.isothermal_internal_energy *
          pow_minus_gamma_minus_one(density);
 }
 
 /**
  * @brief Returns the pressure given density and internal energy
  *
- * Since we are using an isothermal EoS, the energy value is ignored
+ * Since we are using an isothermal EoS, the energy value is ignored.
  * Computes \f$P = (\gamma - 1)u_{cst}\rho\f$.
  *
  * @param density The density \f$\rho\f$
@@ -261,7 +275,7 @@ gas_entropy_from_internal_energy(float density, float u) {
 __attribute__((always_inline)) INLINE static float
 gas_pressure_from_internal_energy(float density, float u) {
 
-  return hydro_gamma_minus_one * const_isothermal_internal_energy * density;
+  return hydro_gamma_minus_one * eos.isothermal_internal_energy * density;
 }
 
 /**
@@ -275,14 +289,15 @@ gas_pressure_from_internal_energy(float density, float u) {
  */
 __attribute__((always_inline)) INLINE static float
 gas_internal_energy_from_pressure(float density, float pressure) {
-  return const_isothermal_internal_energy;
+  return eos.isothermal_internal_energy;
 }
 
 /**
  * @brief Returns the sound speed given density and internal energy
  *
- * Since we are using an isothermal EoS, the energy value is ignored
- * Computes \f$c = \sqrt{u_{cst} \gamma \rho^{\gamma-1}}\f$.
+ * Since we are using an isothermal EoS, the energy and density values are
+ * ignored.
+ * Computes \f$c = \sqrt{u_{cst} \gamma (\gamma-1)}\f$.
  *
  * @param density The density \f$\rho\f$
  * @param u The internal energy \f$u\f$
@@ -290,15 +305,16 @@ gas_internal_energy_from_pressure(float density, float pressure) {
 __attribute__((always_inline)) INLINE static float
 gas_soundspeed_from_internal_energy(float density, float u) {
 
-  return sqrtf(const_isothermal_internal_energy * hydro_gamma *
+  return sqrtf(eos.isothermal_internal_energy * hydro_gamma *
                hydro_gamma_minus_one);
 }
 
 /**
  * @brief Returns the sound speed given density and pressure
  *
- * Since we are using an isothermal EoS, the pressure value is ignored
- * Computes \f$c = \sqrt{u_{cst} \gamma \rho^{\gamma-1}}\f$.
+ * Since we are using an isothermal EoS, the pressure and density values are
+ * ignored.
+ * Computes \f$c = \sqrt{u_{cst} \gamma (\gamma-1)}\f$.
  *
  * @param density The density \f$\rho\f$
  * @param P The pressure \f$P\f$
@@ -306,7 +322,7 @@ gas_soundspeed_from_internal_energy(float density, float u) {
 __attribute__((always_inline)) INLINE static float gas_soundspeed_from_pressure(
     float density, float P) {
 
-  return sqrtf(const_isothermal_internal_energy * hydro_gamma *
+  return sqrtf(eos.isothermal_internal_energy * hydro_gamma *
                hydro_gamma_minus_one);
 }
 
@@ -317,4 +333,11 @@ __attribute__((always_inline)) INLINE static float gas_soundspeed_from_pressure(
 
 #endif
 
+void eos_init(struct eos_parameters *e, const struct swift_params *params);
+void eos_print(const struct eos_parameters *e);
+
+#if defined(HAVE_HDF5)
+void eos_print_snapshot(hid_t h_grpsph, const struct eos_parameters *e);
+#endif
+
 #endif /* SWIFT_EQUATION_OF_STATE_H */
diff --git a/src/gravity/Default/gravity_io.h b/src/gravity/Default/gravity_io.h
index 26eed8a0a680e13130dd257d8b31e6cd00392f6d..671ccc24e95f322d71a734c0a39eface2a93bcec 100644
--- a/src/gravity/Default/gravity_io.h
+++ b/src/gravity/Default/gravity_io.h
@@ -21,6 +21,20 @@
 
 #include "io_properties.h"
 
+void convert_gpart_pos(const struct engine* e, const struct gpart* gp,
+                       double* ret) {
+
+  if (e->s->periodic) {
+    ret[0] = box_wrap(gp->x[0], 0.0, e->s->dim[0]);
+    ret[1] = box_wrap(gp->x[1], 0.0, e->s->dim[1]);
+    ret[2] = box_wrap(gp->x[2], 0.0, e->s->dim[2]);
+  } else {
+    ret[0] = gp->x[0];
+    ret[1] = gp->x[1];
+    ret[2] = gp->x[2];
+  }
+}
+
 /**
  * @brief Specifies which g-particle fields to read from a dataset
  *
@@ -52,15 +66,15 @@ void darkmatter_read_particles(struct gpart* gparts, struct io_props* list,
  * @param list The list of i/o properties to write.
  * @param num_fields The number of i/o fields to write.
  */
-void darkmatter_write_particles(struct gpart* gparts, struct io_props* list,
-                                int* num_fields) {
+void darkmatter_write_particles(const struct gpart* gparts,
+                                struct io_props* list, int* num_fields) {
 
   /* Say how much we want to read */
   *num_fields = 5;
 
   /* List what we want to read */
-  list[0] = io_make_output_field("Coordinates", DOUBLE, 3, UNIT_CONV_LENGTH,
-                                 gparts, x);
+  list[0] = io_make_output_field_convert_gpart(
+      "Coordinates", DOUBLE, 3, UNIT_CONV_LENGTH, gparts, convert_gpart_pos);
   list[1] = io_make_output_field("Velocities", FLOAT, 3, UNIT_CONV_SPEED,
                                  gparts, v_full);
   list[2] =
diff --git a/src/hydro/Default/hydro_io.h b/src/hydro/Default/hydro_io.h
index 62e94b05ffea259aac99d4b3714e0eea7e7c955f..8c12f015bd69dc917a80c564422676a85ffcfb3b 100644
--- a/src/hydro/Default/hydro_io.h
+++ b/src/hydro/Default/hydro_io.h
@@ -53,6 +53,20 @@ void hydro_read_particles(struct part* parts, struct io_props* list,
                                 UNIT_CONV_DENSITY, parts, rho);
 }
 
+void convert_part_pos(const struct engine* e, const struct part* p,
+                      double* ret) {
+
+  if (e->s->periodic) {
+    ret[0] = box_wrap(p->x[0], 0.0, e->s->dim[0]);
+    ret[1] = box_wrap(p->x[1], 0.0, e->s->dim[1]);
+    ret[2] = box_wrap(p->x[2], 0.0, e->s->dim[2]);
+  } else {
+    ret[0] = p->x[0];
+    ret[1] = p->x[1];
+    ret[2] = p->x[2];
+  }
+}
+
 /**
  * @brief Specifies which particle fields to write to a dataset
  *
@@ -66,8 +80,8 @@ void hydro_write_particles(struct part* parts, struct io_props* list,
   *num_fields = 8;
 
   /* List what we want to write */
-  list[0] = io_make_output_field("Coordinates", DOUBLE, 3, UNIT_CONV_LENGTH,
-                                 parts, x);
+  list[0] = io_make_output_field_convert_part(
+      "Coordinates", DOUBLE, 3, UNIT_CONV_LENGTH, parts, convert_part_pos);
   list[1] =
       io_make_output_field("Velocities", FLOAT, 3, UNIT_CONV_SPEED, parts, v);
   list[2] =
diff --git a/src/hydro/Default/hydro_part.h b/src/hydro/Default/hydro_part.h
index 8b14c45614e4c09c48056c9398ed4bfe3ed90e38..5d2f76ab298cdd6199d95abe1e4435321302689e 100644
--- a/src/hydro/Default/hydro_part.h
+++ b/src/hydro/Default/hydro_part.h
@@ -125,6 +125,16 @@ struct part {
   /* Particle time-bin */
   timebin_t time_bin;
 
+#ifdef SWIFT_DEBUG_CHECKS
+
+  /* Time of the last drift */
+  integertime_t ti_drift;
+
+  /* Time of the last kick */
+  integertime_t ti_kick;
+
+#endif
+
 } SWIFT_STRUCT_ALIGN;
 
 #endif /* SWIFT_DEFAULT_HYDRO_PART_H */
diff --git a/src/hydro/Gadget2/hydro_io.h b/src/hydro/Gadget2/hydro_io.h
index ac4ce0db57b3297fc961548ffea6b00f32f4dd8c..e6d125105d4a9aa6443e35644990655fd4c7335c 100644
--- a/src/hydro/Gadget2/hydro_io.h
+++ b/src/hydro/Gadget2/hydro_io.h
@@ -55,14 +55,28 @@ void hydro_read_particles(struct part* parts, struct io_props* list,
                                 UNIT_CONV_DENSITY, parts, rho);
 }
 
-float convert_u(struct engine* e, struct part* p) {
+void convert_u(const struct engine* e, const struct part* p, float* ret) {
 
-  return hydro_get_internal_energy(p);
+  ret[0] = hydro_get_internal_energy(p);
 }
 
-float convert_P(struct engine* e, struct part* p) {
+void convert_P(const struct engine* e, const struct part* p, float* ret) {
 
-  return hydro_get_pressure(p);
+  ret[0] = hydro_get_pressure(p);
+}
+
+void convert_part_pos(const struct engine* e, const struct part* p,
+                      double* ret) {
+
+  if (e->s->periodic) {
+    ret[0] = box_wrap(p->x[0], 0.0, e->s->dim[0]);
+    ret[1] = box_wrap(p->x[1], 0.0, e->s->dim[1]);
+    ret[2] = box_wrap(p->x[2], 0.0, e->s->dim[2]);
+  } else {
+    ret[0] = p->x[0];
+    ret[1] = p->x[1];
+    ret[2] = p->x[2];
+  }
 }
 
 /**
@@ -72,7 +86,7 @@ float convert_P(struct engine* e, struct part* p) {
  * @param list The list of i/o properties to write.
  * @param num_fields The number of i/o fields to write.
  */
-void hydro_write_particles(struct part* parts, struct io_props* list,
+void hydro_write_particles(const struct part* parts, struct io_props* list,
                            int* num_fields) {
 
   *num_fields = 10;
@@ -82,8 +96,8 @@ void hydro_write_particles(struct part* parts, struct io_props* list,
 #endif
 
   /* List what we want to write */
-  list[0] = io_make_output_field("Coordinates", DOUBLE, 3, UNIT_CONV_LENGTH,
-                                 parts, x);
+  list[0] = io_make_output_field_convert_part(
+      "Coordinates", DOUBLE, 3, UNIT_CONV_LENGTH, parts, convert_part_pos);
   list[1] =
       io_make_output_field("Velocities", FLOAT, 3, UNIT_CONV_SPEED, parts, v);
   list[2] =
@@ -100,9 +114,9 @@ void hydro_write_particles(struct part* parts, struct io_props* list,
       io_make_output_field("Density", FLOAT, 1, UNIT_CONV_DENSITY, parts, rho);
   list[8] = io_make_output_field_convert_part("InternalEnergy", FLOAT, 1,
                                               UNIT_CONV_ENERGY_PER_UNIT_MASS,
-                                              parts, rho, convert_u);
+                                              parts, convert_u);
   list[9] = io_make_output_field_convert_part(
-      "Pressure", FLOAT, 1, UNIT_CONV_PRESSURE, parts, rho, convert_P);
+      "Pressure", FLOAT, 1, UNIT_CONV_PRESSURE, parts, convert_P);
 #ifdef DEBUG_INTERACTIONS_SPH
   list[10] = io_make_output_field("Num_ngb_density", INT, 1, UNIT_CONV_NO_UNITS,
                                  parts, num_ngb_density);
diff --git a/src/hydro/Gizmo/hydro.h b/src/hydro/Gizmo/hydro.h
index 2c2f54699bb380a491edf61a83ad8a031572c86c..6273f5009a051e17d2a2954ac994c3f189110464 100644
--- a/src/hydro/Gizmo/hydro.h
+++ b/src/hydro/Gizmo/hydro.h
@@ -24,6 +24,7 @@
 #include "approx_math.h"
 #include "equation_of_state.h"
 #include "hydro_gradients.h"
+#include "hydro_properties.h"
 #include "hydro_space.h"
 #include "hydro_unphysical.h"
 #include "hydro_velocities.h"
@@ -130,8 +131,9 @@ __attribute__((always_inline)) INLINE static void hydro_first_init_part(
 /* remember that we store the total thermal energy, not the specific thermal
    energy (as in Gadget) */
 #if defined(EOS_ISOTHERMAL_GAS)
-  /* this overwrites the internal energy from the initial condition file */
-  p->conserved.energy = mass * const_isothermal_internal_energy;
+  /* this overwrites the internal energy from the initial condition file
+   * Note that we call the EoS function just to get the constant u here. */
+  p->conserved.energy = mass * gas_internal_energy_from_entropy(0.f, 0.f);
 #else
   p->conserved.energy *= mass;
 #endif
@@ -608,7 +610,9 @@ __attribute__((always_inline)) INLINE static void hydro_kick_extra(
   p->conserved.momentum[1] += p->conserved.flux.momentum[1];
   p->conserved.momentum[2] += p->conserved.flux.momentum[2];
 #if defined(EOS_ISOTHERMAL_GAS)
-  p->conserved.energy = p->conserved.mass * const_isothermal_internal_energy;
+  /* We use the EoS equation in a sneaky way here just to get the constant u */
+  p->conserved.energy =
+      p->conserved.mass * gas_internal_energy_from_entropy(0.f, 0.f);
 #else
   p->conserved.energy += p->conserved.flux.energy;
 #endif
@@ -707,15 +711,11 @@ __attribute__((always_inline)) INLINE static void hydro_kick_extra(
 __attribute__((always_inline)) INLINE static float hydro_get_internal_energy(
     const struct part* restrict p) {
 
-  if (p->primitives.rho > 0.) {
-#ifdef EOS_ISOTHERMAL_GAS
-    return p->primitives.P / hydro_gamma_minus_one / p->primitives.rho;
-#else
-    return const_isothermal_internal_energy;
-#endif
-  } else {
+  if (p->primitives.rho > 0.)
+    return gas_internal_energy_from_pressure(p->primitives.rho,
+                                             p->primitives.P);
+  else
     return 0.;
-  }
 }
 
 /**
@@ -727,7 +727,7 @@ __attribute__((always_inline)) INLINE static float hydro_get_entropy(
     const struct part* restrict p) {
 
   if (p->primitives.rho > 0.) {
-    return p->primitives.P / pow_gamma(p->primitives.rho);
+    return gas_entropy_from_pressure(p->primitives.rho, p->primitives.P);
   } else {
     return 0.;
   }
@@ -741,16 +741,10 @@ __attribute__((always_inline)) INLINE static float hydro_get_entropy(
 __attribute__((always_inline)) INLINE static float hydro_get_soundspeed(
     const struct part* restrict p) {
 
-  if (p->primitives.rho > 0.) {
-#ifdef EOS_ISOTHERMAL_GAS
-    return sqrtf(const_isothermal_internal_energy * hydro_gamma *
-                 hydro_gamma_minus_one);
-#else
-    return sqrtf(hydro_gamma * p->primitives.P / p->primitives.rho);
-#endif
-  } else {
+  if (p->primitives.rho > 0.)
+    return gas_soundspeed_from_pressure(p->primitives.rho, p->primitives.P);
+  else
     return 0.;
-  }
 }
 
 /**
diff --git a/src/hydro/Gizmo/hydro_io.h b/src/hydro/Gizmo/hydro_io.h
index d20f7e2eb1cf50be7690e15a9569d8e9c4605af5..458ea6151e4b00b9ccf85ad6eff4ccf58277ff7d 100644
--- a/src/hydro/Gizmo/hydro_io.h
+++ b/src/hydro/Gizmo/hydro_io.h
@@ -18,6 +18,7 @@
  ******************************************************************************/
 
 #include "adiabatic_index.h"
+#include "hydro.h"
 #include "hydro_flux_limiters.h"
 #include "hydro_gradients.h"
 #include "hydro_slope_limiters.h"
@@ -68,18 +69,11 @@ void hydro_read_particles(struct part* parts, struct io_props* list,
  *
  * @param e #engine.
  * @param p Particle.
- * @return Internal energy of the particle
+ * @param ret (return) Internal energy of the particle
  */
-float convert_u(struct engine* e, struct part* p) {
-  if (p->primitives.rho > 0.) {
-#ifdef EOS_ISOTHERMAL_GAS
-    return const_isothermal_internal_energy;
-#else
-    return p->primitives.P / hydro_gamma_minus_one / p->primitives.rho;
-#endif
-  } else {
-    return 0.;
-  }
+void convert_u(const struct engine* e, const struct part* p, float* ret) {
+
+  ret[0] = hydro_get_internal_energy(p);
 }
 
 /**
@@ -87,14 +81,10 @@ float convert_u(struct engine* e, struct part* p) {
  *
  * @param e #engine.
  * @param p Particle.
- * @return Entropic function of the particle
+ * @param ret (return) Entropic function of the particle
  */
-float convert_A(struct engine* e, struct part* p) {
-  if (p->primitives.rho > 0.) {
-    return p->primitives.P / pow_gamma(p->primitives.rho);
-  } else {
-    return 0.;
-  }
+void convert_A(const struct engine* e, const struct part* p, float* ret) {
+  ret[0] = hydro_get_entropy(p);
 }
 
 /**
@@ -104,9 +94,9 @@ float convert_A(struct engine* e, struct part* p) {
  * @param p Particle.
  * @return Total energy of the particle
  */
-float convert_Etot(struct engine* e, struct part* p) {
+void convert_Etot(const struct engine* e, const struct part* p, float* ret) {
 #ifdef GIZMO_TOTAL_ENERGY
-  return p->conserved.energy;
+  ret[0] = p->conserved.energy;
 #else
   float momentum2;
 
@@ -114,10 +104,24 @@ float convert_Etot(struct engine* e, struct part* p) {
               p->conserved.momentum[1] * p->conserved.momentum[1] +
               p->conserved.momentum[2] * p->conserved.momentum[2];
 
-  return p->conserved.energy + 0.5f * momentum2 / p->conserved.mass;
+  ret[0] = p->conserved.energy + 0.5f * momentum2 / p->conserved.mass;
 #endif
 }
 
+void convert_part_pos(const struct engine* e, const struct part* p,
+                      double* ret) {
+
+  if (e->s->periodic) {
+    ret[0] = box_wrap(p->x[0], 0.0, e->s->dim[0]);
+    ret[1] = box_wrap(p->x[1], 0.0, e->s->dim[1]);
+    ret[2] = box_wrap(p->x[2], 0.0, e->s->dim[2]);
+  } else {
+    ret[0] = p->x[0];
+    ret[1] = p->x[1];
+    ret[2] = p->x[2];
+  }
+}
+
 /**
  * @brief Specifies which particle fields to write to a dataset
  *
@@ -131,8 +135,8 @@ void hydro_write_particles(struct part* parts, struct io_props* list,
   *num_fields = 10;
 
   /* List what we want to write */
-  list[0] = io_make_output_field("Coordinates", DOUBLE, 3, UNIT_CONV_LENGTH,
-                                 parts, x);
+  list[0] = io_make_output_field_convert_part(
+      "Coordinates", DOUBLE, 3, UNIT_CONV_LENGTH, parts, convert_part_pos);
   list[1] = io_make_output_field("Velocities", FLOAT, 3, UNIT_CONV_SPEED, parts,
                                  primitives.v);
   list[2] = io_make_output_field("Masses", FLOAT, 1, UNIT_CONV_MASS, parts,
@@ -141,18 +145,17 @@ void hydro_write_particles(struct part* parts, struct io_props* list,
                                  parts, h);
   list[4] = io_make_output_field_convert_part("InternalEnergy", FLOAT, 1,
                                               UNIT_CONV_ENERGY_PER_UNIT_MASS,
-                                              parts, primitives.P, convert_u);
+                                              parts, convert_u);
   list[5] = io_make_output_field("ParticleIDs", ULONGLONG, 1,
                                  UNIT_CONV_NO_UNITS, parts, id);
   list[6] = io_make_output_field("Density", FLOAT, 1, UNIT_CONV_DENSITY, parts,
                                  primitives.rho);
   list[7] = io_make_output_field_convert_part(
-      "Entropy", FLOAT, 1, UNIT_CONV_ENTROPY, parts, primitives.P, convert_A);
+      "Entropy", FLOAT, 1, UNIT_CONV_ENTROPY, parts, convert_A);
   list[8] = io_make_output_field("Pressure", FLOAT, 1, UNIT_CONV_PRESSURE,
                                  parts, primitives.P);
-  list[9] =
-      io_make_output_field_convert_part("TotEnergy", FLOAT, 1, UNIT_CONV_ENERGY,
-                                        parts, conserved.energy, convert_Etot);
+  list[9] = io_make_output_field_convert_part(
+      "TotEnergy", FLOAT, 1, UNIT_CONV_ENERGY, parts, convert_Etot);
 }
 
 /**
diff --git a/src/hydro/Minimal/hydro.h b/src/hydro/Minimal/hydro.h
index 4d8ca5b05547467c973e17983774b64736060471..905412040cbb8ac9666e555d88cad9bc56de13ab 100644
--- a/src/hydro/Minimal/hydro.h
+++ b/src/hydro/Minimal/hydro.h
@@ -224,7 +224,7 @@ __attribute__((always_inline)) INLINE static void hydro_end_density(
   /* Finish the calculation by inserting the missing h-factors */
   p->rho *= h_inv_dim;
   p->density.rho_dh *= h_inv_dim_plus_one;
-  p->density.wcount *= kernel_norm;
+  p->density.wcount *= h_inv_dim;
   p->density.wcount_dh *= h_inv_dim_plus_one;
 }
 
diff --git a/src/hydro/Minimal/hydro_io.h b/src/hydro/Minimal/hydro_io.h
index 2ec0cb11d1e3ccaaa09d9822c75b396364912df8..771e4ac2d066ea098edb8b2f11d7afd550783347 100644
--- a/src/hydro/Minimal/hydro_io.h
+++ b/src/hydro/Minimal/hydro_io.h
@@ -69,14 +69,28 @@ void hydro_read_particles(struct part* parts, struct io_props* list,
                                 UNIT_CONV_DENSITY, parts, rho);
 }
 
-float convert_S(struct engine* e, struct part* p) {
+void convert_S(const struct engine* e, const struct part* p, float* ret) {
 
-  return hydro_get_entropy(p);
+  ret[0] = hydro_get_entropy(p);
 }
 
-float convert_P(struct engine* e, struct part* p) {
+void convert_P(const struct engine* e, const struct part* p, float* ret) {
 
-  return hydro_get_pressure(p);
+  ret[0] = hydro_get_pressure(p);
+}
+
+void convert_part_pos(const struct engine* e, const struct part* p,
+                      double* ret) {
+
+  if (e->s->periodic) {
+    ret[0] = box_wrap(p->x[0], 0.0, e->s->dim[0]);
+    ret[1] = box_wrap(p->x[1], 0.0, e->s->dim[1]);
+    ret[2] = box_wrap(p->x[2], 0.0, e->s->dim[2]);
+  } else {
+    ret[0] = p->x[0];
+    ret[1] = p->x[1];
+    ret[2] = p->x[2];
+  }
 }
 
 /**
@@ -92,8 +106,8 @@ void hydro_write_particles(struct part* parts, struct io_props* list,
   *num_fields = 10;
 
   /* List what we want to write */
-  list[0] = io_make_output_field("Coordinates", DOUBLE, 3, UNIT_CONV_LENGTH,
-                                 parts, x);
+  list[0] = io_make_output_field_convert_part(
+      "Coordinates", DOUBLE, 3, UNIT_CONV_LENGTH, parts, convert_part_pos);
   list[1] =
       io_make_output_field("Velocities", FLOAT, 3, UNIT_CONV_SPEED, parts, v);
   list[2] =
@@ -108,11 +122,10 @@ void hydro_write_particles(struct part* parts, struct io_props* list,
                                  UNIT_CONV_ACCELERATION, parts, a_hydro);
   list[7] =
       io_make_output_field("Density", FLOAT, 1, UNIT_CONV_DENSITY, parts, rho);
-  list[8] = io_make_output_field_convert_part("Entropy", FLOAT, 1,
-                                              UNIT_CONV_ENTROPY_PER_UNIT_MASS,
-                                              parts, rho, convert_S);
+  list[8] = io_make_output_field_convert_part(
+      "Entropy", FLOAT, 1, UNIT_CONV_ENTROPY_PER_UNIT_MASS, parts, convert_S);
   list[9] = io_make_output_field_convert_part(
-      "Pressure", FLOAT, 1, UNIT_CONV_PRESSURE, parts, rho, convert_P);
+      "Pressure", FLOAT, 1, UNIT_CONV_PRESSURE, parts, convert_P);
 }
 
 /**
diff --git a/src/hydro/PressureEntropy/hydro_io.h b/src/hydro/PressureEntropy/hydro_io.h
index 10243750c01d6bc64664f9834bc4cc245c786f49..d3780261ba587cfac92ec44a65b13f0a4344b3c3 100644
--- a/src/hydro/PressureEntropy/hydro_io.h
+++ b/src/hydro/PressureEntropy/hydro_io.h
@@ -67,14 +67,28 @@ void hydro_read_particles(struct part* parts, struct io_props* list,
                                 UNIT_CONV_DENSITY, parts, rho);
 }
 
-float convert_u(struct engine* e, struct part* p) {
+void convert_u(const struct engine* e, const struct part* p, float* ret) {
 
-  return hydro_get_internal_energy(p);
+  ret[0] = hydro_get_internal_energy(p);
 }
 
-float convert_P(struct engine* e, struct part* p) {
+void convert_P(const struct engine* e, const struct part* p, float* ret) {
 
-  return hydro_get_pressure(p);
+  ret[0] = hydro_get_pressure(p);
+}
+
+void convert_part_pos(const struct engine* e, const struct part* p,
+                      double* ret) {
+
+  if (e->s->periodic) {
+    ret[0] = box_wrap(p->x[0], 0.0, e->s->dim[0]);
+    ret[1] = box_wrap(p->x[1], 0.0, e->s->dim[1]);
+    ret[2] = box_wrap(p->x[2], 0.0, e->s->dim[2]);
+  } else {
+    ret[0] = p->x[0];
+    ret[1] = p->x[1];
+    ret[2] = p->x[2];
+  }
 }
 
 /**
@@ -90,27 +104,27 @@ void hydro_write_particles(struct part* parts, struct io_props* list,
   *num_fields = 11;
 
   /* List what we want to write */
-  list[0] = io_make_output_field("Coordinates", DOUBLE, 3, UNIT_CONV_LENGTH,
-                                 parts, x);
+  list[0] = io_make_output_field_convert_part(
+      "Coordinates", DOUBLE, 3, UNIT_CONV_LENGTH, parts, convert_part_pos);
   list[1] =
       io_make_output_field("Velocities", FLOAT, 3, UNIT_CONV_SPEED, parts, v);
   list[2] =
       io_make_output_field("Masses", FLOAT, 1, UNIT_CONV_MASS, parts, mass);
   list[3] = io_make_output_field("SmoothingLength", FLOAT, 1, UNIT_CONV_LENGTH,
                                  parts, h);
-  list[4] = io_make_output_field_convert_part("InternalEnergy", FLOAT, 1,
-                                              UNIT_CONV_ENERGY_PER_UNIT_MASS,
-                                              parts, entropy, convert_u);
+  list[4] = io_make_output_field(
+      "Entropy", FLOAT, 1, UNIT_CONV_ENTROPY_PER_UNIT_MASS, parts, entropy);
   list[5] = io_make_output_field("ParticleIDs", ULONGLONG, 1,
                                  UNIT_CONV_NO_UNITS, parts, id);
   list[6] = io_make_output_field("Acceleration", FLOAT, 3,
                                  UNIT_CONV_ACCELERATION, parts, a_hydro);
   list[7] =
       io_make_output_field("Density", FLOAT, 1, UNIT_CONV_DENSITY, parts, rho);
-  list[8] = io_make_output_field(
-      "Entropy", FLOAT, 1, UNIT_CONV_ENTROPY_PER_UNIT_MASS, parts, entropy);
+  list[8] = io_make_output_field_convert_part("InternalEnergy", FLOAT, 1,
+                                              UNIT_CONV_ENERGY_PER_UNIT_MASS,
+                                              parts, convert_u);
   list[9] = io_make_output_field_convert_part(
-      "Pressure", FLOAT, 1, UNIT_CONV_PRESSURE, parts, rho, convert_P);
+      "Pressure", FLOAT, 1, UNIT_CONV_PRESSURE, parts, convert_P);
   list[10] = io_make_output_field("WeightedDensity", FLOAT, 1,
                                   UNIT_CONV_DENSITY, parts, rho_bar);
 }
diff --git a/src/hydro/Shadowswift/hydro.h b/src/hydro/Shadowswift/hydro.h
index abbcdcd2f7879d8063a906e44ab2fe6a3e675828..eb9938a341f1e6273f8ea03264bdda6dc69832e6 100644
--- a/src/hydro/Shadowswift/hydro.h
+++ b/src/hydro/Shadowswift/hydro.h
@@ -22,6 +22,7 @@
 #include "approx_math.h"
 #include "equation_of_state.h"
 #include "hydro_gradients.h"
+#include "hydro_properties.h"
 #include "hydro_space.h"
 #include "voronoi_algorithm.h"
 
diff --git a/src/hydro/Shadowswift/hydro_io.h b/src/hydro/Shadowswift/hydro_io.h
index de45b5d68c78f96cee3030eadef4b4410e550c22..f6f6fcc6c070b0add8e2b97678e5ef1ada636bfd 100644
--- a/src/hydro/Shadowswift/hydro_io.h
+++ b/src/hydro/Shadowswift/hydro_io.h
@@ -19,6 +19,7 @@
 
 #include "adiabatic_index.h"
 #include "equation_of_state.h"
+#include "hydro.h"
 #include "hydro_gradients.h"
 #include "hydro_slope_limiters.h"
 #include "io_properties.h"
@@ -63,13 +64,8 @@ void hydro_read_particles(struct part* parts, struct io_props* list,
  * @param p Particle.
  * @return Internal energy of the particle
  */
-float convert_u(struct engine* e, struct part* p) {
-  if (p->primitives.rho > 0.) {
-    return gas_internal_energy_from_pressure(p->primitives.rho,
-                                             p->primitives.P);
-  } else {
-    return 0.;
-  }
+void convert_u(const struct engine* e, const struct part* p, float* ret) {
+  ret[0] = hydro_get_internal_energy(p);
 }
 
 /**
@@ -79,12 +75,8 @@ float convert_u(struct engine* e, struct part* p) {
  * @param p Particle.
  * @return Entropic function of the particle
  */
-float convert_A(struct engine* e, struct part* p) {
-  if (p->primitives.rho > 0.) {
-    return gas_entropy_from_pressure(p->primitives.rho, p->primitives.P);
-  } else {
-    return 0.;
-  }
+void convert_A(const struct engine* e, const struct part* p, float* ret) {
+  ret[0] = hydro_get_entropy(p);
 }
 
 /**
@@ -94,7 +86,7 @@ float convert_A(struct engine* e, struct part* p) {
  * @param p Particle.
  * @return Total energy of the particle
  */
-float convert_Etot(struct engine* e, struct part* p) {
+void convert_Etot(const struct engine* e, const struct part* p, float* ret) {
 #ifdef SHADOWFAX_TOTAL_ENERGY
   return p->conserved.energy;
 #else
@@ -105,13 +97,27 @@ float convert_Etot(struct engine* e, struct part* p) {
                 p->conserved.momentum[1] * p->conserved.momentum[1] +
                 p->conserved.momentum[2] * p->conserved.momentum[2];
 
-    return p->conserved.energy + 0.5f * momentum2 / p->conserved.mass;
+    ret[0] = p->conserved.energy + 0.5f * momentum2 / p->conserved.mass;
   } else {
-    return 0.;
+    ret[0] = 0.;
   }
 #endif
 }
 
+void convert_part_pos(const struct engine* e, const struct part* p,
+                      double* ret) {
+
+  if (e->s->periodic) {
+    ret[0] = box_wrap(p->x[0], 0.0, e->s->dim[0]);
+    ret[1] = box_wrap(p->x[1], 0.0, e->s->dim[1]);
+    ret[2] = box_wrap(p->x[2], 0.0, e->s->dim[2]);
+  } else {
+    ret[0] = p->x[0];
+    ret[1] = p->x[1];
+    ret[2] = p->x[2];
+  }
+}
+
 /**
  * @brief Specifies which particle fields to write to a dataset
  *
@@ -125,8 +131,8 @@ void hydro_write_particles(struct part* parts, struct io_props* list,
   *num_fields = 13;
 
   /* List what we want to write */
-  list[0] = io_make_output_field("Coordinates", DOUBLE, 3, UNIT_CONV_LENGTH,
-                                 parts, x);
+  list[0] = io_make_output_field_convert_part(
+      "Coordinates", DOUBLE, 3, UNIT_CONV_LENGTH, parts, convert_part_pos);
   list[1] = io_make_output_field("Velocities", FLOAT, 3, UNIT_CONV_SPEED, parts,
                                  primitives.v);
   list[2] = io_make_output_field("Masses", FLOAT, 1, UNIT_CONV_MASS, parts,
@@ -135,7 +141,7 @@ void hydro_write_particles(struct part* parts, struct io_props* list,
                                  parts, h);
   list[4] = io_make_output_field_convert_part("InternalEnergy", FLOAT, 1,
                                               UNIT_CONV_ENERGY_PER_UNIT_MASS,
-                                              parts, primitives.P, convert_u);
+                                              parts, convert_u);
   list[5] = io_make_output_field("ParticleIDs", ULONGLONG, 1,
                                  UNIT_CONV_NO_UNITS, parts, id);
   list[6] = io_make_output_field("Acceleration", FLOAT, 3,
@@ -147,12 +153,11 @@ void hydro_write_particles(struct part* parts, struct io_props* list,
   list[9] = io_make_output_field("GradDensity", FLOAT, 3, UNIT_CONV_DENSITY,
                                  parts, primitives.gradients.rho);
   list[10] = io_make_output_field_convert_part(
-      "Entropy", FLOAT, 1, UNIT_CONV_ENTROPY, parts, primitives.P, convert_A);
+      "Entropy", FLOAT, 1, UNIT_CONV_ENTROPY, parts, convert_A);
   list[11] = io_make_output_field("Pressure", FLOAT, 1, UNIT_CONV_PRESSURE,
                                   parts, primitives.P);
-  list[12] =
-      io_make_output_field_convert_part("TotEnergy", FLOAT, 1, UNIT_CONV_ENERGY,
-                                        parts, conserved.energy, convert_Etot);
+  list[12] = io_make_output_field_convert_part(
+      "TotEnergy", FLOAT, 1, UNIT_CONV_ENERGY, parts, convert_Etot);
 }
 
 /**
diff --git a/src/hydro_properties.c b/src/hydro_properties.c
index 1e7554f7d84220b8c962d60cc4538c685b5bad52..995610acb2d46cb62a4fa5fc7ef76b0d9474f504 100644
--- a/src/hydro_properties.c
+++ b/src/hydro_properties.c
@@ -77,17 +77,10 @@ void hydro_props_init(struct hydro_props *p,
 
 void hydro_props_print(const struct hydro_props *p) {
 
-#if defined(EOS_IDEAL_GAS)
-  message("Equation of state: Ideal gas.");
-#elif defined(EOS_ISOTHERMAL_GAS)
-  message(
-      "Equation of state: Isothermal with internal energy "
-      "per unit mass set to %f.",
-      const_isothermal_internal_energy);
-#endif
-
-  message("Adiabatic index gamma: %f.", hydro_gamma);
+  /* Print equation of state first */
+  eos_print(&eos);
 
+  /* Now SPH */
   message("Hydrodynamic scheme: %s in %dD.", SPH_IMPLEMENTATION,
           (int)hydro_dimension);
 
@@ -115,7 +108,8 @@ void hydro_props_print(const struct hydro_props *p) {
 #if defined(HAVE_HDF5)
 void hydro_props_print_snapshot(hid_t h_grpsph, const struct hydro_props *p) {
 
-  io_write_attribute_f(h_grpsph, "Adiabatic index", hydro_gamma);
+  eos_print_snapshot(h_grpsph, &eos);
+
   io_write_attribute_i(h_grpsph, "Dimension", (int)hydro_dimension);
   io_write_attribute_s(h_grpsph, "Scheme", SPH_IMPLEMENTATION);
   io_write_attribute_s(h_grpsph, "Kernel function", kernel_name);
diff --git a/src/io_properties.h b/src/io_properties.h
index 9fcf1a1ac67cae6afab6870369e51d06c752fc11..2142d8d555fb52258d3443f20d14c72cf7568045 100644
--- a/src/io_properties.h
+++ b/src/io_properties.h
@@ -16,19 +16,31 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  *
  ******************************************************************************/
-
 #ifndef SWIFT_IO_PROPERTIES_H
 #define SWIFT_IO_PROPERTIES_H
 
 /* Config parameters. */
 #include "../config.h"
 
+/* Local includes. */
+#include "inline.h"
+
 /**
  * @brief The two sorts of data present in the GADGET IC files: compulsory to
  * start a run or optional.
  */
 enum DATA_IMPORTANCE { COMPULSORY = 1, OPTIONAL = 0, UNUSED = -1 };
 
+/* Helper typedefs */
+typedef void (*conversion_func_part_float)(const struct engine*,
+                                           const struct part*, float*);
+typedef void (*conversion_func_part_double)(const struct engine*,
+                                            const struct part*, double*);
+typedef void (*conversion_func_gpart_float)(const struct engine*,
+                                            const struct gpart*, float*);
+typedef void (*conversion_func_gpart_double)(const struct engine*,
+                                             const struct gpart*, double*);
+
 /**
  * @brief The properties of a given dataset for i/o
  */
@@ -52,18 +64,31 @@ struct io_props {
   /* Pointer to the field of the first particle in the array */
   char* field;
 
+  /* Pointer to the start of the temporary buffer used in i/o */
+  char* start_temp_c;
+  float* start_temp_f;
+  double* start_temp_d;
+
+  /* Pointer to the engine */
+  const struct engine* e;
+
   /* The size of the particles */
   size_t partSize;
 
   /* The particle arrays */
-  struct part* parts;
-  struct gpart* gparts;
+  const struct part* parts;
+  const struct gpart* gparts;
+
+  /* Are we converting? */
+  int conversion;
 
   /* Conversion function for part */
-  float (*convert_part)(struct engine*, struct part*);
+  conversion_func_part_float convert_part_f;
+  conversion_func_part_double convert_part_d;
 
   /* Conversion function for gpart */
-  float (*convert_gpart)(struct engine*, struct gpart*);
+  conversion_func_gpart_float convert_gpart_f;
+  conversion_func_gpart_double convert_gpart_d;
 };
 
 /**
@@ -86,11 +111,10 @@ struct io_props {
  *
  * Do not call this function directly. Use the macro defined above.
  */
-struct io_props io_make_input_field_(char name[FIELD_BUFFER_SIZE],
-                                     enum IO_DATA_TYPE type, int dimension,
-                                     enum DATA_IMPORTANCE importance,
-                                     enum unit_conversion_factor units,
-                                     char* field, size_t partSize) {
+INLINE static struct io_props io_make_input_field_(
+    char name[FIELD_BUFFER_SIZE], enum IO_DATA_TYPE type, int dimension,
+    enum DATA_IMPORTANCE importance, enum unit_conversion_factor units,
+    char* field, size_t partSize) {
   struct io_props r;
   strcpy(r.name, name);
   r.type = type;
@@ -101,8 +125,11 @@ struct io_props io_make_input_field_(char name[FIELD_BUFFER_SIZE],
   r.partSize = partSize;
   r.parts = NULL;
   r.gparts = NULL;
-  r.convert_part = NULL;
-  r.convert_gpart = NULL;
+  r.conversion = 0;
+  r.convert_part_f = NULL;
+  r.convert_part_d = NULL;
+  r.convert_gpart_f = NULL;
+  r.convert_gpart_d = NULL;
 
   return r;
 }
@@ -126,10 +153,9 @@ struct io_props io_make_input_field_(char name[FIELD_BUFFER_SIZE],
  *
  * Do not call this function directly. Use the macro defined above.
  */
-struct io_props io_make_output_field_(char name[FIELD_BUFFER_SIZE],
-                                      enum IO_DATA_TYPE type, int dimension,
-                                      enum unit_conversion_factor units,
-                                      char* field, size_t partSize) {
+INLINE static struct io_props io_make_output_field_(
+    char name[FIELD_BUFFER_SIZE], enum IO_DATA_TYPE type, int dimension,
+    enum unit_conversion_factor units, char* field, size_t partSize) {
   struct io_props r;
   strcpy(r.name, name);
   r.type = type;
@@ -140,8 +166,11 @@ struct io_props io_make_output_field_(char name[FIELD_BUFFER_SIZE],
   r.partSize = partSize;
   r.parts = NULL;
   r.gparts = NULL;
-  r.convert_part = NULL;
-  r.convert_gpart = NULL;
+  r.conversion = 0;
+  r.convert_part_f = NULL;
+  r.convert_part_d = NULL;
+  r.convert_gpart_f = NULL;
+  r.convert_gpart_d = NULL;
 
   return r;
 }
@@ -149,11 +178,10 @@ struct io_props io_make_output_field_(char name[FIELD_BUFFER_SIZE],
 /**
  * @brief Constructs an #io_props (with conversion) from its parameters
  */
-#define io_make_output_field_convert_part(name, type, dim, units, part, field, \
-                                          convert)                             \
-  io_make_output_field_convert_part_(name, type, dim, units,                   \
-                                     (char*)(&(part[0]).field),                \
-                                     sizeof(part[0]), part, convert)
+#define io_make_output_field_convert_part(name, type, dim, units, part, \
+                                          convert)                      \
+  io_make_output_field_convert_part_##type(name, type, dim, units,      \
+                                           sizeof(part[0]), part, convert)
 
 /**
  * @brief Construct an #io_props from its parameters
@@ -162,17 +190,16 @@ struct io_props io_make_output_field_(char name[FIELD_BUFFER_SIZE],
  * @param type The type of the data
  * @param dimension Dataset dimension (1D, 3D, ...)
  * @param units The units of the dataset
- * @param field Pointer to the field of the first particle
  * @param partSize The size in byte of the particle
  * @param parts The particle array
  * @param functionPtr The function used to convert a particle to a float
  *
  * Do not call this function directly. Use the macro defined above.
  */
-struct io_props io_make_output_field_convert_part_(
+INLINE static struct io_props io_make_output_field_convert_part_FLOAT(
     char name[FIELD_BUFFER_SIZE], enum IO_DATA_TYPE type, int dimension,
-    enum unit_conversion_factor units, char* field, size_t partSize,
-    struct part* parts, float (*functionPtr)(struct engine*, struct part*)) {
+    enum unit_conversion_factor units, size_t partSize,
+    const struct part* parts, conversion_func_part_float functionPtr) {
 
   struct io_props r;
   strcpy(r.name, name);
@@ -180,12 +207,52 @@ struct io_props io_make_output_field_convert_part_(
   r.dimension = dimension;
   r.importance = UNUSED;
   r.units = units;
-  r.field = field;
+  r.field = NULL;
+  r.partSize = partSize;
+  r.parts = parts;
+  r.gparts = NULL;
+  r.conversion = 1;
+  r.convert_part_f = functionPtr;
+  r.convert_part_d = NULL;
+  r.convert_gpart_f = NULL;
+  r.convert_gpart_d = NULL;
+
+  return r;
+}
+
+/**
+ * @brief Construct an #io_props from its parameters
+ *
+ * @param name Name of the field to read
+ * @param type The type of the data
+ * @param dimension Dataset dimension (1D, 3D, ...)
+ * @param units The units of the dataset
+ * @param partSize The size in byte of the particle
+ * @param parts The particle array
+ * @param functionPtr The function used to convert a particle to a float
+ *
+ * Do not call this function directly. Use the macro defined above.
+ */
+INLINE static struct io_props io_make_output_field_convert_part_DOUBLE(
+    char name[FIELD_BUFFER_SIZE], enum IO_DATA_TYPE type, int dimension,
+    enum unit_conversion_factor units, size_t partSize,
+    const struct part* parts, conversion_func_part_double functionPtr) {
+
+  struct io_props r;
+  strcpy(r.name, name);
+  r.type = type;
+  r.dimension = dimension;
+  r.importance = UNUSED;
+  r.units = units;
+  r.field = NULL;
   r.partSize = partSize;
   r.parts = parts;
   r.gparts = NULL;
-  r.convert_part = functionPtr;
-  r.convert_gpart = NULL;
+  r.conversion = 1;
+  r.convert_part_f = NULL;
+  r.convert_part_d = functionPtr;
+  r.convert_gpart_f = NULL;
+  r.convert_gpart_d = NULL;
 
   return r;
 }
@@ -193,11 +260,10 @@ struct io_props io_make_output_field_convert_part_(
 /**
  * @brief Constructs an #io_props (with conversion) from its parameters
  */
-#define io_make_output_field_convert_gpart(name, type, dim, units, part, \
-                                           field, convert)               \
-  io_make_output_field_convert_gpart_(name, type, dim, units,            \
-                                      (char*)(&(part[0]).field),         \
-                                      sizeof(part[0]), gpart, convert)
+#define io_make_output_field_convert_gpart(name, type, dim, units, gpart, \
+                                           convert)                       \
+  io_make_output_field_convert_gpart_##type(name, type, dim, units,       \
+                                            sizeof(gpart[0]), gpart, convert)
 
 /**
  * @brief Construct an #io_props from its parameters
@@ -206,17 +272,16 @@ struct io_props io_make_output_field_convert_part_(
  * @param type The type of the data
  * @param dimension Dataset dimension (1D, 3D, ...)
  * @param units The units of the dataset
- * @param field Pointer to the field of the first particle
- * @param partSize The size in byte of the particle
+ * @param gpartSize The size in byte of the particle
  * @param gparts The particle array
  * @param functionPtr The function used to convert a g-particle to a float
  *
  * Do not call this function directly. Use the macro defined above.
  */
-struct io_props io_make_output_field_convert_gpart_(
+INLINE static struct io_props io_make_output_field_convert_gpart_FLOAT(
     char name[FIELD_BUFFER_SIZE], enum IO_DATA_TYPE type, int dimension,
-    enum unit_conversion_factor units, char* field, size_t partSize,
-    struct gpart* gparts, float (*functionPtr)(struct engine*, struct gpart*)) {
+    enum unit_conversion_factor units, size_t gpartSize,
+    const struct gpart* gparts, conversion_func_gpart_float functionPtr) {
 
   struct io_props r;
   strcpy(r.name, name);
@@ -224,12 +289,52 @@ struct io_props io_make_output_field_convert_gpart_(
   r.dimension = dimension;
   r.importance = UNUSED;
   r.units = units;
-  r.field = field;
-  r.partSize = partSize;
+  r.field = NULL;
+  r.partSize = gpartSize;
+  r.parts = NULL;
+  r.gparts = gparts;
+  r.conversion = 1;
+  r.convert_part_f = NULL;
+  r.convert_part_d = NULL;
+  r.convert_gpart_f = functionPtr;
+  r.convert_gpart_d = NULL;
+
+  return r;
+}
+
+/**
+ * @brief Construct an #io_props from its parameters
+ *
+ * @param name Name of the field to read
+ * @param type The type of the data
+ * @param dimension Dataset dimension (1D, 3D, ...)
+ * @param units The units of the dataset
+ * @param gpartSize The size in byte of the particle
+ * @param gparts The particle array
+ * @param functionPtr The function used to convert a g-particle to a float
+ *
+ * Do not call this function directly. Use the macro defined above.
+ */
+INLINE static struct io_props io_make_output_field_convert_gpart_DOUBLE(
+    char name[FIELD_BUFFER_SIZE], enum IO_DATA_TYPE type, int dimension,
+    enum unit_conversion_factor units, size_t gpartSize,
+    const struct gpart* gparts, conversion_func_gpart_double functionPtr) {
+
+  struct io_props r;
+  strcpy(r.name, name);
+  r.type = type;
+  r.dimension = dimension;
+  r.importance = UNUSED;
+  r.units = units;
+  r.field = NULL;
+  r.partSize = gpartSize;
   r.parts = NULL;
   r.gparts = gparts;
-  r.convert_part = NULL;
-  r.convert_gpart = functionPtr;
+  r.conversion = 1;
+  r.convert_part_f = NULL;
+  r.convert_part_d = NULL;
+  r.convert_gpart_f = NULL;
+  r.convert_gpart_d = functionPtr;
 
   return r;
 }
diff --git a/src/logger.c b/src/logger.c
index b2acf47aa70cef55f53d296033f6f5c6162fd5bd..0e0bc930c0841f985da2c353357a69b69aba5d91 100644
--- a/src/logger.c
+++ b/src/logger.c
@@ -20,6 +20,8 @@
 /* Config parameters. */
 #include "../config.h"
 
+#ifdef HAVE_POSIX_FALLOCATE /* Are we on a sensible platform? */
+
 /* Some standard headers. */
 #include <stdint.h>
 #include <stdlib.h>
@@ -41,7 +43,6 @@
  *
  * @return The size of the logger message in bytes.
  */
-
 int logger_size(unsigned int mask) {
 
   /* Start with 8 bytes for the header. */
@@ -95,7 +96,6 @@ int logger_size(unsigned int mask) {
  * @param offset Pointer to the offset of the previous log of this particle.
  * @param dump The #dump in which to log the particle data.
  */
-
 void logger_log_part(struct part *p, unsigned int mask, size_t *offset,
                      struct dump *dump) {
 
@@ -176,7 +176,6 @@ void logger_log_part(struct part *p, unsigned int mask, size_t *offset,
  * @param offset Pointer to the offset of the previous log of this particle.
  * @param dump The #dump in which to log the particle data.
  */
-
 void logger_log_gpart(struct gpart *p, unsigned int mask, size_t *offset,
                       struct dump *dump) {
 
@@ -270,7 +269,6 @@ void logger_log_timestamp(unsigned long long int timestamp, size_t *offset,
  *
  * @return The mask containing the values read.
  */
-
 int logger_read_part(struct part *p, size_t *offset, const char *buff) {
 
   /* Jump to the offset. */
@@ -349,7 +347,6 @@ int logger_read_part(struct part *p, size_t *offset, const char *buff) {
  *
  * @return The mask containing the values read.
  */
-
 int logger_read_gpart(struct gpart *p, size_t *offset, const char *buff) {
 
   /* Jump to the offset. */
@@ -416,7 +413,6 @@ int logger_read_gpart(struct gpart *p, size_t *offset, const char *buff) {
  *
  * @return The mask containing the values read.
  */
-
 int logger_read_timestamp(unsigned long long int *t, size_t *offset,
                           const char *buff) {
 
@@ -444,3 +440,5 @@ int logger_read_timestamp(unsigned long long int *t, size_t *offset,
   /* Finally, return the mask of the values we just read. */
   return mask;
 }
+
+#endif /* HAVE_POSIX_FALLOCATE */
diff --git a/src/logger.h b/src/logger.h
index 32fae752c2ae13a143809d9df3030dbc06b0942d..596c0903750404d0934e0d3843a5461523700e9e 100644
--- a/src/logger.h
+++ b/src/logger.h
@@ -20,9 +20,11 @@
 #define SWIFT_LOGGER_H
 
 /* Includes. */
-#include "dump.h"
 #include "part.h"
 
+/* Forward declaration */
+struct dump;
+
 /**
  * Logger entries contain messages representing the particle data at a given
  * point in time during the simulation.
diff --git a/src/parallel_io.c b/src/parallel_io.c
index 26686baadb7c040264c20385d0c89cb30e5dc398..e11c43c79badcc40bb76b7eee09348571e1841cc 100644
--- a/src/parallel_io.c
+++ b/src/parallel_io.c
@@ -51,56 +51,35 @@
 #include "units.h"
 #include "xmf.h"
 
+/* Are we timing the i/o? */
+//#define IO_SPEED_MEASUREMENT
+
 /* The current limit of ROMIO (the underlying MPI-IO layer) is 2GB */
 #define HDF5_PARALLEL_IO_MAX_BYTES 2000000000LL
 
 /**
- * @brief Reads a data array from a given HDF5 group.
+ * @brief Reads a chunk of data from an open HDF5 dataset
  *
- * @param grp The group from which to read.
- * @param prop The #io_props of the field to read.
- * @param N The number of particles on that rank.
- * @param N_total The total number of particles.
- * @param offset The offset in the array on disk for this rank.
+ * @param h_data The HDF5 dataset to write to.
+ * @param h_plist_id the parallel HDF5 properties.
+ * @param props The #io_props of the field to read.
+ * @param N The number of particles to write.
+ * @param offset Offset in the array where this mpi task starts writing.
  * @param internal_units The #unit_system used internally.
- * @param ic_units The #unit_system used in the ICs.
+ * @param ic_units The #unit_system used in the snapshots.
  */
-void readArray(hid_t grp, const struct io_props prop, size_t N,
-               long long N_total, long long offset,
-               const struct unit_system* internal_units,
-               const struct unit_system* ic_units) {
-
-  const size_t typeSize = io_sizeof_type(prop.type);
-  const size_t copySize = typeSize * prop.dimension;
-  const size_t num_elements = N * prop.dimension;
-
-  /* Check whether the dataspace exists or not */
-  const htri_t exist = H5Lexists(grp, prop.name, 0);
-  if (exist < 0) {
-    error("Error while checking the existence of data set '%s'.", prop.name);
-  } else if (exist == 0) {
-    if (prop.importance == COMPULSORY) {
-      error("Compulsory data set '%s' not present in the file.", prop.name);
-    } else {
-      for (size_t i = 0; i < N; ++i)
-        memset(prop.field + i * prop.partSize, 0, copySize);
-      return;
-    }
-  }
-
-  /* message("Reading %s '%s' array...", */
-  /*         prop.importance == COMPULSORY ? "compulsory" : "optional  ", */
-  /*         prop.name); */
+void readArray_chunk(hid_t h_data, hid_t h_plist_id,
+                     const struct io_props props, size_t N, long long offset,
+                     const struct unit_system* internal_units,
+                     const struct unit_system* ic_units) {
 
-  /* Open data space in file */
-  const hid_t h_data = H5Dopen2(grp, prop.name, H5P_DEFAULT);
-  if (h_data < 0) error("Error while opening data space '%s'.", prop.name);
+  const size_t typeSize = io_sizeof_type(props.type);
+  const size_t copySize = typeSize * props.dimension;
+  const size_t num_elements = N * props.dimension;
 
-  /* Check data type */
-  const hid_t h_type = H5Dget_type(h_data);
-  if (h_type < 0) error("Unable to retrieve data type from the file");
-  /* if (!H5Tequal(h_type, hdf5_type(type))) */
-  /*   error("Non-matching types between the code and the file"); */
+  /* Can't handle writes of more than 2GB */
+  if (N * props.dimension * typeSize > HDF5_PARALLEL_IO_MAX_BYTES)
+    error("Dataset too large to be written in one pass!");
 
   /* Allocate temporary buffer */
   void* temp = malloc(num_elements * typeSize);
@@ -109,10 +88,10 @@ void readArray(hid_t grp, const struct io_props prop, size_t N,
   /* Prepare information for hyper-slab */
   hsize_t shape[2], offsets[2];
   int rank;
-  if (prop.dimension > 1) {
+  if (props.dimension > 1) {
     rank = 2;
     shape[0] = N;
-    shape[1] = prop.dimension;
+    shape[1] = props.dimension;
     offsets[0] = offset;
     offsets[1] = 0;
   } else {
@@ -130,27 +109,23 @@ void readArray(hid_t grp, const struct io_props prop, size_t N,
   const hid_t h_filespace = H5Dget_space(h_data);
   H5Sselect_hyperslab(h_filespace, H5S_SELECT_SET, offsets, NULL, shape, NULL);
 
-  /* Set collective reading properties */
-  const hid_t h_plist_id = H5Pcreate(H5P_DATASET_XFER);
-  H5Pset_dxpl_mpio(h_plist_id, H5FD_MPIO_COLLECTIVE);
-
   /* Read HDF5 dataspace in temporary buffer */
   /* Dirty version that happens to work for vectors but should be improved */
   /* Using HDF5 dataspaces would be better */
-  const hid_t h_err = H5Dread(h_data, io_hdf5_type(prop.type), h_memspace,
+  const hid_t h_err = H5Dread(h_data, io_hdf5_type(props.type), h_memspace,
                               h_filespace, h_plist_id, temp);
   if (h_err < 0) {
-    error("Error while reading data array '%s'.", prop.name);
+    error("Error while reading data array '%s'.", props.name);
   }
 
   /* Unit conversion if necessary */
   const double factor =
-      units_conversion_factor(ic_units, internal_units, prop.units);
-  if (factor != 1. && exist != 0) {
+      units_conversion_factor(ic_units, internal_units, props.units);
+  if (factor != 1.) {
 
     /* message("Converting ! factor=%e", factor); */
 
-    if (io_is_double_precision(prop.type)) {
+    if (io_is_double_precision(props.type)) {
       double* temp_d = temp;
       for (size_t i = 0; i < num_elements; ++i) temp_d[i] *= factor;
     } else {
@@ -162,13 +137,98 @@ void readArray(hid_t grp, const struct io_props prop, size_t N,
   /* Copy temporary buffer to particle data */
   char* temp_c = temp;
   for (size_t i = 0; i < N; ++i)
-    memcpy(prop.field + i * prop.partSize, &temp_c[i * copySize], copySize);
+    memcpy(props.field + i * props.partSize, &temp_c[i * copySize], copySize);
 
   /* Free and close everything */
   free(temp);
-  H5Pclose(h_plist_id);
   H5Sclose(h_filespace);
   H5Sclose(h_memspace);
+}
+
+/**
+ * @brief Reads a data array from a given HDF5 group.
+ *
+ * @param grp The group from which to read.
+ * @param props The #io_props of the field to read.
+ * @param N The number of particles on that rank.
+ * @param N_total The total number of particles.
+ * @param mpi_rank The MPI rank of this node.
+ * @param offset The offset in the array on disk for this rank.
+ * @param internal_units The #unit_system used internally.
+ * @param ic_units The #unit_system used in the ICs.
+ */
+void readArray(hid_t grp, struct io_props props, size_t N, long long N_total,
+               int mpi_rank, long long offset,
+               const struct unit_system* internal_units,
+               const struct unit_system* ic_units) {
+
+  const size_t typeSize = io_sizeof_type(props.type);
+  const size_t copySize = typeSize * props.dimension;
+
+  /* Check whether the dataspace exists or not */
+  const htri_t exist = H5Lexists(grp, props.name, 0);
+  if (exist < 0) {
+    error("Error while checking the existence of data set '%s'.", props.name);
+  } else if (exist == 0) {
+    if (props.importance == COMPULSORY) {
+      error("Compulsory data set '%s' not present in the file.", props.name);
+    } else {
+      for (size_t i = 0; i < N; ++i)
+        memset(props.field + i * props.partSize, 0, copySize);
+      return;
+    }
+  }
+
+  /* Open data space in file */
+  const hid_t h_data = H5Dopen2(grp, props.name, H5P_DEFAULT);
+  if (h_data < 0) error("Error while opening data space '%s'.", props.name);
+
+  /* Check data type */
+  const hid_t h_type = H5Dget_type(h_data);
+  if (h_type < 0) error("Unable to retrieve data type from the file");
+  /* if (!H5Tequal(h_type, hdf5_type(type))) */
+  /*   error("Non-matching types between the code and the file"); */
+
+  /* Create property list for collective dataset read. */
+  const hid_t h_plist_id = H5Pcreate(H5P_DATASET_XFER);
+  H5Pset_dxpl_mpio(h_plist_id, H5FD_MPIO_COLLECTIVE);
+
+  /* Given the limitations of ROM-IO we will need to read the data in chunk of
+     HDF5_PARALLEL_IO_MAX_BYTES bytes per node until all the nodes are done. */
+  char redo = 1;
+  while (redo) {
+
+    /* Maximal number of elements */
+    const size_t max_chunk_size =
+        HDF5_PARALLEL_IO_MAX_BYTES / (props.dimension * typeSize);
+
+    /* Write the first chunk */
+    const size_t this_chunk = (N > max_chunk_size) ? max_chunk_size : N;
+    readArray_chunk(h_data, h_plist_id, props, this_chunk, offset,
+                    internal_units, ic_units);
+
+    /* Compute how many items are left */
+    if (N > max_chunk_size) {
+      N -= max_chunk_size;
+      props.field += max_chunk_size * props.partSize;
+      offset += max_chunk_size;
+      redo = 1;
+    } else {
+      N = 0;
+      offset += 0;
+      redo = 0;
+    }
+
+    /* Do we need to run again ? */
+    MPI_Allreduce(MPI_IN_PLACE, &redo, 1, MPI_SIGNED_CHAR, MPI_MAX,
+                  MPI_COMM_WORLD);
+
+    if (redo && mpi_rank == 0)
+      message("Need to redo one iteration for array '%s'", props.name);
+  }
+
+  /* Close everything */
+  H5Pclose(h_plist_id);
   H5Tclose(h_type);
   H5Dclose(h_data);
 }
@@ -195,7 +255,6 @@ void writeArray_chunk(struct engine* e, hid_t h_data, hid_t h_plist_id,
                       const struct unit_system* snapshot_units) {
 
   const size_t typeSize = io_sizeof_type(props.type);
-  const size_t copySize = typeSize * props.dimension;
   const size_t num_elements = N * props.dimension;
 
   /* Can't handle writes of more than 2GB */
@@ -206,44 +265,24 @@ void writeArray_chunk(struct engine* e, hid_t h_data, hid_t h_plist_id,
 
   /* Allocate temporary buffer */
   void* temp = malloc(num_elements * typeSize);
-  if (temp == NULL) error("Unable to allocate memory for temporary buffer");
-
-  /* Copy particle data to temporary buffer */
-  if (props.convert_part == NULL &&
-      props.convert_gpart == NULL) { /* No conversion */
+  if (posix_memalign((void**)&temp, IO_BUFFER_ALIGNMENT,
+                     num_elements * typeSize) != 0)
+    error("Unable to allocate temporary i/o buffer");
 
-    char* temp_c = temp;
-    for (size_t i = 0; i < N; ++i)
-      memcpy(&temp_c[i * copySize], props.field + i * props.partSize, copySize);
+#ifdef IO_SPEED_MEASUREMENT
+  MPI_Barrier(MPI_COMM_WORLD);
+  ticks tic = getticks();
+#endif
 
-  } else if (props.convert_part != NULL) { /* conversion (for parts)*/
+  /* Copy the particle data to the temporary buffer */
+  io_copy_temp_buffer(temp, e, props, N, internal_units, snapshot_units);
 
-    float* temp_f = temp;
-    for (size_t i = 0; i < N; ++i)
-      temp_f[i] = props.convert_part(e, &props.parts[i]);
-
-  } else if (props.convert_gpart != NULL) { /* conversion (for gparts)*/
-
-    float* temp_f = temp;
-    for (size_t i = 0; i < N; ++i)
-      temp_f[i] = props.convert_gpart(e, &props.gparts[i]);
-  }
-
-  /* Unit conversion if necessary */
-  const double factor =
-      units_conversion_factor(internal_units, snapshot_units, props.units);
-  if (factor != 1.) {
-
-    /* message("Converting ! factor=%e", factor); */
-
-    if (io_is_double_precision(props.type)) {
-      double* temp_d = temp;
-      for (size_t i = 0; i < num_elements; ++i) temp_d[i] *= factor;
-    } else {
-      float* temp_f = temp;
-      for (size_t i = 0; i < num_elements; ++i) temp_f[i] *= factor;
-    }
-  }
+#ifdef IO_SPEED_MEASUREMENT
+  MPI_Barrier(MPI_COMM_WORLD);
+  if(engine_rank == 0)
+    message("Copying for '%s' took %.3f %s." , props.name,
+	    clocks_from_ticks(getticks() - tic), clocks_getunit());
+#endif
 
   /* Create data space */
   const hid_t h_memspace = H5Screate(H5S_SIMPLE);
@@ -286,10 +325,14 @@ void writeArray_chunk(struct engine* e, hid_t h_data, hid_t h_plist_id,
   }
 
   /* message("Writing %lld '%s', %zd elements = %zd bytes (int=%d) at offset
-   * %zd", */
-  /* 	  N, props.name, N * props.dimension, N * props.dimension * typeSize, */
+   * %zd", N, props.name, N * props.dimension, N * props.dimension * typeSize, */
   /* 	  (int)(N * props.dimension * typeSize), offset); */
 
+#ifdef IO_SPEED_MEASUREMENT
+  MPI_Barrier(MPI_COMM_WORLD);
+  tic = getticks();
+#endif
+
   /* Write temporary buffer to HDF5 dataspace */
   h_err = H5Dwrite(h_data, io_hdf5_type(props.type), h_memspace, h_filespace,
                    h_plist_id, temp);
@@ -297,6 +340,18 @@ void writeArray_chunk(struct engine* e, hid_t h_data, hid_t h_plist_id,
     error("Error while writing data array '%s'.", props.name);
   }
 
+#ifdef IO_SPEED_MEASUREMENT
+  MPI_Barrier(MPI_COMM_WORLD);
+  ticks toc = getticks();
+  float ms = clocks_from_ticks(toc - tic);
+  int megaBytes = N * props.dimension * typeSize / (1024 * 1024);
+  int total = 0;
+  MPI_Reduce(&megaBytes, &total, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
+  if (engine_rank == 0)
+    message("H5Dwrite for '%s' (%d MB) took %.3f %s (speed = %f MB/s).",
+            props.name, total, ms, clocks_getunit(), total / (ms / 1000.));
+#endif
+
   /* Free and close everything */
   free(temp);
   H5Sclose(h_memspace);
@@ -328,6 +383,10 @@ void writeArray(struct engine* e, hid_t grp, char* fileName, FILE* xmfFile,
 
   const size_t typeSize = io_sizeof_type(props.type);
 
+#ifdef IO_SPEED_MEASUREMENT
+  const ticks tic = getticks();
+#endif
+
   /* Work out properties of the array in the file */
   int rank;
   hsize_t shape_total[2];
@@ -440,6 +499,13 @@ void writeArray(struct engine* e, hid_t grp, char* fileName, FILE* xmfFile,
   H5Pclose(h_prop);
   H5Dclose(h_data);
   H5Pclose(h_plist_id);
+
+#ifdef IO_SPEED_MEASUREMENT
+  MPI_Barrier(MPI_COMM_WORLD);
+  if(engine_rank == 0)
+    message("'%s' took %.3f %s." , props.name,
+	    clocks_from_ticks(getticks() - tic), clocks_getunit());
+#endif
 }
 
 /**
@@ -560,7 +626,7 @@ void read_ic_parallel(char* fileName, const struct unit_system* internal_units,
   /* Read the unit system used in the ICs */
   struct unit_system* ic_units = malloc(sizeof(struct unit_system));
   if (ic_units == NULL) error("Unable to allocate memory for IC unit system");
-  io_read_unit_system(h_file, ic_units);
+  io_read_unit_system(h_file, ic_units, mpi_rank);
 
   /* Tell the user if a conversion will be needed */
   if (mpi_rank == 0) {
@@ -678,14 +744,16 @@ void read_ic_parallel(char* fileName, const struct unit_system* internal_units,
         break;
 
       default:
-        message("Particle Type %d not yet supported. Particles ignored", ptype);
+        if (mpi_rank == 0)
+          message("Particle Type %d not yet supported. Particles ignored",
+                  ptype);
     }
 
     /* Read everything */
     if (!dry_run)
       for (int i = 0; i < num_fields; ++i)
-        readArray(h_grp, list[i], Nparticles, N_total[ptype], offset[ptype],
-                  internal_units, ic_units);
+        readArray(h_grp, list[i], Nparticles, N_total[ptype], mpi_rank,
+                  offset[ptype], internal_units, ic_units);
 
     /* Close particle group */
     H5Gclose(h_grp);
@@ -768,16 +836,42 @@ void write_output_parallel(struct engine* e, const char* baseName,
   /* Prepare the XMF file for the new entry */
   if (mpi_rank == 0) xmfFile = xmf_prepare_file(baseName);
 
-  /* Open HDF5 file */
+  /* Prepare some file-access properties */
   hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS);
-  H5Pset_fapl_mpio(plist_id, comm, info);
+
+  /* Set some MPI-IO parameters */
+  // MPI_Info_set(info, "IBM_largeblock_io", "true");
+  MPI_Info_set(info, "romio_cb_write", "enable");
+  MPI_Info_set(info, "romio_ds_write", "disable");
+
+  /* Activate parallel i/o */
+  hid_t h_err = H5Pset_fapl_mpio(plist_id, comm, info);
+  if (h_err < 0) error("Error setting parallel i/o");
+  
+  /* Align on 4k pages. */
+  h_err = H5Pset_alignment(plist_id, 1024, 4096);
+  if (h_err < 0) error("Error setting Hdf5 alignment");
+
+  /* Disable meta-data cache eviction */
+  H5AC_cache_config_t mdc_config;
+  mdc_config.version = H5AC__CURR_CACHE_CONFIG_VERSION;
+  h_err = H5Pget_mdc_config(plist_id, &mdc_config);
+  if (h_err < 0) error("Error getting the MDC config");
+
+  mdc_config.evictions_enabled = 0; /* false */
+  mdc_config.incr_mode = H5C_incr__off;
+  mdc_config.decr_mode = H5C_decr__off;
+  mdc_config.flash_incr_mode = H5C_flash_incr__off;
+  h_err = H5Pset_mdc_config(plist_id, &mdc_config);
+  if (h_err < 0) error("Error setting the MDC config");
+
+  /* Open HDF5 file with the chosen parameters */
   h_file = H5Fcreate(fileName, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id);
   if (h_file < 0) {
     error("Error while opening file '%s'.", fileName);
   }
 
-  /* Compute offset in the file and total number of
-   * particles */
+  /* Compute offset in the file and total number of particles */
   size_t N[swift_type_count] = {Ngas, Ndm, 0, 0, Nstars, 0};
   long long N_total[swift_type_count] = {0};
   long long offset[swift_type_count] = {0};
@@ -790,8 +884,7 @@ void write_output_parallel(struct engine* e, const char* baseName,
   MPI_Bcast(&N_total, 6, MPI_LONG_LONG_INT, mpi_size - 1, comm);
 
   /* Now everybody konws its offset and the total number of
-   * particles of each
-   * type */
+   * particles of each type */
 
   /* Write the part of the XMF file corresponding to this
    * specific output */
diff --git a/src/partition.c b/src/partition.c
index d9aade781e824defedb05f5a201fec8401da7ee2..51231b5c01e7fd0d3d8d689d5f41fa1017a85537 100644
--- a/src/partition.c
+++ b/src/partition.c
@@ -63,10 +63,15 @@ const char *initial_partition_name[] = {
 
 /* Simple descriptions of repartition types for reports. */
 const char *repartition_name[] = {
-    "no", "METIS edge and vertex time weighted cells",
-    "METIS particle count vertex weighted cells",
-    "METIS time edge weighted cells",
-    "METIS particle count vertex and time edge cells"};
+    "no",
+    "METIS edge and vertex task cost weights",
+    "METIS particle count vertex weights",
+    "METIS task cost edge weights",
+    "METIS particle count vertex and task cost edge weights",
+    "METIS vertex task costs and edge delta timebin weights",
+    "METIS particle count vertex and edge delta timebin weights",
+    "METIS edge delta timebin weights",
+};
 
 /* Local functions, if needed. */
 static int check_complete(struct space *s, int verbose, int nregions);
@@ -236,14 +241,14 @@ static void graph_init_metis(struct space *s, idx_t *adjncy, idx_t *xadj) {
  * @param counts the number of particles per cell. Should be
  *               allocated as size s->nr_parts.
  */
-static void accumulate_counts(struct space *s, int *counts) {
+static void accumulate_counts(struct space *s, double *counts) {
 
   struct part *parts = s->parts;
   int *cdim = s->cdim;
   double iwidth[3] = {s->iwidth[0], s->iwidth[1], s->iwidth[2]};
   double dim[3] = {s->dim[0], s->dim[1], s->dim[2]};
 
-  bzero(counts, sizeof(int) * s->nr_cells);
+  bzero(counts, sizeof(double) * s->nr_cells);
 
   for (size_t k = 0; k < s->nr_parts; k++) {
     for (int j = 0; j < 3; j++) {
@@ -274,6 +279,9 @@ static void accumulate_counts(struct space *s, int *counts) {
 static void split_metis(struct space *s, int nregions, int *celllist) {
 
   for (int i = 0; i < s->nr_cells; i++) s->cells_top[i].nodeID = celllist[i];
+
+  /* To check or visualise the partition dump all the cells. */
+  /* dumpCellRanks("metis_partition", s->cells_top, s->nr_cells); */
 }
 #endif
 
@@ -300,15 +308,16 @@ static int indexvalcmp(const void *p1, const void *p2) {
  * @param s the space of cells to partition.
  * @param nregions the number of regions required in the partition.
  * @param vertexw weights for the cells, sizeof number of cells if used,
- *        NULL for unit weights.
+ *        NULL for unit weights. Need to be in the range of idx_t.
  * @param edgew weights for the graph edges between all cells, sizeof number
  *        of cells * 26 if used, NULL for unit weights. Need to be packed
- *        in CSR format, so same as adjncy array.
+ *        in CSR format, so same as adjncy array. Need to be in the range of
+ *        idx_t.
  * @param celllist on exit this contains the ids of the selected regions,
  *        sizeof number of cells.
  */
-static void pick_metis(struct space *s, int nregions, int *vertexw, int *edgew,
-                       int *celllist) {
+static void pick_metis(struct space *s, int nregions, double *vertexw,
+                       double *edgew, int *celllist) {
 
   /* Total number of cells. */
   int ncells = s->cdim[0] * s->cdim[1] * s->cdim[2];
@@ -345,23 +354,56 @@ static void pick_metis(struct space *s, int nregions, int *vertexw, int *edgew,
   /* Init the vertex weights array. */
   if (vertexw != NULL) {
     for (int k = 0; k < ncells; k++) {
-      if (vertexw[k] > 0) {
+      if (vertexw[k] > 1) {
         weights_v[k] = vertexw[k];
       } else {
         weights_v[k] = 1;
       }
     }
+
+#ifdef SWIFT_DEBUG_CHECKS
+    /* Check weights are all in range. */
+    int failed = 0;
+    for (int k = 0; k < ncells; k++) {
+      if ((idx_t)vertexw[k] < 0) {
+        message("Input vertex weight out of range: %ld", (long)vertexw[k]);
+        failed++;
+      }
+      if (weights_v[k] < 1) {
+        message("Used vertex weight  out of range: %" PRIDX, weights_v[k]);
+        failed++;
+      }
+    }
+    if (failed > 0) error("%d vertex weights are out of range", failed);
+#endif
   }
 
   /* Init the edges weights array. */
   if (edgew != NULL) {
     for (int k = 0; k < 26 * ncells; k++) {
-      if (edgew[k] > 0) {
+      if (edgew[k] > 1) {
         weights_e[k] = edgew[k];
       } else {
         weights_e[k] = 1;
       }
     }
+
+#ifdef SWIFT_DEBUG_CHECKS
+    /* Check weights are all in range. */
+    int failed = 0;
+    for (int k = 0; k < 26 * ncells; k++) {
+
+      if ((idx_t)edgew[k] < 0) {
+        message("Input edge weight out of range: %ld", (long)edgew[k]);
+        failed++;
+      }
+      if (weights_e[k] < 1) {
+        message("Used edge weight out of range: %" PRIDX, weights_e[k]);
+        failed++;
+      }
+    }
+    if (failed > 0) error("%d edge weights are out of range", failed);
+#endif
   }
 
   /* Set the METIS options. */
@@ -380,8 +422,8 @@ static void pick_metis(struct space *s, int nregions, int *vertexw, int *edgew,
   idx_t objval;
 
   /* Dump graph in METIS format */
-  /* dumpMETISGraph("metis_graph", idx_ncells, one, xadj, adjncy,
-   *                weights_v, NULL, weights_e);
+  /*dumpMETISGraph("metis_graph", idx_ncells, one, xadj, adjncy,
+   *               weights_v, NULL, weights_e);
    */
   if (METIS_PartGraphKway(&idx_ncells, &one, xadj, adjncy, weights_v, NULL,
                           weights_e, &idx_nregions, NULL, NULL, options,
@@ -465,31 +507,28 @@ static void pick_metis(struct space *s, int nregions, int *vertexw, int *edgew,
 
 #if defined(WITH_MPI) && defined(HAVE_METIS)
 /**
- * @brief Repartition the cells amongst the nodes using task timings
- *        as edge weights and vertex weights also from task timings
+ * @brief Repartition the cells amongst the nodes using task costs
+ *        as edge weights and vertex weights also from task costs
  *        or particle cells counts.
  *
  * @param partweights whether particle counts will be used as vertex weights.
  * @param bothweights whether vertex and edge weights will be used, otherwise
  *                    only edge weights will be used.
+ * @param timebins use timebins as edge weights.
  * @param nodeID our nodeID.
  * @param nr_nodes the number of nodes.
  * @param s the space of cells holding our local particles.
  * @param tasks the completed tasks from the last engine step for our node.
  * @param nr_tasks the number of tasks.
  */
-static void repart_edge_metis(int partweights, int bothweights, int nodeID,
-                              int nr_nodes, struct space *s, struct task *tasks,
-                              int nr_tasks) {
+static void repart_edge_metis(int partweights, int bothweights, int timebins,
+                              int nodeID, int nr_nodes, struct space *s,
+                              struct task *tasks, int nr_tasks) {
 
   /* Create weight arrays using task ticks for vertices and edges (edges
    * assume the same graph structure as used in the part_ calls). */
   int nr_cells = s->nr_cells;
   struct cell *cells = s->cells_top;
-  float wscale = 1.f, wscale_buff = 0.0;
-  int wtot = 0;
-  int wmax = 1e9 / nr_nodes;
-  int wmin;
 
   /* Allocate and fill the adjncy indexing array defining the graph of
    * cells. */
@@ -499,16 +538,16 @@ static void repart_edge_metis(int partweights, int bothweights, int nodeID,
   graph_init_metis(s, inds, NULL);
 
   /* Allocate and init weights. */
-  int *weights_v = NULL;
-  int *weights_e = NULL;
+  double *weights_v = NULL;
+  double *weights_e = NULL;
   if (bothweights) {
-    if ((weights_v = (int *)malloc(sizeof(int) * nr_cells)) == NULL)
+    if ((weights_v = (double *)malloc(sizeof(double) * nr_cells)) == NULL)
       error("Failed to allocate vertex weights arrays.");
-    bzero(weights_v, sizeof(int) * nr_cells);
+    bzero(weights_v, sizeof(double) * nr_cells);
   }
-  if ((weights_e = (int *)malloc(sizeof(int) * 26 * nr_cells)) == NULL)
+  if ((weights_e = (double *)malloc(sizeof(double) * 26 * nr_cells)) == NULL)
     error("Failed to allocate edge weights arrays.");
-  bzero(weights_e, sizeof(int) * 26 * nr_cells);
+  bzero(weights_e, sizeof(double) * 26 * nr_cells);
 
   /* Generate task weights for vertices. */
   int taskvweights = (bothweights && !partweights);
@@ -521,19 +560,8 @@ static void repart_edge_metis(int partweights, int bothweights, int nodeID,
     /* Skip un-interesting tasks. */
     if (t->cost == 0) continue;
 
-    /* Get the task weight. */
-    int w = t->cost * wscale;
-
-    /* Do we need to re-scale? */
-    wtot += w;
-    while (wtot > wmax) {
-      wscale /= 2;
-      wtot /= 2;
-      w /= 2;
-      for (int k = 0; k < 26 * nr_cells; k++) weights_e[k] *= 0.5;
-      if (taskvweights)
-        for (int k = 0; k < nr_cells; k++) weights_v[k] *= 0.5;
-    }
+    /* Get the task weight based on costs. */
+    double w = (double)t->cost;
 
     /* Get the top-level cells involved. */
     struct cell *ci, *cj;
@@ -552,9 +580,9 @@ static void repart_edge_metis(int partweights, int bothweights, int nodeID,
     if (t->type == task_type_ghost || t->type == task_type_kick1 ||
         t->type == task_type_kick2 || t->type == task_type_timestep ||
         t->type == task_type_drift_part || t->type == task_type_drift_gpart) {
+
       /* Particle updates add only to vertex weight. */
       if (taskvweights) weights_v[cid] += w;
-
     }
 
     /* Self interaction? */
@@ -575,64 +603,72 @@ static void repart_edge_metis(int partweights, int bothweights, int nodeID,
 
       }
 
-      /* Distinct cells with local ci? */
-      else if (ci->nodeID == nodeID) {
+      /* Distinct cells. */
+      else {
         /* Index of the jth cell. */
         int cjd = cj - cells;
 
-        /* Add half of weight to each cell. */
-        if (taskvweights) {
-          if (ci->nodeID == nodeID) weights_v[cid] += 0.5 * w;
+        /* Local cells add weight to vertices. */
+        if (taskvweights && ci->nodeID == nodeID) {
+          weights_v[cid] += 0.5 * w;
           if (cj->nodeID == nodeID) weights_v[cjd] += 0.5 * w;
         }
 
-        /* Add weights to edge. */
-        int kk;
-        for (kk = 26 * cid; inds[kk] != cjd; kk++)
-          ;
-        weights_e[kk] += w;
-        for (kk = 26 * cjd; inds[kk] != cid; kk++)
-          ;
-        weights_e[kk] += w;
+        if (timebins) {
+          /* Add weights to edge for all cells based on the expected
+           * interaction time (calculated as the time to the last expected
+           * time) as we want to avoid having active cells on the edges, so we
+           * cut for that. Note that weight is added to the local and remote
+           * cells, as we want to keep both away from any cuts, this can
+           * overflow int, so take care. */
+          int dti = num_time_bins - get_time_bin(ci->ti_end_min);
+          int dtj = num_time_bins - get_time_bin(cj->ti_end_min);
+          double dt = (double)(1 << dti) + (double)(1 << dtj);
+
+          /* ci */
+          int kk;
+          for (kk = 26 * cid; inds[kk] != cjd; kk++)
+            ;
+          weights_e[kk] += dt;
+
+          /* cj */
+          for (kk = 26 * cjd; inds[kk] != cid; kk++)
+            ;
+          weights_e[kk] += dt;
+        } else {
+
+          /* Add weights from task costs to the edge. */
+
+          /* ci */
+          int kk;
+          for (kk = 26 * cid; inds[kk] != cjd; kk++)
+            ;
+          weights_e[kk] += w;
+
+          /* cj */
+          for (kk = 26 * cjd; inds[kk] != cid; kk++)
+            ;
+          weights_e[kk] += w;
+        }
       }
     }
   }
 
   /* Re-calculate the vertices if using particle counts. */
-  if (partweights && bothweights) {
-    accumulate_counts(s, weights_v);
-
-    /*  Rescale to balance times. */
-    float vwscale = (float)wtot / (float)nr_tasks;
-    for (int k = 0; k < nr_cells; k++) {
-      weights_v[k] *= vwscale;
-    }
-  }
-
-  /* Get the minimum scaling and re-scale if necessary. */
-  int res;
-  if ((res = MPI_Allreduce(&wscale, &wscale_buff, 1, MPI_FLOAT, MPI_MIN,
-                           MPI_COMM_WORLD)) != MPI_SUCCESS)
-    mpi_error(res, "Failed to allreduce the weight scales.");
-
-  if (wscale_buff != wscale) {
-    float scale = wscale_buff / wscale;
-    for (int k = 0; k < 26 * nr_cells; k++) weights_e[k] *= scale;
-    if (bothweights)
-      for (int k = 0; k < nr_cells; k++) weights_v[k] *= scale;
-  }
+  if (partweights && bothweights) accumulate_counts(s, weights_v);
 
   /* Merge the weights arrays across all nodes. */
+  int res;
   if (bothweights) {
     if ((res = MPI_Reduce((nodeID == 0) ? MPI_IN_PLACE : weights_v, weights_v,
-                          nr_cells, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD)) !=
+                          nr_cells, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD)) !=
         MPI_SUCCESS)
       mpi_error(res, "Failed to allreduce vertex weights.");
   }
 
   if ((res = MPI_Reduce((nodeID == 0) ? MPI_IN_PLACE : weights_e, weights_e,
-                        26 * nr_cells, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD)) !=
-      MPI_SUCCESS)
+                        26 * nr_cells, MPI_DOUBLE, MPI_SUM, 0,
+                        MPI_COMM_WORLD)) != MPI_SUCCESS)
     mpi_error(res, "Failed to allreduce edge weights.");
 
   /* Allocate cell list for the partition. */
@@ -641,39 +677,72 @@ static void repart_edge_metis(int partweights, int bothweights, int nodeID,
 
   /* As of here, only one node needs to compute the partition. */
   if (nodeID == 0) {
-    /* Final rescale of all weights to avoid a large range. Large ranges
-     * have been seen to cause an incomplete graph. */
-    wmin = wmax;
-    wmax = 0;
-    for (int k = 0; k < 26 * nr_cells; k++) {
-      wmax = weights_e[k] > wmax ? weights_e[k] : wmax;
-      wmin = weights_e[k] < wmin ? weights_e[k] : wmin;
-    }
+
+    /* We need to rescale the weights into the range of an integer for METIS
+     * (really range of idx_t). Also we would like the range of vertex and
+     * edges weights to be similar so they balance. */
+    double wminv = 0.0;
+    double wmaxv = 0.0;
     if (bothweights) {
+      wminv = weights_v[0];
+      wmaxv = weights_v[0];
       for (int k = 0; k < nr_cells; k++) {
-        wmax = weights_v[k] > wmax ? weights_v[k] : wmax;
-        wmin = weights_v[k] < wmin ? weights_v[k] : wmin;
+        wmaxv = weights_v[k] > wmaxv ? weights_v[k] : wmaxv;
+        wminv = weights_v[k] < wminv ? weights_v[k] : wminv;
       }
     }
 
-    if ((wmax - wmin) > metis_maxweight) {
-      wscale = metis_maxweight / (wmax - wmin);
-      for (int k = 0; k < 26 * nr_cells; k++) {
-        weights_e[k] = (weights_e[k] - wmin) * wscale + 1;
-      }
-      if (bothweights) {
+    double wmine = weights_e[0];
+    double wmaxe = weights_e[0];
+    for (int k = 0; k < 26 * nr_cells; k++) {
+      wmaxe = weights_e[k] > wmaxe ? weights_e[k] : wmaxe;
+      wmine = weights_e[k] < wmine ? weights_e[k] : wmine;
+    }
+
+    if (bothweights) {
+
+      /* Make range the same in both weights systems. */
+      if ((wmaxv - wminv) > (wmaxe - wmine)) {
+        double wscale = 1.0;
+        if ((wmaxe - wmine) > 0.0) {
+          wscale = (wmaxv - wminv) / (wmaxe - wmine);
+        }
+        for (int k = 0; k < 26 * nr_cells; k++) {
+          weights_e[k] = (weights_e[k] - wmine) * wscale + wminv;
+        }
+        wmine = wminv;
+        wmaxe = wmaxv;
+
+      } else {
+        double wscale = 1.0;
+        if ((wmaxv - wminv) > 0.0) {
+          wscale = (wmaxe - wmine) / (wmaxv - wminv);
+        }
         for (int k = 0; k < nr_cells; k++) {
-          weights_v[k] = (weights_v[k] - wmin) * wscale + 1;
+          weights_v[k] = (weights_v[k] - wminv) * wscale + wmine;
         }
+        wminv = wmine;
+        wmaxv = wmaxe;
+      }
+
+      /* Scale to the METIS range. */
+      double wscale = 1.0;
+      if ((wmaxv - wminv) > 0.0) {
+         wscale = (metis_maxweight - 1.0) / (wmaxv - wminv);
+      }
+      for (int k = 0; k < nr_cells; k++) {
+        weights_v[k] = (weights_v[k] - wminv) * wscale + 1.0;
       }
     }
 
-    /* Make sure there are no zero weights. */
-    for (int k = 0; k < 26 * nr_cells; k++)
-      if (weights_e[k] == 0) weights_e[k] = 1;
-    if (bothweights)
-      for (int k = 0; k < nr_cells; k++)
-        if (weights_v[k] == 0) weights_v[k] = 1;
+    /* Scale to the METIS range. */
+    double wscale = 1.0;
+    if ((wmaxe - wmine) > 0.0) {
+      wscale = (metis_maxweight - 1.0) / (wmaxe - wmine);
+    }
+    for (int k = 0; k < 26 * nr_cells; k++) {
+      weights_e[k] = (weights_e[k] - wmine) * wscale + 1.0;
+    }
 
     /* And partition, use both weights or not as requested. */
     if (bothweights)
@@ -736,8 +805,8 @@ static void repart_vertex_metis(struct space *s, int nodeID, int nr_nodes) {
 
   /* Use particle counts as vertex weights. */
   /* Space for particles per cell counts, which will be used as weights. */
-  int *weights = NULL;
-  if ((weights = (int *)malloc(sizeof(int) * s->nr_cells)) == NULL)
+  double *weights = NULL;
+  if ((weights = (double *)malloc(sizeof(double) * s->nr_cells)) == NULL)
     error("Failed to allocate weights buffer.");
 
   /* Check each particle and accumulate the counts per cell. */
@@ -745,8 +814,8 @@ static void repart_vertex_metis(struct space *s, int nodeID, int nr_nodes) {
 
   /* Get all the counts from all the nodes. */
   int res;
-  if ((res = MPI_Allreduce(MPI_IN_PLACE, weights, s->nr_cells, MPI_INT, MPI_SUM,
-                           MPI_COMM_WORLD)) != MPI_SUCCESS)
+  if ((res = MPI_Allreduce(MPI_IN_PLACE, weights, s->nr_cells, MPI_DOUBLE,
+                           MPI_SUM, MPI_COMM_WORLD)) != MPI_SUCCESS)
     mpi_error(res, "Failed to allreduce particle cell weights.");
 
   /* Main node does the partition calculation. */
@@ -787,35 +856,32 @@ void partition_repartition(struct repartition *reparttype, int nodeID,
 
 #if defined(WITH_MPI) && defined(HAVE_METIS)
 
-  if (reparttype->type == REPART_METIS_BOTH ||
-      reparttype->type == REPART_METIS_EDGE ||
-      reparttype->type == REPART_METIS_VERTEX_EDGE) {
+  if (reparttype->type == REPART_METIS_VERTEX_COSTS_EDGE_COSTS) {
+    repart_edge_metis(0, 1, 0, nodeID, nr_nodes, s, tasks, nr_tasks);
 
-    int partweights;
-    int bothweights;
-    if (reparttype->type == REPART_METIS_VERTEX_EDGE)
-      partweights = 1;
-    else
-      partweights = 0;
+  } else if (reparttype->type == REPART_METIS_EDGE_COSTS) {
+    repart_edge_metis(0, 0, 0, nodeID, nr_nodes, s, tasks, nr_tasks);
 
-    if (reparttype->type == REPART_METIS_BOTH)
-      bothweights = 1;
-    else
-      bothweights = 0;
+  } else if (reparttype->type == REPART_METIS_VERTEX_COUNTS_EDGE_COSTS) {
+    repart_edge_metis(1, 1, 0, nodeID, nr_nodes, s, tasks, nr_tasks);
 
-    repart_edge_metis(partweights, bothweights, nodeID, nr_nodes, s, tasks,
-                      nr_tasks);
+  } else if (reparttype->type == REPART_METIS_VERTEX_COSTS_EDGE_TIMEBINS) {
+    repart_edge_metis(0, 1, 1, nodeID, nr_nodes, s, tasks, nr_tasks);
 
-  } else if (reparttype->type == REPART_METIS_VERTEX) {
+  } else if (reparttype->type == REPART_METIS_VERTEX_COUNTS_EDGE_TIMEBINS) {
+    repart_edge_metis(1, 1, 1, nodeID, nr_nodes, s, tasks, nr_tasks);
 
+  } else if (reparttype->type == REPART_METIS_EDGE_TIMEBINS) {
+    repart_edge_metis(0, 0, 1, nodeID, nr_nodes, s, tasks, nr_tasks);
+
+  } else if (reparttype->type == REPART_METIS_VERTEX_COUNTS) {
     repart_vertex_metis(s, nodeID, nr_nodes);
 
   } else if (reparttype->type == REPART_NONE) {
-
     /* Doing nothing. */
 
   } else {
-    error("Unknown repartition type");
+    error("Impossible repartition type");
   }
 #else
   error("SWIFT was not compiled with METIS support.");
@@ -884,17 +950,17 @@ void partition_initial_partition(struct partition *initial_partition,
 
     /* Space for particles per cell counts, which will be used as weights or
      * not. */
-    int *weights = NULL;
+    double *weights = NULL;
     if (initial_partition->type == INITPART_METIS_WEIGHT) {
-      if ((weights = (int *)malloc(sizeof(int) * s->nr_cells)) == NULL)
+      if ((weights = (double *)malloc(sizeof(double) * s->nr_cells)) == NULL)
         error("Failed to allocate weights buffer.");
-      bzero(weights, sizeof(int) * s->nr_cells);
+      bzero(weights, sizeof(double) * s->nr_cells);
 
       /* Check each particle and accumilate the counts per cell. */
       accumulate_counts(s, weights);
 
       /* Get all the counts from all the nodes. */
-      if (MPI_Allreduce(MPI_IN_PLACE, weights, s->nr_cells, MPI_INT, MPI_SUM,
+      if (MPI_Allreduce(MPI_IN_PLACE, weights, s->nr_cells, MPI_DOUBLE, MPI_SUM,
                         MPI_COMM_WORLD) != MPI_SUCCESS)
         error("Failed to allreduce particle cell weights.");
     }
@@ -969,10 +1035,10 @@ void partition_init(struct partition *partition,
 
 /* Defaults make use of METIS if available */
 #ifdef HAVE_METIS
-  const char *default_repart = "task_weights";
+  const char *default_repart = "costs/costs";
   const char *default_part = "simple_metis";
 #else
-  const char *default_repart = "none";
+  const char *default_repart = "none/none";
   const char *default_part = "grid";
 #endif
 
@@ -1029,32 +1095,40 @@ void partition_init(struct partition *partition,
   parser_get_opt_param_string(params, "DomainDecomposition:repartition_type",
                               part_type, default_repart);
 
-  switch (part_type[0]) {
-    case 'n':
-      repartition->type = REPART_NONE;
-      break;
+  if (strcmp("none/none", part_type) == 0) {
+    repartition->type = REPART_NONE;
+
 #ifdef HAVE_METIS
-    case 't':
-      repartition->type = REPART_METIS_BOTH;
-      break;
-    case 'e':
-      repartition->type = REPART_METIS_EDGE;
-      break;
-    case 'p':
-      repartition->type = REPART_METIS_VERTEX;
-      break;
-    case 'h':
-      repartition->type = REPART_METIS_VERTEX_EDGE;
-      break;
-    default:
-      message("Invalid choice of re-partition type '%s'.", part_type);
-      error(
-          "Permitted values are: 'none', 'task_weights', 'particle_weights',"
-          "'edge_task_weights' or 'hybrid_weights'");
+  } else if (strcmp("costs/costs", part_type) == 0) {
+    repartition->type = REPART_METIS_VERTEX_COSTS_EDGE_COSTS;
+
+  } else if (strcmp("counts/none", part_type) == 0) {
+    repartition->type = REPART_METIS_VERTEX_COUNTS;
+
+  } else if (strcmp("none/costs", part_type) == 0) {
+    repartition->type = REPART_METIS_EDGE_COSTS;
+
+  } else if (strcmp("counts/costs", part_type) == 0) {
+    repartition->type = REPART_METIS_VERTEX_COUNTS_EDGE_COSTS;
+
+  } else if (strcmp("costs/time", part_type) == 0) {
+    repartition->type = REPART_METIS_VERTEX_COSTS_EDGE_TIMEBINS;
+
+  } else if (strcmp("counts/time", part_type) == 0) {
+    repartition->type = REPART_METIS_VERTEX_COUNTS_EDGE_TIMEBINS;
+
+  } else if (strcmp("none/time", part_type) == 0) {
+    repartition->type = REPART_METIS_EDGE_TIMEBINS;
+  } else {
+    message("Invalid choice of re-partition type '%s'.", part_type);
+    error(
+        "Permitted values are: 'none/none', 'costs/costs',"
+        "'counts/none', 'none/costs', 'counts/costs', "
+        "'costs/time', 'counts/time' or 'none/time'");
 #else
-    default:
-      message("Invalid choice of re-partition type '%s'.", part_type);
-      error("Permitted values are: 'none' when compiled without METIS.");
+  } else {
+    message("Invalid choice of re-partition type '%s'.", part_type);
+    error("Permitted values are: 'none/none' when compiled without METIS.");
 #endif
   }
 
diff --git a/src/partition.h b/src/partition.h
index 03523b165a930b085224e458ac0dd8c8232a578d..c3eade190c9514efb4c44011e3990745e20846fd 100644
--- a/src/partition.h
+++ b/src/partition.h
@@ -43,10 +43,13 @@ struct partition {
 /* Repartition type to use. */
 enum repartition_type {
   REPART_NONE = 0,
-  REPART_METIS_BOTH,
-  REPART_METIS_VERTEX,
-  REPART_METIS_EDGE,
-  REPART_METIS_VERTEX_EDGE
+  REPART_METIS_VERTEX_COSTS_EDGE_COSTS,
+  REPART_METIS_VERTEX_COUNTS,
+  REPART_METIS_EDGE_COSTS,
+  REPART_METIS_VERTEX_COUNTS_EDGE_COSTS,
+  REPART_METIS_VERTEX_COSTS_EDGE_TIMEBINS,
+  REPART_METIS_VERTEX_COUNTS_EDGE_TIMEBINS,
+  REPART_METIS_EDGE_TIMEBINS
 };
 
 /* Repartition preferences. */
diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c
index 8e5a9e609c02b1697c6e8a052127d0a60e88961c..39647a2bb1f4c76914a946e49138519983639134 100644
--- a/src/runner_doiact_vec.c
+++ b/src/runner_doiact_vec.c
@@ -26,7 +26,8 @@
 /* Local headers. */
 #include "active.h"
 
-#ifdef WITH_VECTORIZATION
+#if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH)
+
 static const vector kernel_gamma2_vec = FILL_VEC(kernel_gamma2);
 
 /**
@@ -384,8 +385,7 @@ __attribute__((always_inline)) INLINE static void populate_max_index_no_cache(
  * @param rshift cutoff shift
  * @param hi_max_raw Maximal smoothing length in cell ci
  * @param hj_max_raw Maximal smoothing length in cell cj
- * @param hi_max Maximal smoothing length in cell ci scaled by kernel_gamma
- * @param hj_max Maximal smoothing length in cell cj scaled by kernel_gamma
+ * @param h_max Maximal smoothing length in both cells scaled by kernel_gamma
  * @param di_max Maximal position on the axis that can interact in cell ci
  * @param dj_min Minimal position on the axis that can interact in cell ci
  * @param max_index_i array to hold the maximum distances of pi particles into
@@ -402,9 +402,8 @@ populate_max_index_no_cache_force(const struct cell *ci, const struct cell *cj,
                                   const struct entry *restrict sort_j,
                                   const float dx_max, const float rshift,
                                   const double hi_max_raw,
-                                  const double hj_max_raw, const double hi_max,
-                                  const double hj_max, const double di_max,
-                                  const double dj_min, int *max_index_i,
+                                  const double hj_max_raw, const double h_max,
+                                  const double di_max, const double dj_min, int *max_index_i,
                                   int *max_index_j, int *init_pi, int *init_pj,
                                   const timebin_t max_active_bin) {
 
@@ -419,7 +418,7 @@ populate_max_index_no_cache_force(const struct cell *ci, const struct cell *cj,
   first_pi = ci->count;
   int active_id = first_pi - 1;
   while (first_pi > 0 &&
-         sort_i[first_pi - 1].d + dx_max + max(hi_max, hj_max) > dj_min) {
+         sort_i[first_pi - 1].d + dx_max + h_max - rshift > dj_min) {
     first_pi--;
     /* Store the index of the particle if it is active. */
     if (part_is_active_no_debug(&parts_i[sort_i[first_pi].i], max_active_bin))
@@ -469,7 +468,7 @@ populate_max_index_no_cache_force(const struct cell *ci, const struct cell *cj,
   last_pj = -1;
   active_id = last_pj;
   while (last_pj < cj->count &&
-         sort_j[last_pj + 1].d - max(hj_max, hi_max) - dx_max < di_max) {
+      sort_j[last_pj + 1].d - h_max - dx_max < di_max) {
     last_pj++;
     /* Store the index of the particle if it is active. */
     if (part_is_active_no_debug(&parts_j[sort_j[last_pj].i], max_active_bin))
@@ -515,7 +514,7 @@ populate_max_index_no_cache_force(const struct cell *ci, const struct cell *cj,
   *init_pj = last_pj;
 }
 
-#endif /* WITH_VECTORIZATION */
+#endif /* WITH_VECTORIZATION && GADGET2_SPH */
 
 /**
  * @brief Compute the cell self-interaction (non-symmetric) using vector
@@ -527,7 +526,8 @@ populate_max_index_no_cache_force(const struct cell *ci, const struct cell *cj,
 __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
     struct runner *r, struct cell *restrict c) {
 
-#ifdef WITH_VECTORIZATION
+#if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH)
+
   /* Get some local variables */
   const struct engine *e = r->e;
   const timebin_t max_active_bin = e->max_active_bin;
@@ -541,20 +541,25 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
 
   if (!cell_are_part_drifted(c, e)) error("Interacting undrifted cell.");
 
+#ifdef SWIFT_DEBUG_CHECKS
+  for (int i = 0; i < count; i++) {
+    /* Check that particles have been drifted to the current time */
+    if (parts[i].ti_drift != e->ti_current)
+      error("Particle pi not drifted to current time");
+  }
+#endif
+
   /* Get the particle cache from the runner and re-allocate
    * the cache if it is not big enough for the cell. */
   struct cache *restrict cell_cache = &r->ci_cache;
 
-  if (cell_cache->count < count) {
-    cache_init(cell_cache, count);
-  }
+  if (cell_cache->count < count) cache_init(cell_cache, count);
 
   /* Read the particles from the cell and store them locally in the cache. */
   cache_read_particles(c, cell_cache);
 
   /* Create secondary cache to store particle interactions. */
   struct c2_cache int_cache;
-  int icount = 0, icount_align = 0;
 
   /* Loop over the particles in the cell. */
   for (int pid = 0; pid < count; pid++) {
@@ -607,6 +612,10 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
       }
     }
 
+    /* The number of interactions for pi and the padded version of it to
+     * make it a multiple of VEC_SIZE. */
+    int icount = 0, icount_align = 0;
+
     /* Find all of particle pi's interacions and store needed values in the
      * secondary cache.*/
     for (int pjd = 0; pjd < count_align; pjd += (NUM_VEC_PROC * VEC_SIZE)) {
@@ -727,6 +736,11 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
   } /* loop over all particles. */
 
   TIMER_TOC(timer_doself_density);
+
+#else
+
+  error("Incorrectly calling vectorized Gadget-2 functions!");
+
 #endif /* WITH_VECTORIZATION */
 }
 
@@ -744,25 +758,23 @@ __attribute__((always_inline)) INLINE void runner_doself_subset_density_vec(
     struct runner *r, struct cell *restrict c, struct part *restrict parts,
     int *restrict ind, int pi_count) {
 
-#ifdef WITH_VECTORIZATION
+#if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH)
+
   const int count = c->count;
 
-  TIMER_TIC
+  TIMER_TIC;
 
   /* Get the particle cache from the runner and re-allocate
    * the cache if it is not big enough for the cell. */
   struct cache *restrict cell_cache = &r->ci_cache;
 
-  if (cell_cache->count < count) {
-    cache_init(cell_cache, count);
-  }
+  if (cell_cache->count < count) cache_init(cell_cache, count);
 
   /* Read the particles from the cell and store them locally in the cache. */
   cache_read_particles(c, cell_cache);
 
   /* Create secondary cache to store particle interactions. */
   struct c2_cache int_cache;
-  int icount = 0, icount_align = 0;
 
   /* Loop over the subset of particles in the parts that need updating. */
   for (int pid = 0; pid < pi_count; pid++) {
@@ -819,6 +831,10 @@ __attribute__((always_inline)) INLINE void runner_doself_subset_density_vec(
       }
     }
 
+    /* The number of interactions for pi and the padded version of it to
+     * make it a multiple of VEC_SIZE. */
+    int icount = 0, icount_align = 0;
+
     /* Find all of particle pi's interacions and store needed values in the
      * secondary cache.*/
     for (int pjd = 0; pjd < count_align; pjd += (NUM_VEC_PROC * VEC_SIZE)) {
@@ -941,6 +957,11 @@ __attribute__((always_inline)) INLINE void runner_doself_subset_density_vec(
   } /* loop over all particles. */
 
   TIMER_TOC(timer_doself_subset);
+
+#else
+
+  error("Incorrectly calling vectorized Gadget-2 functions!");
+
 #endif /* WITH_VECTORIZATION */
 }
 
@@ -954,13 +975,10 @@ __attribute__((always_inline)) INLINE void runner_doself_subset_density_vec(
 __attribute__((always_inline)) INLINE void runner_doself2_force_vec(
     struct runner *r, struct cell *restrict c) {
 
-#ifdef WITH_VECTORIZATION
-  const struct engine *e = r->e;
-  struct part *restrict pi;
-  int count_align;
+#if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH)
 
+  const struct engine *e = r->e;
   const timebin_t max_active_bin = e->max_active_bin;
-
   struct part *restrict parts = c->parts;
   const int count = c->count;
 
@@ -970,31 +988,28 @@ __attribute__((always_inline)) INLINE void runner_doself2_force_vec(
 
   if (!cell_are_part_drifted(c, e)) error("Interacting undrifted cell.");
 
+#ifdef SWIFT_DEBUG_CHECKS
+  for (int i = 0; i < count; i++) {
+    /* Check that particles have been drifted to the current time */
+    if (parts[i].ti_drift != e->ti_current)
+      error("Particle pi not drifted to current time");
+  }
+#endif
+
   /* Get the particle cache from the runner and re-allocate
    * the cache if it is not big enough for the cell. */
   struct cache *restrict cell_cache = &r->ci_cache;
 
-  if (cell_cache->count < count) {
-    cache_init(cell_cache, count);
-  }
+  if (cell_cache->count < count) cache_init(cell_cache, count);
 
   /* Read the particles from the cell and store them locally in the cache. */
   cache_read_force_particles(c, cell_cache);
 
-#ifdef SWIFT_DEBUG_CHECKS
-  for (int i = 0; i < count; i++) {
-    pi = &c->parts[i];
-    /* Check that particles have been drifted to the current time */
-    if (pi->ti_drift != e->ti_current)
-      error("Particle pi not drifted to current time");
-  }
-#endif
-
   /* Loop over the particles in the cell. */
   for (int pid = 0; pid < count; pid++) {
 
     /* Get a pointer to the ith particle. */
-    pi = &parts[pid];
+    struct part *restrict pi = &parts[pid];
 
     /* Is the ith particle active? */
     if (!part_is_active_no_debug(pi, max_active_bin)) continue;
@@ -1031,7 +1046,7 @@ __attribute__((always_inline)) INLINE void runner_doself2_force_vec(
     vector v_entropy_dtSum = vector_setzero();
 
     /* Pad cache if there is a serial remainder. */
-    count_align = count;
+    int count_align = count;
     int rem = count % VEC_SIZE;
     if (rem != 0) {
       int pad = VEC_SIZE - rem;
@@ -1129,6 +1144,11 @@ __attribute__((always_inline)) INLINE void runner_doself2_force_vec(
   } /* loop over all particles. */
 
   TIMER_TOC(timer_doself_force);
+
+#else
+
+  error("Incorrectly calling vectorized Gadget-2 functions!");
+
 #endif /* WITH_VECTORIZATION */
 }
 
@@ -1146,7 +1166,8 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
                                 struct cell *cj, const int sid,
                                 const double *shift) {
 
-#ifdef WITH_VECTORIZATION
+#if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH)
+
   const struct engine *restrict e = r->e;
   const timebin_t max_active_bin = e->max_active_bin;
 
@@ -1173,6 +1194,16 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
   const int active_ci = cell_is_active(ci, e);
   const int active_cj = cell_is_active(cj, e);
 
+#ifdef SWIFT_DEBUG_CHECKS
+  /* Check that particles have been drifted to the current time */
+  for (int pid = 0; pid < count_i; pid++)
+    if (parts_i[pid].ti_drift != e->ti_current)
+      error("Particle pi not drifted to current time");
+  for (int pjd = 0; pjd < count_j; pjd++)
+    if (parts_j[pjd].ti_drift != e->ti_current)
+      error("Particle pj not drifted to current time");
+#endif
+
   /* Count number of particles that are in range and active*/
   int numActive = 0;
 
@@ -1206,16 +1237,12 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
   struct cache *restrict ci_cache = &r->ci_cache;
   struct cache *restrict cj_cache = &r->cj_cache;
 
-  if (ci_cache->count < count_i) {
-    cache_init(ci_cache, count_i);
-  }
-  if (cj_cache->count < count_j) {
-    cache_init(cj_cache, count_j);
-  }
+  if (ci_cache->count < count_i) cache_init(ci_cache, count_i);
+  if (cj_cache->count < count_j) cache_init(cj_cache, count_j);
 
   int first_pi, last_pj;
-  int *max_index_i __attribute__((aligned(sizeof(int) * VEC_SIZE)));
-  int *max_index_j __attribute__((aligned(sizeof(int) * VEC_SIZE)));
+  int *restrict max_index_i SWIFT_CACHE_ALIGN;
+  int *restrict max_index_j SWIFT_CACHE_ALIGN;
 
   max_index_i = r->ci_cache.max_index;
   max_index_j = r->cj_cache.max_index;
@@ -1228,8 +1255,8 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
                               &first_pi, &last_pj, max_active_bin);
 
   /* Limits of the outer loops. */
-  int first_pi_loop = first_pi;
-  int last_pj_loop = last_pj;
+  const int first_pi_loop = first_pi;
+  const int last_pj_loop_end = last_pj + 1;
 
   /* Take the max/min of both values calculated to work out how many particles
    * to read into the cache. */
@@ -1262,7 +1289,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
       if (di_test < dj_min) continue;
 
       /* Determine the exit iteration of the interaction loop. */
-      const int exit_iteration = max_index_i[pid];
+      const int exit_iteration_end = max_index_i[pid] + 1;
 
       /* Fill particle pi vectors. */
       const vector v_pix = vector_set1(ci_cache->x[ci_cache_idx]);
@@ -1292,7 +1319,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
       /* Loop over the parts in cj. Making sure to perform an iteration of the
        * loop even if exit_iteration_align is zero and there is only one
        * particle to interact with.*/
-      for (int pjd = 0; pjd <= exit_iteration; pjd += VEC_SIZE) {
+      for (int pjd = 0; pjd < exit_iteration_end; pjd += VEC_SIZE) {
 
         /* Get the cache index to the jth particle. */
         const int cj_cache_idx = pjd;
@@ -1363,7 +1390,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
   if (active_cj) {
 
     /* Loop over the parts in cj until nothing is within range in ci. */
-    for (int pjd = 0; pjd <= last_pj_loop; pjd++) {
+    for (int pjd = 0; pjd < last_pj_loop_end; pjd++) {
 
       /* Get a hold of the jth part in cj. */
       struct part *restrict pj = &parts_j[sort_j[pjd].i];
@@ -1486,6 +1513,10 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
 
   TIMER_TOC(timer_dopair_density);
 
+#else
+
+  error("Incorrectly calling vectorized Gadget-2 functions!");
+
 #endif /* WITH_VECTORIZATION */
 }
 
@@ -1503,7 +1534,8 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci,
                               struct cell *cj, const int sid,
                               const double *shift) {
 
-#ifdef WITH_VECTORIZATION
+#if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH)
+
   const struct engine *restrict e = r->e;
   const timebin_t max_active_bin = e->max_active_bin;
 
@@ -1520,7 +1552,7 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci,
   /* Get some other useful values. */
   const int count_i = ci->count;
   const int count_j = cj->count;
-  const double hi_max = ci->h_max * kernel_gamma - rshift;
+  const double hi_max = ci->h_max * kernel_gamma;
   const double hj_max = cj->h_max * kernel_gamma;
   const double hi_max_raw = ci->h_max;
   const double hj_max_raw = cj->h_max;
@@ -1532,6 +1564,16 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci,
   const int active_ci = cell_is_active(ci, e);
   const int active_cj = cell_is_active(cj, e);
 
+#ifdef SWIFT_DEBUG_CHECKS
+  /* Check that particles have been drifted to the current time */
+  for (int pid = 0; pid < count_i; pid++)
+    if (parts_i[pid].ti_drift != e->ti_current)
+      error("Particle pi not drifted to current time");
+  for (int pjd = 0; pjd < count_j; pjd++)
+    if (parts_j[pjd].ti_drift != e->ti_current)
+      error("Particle pj not drifted to current time");
+#endif
+
   /* Check if any particles are active and in range */
   int numActive = 0;
 
@@ -1541,7 +1583,7 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci,
 
   if (active_ci) {
     for (int pid = count_i - 1;
-         pid >= 0 && sort_i[pid].d + h_max + dx_max > dj_min; pid--) {
+         pid >= 0 && sort_i[pid].d + h_max + dx_max - rshift > dj_min; pid--) {
       struct part *restrict pi = &parts_i[sort_i[pid].i];
       if (part_is_active(pi, e)) {
         numActive++;
@@ -1569,16 +1611,12 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci,
   struct cache *restrict ci_cache = &r->ci_cache;
   struct cache *restrict cj_cache = &r->cj_cache;
 
-  if (ci_cache->count < count_i) {
-    cache_init(ci_cache, count_i);
-  }
-  if (cj_cache->count < count_j) {
-    cache_init(cj_cache, count_j);
-  }
+  if (ci_cache->count < count_i) cache_init(ci_cache, count_i);
+  if (cj_cache->count < count_j) cache_init(cj_cache, count_j);
 
   int first_pi, last_pj;
-  int *max_index_i SWIFT_CACHE_ALIGN;
-  int *max_index_j SWIFT_CACHE_ALIGN;
+  int *restrict max_index_i SWIFT_CACHE_ALIGN;
+  int *restrict max_index_j SWIFT_CACHE_ALIGN;
 
   max_index_i = r->ci_cache.max_index;
   max_index_j = r->cj_cache.max_index;
@@ -1587,7 +1625,7 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci,
   /* Also find the first pi that interacts with any particle in cj and the last
    * pj that interacts with any particle in ci. */
   populate_max_index_no_cache_force(ci, cj, sort_i, sort_j, dx_max, rshift,
-                                    hi_max_raw, hj_max_raw, hi_max, hj_max,
+                                    hi_max_raw, hj_max_raw, h_max,
                                     di_max, dj_min, max_index_i, max_index_j,
                                     &first_pi, &last_pj, max_active_bin);
 
@@ -1599,7 +1637,7 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci,
 
   /* Limits of the outer loops. */
   const int first_pi_loop = first_pi;
-  const int last_pj_loop = last_pj;
+  const int last_pj_loop_end = last_pj + 1;
 
   /* Take the max/min of both values calculated to work out how many particles
    * to read into the cache. */
@@ -1632,7 +1670,7 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci,
       if (di_test < dj_min) continue;
 
       /* Determine the exit iteration of the interaction loop. */
-      const int exit_iteration = max_index_i[pid];
+      const int exit_iteration_end = max_index_i[pid] + 1;
 
       /* Fill particle pi vectors. */
       const vector v_pix = vector_set1(ci_cache->x[ci_cache_idx]);
@@ -1665,7 +1703,7 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci,
       /* Loop over the parts in cj. Making sure to perform an iteration of the
        * loop even if exit_iteration_align is zero and there is only one
        * particle to interact with.*/
-      for (int pjd = 0; pjd <= exit_iteration; pjd += VEC_SIZE) {
+      for (int pjd = 0; pjd < exit_iteration_end; pjd += VEC_SIZE) {
 
         /* Get the cache index to the jth particle. */
         const int cj_cache_idx = pjd;
@@ -1746,7 +1784,7 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci,
   if (active_cj) {
 
     /* Loop over the parts in cj until nothing is within range in ci. */
-    for (int pjd = 0; pjd <= last_pj_loop; pjd++) {
+    for (int pjd = 0; pjd < last_pj_loop_end; pjd++) {
 
       /* Get a hold of the jth part in cj. */
       struct part *restrict pj = &parts_j[sort_j[pjd].i];
@@ -1878,5 +1916,9 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci,
     TIMER_TOC(timer_dopair_density);
   }
 
+#else
+
+  error("Incorrectly calling vectorized Gadget-2 functions!");
+
 #endif /* WITH_VECTORIZATION */
 }
diff --git a/src/scheduler.c b/src/scheduler.c
index ea2c6bde010dd387b355e961903411c0b18a41bf..5af9e9c9e19248838bb98c3ee2305d7dc54324f0 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -50,6 +50,7 @@
 #include "space.h"
 #include "task.h"
 #include "timers.h"
+#include "version.h"
 
 /**
  * @brief Re-set the list of active tasks.
@@ -111,6 +112,125 @@ void scheduler_addunlock(struct scheduler *s, struct task *ta,
   atomic_inc(&s->completed_unlock_writes);
 }
 
+/**
+ * @brief Write a dot file with the task dependencies.
+ *
+ * Run plot_task_dependencies.sh for an example of how to use it
+ * to generate the figure.
+ *
+ * @param s The #scheduler we are working in.
+ * @param verbose Are we verbose about this?
+ */
+void scheduler_write_dependencies(struct scheduler *s, int verbose) {
+
+  const ticks tic = getticks();
+
+  /* Conservative number of dependencies per task type */
+  const int max_nber_dep = 128;
+
+  /* Number of possible relations between tasks */
+  const int nber_relation =
+      2 * task_type_count * task_subtype_count * max_nber_dep;
+
+  /* To get the table of max_nber_dep for a task:
+   * ind = (ta * task_subtype_count + sa) * max_nber_dep * 2
+   * where ta is the value of task_type and sa is the value of
+   * task_subtype  */
+  int *table = malloc(nber_relation * sizeof(int));
+  if (table == NULL)
+    error("Error allocating memory for task-dependency graph.");
+
+  /* Reset everything */
+  for (int i = 0; i < nber_relation; i++) table[i] = -1;
+
+  /* Create file */
+  char filename[200] = "dependency_graph.dot";
+  FILE *f = fopen(filename, "w");
+  if (f == NULL) error("Error opening dependency graph file.");
+
+  /* Write header */
+  fprintf(f, "digraph task_dep {\n");
+  fprintf(f, "label=\"Task dependencies for SWIFT %s\"", git_revision());
+  fprintf(f, "\t compound=true;\n");
+  fprintf(f, "\t ratio=0.66;\n");
+  fprintf(f, "\t node[nodesep=0.15];\n");
+
+  /* loop over all tasks */
+  for (int i = 0; i < s->nr_tasks; i++) {
+    const struct task *ta = &s->tasks[i];
+
+    /* and their dependencies */
+    for (int j = 0; j < ta->nr_unlock_tasks; j++) {
+      const struct task *tb = ta->unlock_tasks[j];
+
+      /* check if dependency already written */
+      int written = 0;
+
+      /* Current index */
+      int ind = ta->type * task_subtype_count + ta->subtype;
+      ind *= 2 * max_nber_dep;
+
+      int k = 0;
+      int *cur = &table[ind];
+      while (k < max_nber_dep) {
+
+        /* not written yet */
+        if (cur[0] == -1) {
+          cur[0] = tb->type;
+          cur[1] = tb->subtype;
+          break;
+        }
+
+        /* already written */
+        if (cur[0] == tb->type && cur[1] == tb->subtype) {
+          written = 1;
+          break;
+        }
+
+        k += 1;
+        cur = &cur[3];
+      }
+
+      /* max_nber_dep is too small */
+      if (k == max_nber_dep)
+        error("Not enough memory, please increase max_nber_dep");
+
+      /* Not written yet => write it */
+      if (!written) {
+
+        /* text to write */
+        char ta_name[200];
+        char tb_name[200];
+
+        /* construct line */
+        if (ta->subtype == task_subtype_none)
+          sprintf(ta_name, "%s", taskID_names[ta->type]);
+        else
+          sprintf(ta_name, "\"%s %s\"", taskID_names[ta->type],
+                  subtaskID_names[ta->subtype]);
+
+        if (tb->subtype == task_subtype_none)
+          sprintf(tb_name, "%s", taskID_names[tb->type]);
+        else
+          sprintf(tb_name, "\"%s %s\"", taskID_names[tb->type],
+                  subtaskID_names[tb->subtype]);
+
+        /* Write to the ffile */
+        fprintf(f, "\t %s->%s;\n", ta_name, tb_name);
+      }
+    }
+  }
+
+  /* Be clean */
+  fprintf(f, "}");
+  fclose(f);
+  free(table);
+
+  if (verbose)
+    message("Printing task graph took %.3f %s.",
+            clocks_from_ticks(getticks() - tic), clocks_getunit());
+}
+
 /**
  * @brief Split a hydrodynamic task if too large.
  *
@@ -1317,26 +1437,49 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
         if (t->subtype == task_subtype_tend) {
           t->buff = malloc(sizeof(struct pcell_step) * t->ci->pcell_size);
           cell_pack_end_step(t->ci, t->buff);
-          err = MPI_Isend(
-              t->buff, t->ci->pcell_size * sizeof(struct pcell_step), MPI_BYTE,
-              t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+          if ((t->ci->pcell_size * sizeof(struct pcell_step)) >
+              s->mpi_message_limit)
+            err = MPI_Isend(
+                t->buff, t->ci->pcell_size * sizeof(struct pcell_step),
+                MPI_BYTE, t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+          else
+            err = MPI_Issend(
+                t->buff, t->ci->pcell_size * sizeof(struct pcell_step),
+                MPI_BYTE, t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
         } else if (t->subtype == task_subtype_xv ||
                    t->subtype == task_subtype_rho ||
                    t->subtype == task_subtype_gradient) {
-          err = MPI_Isend(t->ci->parts, t->ci->count, part_mpi_type,
-                          t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+          if ((t->ci->count * sizeof(struct part)) > s->mpi_message_limit)
+            err = MPI_Isend(t->ci->parts, t->ci->count, part_mpi_type,
+                            t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+          else
+            err = MPI_Issend(t->ci->parts, t->ci->count, part_mpi_type,
+                             t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
           // message( "sending %i parts with tag=%i from %i to %i." ,
           //     t->ci->count , t->flags , s->nodeID , t->cj->nodeID );
           // fflush(stdout);
         } else if (t->subtype == task_subtype_gpart) {
-          err = MPI_Isend(t->ci->gparts, t->ci->gcount, gpart_mpi_type,
-                          t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+          if ((t->ci->gcount * sizeof(struct gpart)) > s->mpi_message_limit)
+            err = MPI_Isend(t->ci->gparts, t->ci->gcount, gpart_mpi_type,
+                            t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+          else
+            err = MPI_Issend(t->ci->gparts, t->ci->gcount, gpart_mpi_type,
+                             t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
         } else if (t->subtype == task_subtype_spart) {
-          err = MPI_Isend(t->ci->sparts, t->ci->scount, spart_mpi_type,
-                          t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+          if ((t->ci->scount * sizeof(struct spart)) > s->mpi_message_limit)
+            err = MPI_Isend(t->ci->sparts, t->ci->scount, spart_mpi_type,
+                            t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+          else
+            err = MPI_Issend(t->ci->sparts, t->ci->scount, spart_mpi_type,
+                             t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
         } else if (t->subtype == task_subtype_multipole) {
-          err = MPI_Isend(t->ci->multipole, 1, multipole_mpi_type,
-                          t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+          if ((t->ci->scount * sizeof(struct gravity_tensors)) >
+              s->mpi_message_limit)
+            err = MPI_Isend(t->ci->multipole, 1, multipole_mpi_type,
+                            t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+          else
+            err = MPI_Issend(t->ci->multipole, 1, multipole_mpi_type,
+                             t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
         } else {
           error("Unknown communication sub-type");
         }
diff --git a/src/scheduler.h b/src/scheduler.h
index c5ccbf43f3048dfb47d2d3eb7a7db6634b646700..1a75544de12b8402e553e3ae2b84e2d8a65c56e8 100644
--- a/src/scheduler.h
+++ b/src/scheduler.h
@@ -103,6 +103,10 @@ struct scheduler {
   /* The node we are working on. */
   int nodeID;
 
+  /* Maximum size of task messages, in bytes, to sent using non-buffered
+   * MPI. */
+  size_t mpi_message_limit;
+
   /* 'Pointer' to the seed for the random number generator */
   pthread_key_t local_seed_pointer;
 };
@@ -168,5 +172,6 @@ void scheduler_dump_queue(struct scheduler *s);
 void scheduler_print_tasks(const struct scheduler *s, const char *fileName);
 void scheduler_clean(struct scheduler *s);
 void scheduler_free_tasks(struct scheduler *s);
+void scheduler_write_dependencies(struct scheduler *s, int verbose);
 
 #endif /* SWIFT_SCHEDULER_H */
diff --git a/src/serial_io.c b/src/serial_io.c
index eb1e0e23fb34fd8d6a21230d9e38cfe82c47df1d..f8e3927c4a09a8a94afff5287a42f2b10afac9b4 100644
--- a/src/serial_io.c
+++ b/src/serial_io.c
@@ -176,9 +176,9 @@ void readArray(hid_t grp, const struct io_props props, size_t N,
  * Routines writing an output file
  *-----------------------------------------------------------------------------*/
 
-void prepareArray(struct engine* e, hid_t grp, char* fileName, FILE* xmfFile,
-                  char* partTypeGroupName, const struct io_props props,
-                  unsigned long long N_total,
+void prepareArray(const struct engine* e, hid_t grp, char* fileName,
+                  FILE* xmfFile, char* partTypeGroupName,
+                  const struct io_props props, unsigned long long N_total,
                   const struct unit_system* internal_units,
                   const struct unit_system* snapshot_units) {
 
@@ -282,14 +282,14 @@ void prepareArray(struct engine* e, hid_t grp, char* fileName, FILE* xmfFile,
  * @todo A better version using HDF5 hyper-slabs to write the file directly from
  * the part array will be written once the structures have been stabilized.
  */
-void writeArray(struct engine* e, hid_t grp, char* fileName, FILE* xmfFile,
-                char* partTypeGroupName, const struct io_props props, size_t N,
-                long long N_total, int mpi_rank, long long offset,
+void writeArray(const struct engine* e, hid_t grp, char* fileName,
+                FILE* xmfFile, char* partTypeGroupName,
+                const struct io_props props, size_t N, long long N_total,
+                int mpi_rank, long long offset,
                 const struct unit_system* internal_units,
                 const struct unit_system* snapshot_units) {
 
   const size_t typeSize = io_sizeof_type(props.type);
-  const size_t copySize = typeSize * props.dimension;
   const size_t num_elements = N * props.dimension;
 
   /* message("Writing '%s' array...", props.name); */
@@ -300,45 +300,13 @@ void writeArray(struct engine* e, hid_t grp, char* fileName, FILE* xmfFile,
                  internal_units, snapshot_units);
 
   /* Allocate temporary buffer */
-  void* temp = malloc(num_elements * io_sizeof_type(props.type));
-  if (temp == NULL) error("Unable to allocate memory for temporary buffer");
-
-  /* Copy particle data to temporary buffer */
-  if (props.convert_part == NULL &&
-      props.convert_gpart == NULL) { /* No conversion */
-
-    char* temp_c = temp;
-    for (size_t i = 0; i < N; ++i)
-      memcpy(&temp_c[i * copySize], props.field + i * props.partSize, copySize);
-
-  } else if (props.convert_part != NULL) { /* conversion (for parts)*/
-
-    float* temp_f = temp;
-    for (size_t i = 0; i < N; ++i)
-      temp_f[i] = props.convert_part(e, &props.parts[i]);
-
-  } else if (props.convert_gpart != NULL) { /* conversion (for gparts)*/
-
-    float* temp_f = temp;
-    for (size_t i = 0; i < N; ++i)
-      temp_f[i] = props.convert_gpart(e, &props.gparts[i]);
-  }
-
-  /* Unit conversion if necessary */
-  const double factor =
-      units_conversion_factor(internal_units, snapshot_units, props.units);
-  if (factor != 1.) {
-
-    /* message("Converting ! factor=%e", factor); */
+  void* temp = NULL;
+  if (posix_memalign((void**)&temp, IO_BUFFER_ALIGNMENT,
+                     num_elements * typeSize) != 0)
+    error("Unable to allocate temporary i/o buffer");
 
-    if (io_is_double_precision(props.type)) {
-      double* temp_d = temp;
-      for (size_t i = 0; i < num_elements; ++i) temp_d[i] *= factor;
-    } else {
-      float* temp_f = temp;
-      for (size_t i = 0; i < num_elements; ++i) temp_f[i] *= factor;
-    }
-  }
+  /* Copy the particle data to the temporary buffer */
+  io_copy_temp_buffer(temp, e, props, N, internal_units, snapshot_units);
 
   /* Construct information for the hyper-slab */
   int rank;
@@ -512,7 +480,7 @@ void read_ic_serial(char* fileName, const struct unit_system* internal_units,
 
     /* Read the unit system used in the ICs */
     if (ic_units == NULL) error("Unable to allocate memory for IC unit system");
-    io_read_unit_system(h_file, ic_units);
+    io_read_unit_system(h_file, ic_units, mpi_rank);
 
     if (units_are_equal(ic_units, internal_units)) {
 
@@ -656,8 +624,9 @@ void read_ic_serial(char* fileName, const struct unit_system* internal_units,
             break;
 
           default:
-            message("Particle Type %d not yet supported. Particles ignored",
-                    ptype);
+            if (mpi_rank == 0)
+              message("Particle Type %d not yet supported. Particles ignored",
+                      ptype);
         }
 
         /* Read everything */
diff --git a/src/single_io.c b/src/single_io.c
index 194563352dff5570b8703f828fac95bccbf7409f..a8199af6a84d3f1bdffd746e5a1a49e6e2518dca 100644
--- a/src/single_io.c
+++ b/src/single_io.c
@@ -169,57 +169,25 @@ void readArray(hid_t h_grp, const struct io_props prop, size_t N,
  * @todo A better version using HDF5 hyper-slabs to write the file directly from
  * the part array will be written once the structures have been stabilized.
  */
-void writeArray(struct engine* e, hid_t grp, char* fileName, FILE* xmfFile,
-                char* partTypeGroupName, const struct io_props props, size_t N,
+void writeArray(const struct engine* e, hid_t grp, char* fileName,
+                FILE* xmfFile, char* partTypeGroupName,
+                const struct io_props props, size_t N,
                 const struct unit_system* internal_units,
                 const struct unit_system* snapshot_units) {
 
   const size_t typeSize = io_sizeof_type(props.type);
-  const size_t copySize = typeSize * props.dimension;
   const size_t num_elements = N * props.dimension;
 
   /* message("Writing '%s' array...", props.name); */
 
   /* Allocate temporary buffer */
-  void* temp = malloc(num_elements * io_sizeof_type(props.type));
-  if (temp == NULL) error("Unable to allocate memory for temporary buffer");
-
-  /* Copy particle data to temporary buffer */
-  if (props.convert_part == NULL &&
-      props.convert_gpart == NULL) { /* No conversion */
-
-    char* temp_c = temp;
-    for (size_t i = 0; i < N; ++i)
-      memcpy(&temp_c[i * copySize], props.field + i * props.partSize, copySize);
-
-  } else if (props.convert_part != NULL) { /* conversion (for parts)*/
-
-    float* temp_f = temp;
-    for (size_t i = 0; i < N; ++i)
-      temp_f[i] = props.convert_part(e, &props.parts[i]);
-
-  } else if (props.convert_gpart != NULL) { /* conversion (for gparts)*/
-
-    float* temp_f = temp;
-    for (size_t i = 0; i < N; ++i)
-      temp_f[i] = props.convert_gpart(e, &props.gparts[i]);
-  }
-
-  /* Unit conversion if necessary */
-  const double factor =
-      units_conversion_factor(internal_units, snapshot_units, props.units);
-  if (factor != 1.) {
-
-    /* message("Converting ! factor=%e", factor); */
+  void* temp = NULL;
+  if (posix_memalign((void**)&temp, IO_BUFFER_ALIGNMENT,
+                     num_elements * typeSize) != 0)
+    error("Unable to allocate temporary i/o buffer");
 
-    if (io_is_double_precision(props.type)) {
-      double* temp_d = temp;
-      for (size_t i = 0; i < num_elements; ++i) temp_d[i] *= factor;
-    } else {
-      float* temp_f = temp;
-      for (size_t i = 0; i < num_elements; ++i) temp_f[i] *= factor;
-    }
-  }
+  /* Copy the particle data to the temporary buffer */
+  io_copy_temp_buffer(temp, e, props, N, internal_units, snapshot_units);
 
   /* Create data space */
   const hid_t h_space = H5Screate(H5S_SIMPLE);
@@ -419,7 +387,7 @@ void read_ic_single(char* fileName, const struct unit_system* internal_units,
   /* Read the unit system used in the ICs */
   struct unit_system* ic_units = malloc(sizeof(struct unit_system));
   if (ic_units == NULL) error("Unable to allocate memory for IC unit system");
-  io_read_unit_system(h_file, ic_units);
+  io_read_unit_system(h_file, ic_units, 0);
 
   /* Tell the user if a conversion will be needed */
   if (units_are_equal(ic_units, internal_units)) {
diff --git a/src/space.c b/src/space.c
index c9db8c84ac3a04c2fc82fcce620e14dc2070b107..b9fe639b195226da3d3168581ebd26f3273d28eb 100644
--- a/src/space.c
+++ b/src/space.c
@@ -2551,7 +2551,7 @@ void space_synchronize_particle_positions(struct space *s) {
  *
  * Calls hydro_first_init_part() on all the particles
  */
-void space_init_parts(struct space *s) {
+void space_first_init_parts(struct space *s) {
 
   const size_t nr_parts = s->nr_parts;
   struct part *restrict p = s->parts;
@@ -2583,7 +2583,7 @@ void space_init_parts(struct space *s) {
  *
  * Calls cooling_init_xpart() on all the particles
  */
-void space_init_xparts(struct space *s) {
+void space_first_init_xparts(struct space *s) {
 
   const size_t nr_parts = s->nr_parts;
   struct part *restrict p = s->parts;
@@ -2600,7 +2600,7 @@ void space_init_xparts(struct space *s) {
  *
  * Calls gravity_first_init_gpart() on all the particles
  */
-void space_init_gparts(struct space *s) {
+void space_first_init_gparts(struct space *s) {
 
   const size_t nr_gparts = s->nr_gparts;
   struct gpart *restrict gp = s->gparts;
@@ -2631,7 +2631,7 @@ void space_init_gparts(struct space *s) {
  *
  * Calls star_first_init_spart() on all the particles
  */
-void space_init_sparts(struct space *s) {
+void space_first_init_sparts(struct space *s) {
 
   const size_t nr_sparts = s->nr_sparts;
   struct spart *restrict sp = s->sparts;
@@ -2657,6 +2657,88 @@ void space_init_sparts(struct space *s) {
   }
 }
 
+void space_init_parts_mapper(void *restrict map_data, int count,
+                             void *restrict extra_data) {
+
+  struct part *restrict parts = map_data;
+  const struct hydro_space *restrict hs = extra_data;
+  for (int k = 0; k < count; k++) hydro_init_part(&parts[k], hs);
+}
+
+/**
+ * @brief Calls the #part initialisation function on all particles in the space.
+ *
+ * @param s The #space.
+ * @param verbose Are we talkative?
+ */
+void space_init_parts(struct space *s, int verbose) {
+
+  const ticks tic = getticks();
+
+  if (s->nr_parts > 0)
+    threadpool_map(&s->e->threadpool, space_init_parts_mapper, s->parts,
+                   s->nr_parts, sizeof(struct part), 0, &s->hs);
+  if (verbose)
+    message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
+            clocks_getunit());
+}
+
+void space_init_gparts_mapper(void *restrict map_data, int count,
+                              void *restrict extra_data) {
+
+  struct gpart *gparts = map_data;
+  for (int k = 0; k < count; k++) gravity_init_gpart(&gparts[k]);
+}
+
+/**
+ * @brief Calls the #gpart initialisation function on all particles in the
+ * space.
+ *
+ * @param s The #space.
+ * @param verbose Are we talkative?
+ */
+void space_init_gparts(struct space *s, int verbose) {
+
+  const ticks tic = getticks();
+
+  if (s->nr_gparts > 0)
+    threadpool_map(&s->e->threadpool, space_init_gparts_mapper, s->gparts,
+                   s->nr_gparts, sizeof(struct gpart), 0, NULL);
+  if (verbose)
+    message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
+            clocks_getunit());
+}
+
+void space_convert_quantities_mapper(void *restrict map_data, int count,
+                                     void *restrict extra_data) {
+  struct space *s = extra_data;
+  struct part *restrict parts = map_data;
+  const ptrdiff_t index = parts - s->parts;
+  struct xpart *restrict xparts = s->xparts + index;
+  for (int k = 0; k < count; k++)
+    hydro_convert_quantities(&parts[k], &xparts[k]);
+}
+
+/**
+ * @brief Calls the #part quantities conversion function on all particles in the
+ * space.
+ *
+ * @param s The #space.
+ * @param verbose Are we talkative?
+ */
+void space_convert_quantities(struct space *s, int verbose) {
+
+  const ticks tic = getticks();
+
+  if (s->nr_parts > 0)
+    threadpool_map(&s->e->threadpool, space_convert_quantities_mapper, s->parts,
+                   s->nr_parts, sizeof(struct part), 0, s);
+
+  if (verbose)
+    message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
+            clocks_getunit());
+}
+
 /**
  * @brief Split the space into cells given the array of particles.
  *
@@ -2845,11 +2927,16 @@ void space_init(struct space *s, const struct swift_params *params,
 
   hydro_space_init(&s->hs, s);
 
+  ticks tic = getticks();
+  if (verbose) message("first init...");
   /* Set the particles in a state where they are ready for a run */
-  space_init_parts(s);
-  space_init_xparts(s);
-  space_init_gparts(s);
-  space_init_sparts(s);
+  space_first_init_parts(s);
+  space_first_init_xparts(s);
+  space_first_init_gparts(s);
+  space_first_init_sparts(s);
+  if (verbose)
+    message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
+            clocks_getunit());
 
   /* Init the space lock. */
   if (lock_init(&s->lock) != 0) error("Failed to create space spin-lock.");
@@ -2871,7 +2958,8 @@ void space_replicate(struct space *s, int replicate, int verbose) {
 
   if (replicate < 1) error("Invalid replicate value: %d", replicate);
 
-  message("Replicating space %d times along each axis.", replicate);
+  if (verbose)
+    message("Replicating space %d times along each axis.", replicate);
 
   const int factor = replicate * replicate * replicate;
 
diff --git a/src/space.h b/src/space.h
index 1ea841affece7b4863edf20468d1d71caec93ab8..c49dc37dc0df27cd3647044f219e36c299dcd73b 100644
--- a/src/space.h
+++ b/src/space.h
@@ -221,9 +221,9 @@ void space_synchronize_particle_positions(struct space *s);
 void space_do_parts_sort();
 void space_do_gparts_sort();
 void space_do_sparts_sort();
-void space_init_parts(struct space *s);
-void space_init_gparts(struct space *s);
-void space_init_sparts(struct space *s);
+void space_init_parts(struct space *s, int verbose);
+void space_init_gparts(struct space *s, int verbose);
+void space_convert_quantities(struct space *s, int verbose);
 void space_link_cleanup(struct space *s);
 void space_check_drift_point(struct space *s, integertime_t ti_drift,
                              int multipole);
diff --git a/src/swift.h b/src/swift.h
index a5556730ae965109385257c0c38bfc34277223d4..33a0425154d45e030443bc7f2c405377ef6a39e2 100644
--- a/src/swift.h
+++ b/src/swift.h
@@ -32,6 +32,7 @@
 #include "cooling.h"
 #include "cycle.h"
 #include "debug.h"
+#include "dump.h"
 #include "engine.h"
 #include "error.h"
 #include "gravity.h"
@@ -40,6 +41,7 @@
 #include "hydro.h"
 #include "hydro_properties.h"
 #include "lock.h"
+#include "logger.h"
 #include "map.h"
 #include "multipole.h"
 #include "parallel_io.h"
diff --git a/src/threadpool.c b/src/threadpool.c
index 465756f71d88df81921a880edf8cdb1ee17f6026..0ef69f9895d3d127a330b105ec385d97975fc82f 100644
--- a/src/threadpool.c
+++ b/src/threadpool.c
@@ -169,10 +169,10 @@ void *threadpool_runner(void *data) {
   while (1) {
 
     /* Let the controller know that this thread is waiting. */
-    pthread_barrier_wait(&tp->wait_barrier);
+    swift_barrier_wait(&tp->wait_barrier);
 
     /* Wait for the controller. */
-    pthread_barrier_wait(&tp->run_barrier);
+    swift_barrier_wait(&tp->run_barrier);
 
     /* If no map function is specified, just die. We use this as a mechanism
        to shut down threads without leaving the barriers in an invalid state. */
@@ -212,8 +212,8 @@ void threadpool_init(struct threadpool *tp, int num_threads) {
   if (num_threads == 1) return;
 
   /* Init the barriers. */
-  if (pthread_barrier_init(&tp->wait_barrier, NULL, num_threads) != 0 ||
-      pthread_barrier_init(&tp->run_barrier, NULL, num_threads) != 0)
+  if (swift_barrier_init(&tp->wait_barrier, NULL, num_threads) != 0 ||
+      swift_barrier_init(&tp->run_barrier, NULL, num_threads) != 0)
     error("Failed to initialize barriers.");
 
   /* Set the task counter to zero. */
@@ -237,7 +237,7 @@ void threadpool_init(struct threadpool *tp, int num_threads) {
   }
 
   /* Wait for all the threads to be up and running. */
-  pthread_barrier_wait(&tp->wait_barrier);
+  swift_barrier_wait(&tp->wait_barrier);
 }
 
 /**
@@ -288,13 +288,13 @@ void threadpool_map(struct threadpool *tp, threadpool_map_function map_function,
   tp->num_threads_running = 0;
 
   /* Wait for all the threads to be up and running. */
-  pthread_barrier_wait(&tp->run_barrier);
+  swift_barrier_wait(&tp->run_barrier);
 
   /* Do some work while I'm at it. */
   threadpool_chomp(tp, tp->num_threads - 1);
 
   /* Wait for all threads to be done. */
-  pthread_barrier_wait(&tp->wait_barrier);
+  swift_barrier_wait(&tp->wait_barrier);
 
 #ifdef SWIFT_DEBUG_THREADPOOL
   /* Log the total call time to thread id -1. */
@@ -321,15 +321,15 @@ void threadpool_clean(struct threadpool *tp) {
      * and waiting for all the threads to terminate. This ensures that no
      * thread is still waiting at a barrier. */
     tp->map_function = NULL;
-    pthread_barrier_wait(&tp->run_barrier);
+    swift_barrier_wait(&tp->run_barrier);
     for (int k = 0; k < tp->num_threads - 1; k++) {
       void *retval;
       pthread_join(tp->threads[k], &retval);
     }
 
     /* Release the barriers. */
-    if (pthread_barrier_destroy(&tp->wait_barrier) != 0 ||
-        pthread_barrier_destroy(&tp->run_barrier) != 0)
+    if (swift_barrier_destroy(&tp->wait_barrier) != 0 ||
+        swift_barrier_destroy(&tp->run_barrier) != 0)
       error("Failed to destroy threadpool barriers.");
 
     /* Clean up memory. */
diff --git a/src/threadpool.h b/src/threadpool.h
index 019403f658a22d36c4a6e1ec1ae1fdc47c62658d..9d19b56836330f59093933bbed5900727142f4a2 100644
--- a/src/threadpool.h
+++ b/src/threadpool.h
@@ -22,10 +22,11 @@
 /* Config parameters. */
 #include "../config.h"
 
-/* Some standard headers. */
+/* Standard headers */
 #include <pthread.h>
 
 /* Local includes. */
+#include "barrier.h"
 #include "cycle.h"
 
 /* Local defines. */
@@ -70,8 +71,8 @@ struct threadpool {
   pthread_t *threads;
 
   /* This is where threads go to rest. */
-  pthread_barrier_t wait_barrier;
-  pthread_barrier_t run_barrier;
+  swift_barrier_t wait_barrier;
+  swift_barrier_t run_barrier;
 
   /* Current map data and count. */
   void *map_data, *map_extra_data;
diff --git a/src/units.c b/src/units.c
index 13d3e5dd35b14c325527ff35703712258e273ef3..fef137971e58c51027da357c11007ccc0946e4b7 100644
--- a/src/units.c
+++ b/src/units.c
@@ -587,3 +587,17 @@ double units_conversion_factor(const struct unit_system* from,
 
   return units_general_conversion_factor(from, to, baseUnitsExp);
 }
+
+/**
+ * @brief print a #unit_system
+ *
+ * @param us The #unit_system
+ */
+void units_print(const struct unit_system* us) {
+  message("Units:");
+  message("\tUnit Mass:        %g", us->UnitMass_in_cgs);
+  message("\tUnit Length:      %g", us->UnitLength_in_cgs);
+  message("\tUnit Time:        %g", us->UnitTime_in_cgs);
+  message("\tUnit Current:     %g", us->UnitCurrent_in_cgs);
+  message("\tUnit Temperature: %g", us->UnitTemperature_in_cgs);
+}
diff --git a/src/units.h b/src/units.h
index a5765495f9f52159ab70a1072c1f8571ddcdf14b..657b29c070f3816293c18edfc46ad6b960ec9b33 100644
--- a/src/units.h
+++ b/src/units.h
@@ -137,4 +137,6 @@ double units_conversion_factor(const struct unit_system* from,
                                const struct unit_system* to,
                                enum unit_conversion_factor unit);
 
+void units_print(const struct unit_system* us);
+
 #endif /* SWIFT_UNITS_H */
diff --git a/src/version.c b/src/version.c
index 46c31103c953ce2ff70b9e346f88470008dd8266..f4177e5c83c776ea063ad32fd00895199c94b182 100644
--- a/src/version.c
+++ b/src/version.c
@@ -332,6 +332,22 @@ const char *fftw3_version(void) {
   return version;
 }
 
+/**
+ * @brief return the thread barrier used in SWIFT.
+ *
+ * @result description of the thread barriers
+ */
+const char *thread_barrier_version(void) {
+
+  static char version[256] = {0};
+#if defined(HAVE_PTHREAD_BARRIERS)
+  sprintf(version, "%s", "pthread");
+#else
+  sprintf(version, "homemade");
+#endif
+  return version;
+}
+
 /**
  * @brief Prints a greeting message to the standard output containing code
  * version and revision number
diff --git a/src/version.h b/src/version.h
index 60998958098c1a37198cdbb3729982835f6e4f62..1af76b647b2d401bb4a7998b281864fb3e63a0c8 100644
--- a/src/version.h
+++ b/src/version.h
@@ -34,6 +34,7 @@ const char* mpi_version(void);
 const char* metis_version(void);
 const char* hdf5_version(void);
 const char* fftw3_version(void);
+const char* thread_barrier_version(void);
 void greetings(void);
 
 #endif /* SWIFT_VERSION_H */
diff --git a/tests/test125cells.c b/tests/test125cells.c
index 62ded5c1df47c9d40759dbc7aa239bb4a8a47dc4..bf1c219fab9927b81bb087684f6ad0d747e958bc 100644
--- a/tests/test125cells.c
+++ b/tests/test125cells.c
@@ -474,8 +474,10 @@ int main(int argc, char *argv[]) {
   unsigned long long cpufreq = 0;
   clocks_set_cpufreq(cpufreq);
 
-  /* Choke on FP-exceptions */
+/* Choke on FP-exceptions */
+#ifdef HAVE_FE_ENABLE_EXCEPT
   feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
+#endif
 
   /* Get some randomness going */
   srand(0);
diff --git a/tests/test27cells.c b/tests/test27cells.c
index c153fc6f57dc976e1f00767bae275a6a3de24c00..f9e3ba1dd78e67a11414c88efe3c604b02b1aa16 100644
--- a/tests/test27cells.c
+++ b/tests/test27cells.c
@@ -333,8 +333,10 @@ int main(int argc, char *argv[]) {
   unsigned long long cpufreq = 0;
   clocks_set_cpufreq(cpufreq);
 
-  /* Choke on FP-exceptions */
+/* Choke on FP-exceptions */
+#ifdef HAVE_FE_ENABLE_EXCEPT
   feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
+#endif
 
   /* Get some randomness going */
   srand(0);
diff --git a/tests/testActivePair.c b/tests/testActivePair.c
index c8e0233c2c317be321ed8b923a46a98e0905b36c..fd2244db971f6a5f7cfeb62b9661c33b7ee88145 100644
--- a/tests/testActivePair.c
+++ b/tests/testActivePair.c
@@ -433,8 +433,10 @@ int main(int argc, char *argv[]) {
   unsigned long long cpufreq = 0;
   clocks_set_cpufreq(cpufreq);
 
-  /* Choke on FP-exceptions */
+/* Choke on FP-exceptions */
+#ifdef HAVE_FE_ENABLE_EXCEPT
   feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
+#endif
 
   /* Generate a RNG seed from time. */
   unsigned int seed = time(NULL);
diff --git a/tests/testDump.c b/tests/testDump.c
index 7343af49f654582de444681ec291311d41251dca..b1406ba5f5e6652c6e2727d6b161215f4e905d62 100644
--- a/tests/testDump.c
+++ b/tests/testDump.c
@@ -20,6 +20,8 @@
 /* Config parameters. */
 #include "../config.h"
 
+#ifdef HAVE_POSIX_FALLOCATE /* Are we on a sensible platform? */
+
 /* Some standard headers. */
 #include <errno.h>
 #include <fcntl.h>
@@ -30,11 +32,8 @@
 #include <sys/types.h>
 #include <unistd.h>
 
-/* This object's header. */
-#include "../src/dump.h"
-
 /* Local headers. */
-#include "../src/threadpool.h"
+#include "swift.h"
 
 void dump_mapper(void *map_data, int num_elements, void *extra_data) {
   struct dump *d = (struct dump *)extra_data;
@@ -49,10 +48,14 @@ int main(int argc, char *argv[]) {
 
   /* Some constants. */
   const int num_threads = 4;
-  const char *filename = "/tmp/dump_test.out";
   const int num_runs = 20;
   const int chunk_size = 1000;
 
+  /* Some constants. */
+  char filename[256];
+  const int now = time(NULL);
+  sprintf(filename, "/tmp/SWIFT_dump_test_%d.out", now);
+
   /* Prepare a threadpool to write to the dump. */
   struct threadpool t;
   threadpool_init(&t, num_threads);
@@ -82,6 +85,15 @@ int main(int argc, char *argv[]) {
   /* Clean the threads */
   threadpool_clean(&t);
 
+  /* Be clean */
+  remove(filename);
+
   /* Return a happy number. */
   return 0;
 }
+
+#else
+
+int main(int argc, char *argv[]) { return 0; }
+
+#endif /* HAVE_POSIX_FALLOCATE */
diff --git a/tests/testFFT.c b/tests/testFFT.c
index 4ddd030ece95bf26cbfe41f2408be7c3e0c50535..ba737935afc4568a1509d2c43a3534b60a6ef867 100644
--- a/tests/testFFT.c
+++ b/tests/testFFT.c
@@ -76,6 +76,8 @@ int main() {
   engine.max_active_bin = num_time_bins;
   engine.gravity_properties = &gravity_properties;
   engine.nr_threads = 1;
+  engine.nodeID = 0;
+  engine_rank = 0;
 
   struct runner runner;
   runner.e = &engine;
diff --git a/tests/testLogger.c b/tests/testLogger.c
index ec3b33b6a9e38741e41b4678681e7afe9b9a7950..9ec08607383fdb192b7ba994e4af506fde12fea9 100644
--- a/tests/testLogger.c
+++ b/tests/testLogger.c
@@ -20,18 +20,16 @@
 /* Config parameters. */
 #include "../config.h"
 
+#ifdef HAVE_POSIX_FALLOCATE /* Are we on a sensible platform? */
+
 /* Some standard headers. */
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 
-/* This object's header. */
-#include "../src/logger.h"
-
 /* Local headers. */
-#include "../src/dump.h"
-#include "../src/part.h"
+#include "swift.h"
 
 void test_log_parts(struct dump *d) {
 
@@ -223,7 +221,9 @@ void test_log_timestamps(struct dump *d) {
 int main(int argc, char *argv[]) {
 
   /* Some constants. */
-  const char *filename = "/tmp/dump_test.out";
+  char filename[256];
+  const int now = time(NULL);
+  sprintf(filename, "/tmp/SWIFT_logger_test_%d.out", now);
 
   /* Prepare a dump. */
   struct dump d;
@@ -241,7 +241,15 @@ int main(int argc, char *argv[]) {
   /* Finalize the dump. */
   dump_close(&d);
 
+  /* Be clean */
+  remove(filename);
+
   /* Return a happy number. */
-  printf("PASS\n");
   return 0;
 }
+
+#else
+
+int main(int argc, char *argv[]) { return 0; }
+
+#endif /* HAVE_POSIX_FALLOCATE */
diff --git a/tests/testPeriodicBC.c b/tests/testPeriodicBC.c
index 1afe829ce70c5d02f565e8fd002ba414abcb7388..ded767c5d657c0a05dc227d1ee8facbfa5f4de63 100644
--- a/tests/testPeriodicBC.c
+++ b/tests/testPeriodicBC.c
@@ -391,8 +391,10 @@ int main(int argc, char *argv[]) {
   unsigned long long cpufreq = 0;
   clocks_set_cpufreq(cpufreq);
 
-  /* Choke on FP-exceptions */
+/* Choke on FP-exceptions */
+#ifdef HAVE_FE_ENABLE_EXCEPT
   feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
+#endif
 
   /* Get some randomness going */
   srand(0);
diff --git a/tests/testSymmetry.c b/tests/testSymmetry.c
index 73c5708a6add174b88f26cc716a716fa2ad81709..68a878b05c3fc298ef7d0b159df9997d2ef1f82d 100644
--- a/tests/testSymmetry.c
+++ b/tests/testSymmetry.c
@@ -28,8 +28,10 @@
 
 int main(int argc, char *argv[]) {
 
-  /* Choke if need be */
+/* Choke on FPEs */
+#ifdef HAVE_FE_ENABLE_EXCEPT
   feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
+#endif
 
 #if defined(SHADOWFAX_SPH)
   /* Initialize the Voronoi simulation box */