diff --git a/examples/Makefile.am b/examples/Makefile.am
index b2c09e54800c556ba03bdb1e2436af3144263dcb..200d340be168651e6dfc91f4b89a0a343d34d9da 100644
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -63,13 +63,15 @@ swift_fixdt_mpi_LDADD =  ../src/.libs/libswiftsim_mpi.a $(HDF5_LDFLAGS) $(HDF5_L
 
 # Scripts to generate ICs
 EXTRA_DIST = UniformBox/makeIC.py \
-             PerturbedBox/makeIC.py \
+	     UniformDMBox/makeIC.py \
+	     PerturbedBox/makeIC.py \
 	     SedovBlast/makeIC.py SedovBlast/makeIC_fcc.py SedovBlast/solution.py \
 	     SodShock/makeIC.py SodShock/solution.py SodShock/glass_001.hdf5 SodShock/glass_002.hdf5 SodShock/rhox.py \
 	     CosmoVolume/getIC.sh \
 	     BigCosmoVolume/makeIC.py \
 	     BigPerturbedBox/makeIC_fcc.py \
-             GreshoVortex/makeIC.py GreshoVortex/solution.py
+             GreshoVortex/makeIC.py GreshoVortex/solution.py \
+             MultiTypes/makeIC.py
 
 
 # Scripts to plot task graphs
diff --git a/examples/MultiTypes/makeIC.py b/examples/MultiTypes/makeIC.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a41910c22c260086b5384b248a5c86ab6340a5e
--- /dev/null
+++ b/examples/MultiTypes/makeIC.py
@@ -0,0 +1,136 @@
+###############################################################################
+ # This file is part of SWIFT.
+ # Copyright (c) 2013 Pedro Gonnet (pedro.gonnet@durham.ac.uk),
+ #                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ # 
+ # This program is free software: you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published
+ # by the Free Software Foundation, either version 3 of the License, or
+ # (at your option) any later version.
+ # 
+ # This program is distributed in the hope that it will be useful,
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ # GNU General Public License for more details.
+ # 
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ # 
+ ##############################################################################
+
+import h5py
+import sys
+from numpy import *
+
+# Generates a swift IC file containing a cartesian distribution of DM particles
+# with a density of 1
+
+# Parameters
+periodic= 1           # 1 For periodic box
+boxSize = 1.
+Lgas = int(sys.argv[1])  # Number of particles along one axis
+rhoGas = 2.              # Density
+P = 1.                   # Pressure
+gamma = 5./3.            # Gas adiabatic index
+rhoDM = 1.
+Ldm = int(sys.argv[2])  # Number of particles along one axis
+
+fileName = "multiTypes.hdf5"
+
+#---------------------------------------------------
+numGas = Lgas**3
+massGas = boxSize**3 * rhoGas / numGas
+internalEnergy = P / ((gamma - 1.)*rhoGas)
+
+numDM = Ldm**3
+massDM = boxSize**3 * rhoDM / numDM
+
+#--------------------------------------------------
+
+#File
+file = h5py.File(fileName, 'w')
+
+# Header
+grp = file.create_group("/Header")
+grp.attrs["BoxSize"] = boxSize
+grp.attrs["NumPart_Total"] =  [numGas, numDM, 0, 0, 0, 0]
+grp.attrs["NumPart_Total_HighWord"] = [0, 0, 0, 0, 0, 0]
+grp.attrs["NumPart_ThisFile"] = [numGas, numDM, 0, 0, 0, 0]
+grp.attrs["Time"] = 0.0
+grp.attrs["NumFilesPerSnapshot"] = 1
+grp.attrs["MassTable"] = [0.0, massDM, 0.0, 0.0, 0.0, 0.0]
+grp.attrs["Flag_Entropy_ICs"] = 0
+
+
+#Runtime parameters
+grp = file.create_group("/RuntimePars")
+grp.attrs["PeriodicBoundariesOn"] = periodic
+
+
+# Gas Particle group
+grp = file.create_group("/PartType0")
+
+v  = zeros((numGas, 3))
+ds = grp.create_dataset('Velocities', (numGas, 3), 'f')
+ds[()] = v
+v = zeros(1)
+
+m = full((numGas, 1), massGas)
+ds = grp.create_dataset('Masses', (numGas,1), 'f')
+ds[()] = m
+m = zeros(1)
+
+h = full((numGas, 1), 1.1255 * boxSize / Lgas)
+ds = grp.create_dataset('SmoothingLength', (numGas,1), 'f')
+ds[()] = h
+h = zeros(1)
+
+u = full((numGas, 1), internalEnergy)
+ds = grp.create_dataset('InternalEnergy', (numGas,1), 'f')
+ds[()] = u
+u = zeros(1)
+
+ids = linspace(0, numGas, numGas, endpoint=False).reshape((numGas,1))
+ds = grp.create_dataset('ParticleIDs', (numGas, 1), 'L')
+ds[()] = ids + 1
+x      = ids % Lgas;
+y      = ((ids - x) / Lgas) % Lgas;
+z      = (ids - x - Lgas * y) / Lgas**2;
+coords = zeros((numGas, 3))
+coords[:,0] = z[:,0] * boxSize / Lgas + boxSize / (2*Lgas)
+coords[:,1] = y[:,0] * boxSize / Lgas + boxSize / (2*Lgas)
+coords[:,2] = x[:,0] * boxSize / Lgas + boxSize / (2*Lgas)
+ds = grp.create_dataset('Coordinates', (numGas, 3), 'd')
+ds[()] = coords
+
+
+
+
+
+# DM Particle group
+grp = file.create_group("/PartType1")
+
+v  = zeros((numDM, 3))
+ds = grp.create_dataset('Velocities', (numDM, 3), 'f')
+ds[()] = v
+v = zeros(1)
+
+m = full((numDM, 1), massDM)
+ds = grp.create_dataset('Masses', (numDM,1), 'f')
+ds[()] = m
+m = zeros(1)
+
+ids = linspace(0, numDM, numDM, endpoint=False).reshape((numDM,1))
+ds = grp.create_dataset('ParticleIDs', (numDM, 1), 'L')
+ds[()] = ids + Lgas**3 + 1
+x      = ids % Ldm;
+y      = ((ids - x) / Ldm) % Ldm;
+z      = (ids - x - Ldm * y) / Ldm**2;
+coords = zeros((numDM, 3))
+coords[:,0] = z[:,0] * boxSize / Ldm + boxSize / (2*Ldm)
+coords[:,1] = y[:,0] * boxSize / Ldm + boxSize / (2*Ldm)
+coords[:,2] = x[:,0] * boxSize / Ldm + boxSize / (2*Ldm)
+ds = grp.create_dataset('Coordinates', (numDM, 3), 'd')
+ds[()] = coords
+
+file.close()
diff --git a/examples/main.c b/examples/main.c
index c88f92a07a747c327692b5e0fbbc7dc07b93ac0c..6bb16d711cb3d44e8843279e76d4cd24f5ce0cc4 100644
--- a/examples/main.c
+++ b/examples/main.c
@@ -1,3 +1,4 @@
+
 /*******************************************************************************
  * This file is part of SWIFT.
  * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk),
@@ -79,7 +80,9 @@ int main(int argc, char *argv[]) {
   int nr_nodes = 1, myrank = 0;
   FILE *file_thread;
   int with_outputs = 1;
-  int verbose = 0, talking;
+  int with_gravity = 0;
+  int engine_policies = 0;
+  int verbose = 0, talking = 0;
   unsigned long long cpufreq = 0;
 
 #ifdef WITH_MPI
@@ -97,12 +100,15 @@ int main(int argc, char *argv[]) {
 #endif
 #endif
 
-/* Choke on FP-exceptions. */
-// feenableexcept( FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW );
+  /* Choke on FP-exceptions. */
+  // feenableexcept( FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW );
+
+  /* Initialize CPU frequency, this also starts time. */
+  clocks_set_cpufreq(cpufreq);
 
 #ifdef WITH_MPI
   /* Start by initializing MPI. */
-  int res, prov;
+  int res = 0, prov = 0;
   if ((res = MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &prov)) !=
       MPI_SUCCESS)
     error("Call to MPI_Init failed with error %i.", res);
@@ -128,9 +134,6 @@ int main(int argc, char *argv[]) {
          &initial_partition.grid[1], &initial_partition.grid[0]);
 #endif
 
-  /* Initialize CPU frequency, this also starts time. */
-  clocks_set_cpufreq(cpufreq);
-
   /* Greeting message */
   if (myrank == 0) greetings();
 
@@ -156,7 +159,7 @@ int main(int argc, char *argv[]) {
   bzero(&s, sizeof(struct space));
 
   /* Parse the options */
-  while ((c = getopt(argc, argv, "a:c:d:e:f:h:m:oP:q:R:s:t:v:w:y:z:")) != -1)
+  while ((c = getopt(argc, argv, "a:c:d:e:f:gh:m:oP:q:R:s:t:v:w:y:z:")) != -1)
     switch (c) {
       case 'a':
         if (sscanf(optarg, "%lf", &scaling) != 1)
@@ -185,6 +188,9 @@ int main(int argc, char *argv[]) {
       case 'f':
         if (!strcpy(ICfileName, optarg)) error("Error parsing IC file name.");
         break;
+      case 'g':
+        with_gravity = 1;
+        break;
       case 'h':
         if (sscanf(optarg, "%llu", &cpufreq) != 1)
           error("Error parsing CPU frequency.");
@@ -356,11 +362,11 @@ int main(int argc, char *argv[]) {
   if (myrank == 0) clocks_gettime(&tic);
 #if defined(WITH_MPI)
 #if defined(HAVE_PARALLEL_HDF5)
-  read_ic_parallel(ICfileName, dim, &parts, &Ngas, &periodic, myrank, nr_nodes,
-                   MPI_COMM_WORLD, MPI_INFO_NULL);
+  read_ic_parallel(ICfileName, dim, &parts, &gparts, &Ngas, &Ngpart, &periodic,
+		   myrank, nr_nodes, MPI_COMM_WORLD, MPI_INFO_NULL);
 #else
-  read_ic_serial(ICfileName, dim, &parts, &Ngas, &periodic, myrank, nr_nodes,
-                 MPI_COMM_WORLD, MPI_INFO_NULL);
+  read_ic_serial(ICfileName, dim, &parts, &gparts, &Ngas, &Ngpart, &periodic,
+                 myrank, nr_nodes, MPI_COMM_WORLD, MPI_INFO_NULL);
 #endif
 #else
   read_ic_single(ICfileName, dim, &parts, &gparts, &Ngas, &Ngpart, &periodic);
@@ -376,6 +382,7 @@ int main(int argc, char *argv[]) {
 #if defined(WITH_MPI)
   long long N_long[2] = {Ngas, Ngpart};
   MPI_Reduce(&N_long, &N_total, 2, MPI_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
+  N_total[1] -= N_total[0];
   if (myrank == 0)
     message("Read %lld gas particles and %lld DM particles from the ICs",
             N_total[0], N_total[1]);
@@ -383,9 +390,34 @@ int main(int argc, char *argv[]) {
   N_total[0] = Ngas;
   N_total[1] = Ngpart - Ngas;
   message("Read %lld gas particles and %lld DM particles from the ICs",
-	  N_total[0], N_total[1]);
+          N_total[0], N_total[1]);
 #endif
 
+  /* MATTHIEU: Temporary fix to preserve master */
+  if (!with_gravity) {
+    free(gparts);
+    for(size_t k = 0; k < Ngas; ++k)
+      parts[k].gpart = NULL;
+    Ngpart = 0;
+#if defined(WITH_MPI)
+    N_long[0] = Ngas;
+    N_long[1] = Ngpart;
+    MPI_Reduce(&N_long, &N_total, 2, MPI_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
+    if (myrank == 0)
+      message(
+          "AFTER FIX: Read %lld gas particles and %lld DM particles from the "
+          "ICs",
+          N_total[0], N_total[1]);
+#else
+    N_total[0] = Ngas;
+    N_total[1] = Ngpart;
+    message(
+        "AFTER FIX: Read %lld gas particles and %lld DM particles from the ICs",
+        N_total[0], N_total[1]);
+#endif
+  }
+  /* MATTHIEU: End temporary fix */
+
   /* Apply h scaling */
   if (scaling != 1.0)
     for (size_t k = 0; k < Ngas; k++) parts[k].h *= scaling;
@@ -448,12 +480,15 @@ int main(int argc, char *argv[]) {
     message("nr of cells at depth %i is %i.", data[0], data[1]);
   }
 
+  /* Construct the engine policy */
+  engine_policies = ENGINE_POLICY | engine_policy_steal | engine_policy_hydro;
+  if (with_gravity) engine_policies |= engine_policy_external_gravity;
+
   /* Initialize the engine with this space. */
   if (myrank == 0) clocks_gettime(&tic);
   if (myrank == 0) message("nr_nodes is %i.", nr_nodes);
   engine_init(&e, &s, dt_max, nr_threads, nr_queues, nr_nodes, myrank,
-              ENGINE_POLICY | engine_policy_steal | engine_policy_hydro, 0,
-              time_end, dt_min, dt_max, talking);
+              engine_policies, 0, time_end, dt_min, dt_max, talking);
   if (myrank == 0 && verbose) {
     clocks_gettime(&toc);
     message("engine_init took %.3f %s.", clocks_diff(&tic, &toc),
diff --git a/src/Makefile.am b/src/Makefile.am
index 8a4e7cdc4b849e8f99bdf9a3dbda9b30fa986332..f44d47819672d10445fd969fe2ff20dbcb49463b 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -46,9 +46,9 @@ AM_SOURCES = space.c runner.c queue.c task.c cell.c engine.c \
 # Include files for distribution, not installation.
 nobase_noinst_HEADERS = approx_math.h atomic.h cycle.h error.h inline.h kernel.h vector.h \
 		 runner_doiact.h runner_doiact_grav.h units.h intrinsics.h minmax.h \
-		 gravity.h \
-		 gravity/Default/gravity.h gravity/Default/runner_iact_grav.h \
-		 gravity/Default/gravity_part.h \
+		 gravity.h gravity_io.h \
+		 gravity/Default/gravity.h gravity/Default/gravity_iact.h gravity/Default/gravity_io.h \
+		 gravity/Default/gravity_debug.h gravity/Default/gravity_part.h  \
 	 	 hydro.h hydro_io.h \
 		 hydro/Minimal/hydro.h hydro/Minimal/hydro_iact.h hydro/Minimal/hydro_io.h \
                  hydro/Minimal/hydro_debug.h hydro/Minimal/hydro_part.h \
diff --git a/src/common_io.c b/src/common_io.c
index f6a4803333581b69671e3adc223b46122ec5364c..5fb2d9513ec2acc0cd8d389a226b14d427e02539 100644
--- a/src/common_io.c
+++ b/src/common_io.c
@@ -45,6 +45,9 @@
 #include "kernel.h"
 #include "version.h"
 
+const char* particle_type_names[NUM_PARTICLE_TYPES] = {
+    "Gas", "DM", "Boundary", "Dummy", "Star", "BH"};
+
 /**
  * @brief Converts a C data type to the HDF5 equivalent.
  *
@@ -402,52 +405,68 @@ void createXMFfile() {
  *snapshot
  *
  * @param xmfFile The file to write in.
- * @param Nparts The number of particles.
  * @param hdfFileName The name of the HDF5 file corresponding to this output.
  * @param time The current simulation time.
  */
-void writeXMFheader(FILE* xmfFile, long long Nparts, char* hdfFileName,
-                    float time) {
+void writeXMFoutputheader(FILE* xmfFile, char* hdfFileName, float time) {
   /* Write end of file */
 
+  fprintf(xmfFile, "<!-- XMF description for file: %s -->\n", hdfFileName);
   fprintf(xmfFile,
           "<Grid GridType=\"Collection\" CollectionType=\"Spatial\">\n");
   fprintf(xmfFile, "<Time Type=\"Single\" Value=\"%f\"/>\n", time);
-  fprintf(xmfFile, "<Grid Name=\"Gas\" GridType=\"Uniform\">\n");
-  fprintf(xmfFile,
-          "<Topology TopologyType=\"Polyvertex\" Dimensions=\"%lld\"/>\n",
-          Nparts);
-  fprintf(xmfFile, "<Geometry GeometryType=\"XYZ\">\n");
-  fprintf(xmfFile,
-          "<DataItem Dimensions=\"%lld 3\" NumberType=\"Double\" "
-          "Precision=\"8\" "
-          "Format=\"HDF\">%s:/PartType0/Coordinates</DataItem>\n",
-          Nparts, hdfFileName);
-  fprintf(xmfFile, "</Geometry>");
 }
 
 /**
  * @brief Writes the end of the XMF file (closes all open markups)
  *
  * @param xmfFile The file to write in.
+ * @param output The number of this output.
+ * @param time The current simulation time.
  */
-void writeXMFfooter(FILE* xmfFile) {
+void writeXMFoutputfooter(FILE* xmfFile, int output, float time) {
   /* Write end of the section of this time step */
 
-  fprintf(xmfFile, "\n</Grid>\n");
-  fprintf(xmfFile, "</Grid>\n");
-  fprintf(xmfFile, "\n</Grid>\n");
+  fprintf(xmfFile,
+          "\n</Grid> <!-- End of meta-data for output=%03i, time=%f -->\n",
+          output, time);
+  fprintf(xmfFile, "\n</Grid> <!-- timeSeries -->\n");
   fprintf(xmfFile, "</Domain>\n");
   fprintf(xmfFile, "</Xdmf>\n");
 
   fclose(xmfFile);
 }
 
+void writeXMFgroupheader(FILE* xmfFile, char* hdfFileName, size_t N,
+                         enum PARTICLE_TYPE ptype) {
+  fprintf(xmfFile, "\n<Grid Name=\"%s\" GridType=\"Uniform\">\n",
+          particle_type_names[ptype]);
+  fprintf(xmfFile,
+          "<Topology TopologyType=\"Polyvertex\" Dimensions=\"%zi\"/>\n", N);
+  fprintf(xmfFile, "<Geometry GeometryType=\"XYZ\">\n");
+  fprintf(xmfFile,
+          "<DataItem Dimensions=\"%zi 3\" NumberType=\"Double\" "
+          "Precision=\"8\" "
+          "Format=\"HDF\">%s:/PartType%d/Coordinates</DataItem>\n",
+          N, hdfFileName, ptype);
+  fprintf(xmfFile,
+          "</Geometry>\n <!-- Done geometry for %s, start of particle fields "
+          "list -->\n",
+          particle_type_names[ptype]);
+}
+
+void writeXMFgroupfooter(FILE* xmfFile, enum PARTICLE_TYPE ptype) {
+  fprintf(xmfFile, "</Grid> <!-- End of meta-data for parttype=%s -->\n",
+          particle_type_names[ptype]);
+}
+
 /**
  * @brief Writes the lines corresponding to an array of the HDF5 output
  *
  * @param xmfFile The file in which to write
  * @param fileName The name of the HDF5 file associated to this XMF descriptor.
+ * @param partTypeGroupName The name of the group containing the particles in
+ *the HDF5 file.
  * @param name The name of the array in the HDF5 file.
  * @param N The number of particles.
  * @param dim The dimension of the quantity (1 for scalars, 3 for vectors).
@@ -455,21 +474,21 @@ void writeXMFfooter(FILE* xmfFile) {
  *
  * @todo Treat the types in a better way.
  */
-void writeXMFline(FILE* xmfFile, char* fileName, char* name, long long N,
-                  int dim, enum DATA_TYPE type) {
+void writeXMFline(FILE* xmfFile, char* fileName, char* partTypeGroupName,
+                  char* name, size_t N, int dim, enum DATA_TYPE type) {
   fprintf(xmfFile,
           "<Attribute Name=\"%s\" AttributeType=\"%s\" Center=\"Node\">\n",
           name, dim == 1 ? "Scalar" : "Vector");
   if (dim == 1)
     fprintf(xmfFile,
-            "<DataItem Dimensions=\"%lld\" NumberType=\"Double\" "
-            "Precision=\"%d\" Format=\"HDF\">%s:/PartType0/%s</DataItem>\n",
-            N, type == FLOAT ? 4 : 8, fileName, name);
+            "<DataItem Dimensions=\"%zi\" NumberType=\"Double\" "
+            "Precision=\"%d\" Format=\"HDF\">%s:%s/%s</DataItem>\n",
+            N, type == FLOAT ? 4 : 8, fileName, partTypeGroupName, name);
   else
     fprintf(xmfFile,
-            "<DataItem Dimensions=\"%lld %d\" NumberType=\"Double\" "
-            "Precision=\"%d\" Format=\"HDF\">%s:/PartType0/%s</DataItem>\n",
-            N, dim, type == FLOAT ? 4 : 8, fileName, name);
+            "<DataItem Dimensions=\"%zi %d\" NumberType=\"Double\" "
+            "Precision=\"%d\" Format=\"HDF\">%s:%s/%s</DataItem>\n",
+            N, dim, type == FLOAT ? 4 : 8, fileName, partTypeGroupName, name);
   fprintf(xmfFile, "</Attribute>\n");
 }
 
@@ -489,7 +508,8 @@ void prepare_dm_gparts(struct gpart* gparts, size_t Ndm) {
   for (size_t i = 0; i < Ndm; ++i) {
 
     /* 0 or negative ids are not allowed */
-    if (gparts[i].id <= 0) error("0 or negative ID for DM particle");
+    if (gparts[i].id <= 0)
+      error("0 or negative ID for DM particle %zd: ID=%lld", i, gparts[i].id);
 
     gparts[i].id = -gparts[i].id;
   }
diff --git a/src/common_io.h b/src/common_io.h
index 2623a03f9a25ce0e650dde4f698da6eb49177e26..4ad0c6fb754c4288a0c731e2b1e2392998719d52 100644
--- a/src/common_io.h
+++ b/src/common_io.h
@@ -70,6 +70,11 @@ enum PARTICLE_TYPE {
   NUM_PARTICLE_TYPES
 };
 
+extern const char* particle_type_names[];
+
+#define FILENAME_BUFFER_SIZE 150
+#define PARTICLE_GROUP_BUFFER_SIZE 20
+
 hid_t hdf5Type(enum DATA_TYPE type);
 size_t sizeOfType(enum DATA_TYPE type);
 
@@ -92,10 +97,13 @@ void writeAttribute_s(hid_t grp, char* name, const char* str);
 
 void createXMFfile();
 FILE* prepareXMFfile();
-void writeXMFfooter(FILE* xmfFile);
-void writeXMFheader(FILE* xmfFile, long long N, char* hdfFileName, float time);
-void writeXMFline(FILE* xmfFile, char* fileName, char* name, long long N,
-                  int dim, enum DATA_TYPE type);
+void writeXMFoutputheader(FILE* xmfFile, char* hdfFileName, float time);
+void writeXMFoutputfooter(FILE* xmfFile, int outputCount, float time);
+void writeXMFgroupheader(FILE* xmfFile, char* hdfFileName, size_t N,
+                         enum PARTICLE_TYPE ptype);
+void writeXMFgroupfooter(FILE* xmfFile, enum PARTICLE_TYPE ptype);
+void writeXMFline(FILE* xmfFile, char* fileName, char* partTypeGroupName,
+                  char* name, size_t N, int dim, enum DATA_TYPE type);
 
 void writeCodeDescription(hid_t h_file);
 void writeSPHflavour(hid_t h_file);
diff --git a/src/engine.c b/src/engine.c
index e4d77df941a70a592ff7e68d50780bf341910939..e4872bf17afe17aa808495dc70bd062641b893a8 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -87,14 +87,17 @@ struct link *engine_addlink(struct engine *e, struct link *l, struct task *t) {
 }
 
 /**
- * @brief Generate the ghost and kick tasks for a hierarchy of cells.
+ * @brief Generate the ghosts all the O(Npart) tasks for a hierarchy of cells.
+ *
+ * Tasks are only created here. The dependencies will be added later on.
  *
  * @param e The #engine.
  * @param c The #cell.
  * @param super The super #cell.
  */
 
-void engine_mkghosts(struct engine *e, struct cell *c, struct cell *super) {
+void engine_make_ghost_tasks(struct engine *e, struct cell *c,
+                             struct cell *super) {
 
   struct scheduler *s = &e->sched;
 
@@ -128,7 +131,8 @@ void engine_mkghosts(struct engine *e, struct cell *c, struct cell *super) {
   /* Recurse. */
   if (c->split)
     for (int k = 0; k < 8; k++)
-      if (c->progeny[k] != NULL) engine_mkghosts(e, c->progeny[k], super);
+      if (c->progeny[k] != NULL)
+        engine_make_ghost_tasks(e, c->progeny[k], super);
 }
 
 /**
@@ -256,13 +260,11 @@ void engine_redistribute(struct engine *e) {
   }
 
   /* Verify that all parts are in the right place. */
-  /* for ( k = 0 ; k < nr_parts ; k++ ) {
-      cid = cell_getid( cdim , parts_new[k].x[0]*ih[0] , parts_new[k].x[1]*ih[1]
-     , parts_new[k].x[2]*ih[2] );
+  /* for ( int k = 0 ; k < nr_parts ; k++ ) {
+      int cid = cell_getid( cdim , parts_new[k].x[0]*ih[0], parts_new[k].x[1]*ih[1], parts_new[k].x[2]*ih[2] );
       if ( cells[ cid ].nodeID != nodeID )
-          error( "Received particle (%i) that does not belong here (nodeID=%i)."
-     , k , cells[ cid ].nodeID );
-      } */
+          error( "Received particle (%i) that does not belong here (nodeID=%i).", k , cells[ cid ].nodeID );
+    } */
 
   /* Set the new part data, free the old. */
   free(parts);
@@ -509,7 +511,7 @@ void engine_exchange_cells(struct engine *e) {
 
   /* Wait for each count to come in and start the recv. */
   for (int k = 0; k < nr_proxies; k++) {
-    int pid;
+    int pid = MPI_UNDEFINED;
     if (MPI_Waitany(nr_proxies, reqs_in, &pid, &status) != MPI_SUCCESS ||
         pid == MPI_UNDEFINED)
       error("MPI_Waitany failed.");
@@ -529,7 +531,7 @@ void engine_exchange_cells(struct engine *e) {
 
   /* Wait for each pcell array to come in from the proxies. */
   for (int k = 0; k < nr_proxies; k++) {
-    int pid;
+    int pid = MPI_UNDEFINED;
     if (MPI_Waitany(nr_proxies, reqs_in, &pid, &status) != MPI_SUCCESS ||
         pid == MPI_UNDEFINED)
       error("MPI_Waitany failed.");
@@ -633,7 +635,7 @@ int engine_exchange_strays(struct engine *e, int offset, size_t *ind,
 
   /* Wait for each count to come in and start the recv. */
   for (int k = 0; k < e->nr_proxies; k++) {
-    int pid;
+    int pid = MPI_UNDEFINED;
     if (MPI_Waitany(e->nr_proxies, reqs_in, &pid, MPI_STATUS_IGNORE) !=
             MPI_SUCCESS ||
         pid == MPI_UNDEFINED)
@@ -689,7 +691,7 @@ int engine_exchange_strays(struct engine *e, int offset, size_t *ind,
      parts from the proxies. */
   size_t count = 0;
   for (int k = 0; k < 2 * (nr_in + nr_out); k++) {
-    int err, pid;
+    int err = 0, pid = MPI_UNDEFINED;
     if ((err = MPI_Waitany(2 * e->nr_proxies, reqs_in, &pid,
                            MPI_STATUS_IGNORE)) != MPI_SUCCESS) {
       char buff[MPI_MAX_ERROR_STRING];
@@ -735,43 +737,42 @@ int engine_exchange_strays(struct engine *e, int offset, size_t *ind,
 }
 
 /**
- * @brief Fill the #space's task list.
+ * @brief Constructs the top-level pair tasks for the first hydro loop over
+ *neighbours
  *
- * @param e The #engine we are working with.
+ * Here we construct all the tasks for all possible neighbouring non-empty
+ * local cells in the hierarchy. No dependencies are being added thus far.
+ * Additional loop over neighbours can later be added by simply duplicating
+ * all the tasks created by this function.
+ *
+ * @param e The #engine.
  */
-
-void engine_maketasks(struct engine *e) {
+void engine_make_hydroloop_tasks(struct engine *e) {
 
   struct space *s = e->s;
   struct scheduler *sched = &e->sched;
-  struct cell *cells = s->cells;
-  const int nr_cells = s->nr_cells;
   const int nodeID = e->nodeID;
   const int *cdim = s->cdim;
-  const ticks tic = getticks();
-
-  /* Re-set the scheduler. */
-  scheduler_reset(sched, s->tot_cells * engine_maxtaskspercell);
-
-  /* Add the space sorting tasks. */
-  for (int i = 0; i < e->nr_threads; i++) {
-    scheduler_addtask(sched, task_type_part_sort, task_subtype_none, i, 0, NULL,
-                      NULL, 0);
-    scheduler_addtask(sched, task_type_gpart_sort, task_subtype_none, i, 0,
-                      NULL, NULL, 0);
-  }
+  struct cell *cells = s->cells;
 
   /* Run through the highest level of cells and add pairs. */
-  for (int i = 0; i < cdim[0]; i++)
-    for (int j = 0; j < cdim[1]; j++)
+  for (int i = 0; i < cdim[0]; i++) {
+    for (int j = 0; j < cdim[1]; j++) {
       for (int k = 0; k < cdim[2]; k++) {
-        int cid = cell_getid(cdim, i, j, k);
-        if (cells[cid].count == 0) continue;
+
+        /* Get the cell */
+        const int cid = cell_getid(cdim, i, j, k);
         struct cell *ci = &cells[cid];
+
+        /* Skip cells without hydro particles */
         if (ci->count == 0) continue;
+
+        /* If the cells is local build a self-interaction */
         if (ci->nodeID == nodeID)
           scheduler_addtask(sched, task_type_self, task_subtype_density, 0, 0,
                             ci, NULL, 0);
+
+        /* Now loop over all the neighbours of this cell */
         for (int ii = -1; ii < 2; ii++) {
           int iii = i + ii;
           if (!s->periodic && (iii < 0 || iii >= cdim[0])) continue;
@@ -784,67 +785,43 @@ void engine_maketasks(struct engine *e) {
               int kkk = k + kk;
               if (!s->periodic && (kkk < 0 || kkk >= cdim[2])) continue;
               kkk = (kkk + cdim[2]) % cdim[2];
-              int cjd = cell_getid(cdim, iii, jjj, kkk);
+
+              /* Get the neighbouring cell */
+              const int cjd = cell_getid(cdim, iii, jjj, kkk);
               struct cell *cj = &cells[cjd];
+
+              /* Is that neighbour local and does it have particles ? */
               if (cid >= cjd || cj->count == 0 ||
                   (ci->nodeID != nodeID && cj->nodeID != nodeID))
                 continue;
-              int sid = sortlistID[(kk + 1) + 3 * ((jj + 1) + 3 * (ii + 1))];
+
+              /* Construct the pair task */
+              const int sid =
+                  sortlistID[(kk + 1) + 3 * ((jj + 1) + 3 * (ii + 1))];
               scheduler_addtask(sched, task_type_pair, task_subtype_density,
                                 sid, 0, ci, cj, 1);
             }
           }
         }
       }
+    }
+  }
+}
 
-  /* /\* Add the gravity mm tasks. *\/ */
-  /* for (int i = 0; i < nr_cells; i++) */
-  /*   if (cells[i].gcount > 0) { */
-  /*     scheduler_addtask(sched, task_type_grav_mm, task_subtype_none, -1, 0,
-   */
-  /*                       &cells[i], NULL, 0); */
-  /*     for (int j = i + 1; j < nr_cells; j++) */
-  /*       if (cells[j].gcount > 0) */
-  /*         scheduler_addtask(sched, task_type_grav_mm, task_subtype_none, -1,
-   * 0, */
-  /*                           &cells[i], &cells[j], 0); */
-  /* } */
-
-  /* Split the tasks. */
-  scheduler_splittasks(sched);
-
-  /* Allocate the list of cell-task links. The maximum number of links
-     is the number of cells (s->tot_cells) times the number of neighbours (27)
-     times the number of interaction types (2, density and force). */
-  if (e->links != NULL) free(e->links);
-  e->size_links = s->tot_cells * 27 * 2;
-  if ((e->links = malloc(sizeof(struct link) * e->size_links)) == NULL)
-    error("Failed to allocate cell-task links.");
-  e->nr_links = 0;
+/**
+ * @brief Counts the tasks associated with one cell and constructs the links
+ *
+ * For each hydrodynamic task, construct the links with the corresponding cell.
+ * Similarly, construct the dependencies for all the sorting tasks.
+ *
+ * @param e The #engine.
+ */
+void engine_count_and_link_tasks(struct engine *e) {
 
-  /* /\* Add the gravity up/down tasks at the top-level cells and push them
-   * down. *\/ */
-  /* for (int k = 0; k < nr_cells; k++) */
-  /*   if (cells[k].nodeID == nodeID && cells[k].gcount > 0) { */
-
-  /*     /\* Create tasks at top level. *\/ */
-  /*     struct task *up = */
-  /*         scheduler_addtask(sched, task_type_grav_up, task_subtype_none, 0,
-   * 0, */
-  /*                           &cells[k], NULL, 0); */
-  /*     struct task *down = */
-  /*         scheduler_addtask(sched, task_type_grav_down, task_subtype_none, 0,
-   * 0, */
-  /*                           &cells[k], NULL, 0); */
-
-  /*     /\* Push tasks down the cell hierarchy. *\/ */
-  /*     engine_addtasks_grav(e, &cells[k], up, down); */
-  /*   } */
+  struct scheduler *sched = &e->sched;
+  const int nr_tasks = sched->nr_tasks;
 
-  /* Count the number of tasks associated with each cell and
-     store the density tasks in each cell, and make each sort
-     depend on the sorts of its progeny. */
-  for (int k = 0; k < sched->nr_tasks; k++) {
+  for (int k = 0; k < nr_tasks; k++) {
 
     /* Get the current task. */
     struct task *t = &sched->tasks[k];
@@ -899,16 +876,27 @@ void engine_maketasks(struct engine *e) {
     /*   } */
     /* } */
   }
+}
 
-  /* Append a ghost task to each cell, and add kick tasks to the
-     super cells. */
-  for (int k = 0; k < nr_cells; k++) engine_mkghosts(e, &cells[k], NULL);
+/**
+ * @brief Duplicates the first hydro loop and construct all the
+ * dependencies for the hydro part
+ *
+ * This is done by looping over all the previously constructed tasks
+ * and adding another task involving the same cells but this time
+ * corresponding to the second hydro loop over neighbours.
+ * With all the relevant tasks for a given cell available, we construct
+ * all the dependencies for that cell.
+ *
+ * @param e The #engine.
+ */
+void engine_make_extra_hydroloop_tasks(struct engine *e) {
 
-  /* Run through the tasks and make force tasks for each density task.
-     Each force task depends on the cell ghosts and unlocks the kick task
-     of its super-cell. */
-  int sched_nr_tasks = sched->nr_tasks;
-  for (int k = 0; k < sched_nr_tasks; k++) {
+  struct scheduler *sched = &e->sched;
+  const int nodeID = e->nodeID;
+  const int nr_tasks = sched->nr_tasks;
+
+  for (int k = 0; k < nr_tasks; k++) {
 
     /* Get a pointer to the task. */
     struct task *t = &sched->tasks[k];
@@ -918,20 +906,39 @@ void engine_maketasks(struct engine *e) {
 
     /* Self-interaction? */
     if (t->type == task_type_self && t->subtype == task_subtype_density) {
-      scheduler_addunlock(sched, t->ci->super->init, t);
-      scheduler_addunlock(sched, t, t->ci->super->ghost);
+
+      /* Start by constructing the task for the second hydro loop */
       struct task *t2 = scheduler_addtask(
           sched, task_type_self, task_subtype_force, 0, 0, t->ci, NULL, 0);
-      scheduler_addunlock(sched, t->ci->super->ghost, t2);
-      scheduler_addunlock(sched, t2, t->ci->super->kick);
+
+      /* Add the link between the new loop and the cell */
       t->ci->force = engine_addlink(e, t->ci->force, t2);
       atomic_inc(&t->ci->nr_force);
+
+      /* Now, build all the dependencies for the hydro */
+      /* init --> t (density loop) --> ghost --> t2 (force loop) --> kick */
+      scheduler_addunlock(sched, t->ci->super->init, t);
+      scheduler_addunlock(sched, t, t->ci->super->ghost);
+      scheduler_addunlock(sched, t->ci->super->ghost, t2);
+      scheduler_addunlock(sched, t2, t->ci->super->kick);
     }
 
     /* Otherwise, pair interaction? */
     else if (t->type == task_type_pair && t->subtype == task_subtype_density) {
+
+      /* Start by constructing the task for the second hydro loop */
       struct task *t2 = scheduler_addtask(
           sched, task_type_pair, task_subtype_force, 0, 0, t->ci, t->cj, 0);
+
+      /* Add the link between the new loop and both cells */
+      t->ci->force = engine_addlink(e, t->ci->force, t2);
+      atomic_inc(&t->ci->nr_force);
+      t->cj->force = engine_addlink(e, t->cj->force, t2);
+      atomic_inc(&t->cj->nr_force);
+
+      /* Now, build all the dependencies for the hydro for the cells */
+      /* that are local and are not descendant of the same super-cells */
+      /* init --> t (density loop) --> ghost --> t2 (force loop) --> kick */
       if (t->ci->nodeID == nodeID) {
         scheduler_addunlock(sched, t->ci->super->init, t);
         scheduler_addunlock(sched, t, t->ci->super->ghost);
@@ -944,17 +951,27 @@ void engine_maketasks(struct engine *e) {
         scheduler_addunlock(sched, t->cj->super->ghost, t2);
         scheduler_addunlock(sched, t2, t->cj->super->kick);
       }
-      t->ci->force = engine_addlink(e, t->ci->force, t2);
-      atomic_inc(&t->ci->nr_force);
-      t->cj->force = engine_addlink(e, t->cj->force, t2);
-      atomic_inc(&t->cj->nr_force);
     }
 
     /* Otherwise, sub interaction? */
     else if (t->type == task_type_sub && t->subtype == task_subtype_density) {
+
+      /* Start by constructing the task for the second hydro loop */
       struct task *t2 =
           scheduler_addtask(sched, task_type_sub, task_subtype_force, t->flags,
                             0, t->ci, t->cj, 0);
+
+      /* Add the link between the new loop and both cells */
+      t->ci->force = engine_addlink(e, t->ci->force, t2);
+      atomic_inc(&t->ci->nr_force);
+      if (t->cj != NULL) {
+        t->cj->force = engine_addlink(e, t->cj->force, t2);
+        atomic_inc(&t->cj->nr_force);
+      }
+
+      /* Now, build all the dependencies for the hydro for the cells */
+      /* that are local and are not descendant of the same super-cells */
+      /* init --> t (density loop) --> ghost --> t2 (force loop) --> kick */
       if (t->ci->nodeID == nodeID) {
         scheduler_addunlock(sched, t, t->ci->super->ghost);
         scheduler_addunlock(sched, t->ci->super->ghost, t2);
@@ -966,40 +983,163 @@ void engine_maketasks(struct engine *e) {
         scheduler_addunlock(sched, t->cj->super->ghost, t2);
         scheduler_addunlock(sched, t2, t->cj->super->kick);
       }
-      t->ci->force = engine_addlink(e, t->ci->force, t2);
-      atomic_inc(&t->ci->nr_force);
-      if (t->cj != NULL) {
-        t->cj->force = engine_addlink(e, t->cj->force, t2);
-        atomic_inc(&t->cj->nr_force);
-      }
     }
 
     /* /\* Kick tasks should rely on the grav_down tasks of their cell. *\/ */
     /* else if (t->type == task_type_kick && t->ci->grav_down != NULL) */
     /*   scheduler_addunlock(sched, t->ci->grav_down, t); */
   }
+}
 
-/* Add the communication tasks if MPI is being used. */
-#ifdef WITH_MPI
+/**
+ * @brief Constructs the top-level pair tasks for the gravity M-M interactions
+ *
+ * Correct implementation is still lacking here.
+ *
+ * @param e The #engine.
+ */
+void engine_make_gravityinteraction_tasks(struct engine *e) {
 
-  /* Loop over the proxies. */
-  for (int pid = 0; pid < e->nr_proxies; pid++) {
+  struct space *s = e->s;
+  struct scheduler *sched = &e->sched;
+  const int nr_cells = s->nr_cells;
+  struct cell *cells = s->cells;
+
+  /* Loop over all cells. */
+  for (int i = 0; i < nr_cells; i++) {
 
-    /* Get a handle on the proxy. */
-    struct proxy *p = &e->proxies[pid];
+    /* If it has gravity particles, add a self-task */
+    if (cells[i].gcount > 0) {
+      scheduler_addtask(sched, task_type_grav_mm, task_subtype_none, -1, 0,
+                        &cells[i], NULL, 0);
 
-    /* Loop through the proxy's incoming cells and add the
-       recv tasks. */
-    for (int k = 0; k < p->nr_cells_in; k++)
-      engine_addtasks_recv(e, p->cells_in[k], NULL, NULL);
+      /* Loop over all remainding cells */
+      for (int j = i + 1; j < nr_cells; j++) {
 
-    /* Loop through the proxy's outgoing cells and add the
-       send tasks. */
-    for (int k = 0; k < p->nr_cells_out; k++)
-      engine_addtasks_send(e, p->cells_out[k], p->cells_in[0]);
+        /* If that other cell has gravity parts, add a pair interaction */
+        if (cells[j].gcount > 0) {
+          scheduler_addtask(sched, task_type_grav_mm, task_subtype_none, -1, 0,
+                            &cells[i], &cells[j], 0);
+        }
+      }
+    }
   }
+}
 
-#endif
+/**
+ * @brief Constructs the gravity tasks building the multipoles and propagating
+ *them to the children
+ *
+ * Correct implementation is still lacking here.
+ *
+ * @param e The #engine.
+ */
+void engine_make_gravityrecursive_tasks(struct engine *e) {
+
+  struct space *s = e->s;
+  struct scheduler *sched = &e->sched;
+  const int nodeID = e->nodeID;
+  const int nr_cells = s->nr_cells;
+  struct cell *cells = s->cells;
+
+  for (int k = 0; k < nr_cells; k++) {
+
+    /* Only do this for local cells containing gravity particles */
+    if (cells[k].nodeID == nodeID && cells[k].gcount > 0) {
+
+      /* Create tasks at top level. */
+      struct task *up =
+          scheduler_addtask(sched, task_type_grav_up, task_subtype_none, 0, 0,
+                            &cells[k], NULL, 0);
+      struct task *down =
+          scheduler_addtask(sched, task_type_grav_down, task_subtype_none, 0, 0,
+                            &cells[k], NULL, 0);
+
+      /* Push tasks down the cell hierarchy. */
+      engine_addtasks_grav(e, &cells[k], up, down);
+    }
+  }
+}
+
+/**
+ * @brief Fill the #space's task list.
+ *
+ * @param e The #engine we are working with.
+ */
+void engine_maketasks(struct engine *e) {
+
+  struct space *s = e->s;
+  struct scheduler *sched = &e->sched;
+  struct cell *cells = s->cells;
+  const int nr_cells = s->nr_cells;
+  const ticks tic = getticks();
+
+  /* Re-set the scheduler. */
+  scheduler_reset(sched, s->tot_cells * engine_maxtaskspercell);
+
+  /* Add the space sorting tasks. */
+  for (int i = 0; i < e->nr_threads; i++)
+    scheduler_addtask(sched, task_type_psort, task_subtype_none, i, 0, NULL,
+                      NULL, 0);
+
+  /* Construct the firt hydro loop over neighbours */
+  engine_make_hydroloop_tasks(e);
+
+  /* Add the gravity mm tasks. */
+  if ((e->policy & engine_policy_self_gravity) == engine_policy_self_gravity)
+    engine_make_gravityinteraction_tasks(e);
+
+  /* Split the tasks. */
+  scheduler_splittasks(sched);
+
+  /* Allocate the list of cell-task links. The maximum number of links
+     is the number of cells (s->tot_cells) times the number of neighbours (27)
+     times the number of interaction types (2, density and force). */
+  if (e->links != NULL) free(e->links);
+  e->size_links = s->tot_cells * 27 * 2;
+  if ((e->links = malloc(sizeof(struct link) * e->size_links)) == NULL)
+    error("Failed to allocate cell-task links.");
+  e->nr_links = 0;
+
+  /* Add the gravity up/down tasks at the top-level cells and push them down. */
+  if ((e->policy & engine_policy_self_gravity) == engine_policy_self_gravity)
+    engine_make_gravityrecursive_tasks(e);
+
+  /* Count the number of tasks associated with each cell and
+     store the density tasks in each cell, and make each sort
+     depend on the sorts of its progeny. */
+  engine_count_and_link_tasks(e);
+
+  /* Append a ghost task to each cell, and add kick tasks to the
+     super cells. */
+  for (int k = 0; k < nr_cells; k++)
+    engine_make_ghost_tasks(e, &cells[k], NULL);
+
+  /* Run through the tasks and make force tasks for each density task.
+     Each force task depends on the cell ghosts and unlocks the kick task
+     of its super-cell. */
+  engine_make_extra_hydroloop_tasks(e);
+
+  /* Add the communication tasks if MPI is being used. */
+  if ((e->policy & engine_policy_mpi) == engine_policy_mpi) {
+
+    /* Loop over the proxies. */
+    for (int pid = 0; pid < e->nr_proxies; pid++) {
+
+      /* Get a handle on the proxy. */
+      struct proxy *p = &e->proxies[pid];
+
+      /* Loop through the proxy's incoming cells and add the
+         recv tasks. */
+      for (int k = 0; k < p->nr_cells_in; k++)
+        engine_addtasks_recv(e, p->cells_in[k], NULL, NULL);
+
+      /* Loop through the proxy's outgoing cells and add the
+         send tasks. */
+      for (int k = 0; k < p->nr_cells_out; k++)
+        engine_addtasks_send(e, p->cells_out[k], p->cells_in[0]);
+    }
+  }
 
   /* Set the unlocks per task. */
   scheduler_set_unlocks(sched);
@@ -1027,9 +1167,10 @@ void engine_maketasks(struct engine *e) {
 int engine_marktasks(struct engine *e) {
 
   struct scheduler *s = &e->sched;
-  const int nr_tasks = s->nr_tasks, *ind = s->tasks_ind;
+  const int ti_end = e->ti_current;
+  const int nr_tasks = s->nr_tasks;
+  const int *const ind = s->tasks_ind;
   struct task *tasks = s->tasks;
-  const float ti_end = e->ti_current;
   const ticks tic = getticks();
 
   /* Much less to do here if we're on a fixed time-step. */
@@ -1195,7 +1336,7 @@ void engine_print_task_counts(struct engine *e) {
 
 void engine_rebuild(struct engine *e) {
 
-  ticks tic = getticks();
+  const ticks tic = getticks();
 
   /* Clear the forcerebuild flag, whatever it was. */
   e->forcerebuild = 0;
@@ -1238,7 +1379,7 @@ void engine_prepare(struct engine *e) {
 
 /* Collect the values of rebuild from all nodes. */
 #ifdef WITH_MPI
-  int buff;
+  int buff = 0;
   if (MPI_Allreduce(&rebuild, &buff, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD) !=
       MPI_SUCCESS)
     error("Failed to aggregate the rebuild flag across nodes.");
@@ -1529,7 +1670,8 @@ void engine_step(struct engine *e) {
 /* Aggregate the data from the different nodes. */
 #ifdef WITH_MPI
   {
-    int in_i[4], out_i[4];
+    int in_i[1], out_i[1];
+    in_i[0] = 0;
     out_i[0] = ti_end_min;
     if (MPI_Allreduce(out_i, in_i, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD) !=
         MPI_SUCCESS)
@@ -1871,6 +2013,7 @@ void engine_init(struct engine *e, struct space *s, float dt, int nr_threads,
   e->dt_max = dt_max;
   e->file_stats = NULL;
   e->verbose = verbose;
+  e->count_step = 0;
   e->wallclock_time = 0.f;
   engine_rank = nodeID;
 
diff --git a/src/gravity.h b/src/gravity.h
index d5361095c744e2df069ccaf99447dcd58e10dcd6..f737a0ab882592cffb974f66ccbb62fa3d16d408 100644
--- a/src/gravity.h
+++ b/src/gravity.h
@@ -24,6 +24,6 @@
 /* So far only one model here */
 /* Straight-forward import */
 #include "./gravity/Default/gravity.h"
-#include "./gravity/Default/runner_iact_grav.h"
+#include "./gravity/Default/gravity_iact.h"
 
 #endif
diff --git a/src/gravity/Default/runner_iact_grav.h b/src/gravity/Default/gravity_iact.h
similarity index 97%
rename from src/gravity/Default/runner_iact_grav.h
rename to src/gravity/Default/gravity_iact.h
index e62be446e8263bf02e3fd73f902b28cb1c3b16cf..37c9e912fc38cddc1dac3bbd8c69cb11ae219f0d 100644
--- a/src/gravity/Default/runner_iact_grav.h
+++ b/src/gravity/Default/gravity_iact.h
@@ -25,16 +25,9 @@
 #include "kernel.h"
 #include "vector.h"
 
-/**
- * @file  runner_iact_grav.h
- * @brief Gravity interaction functions.
- *
- */
-
 /**
  * @brief Gravity potential
  */
-
 __attribute__((always_inline)) INLINE static void runner_iact_grav(
     float r2, float *dx, struct gpart *pi, struct gpart *pj) {
 
diff --git a/src/gravity/Default/gravity_io.h b/src/gravity/Default/gravity_io.h
index d707d69631e65eed8ad21a7fa9601c07d3c71263..129c4b39828ca73d2d80d79edbdaa8ec4d5a9e01 100644
--- a/src/gravity/Default/gravity_io.h
+++ b/src/gravity/Default/gravity_io.h
@@ -48,6 +48,8 @@ __attribute__((always_inline)) INLINE static void darkmatter_read_particles(
  *
  * @param h_grp The HDF5 group in which to write the arrays.
  * @param fileName The name of the file (unsued in MPI mode).
+ * @param partTypeGroupName The name of the group containing the particles in
+ *the HDF5 file.
  * @param xmfFile The XMF file to write to (unused in MPI mode).
  * @param Ndm The number of DM particles on that MPI rank.
  * @param Ndm_total The total number of g-particles (only used in MPI mode)
@@ -59,17 +61,20 @@ __attribute__((always_inline)) INLINE static void darkmatter_read_particles(
  *
  */
 __attribute__((always_inline)) INLINE static void darkmatter_write_particles(
-    hid_t h_grp, char* fileName, FILE* xmfFile, int Ndm, long long Ndm_total,
-    int mpi_rank, long long offset, struct gpart* gparts,
-    struct UnitSystem* us) {
+    hid_t h_grp, char* fileName, char* partTypeGroupName, FILE* xmfFile,
+    int Ndm, long long Ndm_total, int mpi_rank, long long offset,
+    struct gpart* gparts, struct UnitSystem* us) {
 
   /* Write arrays */
-  writeArray(h_grp, fileName, xmfFile, "Coordinates", DOUBLE, Ndm, 3, gparts,
-             Ndm_total, mpi_rank, offset, x, us, UNIT_CONV_LENGTH);
-  writeArray(h_grp, fileName, xmfFile, "Masses", FLOAT, Ndm, 1, gparts,
-             Ndm_total, mpi_rank, offset, mass, us, UNIT_CONV_MASS);
-  writeArray(h_grp, fileName, xmfFile, "Velocities", FLOAT, Ndm, 3, gparts,
-             Ndm_total, mpi_rank, offset, v_full, us, UNIT_CONV_SPEED);
-  writeArray(h_grp, fileName, xmfFile, "ParticleIDs", ULONGLONG, Ndm, 1, gparts,
-             Ndm_total, mpi_rank, offset, id, us, UNIT_CONV_NO_UNITS);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Coordinates", DOUBLE,
+             Ndm, 3, gparts, Ndm_total, mpi_rank, offset, x, us,
+             UNIT_CONV_LENGTH);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Masses", FLOAT, Ndm,
+             1, gparts, Ndm_total, mpi_rank, offset, mass, us, UNIT_CONV_MASS);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Velocities", FLOAT,
+             Ndm, 3, gparts, Ndm_total, mpi_rank, offset, v_full, us,
+             UNIT_CONV_SPEED);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "ParticleIDs",
+             ULONGLONG, Ndm, 1, gparts, Ndm_total, mpi_rank, offset, id, us,
+             UNIT_CONV_NO_UNITS);
 }
diff --git a/src/gravity/Default/gravity_part.h b/src/gravity/Default/gravity_part.h
index 7ce7b81892582f2a90f7dd07f7f244c0d4ed8afb..aaaab2055d8ce05d77a37d3f40e096bd91d87926 100644
--- a/src/gravity/Default/gravity_part.h
+++ b/src/gravity/Default/gravity_part.h
@@ -50,4 +50,4 @@ struct gpart {
     struct part* part;
   };
 
-} __attribute__((aligned(part_align)));
+} __attribute__((aligned(gpart_align)));
diff --git a/src/hydro/Default/hydro_io.h b/src/hydro/Default/hydro_io.h
index 958bf5a1869718b57678246ff3b1985e54145824..0e9ad46ddc1d4e8c8d3ffdbf3e81262ec49a7092 100644
--- a/src/hydro/Default/hydro_io.h
+++ b/src/hydro/Default/hydro_io.h
@@ -56,6 +56,8 @@ __attribute__((always_inline)) INLINE static void hydro_read_particles(
  *
  * @param h_grp The HDF5 group in which to write the arrays.
  * @param fileName The name of the file (unsued in MPI mode).
+ * @param partTypeGroupName The name of the group containing the particles in
+ *the HDF5 file.
  * @param xmfFile The XMF file to write to (unused in MPI mode).
  * @param N The number of particles on that MPI rank.
  * @param N_total The total number of particles (only used in MPI mode)
@@ -67,26 +69,31 @@ __attribute__((always_inline)) INLINE static void hydro_read_particles(
  *
  */
 __attribute__((always_inline)) INLINE static void hydro_write_particles(
-    hid_t h_grp, char* fileName, FILE* xmfFile, int N, long long N_total,
-    int mpi_rank, long long offset, struct part* parts, struct UnitSystem* us) {
+    hid_t h_grp, char* fileName, char* partTypeGroupName, FILE* xmfFile, int N,
+    long long N_total, int mpi_rank, long long offset, struct part* parts,
+    struct UnitSystem* us) {
 
   /* Write arrays */
-  writeArray(h_grp, fileName, xmfFile, "Coordinates", DOUBLE, N, 3, parts,
-             N_total, mpi_rank, offset, x, us, UNIT_CONV_LENGTH);
-  writeArray(h_grp, fileName, xmfFile, "Velocities", FLOAT, N, 3, parts,
-             N_total, mpi_rank, offset, v, us, UNIT_CONV_SPEED);
-  writeArray(h_grp, fileName, xmfFile, "Masses", FLOAT, N, 1, parts, N_total,
-             mpi_rank, offset, mass, us, UNIT_CONV_MASS);
-  writeArray(h_grp, fileName, xmfFile, "SmoothingLength", FLOAT, N, 1, parts,
-             N_total, mpi_rank, offset, h, us, UNIT_CONV_LENGTH);
-  writeArray(h_grp, fileName, xmfFile, "InternalEnergy", FLOAT, N, 1, parts,
-             N_total, mpi_rank, offset, u, us, UNIT_CONV_ENERGY_PER_UNIT_MASS);
-  writeArray(h_grp, fileName, xmfFile, "ParticleIDs", ULONGLONG, N, 1, parts,
-             N_total, mpi_rank, offset, id, us, UNIT_CONV_NO_UNITS);
-  writeArray(h_grp, fileName, xmfFile, "Acceleration", FLOAT, N, 3, parts,
-             N_total, mpi_rank, offset, a_hydro, us, UNIT_CONV_ACCELERATION);
-  writeArray(h_grp, fileName, xmfFile, "Density", FLOAT, N, 1, parts, N_total,
-             mpi_rank, offset, rho, us, UNIT_CONV_DENSITY);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Coordinates", DOUBLE,
+             N, 3, parts, N_total, mpi_rank, offset, x, us, UNIT_CONV_LENGTH);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Velocities", FLOAT,
+             N, 3, parts, N_total, mpi_rank, offset, v, us, UNIT_CONV_SPEED);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Masses", FLOAT, N, 1,
+             parts, N_total, mpi_rank, offset, mass, us, UNIT_CONV_MASS);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "SmoothingLength",
+             FLOAT, N, 1, parts, N_total, mpi_rank, offset, h, us,
+             UNIT_CONV_LENGTH);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "InternalEnergy",
+             FLOAT, N, 1, parts, N_total, mpi_rank, offset, u, us,
+             UNIT_CONV_ENERGY_PER_UNIT_MASS);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "ParticleIDs",
+             ULONGLONG, N, 1, parts, N_total, mpi_rank, offset, id, us,
+             UNIT_CONV_NO_UNITS);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Acceleration", FLOAT,
+             N, 3, parts, N_total, mpi_rank, offset, a_hydro, us,
+             UNIT_CONV_ACCELERATION);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Density", FLOAT, N,
+             1, parts, N_total, mpi_rank, offset, rho, us, UNIT_CONV_DENSITY);
 }
 
 /**
diff --git a/src/hydro/Gadget2/hydro_io.h b/src/hydro/Gadget2/hydro_io.h
index 17c3d3013644c3572f3c26fc3e270b1c1bc465ed..c1c59dfa4980a2843e7e13bee4c964c9b254cae6 100644
--- a/src/hydro/Gadget2/hydro_io.h
+++ b/src/hydro/Gadget2/hydro_io.h
@@ -56,6 +56,8 @@ __attribute__((always_inline)) INLINE static void hydro_read_particles(
  *
  * @param h_grp The HDF5 group in which to write the arrays.
  * @param fileName The name of the file (unsued in MPI mode).
+ * @param partTypeGroupName The name of the group containing the particles in
+ *the HDF5 file.
  * @param xmfFile The XMF file to write to (unused in MPI mode).
  * @param N The number of particles on that MPI rank.
  * @param N_total The total number of particles (only used in MPI mode)
@@ -67,27 +69,31 @@ __attribute__((always_inline)) INLINE static void hydro_read_particles(
  *
  */
 __attribute__((always_inline)) INLINE static void hydro_write_particles(
-    hid_t h_grp, char* fileName, FILE* xmfFile, int N, long long N_total,
-    int mpi_rank, long long offset, struct part* parts, struct UnitSystem* us) {
+    hid_t h_grp, char* fileName, char* partTypeGroupName, FILE* xmfFile, int N,
+    long long N_total, int mpi_rank, long long offset, struct part* parts,
+    struct UnitSystem* us) {
 
   /* Write arrays */
-  writeArray(h_grp, fileName, xmfFile, "Coordinates", DOUBLE, N, 3, parts,
-             N_total, mpi_rank, offset, x, us, UNIT_CONV_LENGTH);
-  writeArray(h_grp, fileName, xmfFile, "Velocities", FLOAT, N, 3, parts,
-             N_total, mpi_rank, offset, v, us, UNIT_CONV_SPEED);
-  writeArray(h_grp, fileName, xmfFile, "Masses", FLOAT, N, 1, parts, N_total,
-             mpi_rank, offset, mass, us, UNIT_CONV_MASS);
-  writeArray(h_grp, fileName, xmfFile, "SmoothingLength", FLOAT, N, 1, parts,
-             N_total, mpi_rank, offset, h, us, UNIT_CONV_LENGTH);
-  writeArray(h_grp, fileName, xmfFile, "InternalEnergy", FLOAT, N, 1, parts,
-             N_total, mpi_rank, offset, entropy, us,
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Coordinates", DOUBLE,
+             N, 3, parts, N_total, mpi_rank, offset, x, us, UNIT_CONV_LENGTH);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Velocities", FLOAT,
+             N, 3, parts, N_total, mpi_rank, offset, v, us, UNIT_CONV_SPEED);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Masses", FLOAT, N, 1,
+             parts, N_total, mpi_rank, offset, mass, us, UNIT_CONV_MASS);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "SmoothingLength",
+             FLOAT, N, 1, parts, N_total, mpi_rank, offset, h, us,
+             UNIT_CONV_LENGTH);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "InternalEnergy",
+             FLOAT, N, 1, parts, N_total, mpi_rank, offset, entropy, us,
              UNIT_CONV_ENTROPY_PER_UNIT_MASS);
-  writeArray(h_grp, fileName, xmfFile, "ParticleIDs", ULONGLONG, N, 1, parts,
-             N_total, mpi_rank, offset, id, us, UNIT_CONV_NO_UNITS);
-  writeArray(h_grp, fileName, xmfFile, "Acceleration", FLOAT, N, 3, parts,
-             N_total, mpi_rank, offset, a_hydro, us, UNIT_CONV_ACCELERATION);
-  writeArray(h_grp, fileName, xmfFile, "Density", FLOAT, N, 1, parts, N_total,
-             mpi_rank, offset, rho, us, UNIT_CONV_DENSITY);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "ParticleIDs",
+             ULONGLONG, N, 1, parts, N_total, mpi_rank, offset, id, us,
+             UNIT_CONV_NO_UNITS);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Acceleration", FLOAT,
+             N, 3, parts, N_total, mpi_rank, offset, a_hydro, us,
+             UNIT_CONV_ACCELERATION);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Density", FLOAT, N,
+             1, parts, N_total, mpi_rank, offset, rho, us, UNIT_CONV_DENSITY);
 }
 
 /**
diff --git a/src/hydro/Minimal/hydro_io.h b/src/hydro/Minimal/hydro_io.h
index 2c56fb489ab84ca7c30426b54cf95e26e3821084..afe5de83f423e43b4d2480cca1ac3e84d6c549de 100644
--- a/src/hydro/Minimal/hydro_io.h
+++ b/src/hydro/Minimal/hydro_io.h
@@ -56,6 +56,8 @@ __attribute__((always_inline)) INLINE static void hydro_read_particles(
  *
  * @param h_grp The HDF5 group in which to write the arrays.
  * @param fileName The name of the file (unsued in MPI mode).
+ * @param partTypeGroupName The name of the group containing the particles in
+ *the HDF5 file.
  * @param xmfFile The XMF file to write to (unused in MPI mode).
  * @param N The number of particles on that MPI rank.
  * @param N_total The total number of particles (only used in MPI mode)
@@ -67,26 +69,31 @@ __attribute__((always_inline)) INLINE static void hydro_read_particles(
  *
  */
 __attribute__((always_inline)) INLINE static void hydro_write_particles(
-    hid_t h_grp, char* fileName, FILE* xmfFile, int N, long long N_total,
-    int mpi_rank, long long offset, struct part* parts, struct UnitSystem* us) {
+    hid_t h_grp, char* fileName, char* partTypeGroupName, FILE* xmfFile, int N,
+    long long N_total, int mpi_rank, long long offset, struct part* parts,
+    struct UnitSystem* us) {
 
   /* Write arrays */
-  writeArray(h_grp, fileName, xmfFile, "Coordinates", DOUBLE, N, 3, parts,
-             N_total, mpi_rank, offset, x, us, UNIT_CONV_LENGTH);
-  writeArray(h_grp, fileName, xmfFile, "Velocities", FLOAT, N, 3, parts,
-             N_total, mpi_rank, offset, v, us, UNIT_CONV_SPEED);
-  writeArray(h_grp, fileName, xmfFile, "Masses", FLOAT, N, 1, parts, N_total,
-             mpi_rank, offset, mass, us, UNIT_CONV_MASS);
-  writeArray(h_grp, fileName, xmfFile, "SmoothingLength", FLOAT, N, 1, parts,
-             N_total, mpi_rank, offset, h, us, UNIT_CONV_LENGTH);
-  writeArray(h_grp, fileName, xmfFile, "InternalEnergy", FLOAT, N, 1, parts,
-             N_total, mpi_rank, offset, u, us, UNIT_CONV_ENERGY_PER_UNIT_MASS);
-  writeArray(h_grp, fileName, xmfFile, "ParticleIDs", ULONGLONG, N, 1, parts,
-             N_total, mpi_rank, offset, id, us, UNIT_CONV_NO_UNITS);
-  writeArray(h_grp, fileName, xmfFile, "Acceleration", FLOAT, N, 3, parts,
-             N_total, mpi_rank, offset, a_hydro, us, UNIT_CONV_ACCELERATION);
-  writeArray(h_grp, fileName, xmfFile, "Density", FLOAT, N, 1, parts, N_total,
-             mpi_rank, offset, rho, us, UNIT_CONV_DENSITY);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Coordinates", DOUBLE,
+             N, 3, parts, N_total, mpi_rank, offset, x, us, UNIT_CONV_LENGTH);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Velocities", FLOAT,
+             N, 3, parts, N_total, mpi_rank, offset, v, us, UNIT_CONV_SPEED);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Masses", FLOAT, N, 1,
+             parts, N_total, mpi_rank, offset, mass, us, UNIT_CONV_MASS);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "SmoothingLength",
+             FLOAT, N, 1, parts, N_total, mpi_rank, offset, h, us,
+             UNIT_CONV_LENGTH);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "InternalEnergy",
+             FLOAT, N, 1, parts, N_total, mpi_rank, offset, u, us,
+             UNIT_CONV_ENERGY_PER_UNIT_MASS);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "ParticleIDs",
+             ULONGLONG, N, 1, parts, N_total, mpi_rank, offset, id, us,
+             UNIT_CONV_NO_UNITS);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Acceleration", FLOAT,
+             N, 3, parts, N_total, mpi_rank, offset, a_hydro, us,
+             UNIT_CONV_ACCELERATION);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Density", FLOAT, N,
+             1, parts, N_total, mpi_rank, offset, rho, us, UNIT_CONV_DENSITY);
 }
 
 /**
diff --git a/src/parallel_io.c b/src/parallel_io.c
index cffa99a0fd75566ec3e850076d15e104504eeb40..0076c225e1c5361287280f8a567c8062aefd914e 100644
--- a/src/parallel_io.c
+++ b/src/parallel_io.c
@@ -178,9 +178,10 @@ void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N,
  *
  * Calls #error() if an error occurs.
  */
-void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
-                       enum DATA_TYPE type, int N, int dim, long long N_total,
-                       int mpi_rank, long long offset, char* part_c,
+void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile,
+                       char* partTypeGroupName, char* name, enum DATA_TYPE type,
+                       int N, int dim, long long N_total, int mpi_rank,
+                       long long offset, char* part_c, size_t partSize,
                        struct UnitSystem* us,
                        enum UnitConversionFactor convFactor) {
   hid_t h_data = 0, h_err = 0, h_memspace = 0, h_filespace = 0, h_plist_id = 0;
@@ -189,7 +190,6 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
   int i = 0, rank = 0;
   const size_t typeSize = sizeOfType(type);
   const size_t copySize = typeSize * dim;
-  const size_t partSize = sizeof(struct part);
   char* temp_c = 0;
   char buffer[150];
 
@@ -269,7 +269,9 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
   }
 
   /* Write XMF description for this data set */
-  if (mpi_rank == 0) writeXMFline(xmfFile, fileName, name, N_total, dim, type);
+  if (mpi_rank == 0)
+    writeXMFline(xmfFile, fileName, partTypeGroupName, name, N_total, dim,
+                 type);
 
   /* Write unit conversion factors for this data set */
   conversionString(buffer, us, convFactor);
@@ -328,14 +330,16 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
  * @param convFactor The UnitConversionFactor for this array
  *
  */
-#define writeArray(grp, fileName, xmfFile, name, type, N, dim, part, N_total, \
-                   mpi_rank, offset, field, us, convFactor)                   \
-  writeArrayBackEnd(grp, fileName, xmfFile, name, type, N, dim, N_total,      \
-                    mpi_rank, offset, (char*)(&(part[0]).field), us,          \
-                    convFactor)
+#define writeArray(grp, fileName, xmfFile, pTypeGroupName, name, type, N, dim, \
+                   part, N_total, mpi_rank, offset, field, us, convFactor)     \
+  writeArrayBackEnd(grp, fileName, xmfFile, pTypeGroupName, name, type, N,     \
+                    dim, N_total, mpi_rank, offset, (char*)(&(part[0]).field), \
+                    sizeof(part[0]), us, convFactor)
 
 /* Import the right hydro definition */
 #include "hydro_io.h"
+/* Import the right gravity definition */
+#include "gravity_io.h"
 
 /**
  * @brief Reads an HDF5 initial condition file (GADGET-3 type) in parallel
@@ -357,16 +361,17 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
  *
  */
 void read_ic_parallel(char* fileName, double dim[3], struct part** parts,
-                      size_t* N, int* periodic, int mpi_rank, int mpi_size,
-                      MPI_Comm comm, MPI_Info info) {
+                      struct gpart** gparts, size_t* Ngas, size_t* Ngparts,
+                      int* periodic, int mpi_rank, int mpi_size, MPI_Comm comm,
+                      MPI_Info info) {
   hid_t h_file = 0, h_grp = 0;
-  double boxSize[3] = {
-      0.0, -1.0, -1.0}; /* GADGET has only cubic boxes (in cosmological mode) */
-  int numParticles[6] = {
-      0}; /* GADGET has 6 particle types. We only keep the type 0*/
-  int numParticles_highWord[6] = {0};
-  long long offset = 0;
-  long long N_total = 0;
+  /* GADGET has only cubic boxes (in cosmological mode) */
+  double boxSize[3] = {0.0, -1.0, -1.0};
+  int numParticles[NUM_PARTICLE_TYPES] = {0};
+  int numParticles_highWord[NUM_PARTICLE_TYPES] = {0};
+  size_t N[NUM_PARTICLE_TYPES] = {0};
+  long long N_total[NUM_PARTICLE_TYPES] = {0};
+  long long offset[NUM_PARTICLE_TYPES] = {0};
 
   /* Open file */
   /* message("Opening file '%s' as IC.", fileName); */
@@ -398,58 +403,116 @@ void read_ic_parallel(char* fileName, double dim[3], struct part** parts,
   readAttribute(h_grp, "NumPart_Total", UINT, numParticles);
   readAttribute(h_grp, "NumPart_Total_HighWord", UINT, numParticles_highWord);
 
-  N_total = ((long long)numParticles[0]) +
-            ((long long)numParticles_highWord[0] << 32);
+  for (int ptype = 0; ptype < NUM_PARTICLE_TYPES; ++ptype)
+    N_total[ptype] = ((long long)numParticles[ptype]) +
+                     ((long long)numParticles_highWord[ptype] << 32);
+
   dim[0] = boxSize[0];
   dim[1] = (boxSize[1] < 0) ? boxSize[0] : boxSize[1];
   dim[2] = (boxSize[2] < 0) ? boxSize[0] : boxSize[2];
 
-  /* message("Found %d particles in a %speriodic box of size [%f %f %f].",  */
-  /* 	 N_total, (periodic ? "": "non-"), dim[0], dim[1], dim[2]); */
+  /* message("Found %d particles in a %speriodic box of size
+   * [%f %f %f].",  */
+  /* 	 N_total, (periodic ? "": "non-"), dim[0],
+   * dim[1], dim[2]); */
 
   /* Divide the particles among the tasks. */
-  offset = mpi_rank * N_total / mpi_size;
-  *N = (mpi_rank + 1) * N_total / mpi_size - offset;
+  for (int ptype = 0; ptype < NUM_PARTICLE_TYPES; ++ptype) {
+    offset[ptype] = mpi_rank * N_total[ptype] / mpi_size;
+    N[ptype] = (mpi_rank + 1) * N_total[ptype] / mpi_size - offset[ptype];
+  }
 
   /* Close header */
   H5Gclose(h_grp);
 
-  /* Allocate memory to store particles */
-  if (posix_memalign((void*)parts, part_align, *N * sizeof(struct part)) != 0)
+  /* Allocate memory to store SPH particles */
+  *Ngas = N[0];
+  if (posix_memalign((void*)parts, part_align, (*Ngas) * sizeof(struct part)) !=
+      0)
     error("Error while allocating memory for particles");
-  bzero(*parts, *N * sizeof(struct part));
+  bzero(*parts, *Ngas * sizeof(struct part));
 
-  /* message("Allocated %8.2f MB for particles.", *N * sizeof(struct part) /
+  /* Allocate memory to store all particles */
+  const size_t Ndm = N[1];
+  *Ngparts = N[1] + N[0];
+  if (posix_memalign((void*)gparts, gpart_align,
+                     *Ngparts * sizeof(struct gpart)) != 0)
+    error(
+        "Error while allocating memory for gravity "
+        "particles");
+  bzero(*gparts, *Ngparts * sizeof(struct gpart));
+
+  /* message("Allocated %8.2f MB for particles.", *N *
+   * sizeof(struct part) /
    * (1024.*1024.)); */
 
-  /* Open SPH particles group */
-  /* message("Reading particle arrays..."); */
-  h_grp = H5Gopen(h_file, "/PartType0", H5P_DEFAULT);
-  if (h_grp < 0) error("Error while opening particle group.\n");
+  /* message("BoxSize = %lf", dim[0]); */
+  /* message("NumPart = [%zd, %zd] Total = %zd", *Ngas, Ndm,
+   * *Ngparts); */
 
-  /* Read particle fields into the particle structure */
-  hydro_read_particles(h_grp, *N, N_total, offset, *parts);
+  /* Loop over all particle types */
+  for (int ptype = 0; ptype < NUM_PARTICLE_TYPES; ptype++) {
 
-  /* Close particle group */
-  H5Gclose(h_grp);
+    /* Don't do anything if no particle of this kind */
+    if (N_total[ptype] == 0) continue;
+
+    /* Open the particle group in the file */
+    char partTypeGroupName[PARTICLE_GROUP_BUFFER_SIZE];
+    snprintf(partTypeGroupName, PARTICLE_GROUP_BUFFER_SIZE, "/PartType%d",
+             ptype);
+    h_grp = H5Gopen(h_file, partTypeGroupName, H5P_DEFAULT);
+    if (h_grp < 0) {
+      error("Error while opening particle group %s.", partTypeGroupName);
+    }
+
+    /* Read particle fields into the particle structure */
+    switch (ptype) {
+
+      case GAS:
+        hydro_read_particles(h_grp, N[ptype], N_total[ptype], offset[ptype],
+                             *parts);
+        break;
+
+      case DM:
+        darkmatter_read_particles(h_grp, N[ptype], N_total[ptype],
+                                  offset[ptype], *gparts);
+        break;
+
+      default:
+        error("Particle Type %d not yet supported. Aborting", ptype);
+    }
+
+    /* Close particle group */
+    H5Gclose(h_grp);
+  }
+
+  /* Prepare the DM particles */
+  prepare_dm_gparts(*gparts, Ndm);
+
+  /* Now duplicate the hydro particle into gparts */
+  duplicate_hydro_gparts(*parts, *gparts, *Ngas, Ndm);
+
+  /* message("Done Reading particles..."); */
 
   /* Close property handler */
   H5Pclose(h_plist_id);
 
   /* Close file */
   H5Fclose(h_file);
-
-  /* message("Done Reading particles..."); */
 }
 
 /**
- * @brief Writes an HDF5 output file (GADGET-3 type) with its XMF descriptor
+ * @brief Writes an HDF5 output file (GADGET-3 type) with
+ *its XMF descriptor
  *
  * @param e The engine containing all the system.
- * @param us The UnitSystem used for the conversion of units in the output
+ * @param us The UnitSystem used for the conversion of units
+ *in the output
  *
- * Creates an HDF5 output file and writes the particles contained
- * in the engine. If such a file already exists, it is erased and replaced
+ * Creates an HDF5 output file and writes the particles
+ *contained
+ * in the engine. If such a file already exists, it is
+ *erased and replaced
  * by the new one.
  * The companion XMF file is also updated accordingly.
  *
@@ -459,23 +522,27 @@ void read_ic_parallel(char* fileName, double dim[3], struct part** parts,
 void write_output_parallel(struct engine* e, struct UnitSystem* us,
                            int mpi_rank, int mpi_size, MPI_Comm comm,
                            MPI_Info info) {
-
   hid_t h_file = 0, h_grp = 0, h_grpsph = 0;
-  int N = e->s->nr_parts;
+  const size_t Ngas = e->s->nr_parts;
+  const size_t Ntot = e->s->nr_gparts;
   int periodic = e->s->periodic;
-  unsigned int numParticles[6] = {N, 0};
-  unsigned int numParticlesHighWord[6] = {0};
-  unsigned int flagEntropy[6] = {0};
-  long long N_total = 0, offset = 0;
-  double offset_d = 0., N_d = 0., N_total_d = 0.;
   int numFiles = 1;
   struct part* parts = e->s->parts;
-  FILE* xmfFile = 0;
+  struct gpart* gparts = e->s->gparts;
+  struct gpart* dmparts = NULL;
   static int outputCount = 0;
+  FILE* xmfFile = 0;
+
+  /* Number of particles of each type */
+  // const size_t Ndm = Ntot - Ngas;
+
+  /* MATTHIEU: Temporary fix to preserve master */
+  const size_t Ndm = Ntot > 0 ? Ntot - Ngas : 0;
+  /* MATTHIEU: End temporary fix */
 
   /* File name */
-  char fileName[200];
-  sprintf(fileName, "output_%03i.hdf5", outputCount);
+  char fileName[FILENAME_BUFFER_SIZE];
+  snprintf(fileName, FILENAME_BUFFER_SIZE, "output_%03i.hdf5", outputCount);
 
   /* First time, we need to create the XMF file */
   if (outputCount == 0 && mpi_rank == 0) createXMFfile();
@@ -491,21 +558,26 @@ void write_output_parallel(struct engine* e, struct UnitSystem* us,
     error("Error while opening file '%s'.", fileName);
   }
 
-  /* Compute offset in the file and total number of particles */
-  /* Done using double to allow for up to 2^50=10^15 particles */
-  N_d = (double)N;
-  MPI_Exscan(&N_d, &offset_d, 1, MPI_DOUBLE, MPI_SUM, comm);
-  N_total_d = offset_d + N_d;
-  MPI_Bcast(&N_total_d, 1, MPI_DOUBLE, mpi_size - 1, comm);
-  if (N_total_d > 1.e15)
-    error(
-        "Error while computing the offset for parallel output: Simulation has "
-        "more than 10^15 particles.\n");
-  N_total = (long long)N_total_d;
-  offset = (long long)offset_d;
+  /* Compute offset in the file and total number of
+   * particles */
+  size_t N[NUM_PARTICLE_TYPES] = {Ngas, Ndm, 0};
+  long long N_total[NUM_PARTICLE_TYPES] = {0};
+  long long offset[NUM_PARTICLE_TYPES] = {0};
+  MPI_Exscan(&N, &offset, NUM_PARTICLE_TYPES, MPI_LONG_LONG, MPI_SUM, comm);
+  for (int ptype = 0; ptype < NUM_PARTICLE_TYPES; ++ptype)
+    N_total[ptype] = offset[ptype] + N[ptype];
+
+  /* The last rank now has the correct N_total. Let's
+   * broadcast from there */
+  MPI_Bcast(&N_total, 6, MPI_LONG_LONG, mpi_size - 1, comm);
 
-  /* Write the part of the XMF file corresponding to this specific output */
-  if (mpi_rank == 0) writeXMFheader(xmfFile, N_total, fileName, e->time);
+  /* Now everybody konws its offset and the total number of
+   * particles of each
+   * type */
+
+  /* Write the part of the XMF file corresponding to this
+   * specific output */
+  if (mpi_rank == 0) writeXMFoutputheader(xmfFile, fileName, e->time);
 
   /* Open header to write simulation properties */
   /* message("Writing runtime parameters..."); */
@@ -526,19 +598,28 @@ void write_output_parallel(struct engine* e, struct UnitSystem* us,
 
   /* Print the relevant information and print status */
   writeAttribute(h_grp, "BoxSize", DOUBLE, e->s->dim, 3);
-  writeAttribute(h_grp, "NumPart_ThisFile", UINT, numParticles, 6);
   double dblTime = e->time;
   writeAttribute(h_grp, "Time", DOUBLE, &dblTime, 1);
 
   /* GADGET-2 legacy values */
-  numParticles[0] = (unsigned int)N_total;
-  writeAttribute(h_grp, "NumPart_Total", UINT, numParticles, 6);
-  numParticlesHighWord[0] = (unsigned int)(N_total >> 32);
+  /* Number of particles of each type */
+  unsigned int numParticles[NUM_PARTICLE_TYPES] = {0};
+  unsigned int numParticlesHighWord[NUM_PARTICLE_TYPES] = {0};
+  for (int ptype = 0; ptype < NUM_PARTICLE_TYPES; ++ptype) {
+    numParticles[ptype] = (unsigned int)N_total[ptype];
+    numParticlesHighWord[ptype] = (unsigned int)(N_total[ptype] >> 32);
+  }
+  writeAttribute(h_grp, "NumPart_ThisFile", LONGLONG, N_total,
+                 NUM_PARTICLE_TYPES);
+  writeAttribute(h_grp, "NumPart_Total", UINT, numParticles,
+                 NUM_PARTICLE_TYPES);
   writeAttribute(h_grp, "NumPart_Total_HighWord", UINT, numParticlesHighWord,
-                 6);
+                 NUM_PARTICLE_TYPES);
   double MassTable[6] = {0., 0., 0., 0., 0., 0.};
-  writeAttribute(h_grp, "MassTable", DOUBLE, MassTable, 6);
-  writeAttribute(h_grp, "Flag_Entropy_ICs", UINT, flagEntropy, 6);
+  writeAttribute(h_grp, "MassTable", DOUBLE, MassTable, NUM_PARTICLE_TYPES);
+  unsigned int flagEntropy[NUM_PARTICLE_TYPES] = {0};
+  writeAttribute(h_grp, "Flag_Entropy_ICs", UINT, flagEntropy,
+                 NUM_PARTICLE_TYPES);
   writeAttribute(h_grp, "NumFilesPerSnapshot", INT, &numFiles, 1);
 
   /* Close header */
@@ -556,21 +637,71 @@ void write_output_parallel(struct engine* e, struct UnitSystem* us,
   /* Print the system of Units */
   writeUnitSystem(h_file, us);
 
-  /* Create SPH particles group */
-  /* message("Writing particle arrays..."); */
-  h_grp =
-      H5Gcreate(h_file, "/PartType0", H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-  if (h_grp < 0) error("Error while creating particle group.\n");
+  /* Loop over all particle types */
+  for (int ptype = 0; ptype < NUM_PARTICLE_TYPES; ptype++) {
+
+    /* Don't do anything if no particle of this kind */
+    if (N_total[ptype] == 0) continue;
+
+    /* Add the global information for that particle type to
+     * the XMF meta-file */
+    if (mpi_rank == 0)
+      writeXMFgroupheader(xmfFile, fileName, N_total[ptype], ptype);
+
+    /* Open the particle group in the file */
+    char partTypeGroupName[PARTICLE_GROUP_BUFFER_SIZE];
+    snprintf(partTypeGroupName, PARTICLE_GROUP_BUFFER_SIZE, "/PartType%d",
+             ptype);
+    h_grp = H5Gcreate(h_file, partTypeGroupName, H5P_DEFAULT, H5P_DEFAULT,
+                      H5P_DEFAULT);
+    if (h_grp < 0) {
+      error("Error while opening particle group %s.", partTypeGroupName);
+    }
 
-  /* Write particle fields from the particle structure */
-  hydro_write_particles(h_grp, fileName, xmfFile, N, N_total, mpi_rank, offset,
-                        parts, us);
+    /* Read particle fields into the particle structure */
+    switch (ptype) {
 
-  /* Close particle group */
-  H5Gclose(h_grp);
+      case GAS:
+        hydro_write_particles(h_grp, fileName, partTypeGroupName, xmfFile,
+                              N[ptype], N_total[ptype], mpi_rank, offset[ptype],
+                              parts, us);
+
+        break;
+
+      case DM:
+        /* Allocate temporary array */
+        if (posix_memalign((void*)&dmparts, gpart_align,
+                           Ndm * sizeof(struct gpart)) != 0)
+          error(
+              "Error while allocating temporart memory for "
+              "DM particles");
+        bzero(dmparts, Ndm * sizeof(struct gpart));
+
+        /* Collect the DM particles from gpart */
+        collect_dm_gparts(gparts, Ntot, dmparts, Ndm);
+
+        /* Write DM particles */
+        darkmatter_write_particles(h_grp, fileName, partTypeGroupName, xmfFile,
+                                   N[ptype], N_total[ptype], mpi_rank,
+                                   offset[ptype], dmparts, us);
+
+        /* Free temporary array */
+        free(dmparts);
+        break;
+
+      default:
+        error("Particle Type %d not yet supported. Aborting", ptype);
+    }
+
+    /* Close particle group */
+    H5Gclose(h_grp);
+
+    /* Close this particle group in the XMF file as well */
+    if (mpi_rank == 0) writeXMFgroupfooter(xmfFile, ptype);
+  }
 
   /* Write LXMF file descriptor */
-  if (mpi_rank == 0) writeXMFfooter(xmfFile);
+  if (mpi_rank == 0) writeXMFoutputfooter(xmfFile, outputCount, e->time);
 
   /* message("Done writing particles..."); */
 
diff --git a/src/parallel_io.h b/src/parallel_io.h
index a0589944ec845c712abde1e64e305980748db0e7..663f0aabac44c08682b964512839b925673ea5c5 100644
--- a/src/parallel_io.h
+++ b/src/parallel_io.h
@@ -32,8 +32,9 @@
 #if defined(HAVE_HDF5) && defined(WITH_MPI) && defined(HAVE_PARALLEL_HDF5)
 
 void read_ic_parallel(char* fileName, double dim[3], struct part** parts,
-                      size_t* N, int* periodic, int mpi_rank, int mpi_size,
-                      MPI_Comm comm, MPI_Info info);
+                      struct gpart** gparts, size_t* Ngas, size_t* Ngparts,
+                      int* periodic, int mpi_rank, int mpi_size, MPI_Comm comm,
+                      MPI_Info info);
 
 void write_output_parallel(struct engine* e, struct UnitSystem* us,
                            int mpi_rank, int mpi_size, MPI_Comm comm,
diff --git a/src/partition.c b/src/partition.c
index 0f8eb3ebe334d71228510307dd9ccc4e56e234b3..ea25bc132dacf19b7a5c12765d2a39313fc01486 100644
--- a/src/partition.c
+++ b/src/partition.c
@@ -424,7 +424,7 @@ static void repart_edge_metis(int partweights, int bothweights, int nodeID,
    * assume the same graph structure as used in the part_ calls). */
   int nr_cells = s->nr_cells;
   struct cell *cells = s->cells;
-  float wscale = 1e-3, vscale = 1e-3, wscale_buff;
+  float wscale = 1e-3, vscale = 1e-3, wscale_buff = 0.0;
   int wtot = 0;
   int wmax = 1e9 / nr_nodes;
   int wmin;
diff --git a/src/proxy.c b/src/proxy.c
index 7d2e546bf945ca18c2195ea2801d1b2058cb2f58..f58847988946c347367a0f172048f8cc96f26914 100644
--- a/src/proxy.c
+++ b/src/proxy.c
@@ -50,11 +50,9 @@ void proxy_cells_exch1(struct proxy *p) {
 
 #ifdef WITH_MPI
 
-  int k, ind;
-
   /* Get the number of pcells we will need to send. */
   p->size_pcells_out = 0;
-  for (k = 0; k < p->nr_cells_out; k++)
+  for (int k = 0; k < p->nr_cells_out; k++)
     p->size_pcells_out += p->cells_out[k]->pcell_size;
 
   /* Send the number of pcells. */
@@ -70,7 +68,7 @@ void proxy_cells_exch1(struct proxy *p) {
   if ((p->pcells_out = malloc(sizeof(struct pcell) * p->size_pcells_out)) ==
       NULL)
     error("Failed to allocate pcell_out buffer.");
-  for (ind = 0, k = 0; k < p->nr_cells_out; k++) {
+  for (int ind = 0, k = 0; k < p->nr_cells_out; k++) {
     memcpy(&p->pcells_out[ind], p->cells_out[k]->pcell,
            sizeof(struct pcell) * p->cells_out[k]->pcell_size);
     ind += p->cells_out[k]->pcell_size;
@@ -131,16 +129,14 @@ void proxy_cells_exch2(struct proxy *p) {
 
 void proxy_addcell_in(struct proxy *p, struct cell *c) {
 
-  int k;
-  struct cell **temp;
-
   /* Check if the cell is already registered with the proxy. */
-  for (k = 0; k < p->nr_cells_in; k++)
+  for (int k = 0; k < p->nr_cells_in; k++)
     if (p->cells_in[k] == c) return;
 
   /* Do we need to grow the number of in cells? */
   if (p->nr_cells_in == p->size_cells_in) {
     p->size_cells_in *= proxy_buffgrow;
+    struct cell **temp;
     if ((temp = malloc(sizeof(struct cell *) * p->size_cells_in)) == NULL)
       error("Failed to allocate incoming cell list.");
     memcpy(temp, p->cells_in, sizeof(struct cell *) * p->nr_cells_in);
@@ -162,16 +158,14 @@ void proxy_addcell_in(struct proxy *p, struct cell *c) {
 
 void proxy_addcell_out(struct proxy *p, struct cell *c) {
 
-  int k;
-  struct cell **temp;
-
   /* Check if the cell is already registered with the proxy. */
-  for (k = 0; k < p->nr_cells_out; k++)
+  for (int k = 0; k < p->nr_cells_out; k++)
     if (p->cells_out[k] == c) return;
 
   /* Do we need to grow the number of out cells? */
   if (p->nr_cells_out == p->size_cells_out) {
     p->size_cells_out *= proxy_buffgrow;
+    struct cell **temp;
     if ((temp = malloc(sizeof(struct cell *) * p->size_cells_out)) == NULL)
       error("Failed to allocate outgoing cell list.");
     memcpy(temp, p->cells_out, sizeof(struct cell *) * p->nr_cells_out);
@@ -278,8 +272,8 @@ void proxy_parts_exch2(struct proxy *p) {
  * @param N The number of parts.
  */
 
-void proxy_parts_load(struct proxy *p, struct part *parts, struct xpart *xparts,
-                      int N) {
+void proxy_parts_load(struct proxy *p, const struct part *parts,
+                      const struct xpart *xparts, int N) {
 
   /* Is there enough space in the buffer? */
   if (p->nr_parts_out + N > p->size_parts_out) {
diff --git a/src/proxy.h b/src/proxy.h
index 3cd33e0f0819ee1ecac53213630445b39c809dea..1a0272dd6c948df0130f733bb573477eeff1d0db 100644
--- a/src/proxy.h
+++ b/src/proxy.h
@@ -68,8 +68,8 @@ struct proxy {
 
 /* Function prototypes. */
 void proxy_init(struct proxy *p, int mynodeID, int nodeID);
-void proxy_parts_load(struct proxy *p, struct part *parts, struct xpart *xparts,
-                      int N);
+void proxy_parts_load(struct proxy *p, const struct part *parts,
+                      const struct xpart *xparts, int N);
 void proxy_parts_exch1(struct proxy *p);
 void proxy_parts_exch2(struct proxy *p);
 void proxy_addcell_in(struct proxy *p, struct cell *c);
diff --git a/src/queue.c b/src/queue.c
index a7321155100df9225526c2f19fac2b99531307e4..6b788d7376ba4bdc95f1b1d918ab52a9514e7b4a 100644
--- a/src/queue.c
+++ b/src/queue.c
@@ -136,9 +136,6 @@ struct task *queue_gettask(struct queue *q, const struct task *prev,
   lock_type *qlock = &q->lock;
   struct task *res = NULL;
 
-  /* If there are no tasks, leave immediately. */
-  if (q->count == 0) return NULL;
-
   /* Grab the task lock. */
   if (blocking) {
     if (lock_lock(qlock) != 0) error("Locking the qlock failed.\n");
@@ -146,6 +143,12 @@ struct task *queue_gettask(struct queue *q, const struct task *prev,
     if (lock_trylock(qlock) != 0) return NULL;
   }
 
+  /* If there are no tasks, leave immediately. */
+  if (q->count == 0) {
+    lock_unlock_blind(qlock);
+    return NULL;
+  }
+
   /* Set some pointers we will use often. */
   int *qtid = q->tid;
   struct task *qtasks = q->tasks;
diff --git a/src/runner.c b/src/runner.c
index 5a7f84c040011cf669be3b81405c88b6057750f3..b4045113bf2a7337a8dd3641cf386d630c759a40 100644
--- a/src/runner.c
+++ b/src/runner.c
@@ -664,7 +664,7 @@ void runner_dodrift(struct runner *r, struct cell *c, int timer) {
   const float ti_current = r->e->ti_current;
   struct part *restrict p, *restrict parts = c->parts;
   struct xpart *restrict xp, *restrict xparts = c->xparts;
-  float dx_max = 0.f, h_max = 0.f;
+  float dx_max = 0.f, dx2_max = 0.f, h_max = 0.f;
   float w;
 
   TIMER_TIC
@@ -709,16 +709,18 @@ void runner_dodrift(struct runner *r, struct cell *c, int timer) {
       /* Predict the values of the extra fields */
       hydro_predict_extra(p, xp, ti_old, ti_current, timeBase);
 
-      /* Compute motion since last cell construction */
-      const float dx =
-          sqrtf((p->x[0] - xp->x_old[0]) * (p->x[0] - xp->x_old[0]) +
-                (p->x[1] - xp->x_old[1]) * (p->x[1] - xp->x_old[1]) +
-                (p->x[2] - xp->x_old[2]) * (p->x[2] - xp->x_old[2]));
-      dx_max = fmaxf(dx_max, dx);
+      /* Compute (square of) motion since last cell construction */
+      const float dx2 = (p->x[0] - xp->x_old[0]) * (p->x[0] - xp->x_old[0]) +
+                        (p->x[1] - xp->x_old[1]) * (p->x[1] - xp->x_old[1]) +
+                        (p->x[2] - xp->x_old[2]) * (p->x[2] - xp->x_old[2]);
+      dx2_max = fmaxf(dx2_max, dx2);
 
       /* Maximal smoothing length */
       h_max = fmaxf(p->h, h_max);
     }
+
+    /* Now, get the maximal particle motion from its square */
+    dx_max = sqrtf(dx2_max);
   }
 
   /* Otherwise, aggregate data from children. */
diff --git a/src/scheduler.c b/src/scheduler.c
index 58cfcb7aec7ffe994e396393e4d72b2196c8fff0..327c7b115520ea1ec9ac5d3e22e528ee5f99d572 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -95,32 +95,29 @@ void scheduler_addunlock(struct scheduler *s, struct task *ta,
 
 void scheduler_splittasks(struct scheduler *s) {
 
-  int j, k, ind, sid, tid = 0, redo;
-  struct cell *ci, *cj;
-  double hi, hj, shift[3];
-  struct task *t, *t_old;
-  // float dt_step = s->dt_step;
-  int pts[7][8] = {{-1, 12, 10, 9, 4, 3, 1, 0},
-                   {-1, -1, 11, 10, 5, 4, 2, 1},
-                   {-1, -1, -1, 12, 7, 6, 4, 3},
-                   {-1, -1, -1, -1, 8, 7, 5, 4},
-                   {-1, -1, -1, -1, -1, 12, 10, 9},
-                   {-1, -1, -1, -1, -1, -1, 11, 10},
-                   {-1, -1, -1, -1, -1, -1, -1, 12}};
-  float sid_scale[13] = {0.1897, 0.4025, 0.1897, 0.4025, 0.5788, 0.4025, 0.1897,
-                         0.4025, 0.1897, 0.4025, 0.5788, 0.4025, 0.5788};
+  const int pts[7][8] = {{-1, 12, 10, 9, 4, 3, 1, 0},
+                         {-1, -1, 11, 10, 5, 4, 2, 1},
+                         {-1, -1, -1, 12, 7, 6, 4, 3},
+                         {-1, -1, -1, -1, 8, 7, 5, 4},
+                         {-1, -1, -1, -1, -1, 12, 10, 9},
+                         {-1, -1, -1, -1, -1, -1, 11, 10},
+                         {-1, -1, -1, -1, -1, -1, -1, 12}};
+  const float sid_scale[13] = {0.1897, 0.4025, 0.1897, 0.4025, 0.5788,
+                               0.4025, 0.1897, 0.4025, 0.1897, 0.4025,
+                               0.5788, 0.4025, 0.5788};
 
   /* Loop through the tasks... */
-  redo = 0;
-  t_old = t = NULL;
+  int tid = 0, redo = 0;
+  struct task *t_old = NULL;
   while (1) {
 
     /* Get a pointer on the task. */
+    struct task *t = t_old;
     if (redo) {
       redo = 0;
-      t = t_old;
     } else {
-      if ((ind = atomic_inc(&tid)) < s->nr_tasks)
+      const int ind = atomic_inc(&tid);
+      if (ind < s->nr_tasks)
         t_old = t = &s->tasks[s->tasks_ind[ind]];
       else
         break;
@@ -163,7 +160,7 @@ void scheduler_splittasks(struct scheduler *s) {
     if (t->type == task_type_self) {
 
       /* Get a handle on the cell involved. */
-      ci = t->ci;
+      struct cell *ci = t->ci;
 
       /* Foreign task? */
       if (ci->nodeID != s->nodeID) {
@@ -189,18 +186,18 @@ void scheduler_splittasks(struct scheduler *s) {
           redo = 1;
 
           /* Add the self task. */
-          for (k = 0; ci->progeny[k] == NULL; k++)
-            ;
-          t->ci = ci->progeny[k];
-          for (k += 1; k < 8; k++)
+          int first_child = 0;
+          while (ci->progeny[first_child] == NULL) first_child++;
+          t->ci = ci->progeny[first_child];
+          for (int k = first_child + 1; k < 8; k++)
             if (ci->progeny[k] != NULL)
               scheduler_addtask(s, task_type_self, t->subtype, 0, 0,
                                 ci->progeny[k], NULL, 0);
 
           /* Make a task for each pair of progeny. */
-          for (j = 0; j < 8; j++)
+          for (int j = 0; j < 8; j++)
             if (ci->progeny[j] != NULL)
-              for (k = j + 1; k < 8; k++)
+              for (int k = j + 1; k < 8; k++)
                 if (ci->progeny[k] != NULL)
                   scheduler_addtask(s, task_type_pair, t->subtype, pts[j][k], 0,
                                     ci->progeny[j], ci->progeny[k], 0);
@@ -213,10 +210,10 @@ void scheduler_splittasks(struct scheduler *s) {
     else if (t->type == task_type_pair) {
 
       /* Get a handle on the cells involved. */
-      ci = t->ci;
-      cj = t->cj;
-      hi = ci->dmin;
-      hj = cj->dmin;
+      struct cell *ci = t->ci;
+      struct cell *cj = t->cj;
+      const double hi = ci->dmin;
+      const double hj = cj->dmin;
 
       /* Foreign task? */
       if (ci->nodeID != s->nodeID && cj->nodeID != s->nodeID) {
@@ -226,7 +223,8 @@ void scheduler_splittasks(struct scheduler *s) {
 
       /* Get the sort ID, use space_getsid and not t->flags
          to make sure we get ci and cj swapped if needed. */
-      sid = space_getsid(s->space, &ci, &cj, shift);
+      double shift[3];
+      int sid = space_getsid(s->space, &ci, &cj, shift);
 
       /* Should this task be split-up? */
       if (ci->split && cj->split &&
@@ -482,9 +480,9 @@ void scheduler_splittasks(struct scheduler *s) {
         /* Replace the current task. */
         t->type = task_type_none;
 
-        for (j = 0; j < 8; j++)
+        for (int j = 0; j < 8; j++)
           if (ci->progeny[j] != NULL)
-            for (k = 0; k < 8; k++)
+            for (int k = 0; k < 8; k++)
               if (cj->progeny[k] != NULL) {
                 t = scheduler_addtask(s, task_type_pair, t->subtype, 0, 0,
                                       ci->progeny[j], cj->progeny[k], 0);
@@ -523,8 +521,8 @@ void scheduler_splittasks(struct scheduler *s) {
     else if (t->type == task_type_grav_mm) {
 
       /* Get a handle on the cells involved. */
-      ci = t->ci;
-      cj = t->cj;
+      struct cell *ci = t->ci;
+      struct cell *cj = t->cj;
 
       /* Self-interaction? */
       if (cj == NULL) {
@@ -548,7 +546,7 @@ void scheduler_splittasks(struct scheduler *s) {
 
             /* Split this task into tasks on its progeny. */
             t->type = task_type_none;
-            for (j = 0; j < 8; j++)
+            for (int j = 0; j < 8; j++)
               if (ci->progeny[j] != NULL && ci->progeny[j]->gcount > 0) {
                 if (t->type == task_type_none) {
                   t->type = task_type_grav_mm;
@@ -557,7 +555,7 @@ void scheduler_splittasks(struct scheduler *s) {
                 } else
                   t = scheduler_addtask(s, task_type_grav_mm, task_subtype_none,
                                         0, 0, ci->progeny[j], NULL, 0);
-                for (k = j + 1; k < 8; k++)
+                for (int k = j + 1; k < 8; k++)
                   if (ci->progeny[k] != NULL && ci->progeny[k]->gcount > 0) {
                     if (t->type == task_type_none) {
                       t->type = task_type_grav_mm;
@@ -596,7 +594,7 @@ void scheduler_splittasks(struct scheduler *s) {
 
           /* Get the opening angle theta. */
           float dx[3], theta;
-          for (k = 0; k < 3; k++) {
+          for (int k = 0; k < 3; k++) {
             dx[k] = fabs(ci->loc[k] - cj->loc[k]);
             if (s->space->periodic && dx[k] > 0.5 * s->space->dim[k])
               dx[k] = -dx[k] + s->space->dim[k];
@@ -617,9 +615,9 @@ void scheduler_splittasks(struct scheduler *s) {
 
               /* Split this task into tasks on its progeny. */
               t->type = task_type_none;
-              for (j = 0; j < 8; j++)
+              for (int j = 0; j < 8; j++)
                 if (ci->progeny[j] != NULL && ci->progeny[j]->gcount > 0) {
-                  for (k = 0; k < 8; k++)
+                  for (int k = 0; k < 8; k++)
                     if (cj->progeny[k] != NULL && cj->progeny[k]->gcount > 0) {
                       if (t->type == task_type_none) {
                         t->type = task_type_grav_mm;
@@ -665,17 +663,14 @@ struct task *scheduler_addtask(struct scheduler *s, int type, int subtype,
                                int flags, int wait, struct cell *ci,
                                struct cell *cj, int tight) {
 
-  int ind;
-  struct task *t;
-
   /* Get the next free task. */
-  ind = atomic_inc(&s->tasks_next);
+  const int ind = atomic_inc(&s->tasks_next);
 
   /* Overflow? */
   if (ind >= s->size) error("Task list overflow.");
 
   /* Get a pointer to the new task. */
-  t = &s->tasks[ind];
+  struct task *t = &s->tasks[ind];
 
   /* Copy the data. */
   t->type = type;
@@ -770,24 +765,24 @@ void scheduler_set_unlocks(struct scheduler *s) {
 
 void scheduler_ranktasks(struct scheduler *s) {
 
-  int i, j = 0, k, temp, left = 0, rank;
-  struct task *t, *tasks = s->tasks;
-  int *tid = s->tasks_ind, nr_tasks = s->nr_tasks;
+  struct task *tasks = s->tasks;
+  int *tid = s->tasks_ind;
+  const int nr_tasks = s->nr_tasks;
 
   /* Run through the tasks and get all the waits right. */
-  for (i = 0, k = 0; k < nr_tasks; k++) {
+  for (int k = 0; k < nr_tasks; k++) {
     tid[k] = k;
-    for (j = 0; j < tasks[k].nr_unlock_tasks; j++)
+    for (int j = 0; j < tasks[k].nr_unlock_tasks; j++)
       tasks[k].unlock_tasks[j]->wait += 1;
   }
 
   /* Main loop. */
-  for (j = 0, rank = 0; left < nr_tasks; rank++) {
+  for (int j = 0, rank = 0, left = 0; left < nr_tasks; rank++) {
 
     /* Load the tids of tasks with no waits. */
-    for (k = left; k < nr_tasks; k++)
+    for (int k = left; k < nr_tasks; k++)
       if (tasks[tid[k]].wait == 0) {
-        temp = tid[j];
+        int temp = tid[j];
         tid[j] = tid[k];
         tid[k] = temp;
         j += 1;
@@ -797,15 +792,16 @@ void scheduler_ranktasks(struct scheduler *s) {
     if (j == left) error("Unsatisfiable task dependencies detected.");
 
     /* Unlock the next layer of tasks. */
-    for (i = left; i < j; i++) {
-      t = &tasks[tid[i]];
+    for (int i = left; i < j; i++) {
+      struct task *t = &tasks[tid[i]];
       t->rank = rank;
       tid[i] = t - tasks;
       if (tid[i] >= nr_tasks) error("Task index overshoot.");
       /* message( "task %i of type %s has rank %i." , i ,
           (t->type == task_type_self) ? "self" : (t->type == task_type_pair) ?
          "pair" : "sort" , rank ); */
-      for (k = 0; k < t->nr_unlock_tasks; k++) t->unlock_tasks[k]->wait -= 1;
+      for (int k = 0; k < t->nr_unlock_tasks; k++)
+        t->unlock_tasks[k]->wait -= 1;
     }
 
     /* The new left (no, not tony). */
@@ -827,8 +823,6 @@ void scheduler_ranktasks(struct scheduler *s) {
 
 void scheduler_reset(struct scheduler *s, int size) {
 
-  int k;
-
   /* Do we need to re-allocate? */
   if (size > s->size) {
 
@@ -855,7 +849,7 @@ void scheduler_reset(struct scheduler *s, int size) {
   s->nr_unlocks = 0;
 
   /* Set the task pointers in the queues. */
-  for (k = 0; k < s->nr_queues; k++) s->queues[k].tasks = s->tasks;
+  for (int k = 0; k < s->nr_queues; k++) s->queues[k].tasks = s->tasks;
 }
 
 /**
@@ -866,21 +860,23 @@ void scheduler_reset(struct scheduler *s, int size) {
 
 void scheduler_reweight(struct scheduler *s) {
 
-  int k, j, nr_tasks = s->nr_tasks, *tid = s->tasks_ind;
-  struct task *t, *tasks = s->tasks;
-  int nodeID = s->nodeID;
-  float sid_scale[13] = {0.1897, 0.4025, 0.1897, 0.4025, 0.5788, 0.4025, 0.1897,
-                         0.4025, 0.1897, 0.4025, 0.5788, 0.4025, 0.5788};
-  float wscale = 0.001;
+  const int nr_tasks = s->nr_tasks;
+  int *tid = s->tasks_ind;
+  struct task *tasks = s->tasks;
+  const int nodeID = s->nodeID;
+  const float sid_scale[13] = {0.1897, 0.4025, 0.1897, 0.4025, 0.5788,
+                               0.4025, 0.1897, 0.4025, 0.1897, 0.4025,
+                               0.5788, 0.4025, 0.5788};
+  const float wscale = 0.001;
   // ticks tic;
 
   /* Run through the tasks backwards and set their waits and
      weights. */
   // tic = getticks();
-  for (k = nr_tasks - 1; k >= 0; k--) {
-    t = &tasks[tid[k]];
+  for (int k = nr_tasks - 1; k >= 0; k--) {
+    struct task *t = &tasks[tid[k]];
     t->weight = 0;
-    for (j = 0; j < t->nr_unlock_tasks; j++)
+    for (int j = 0; j < t->nr_unlock_tasks; j++)
       if (t->unlock_tasks[j]->weight > t->weight)
         t->weight = t->unlock_tasks[j]->weight;
     if (!t->implicit && t->tic > 0)
@@ -961,8 +957,9 @@ void scheduler_reweight(struct scheduler *s) {
 void scheduler_start(struct scheduler *s, unsigned int mask,
                      unsigned int submask) {
 
-  int nr_tasks = s->nr_tasks, *tid = s->tasks_ind;
-  struct task *t, *tasks = s->tasks;
+  const int nr_tasks = s->nr_tasks;
+  int *tid = s->tasks_ind;
+  struct task *tasks = s->tasks;
   // ticks tic;
 
   /* Store the masks */
@@ -988,8 +985,7 @@ void scheduler_start(struct scheduler *s, unsigned int mask,
   const int waiting_old = s->waiting;
 
   /* We are going to use the task structure in a modified way to pass
-     information
-     to the task. Don't do this at home !
+     information to the task. Don't do this at home !
      - ci and cj will give the range of tasks to which the waits will be applied
      - the flags will be used to transfer the mask
      - the rank will be used to transfer the submask
@@ -1014,6 +1010,7 @@ void scheduler_start(struct scheduler *s, unsigned int mask,
 
   /* Wait for the rewait tasks to have executed. */
   pthread_mutex_lock(&s->sleep_mutex);
+  pthread_cond_broadcast(&s->sleep_cond);
   while (s->waiting > waiting_old) {
     pthread_cond_wait(&s->sleep_cond, &s->sleep_mutex);
   }
@@ -1027,7 +1024,7 @@ void scheduler_start(struct scheduler *s, unsigned int mask,
   /* Loop over the tasks and enqueue whoever is ready. */
   // tic = getticks();
   for (int k = 0; k < s->nr_tasks; k++) {
-    t = &tasks[tid[k]];
+    struct task *t = &tasks[tid[k]];
     if (atomic_dec(&t->wait) == 1 && ((1 << t->type) & s->mask) &&
         ((1 << t->subtype) & s->submask) && !t->skip) {
       scheduler_enqueue(s, t);
@@ -1035,6 +1032,11 @@ void scheduler_start(struct scheduler *s, unsigned int mask,
     }
   }
 
+  /* To be safe, fire of one last sleep_cond in a safe way. */
+  pthread_mutex_lock(&s->sleep_mutex);
+  pthread_cond_broadcast(&s->sleep_cond);
+  pthread_mutex_unlock(&s->sleep_mutex);
+
   // message( "enqueueing tasks took %.3f %s." ,
   // clocks_from_ticks( getticks() - tic ), clocks_getunit());
 }
@@ -1048,10 +1050,8 @@ void scheduler_start(struct scheduler *s, unsigned int mask,
 
 void scheduler_enqueue(struct scheduler *s, struct task *t) {
 
+  /* The target queue for this task. */
   int qid = -1;
-#ifdef WITH_MPI
-  int err;
-#endif
 
   /* Fail if this task has already been enqueued before. */
   if (t->rid >= 0) error("Task has already been enqueued.");
@@ -1073,6 +1073,9 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
 
   /* Otherwise, look for a suitable queue. */
   else {
+#ifdef WITH_MPI
+    int err;
+#endif
 
     /* Find the previous owner for each task type, and do
        any pre-processing needed. */
@@ -1095,13 +1098,10 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
         break;
       case task_type_recv:
 #ifdef WITH_MPI
-        if ((err = MPI_Irecv(t->ci->parts, t->ci->count, s->part_mpi_type,
-                             t->ci->nodeID, t->flags, MPI_COMM_WORLD,
-                             &t->req)) != MPI_SUCCESS) {
-          char buff[MPI_MAX_ERROR_STRING];
-          int len;
-          MPI_Error_string(err, buff, &len);
-          error("Failed to emit irecv for particle data (%s).", buff);
+        err = MPI_Irecv(t->ci->parts, t->ci->count, s->part_mpi_type,
+                        t->ci->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+        if (err != MPI_SUCCESS) {
+          mpi_error(err, "Failed to emit irecv for particle data.");
         }
         // message( "receiving %i parts with tag=%i from %i to %i." ,
         //     t->ci->count , t->flags , t->ci->nodeID , s->nodeID );
@@ -1113,13 +1113,10 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
         break;
       case task_type_send:
 #ifdef WITH_MPI
-        if ((err = MPI_Isend(t->ci->parts, t->ci->count, s->part_mpi_type,
-                             t->cj->nodeID, t->flags, MPI_COMM_WORLD,
-                             &t->req)) != MPI_SUCCESS) {
-          char buff[MPI_MAX_ERROR_STRING];
-          int len;
-          MPI_Error_string(err, buff, &len);
-          error("Failed to emit isend for particle data (%s).", buff);
+        err = MPI_Isend(t->ci->parts, t->ci->count, s->part_mpi_type,
+                        t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+        if (err != MPI_SUCCESS) {
+          mpi_error(err, "Failed to emit isend for particle data.");
         }
         // message( "sending %i parts with tag=%i from %i to %i." ,
         //     t->ci->count , t->flags , s->nodeID , t->cj->nodeID );
@@ -1135,7 +1132,7 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
 
     if (qid >= s->nr_queues) error("Bad computed qid.");
 
-    /* If no previous owner, find the shortest queue. */
+    /* If no previous owner, pick a random queue. */
     if (qid < 0) qid = rand() % s->nr_queues;
 
     /* Increase the waiting counter. */
@@ -1166,7 +1163,7 @@ struct task *scheduler_done(struct scheduler *s, struct task *t) {
   for (int k = 0; k < t->nr_unlock_tasks; k++) {
     struct task *t2 = t->unlock_tasks[k];
 
-    int res = atomic_dec(&t2->wait);
+    const int res = atomic_dec(&t2->wait);
     if (res < 1) {
       error("Negative wait!");
     } else if (res == 1) {
@@ -1205,7 +1202,7 @@ struct task *scheduler_unlock(struct scheduler *s, struct task *t) {
      they are ready. */
   for (int k = 0; k < t->nr_unlock_tasks; k++) {
     struct task *t2 = t->unlock_tasks[k];
-    int res = atomic_dec(&t2->wait);
+    const int res = atomic_dec(&t2->wait);
     if (res < 1) {
       error("Negative wait!");
     } else if (res == 1) {
@@ -1242,7 +1239,7 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
                                const struct task *prev) {
 
   struct task *res = NULL;
-  int k, nr_queues = s->nr_queues;
+  const int nr_queues = s->nr_queues;
   unsigned int seed = qid;
 
   /* Check qid. */
@@ -1266,10 +1263,10 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
       /* If unsuccessful, try stealing from the other queues. */
       if (s->flags & scheduler_flag_steal) {
         int count = 0, qids[nr_queues];
-        for (k = 0; k < nr_queues; k++)
+        for (int k = 0; k < nr_queues; k++)
           if (s->queues[k].count > 0) qids[count++] = k;
-        for (k = 0; k < scheduler_maxsteal && count > 0; k++) {
-          int ind = rand_r(&seed) % count;
+        for (int k = 0; k < scheduler_maxsteal && count > 0; k++) {
+          const int ind = rand_r(&seed) % count;
           TIMER_TIC
           res = queue_gettask(&s->queues[qids[ind]], prev, 0);
           TIMER_TOC(timer_qsteal);
@@ -1289,7 +1286,10 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
     if (res == NULL) {
 #endif
       pthread_mutex_lock(&s->sleep_mutex);
-      if (s->waiting > 0) pthread_cond_wait(&s->sleep_cond, &s->sleep_mutex);
+      res = queue_gettask(&s->queues[qid], prev, 1);
+      if (res == NULL && s->waiting > 0) {
+        pthread_cond_wait(&s->sleep_cond, &s->sleep_mutex);
+      }
       pthread_mutex_unlock(&s->sleep_mutex);
     }
   }
@@ -1368,7 +1368,7 @@ void scheduler_init(struct scheduler *s, struct space *space, int nr_tasks,
  * @param s The #scheduler
  * @param fileName Name of the file to write to
  */
-void scheduler_print_tasks(struct scheduler *s, char *fileName) {
+void scheduler_print_tasks(const struct scheduler *s, const char *fileName) {
 
   const int nr_tasks = s->nr_tasks, *tid = s->tasks_ind;
   struct task *t, *tasks = s->tasks;
diff --git a/src/scheduler.h b/src/scheduler.h
index 3f2d8c289d0d691d0d155b20ae0522c5830524aa..9c2a5bd7307817bd536930200ad08e7e37502e08 100644
--- a/src/scheduler.h
+++ b/src/scheduler.h
@@ -128,7 +128,7 @@ struct task *scheduler_unlock(struct scheduler *s, struct task *t);
 void scheduler_addunlock(struct scheduler *s, struct task *ta, struct task *tb);
 void scheduler_set_unlocks(struct scheduler *s);
 void scheduler_dump_queue(struct scheduler *s);
-void scheduler_print_tasks(struct scheduler *s, char *fileName);
+void scheduler_print_tasks(const struct scheduler *s, const char *fileName);
 void scheduler_do_rewait(struct task *t_begin, struct task *t_end,
                          unsigned int mask, unsigned int submask);
 
diff --git a/src/serial_io.c b/src/serial_io.c
index 8e63db5cfad3a3b50fc7e350bbac6ce09708230a..40bd2b1c8921f4acbfa0950984d6915ebd3d241e 100644
--- a/src/serial_io.c
+++ b/src/serial_io.c
@@ -57,18 +57,18 @@
  * @param dim The dimension of the data (1 for scalar, 3 for vector)
  * @param part_c A (char*) pointer on the first occurrence of the field of
  *interest in the parts array
+ * @param partSize The size in bytes of the particle structure.
  * @param importance If COMPULSORY, the data must be present in the IC file. If
  *OPTIONAL, the array will be zeroed when the data is not present.
  *
  * @todo A better version using HDF5 hyper-slabs to read the file directly into
  *the part array
  * will be written once the structures have been stabilized.
- *
- * Calls #error() if an error occurs.
  */
 void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N,
                       int dim, long long N_total, long long offset,
-                      char* part_c, enum DATA_IMPORTANCE importance) {
+                      char* part_c, size_t partSize,
+                      enum DATA_IMPORTANCE importance) {
   hid_t h_data = 0, h_err = 0, h_type = 0, h_memspace = 0, h_filespace = 0;
   hsize_t shape[2], offsets[2];
   htri_t exist = 0;
@@ -76,7 +76,6 @@ void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N,
   int i = 0, rank = 0;
   const size_t typeSize = sizeOfType(type);
   const size_t copySize = typeSize * dim;
-  const size_t partSize = sizeof(struct part);
   char* temp_c = 0;
 
   /* Check whether the dataspace exists or not */
@@ -172,9 +171,10 @@ void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N,
  * Routines writing an output file
  *-----------------------------------------------------------------------------*/
 
-void prepareArray(hid_t grp, char* fileName, FILE* xmfFile, char* name,
-                  enum DATA_TYPE type, long long N_total, int dim,
-                  struct UnitSystem* us, enum UnitConversionFactor convFactor) {
+void prepareArray(hid_t grp, char* fileName, FILE* xmfFile,
+                  char* partTypeGroupName, char* name, enum DATA_TYPE type,
+                  long long N_total, int dim, struct UnitSystem* us,
+                  enum UnitConversionFactor convFactor) {
   hid_t h_data = 0, h_err = 0, h_space = 0, h_prop = 0;
   int rank = 0;
   hsize_t shape[2];
@@ -234,7 +234,7 @@ void prepareArray(hid_t grp, char* fileName, FILE* xmfFile, char* name,
   }
 
   /* Write XMF description for this data set */
-  writeXMFline(xmfFile, fileName, name, N_total, dim, type);
+  writeXMFline(xmfFile, fileName, partTypeGroupName, name, N_total, dim, type);
 
   /* Write unit conversion factors for this data set */
   conversionString(buffer, us, convFactor);
@@ -255,21 +255,22 @@ void prepareArray(hid_t grp, char* fileName, FILE* xmfFile, char* name,
  * @param grp The group in which to write.
  * @param fileName The name of the file in which the data is written
  * @param xmfFile The FILE used to write the XMF description
+ * @param partTypeGroupName The name of the group containing the particles in
+ *the HDF5 file.
  * @param name The name of the array to write.
  * @param type The #DATA_TYPE of the array.
  * @param N The number of particles to write.
  * @param dim The dimension of the data (1 for scalar, 3 for vector)
  * @param part_c A (char*) pointer on the first occurrence of the field of
  *interest in the parts array
+ * @param partSize The size in bytes of the particle structure.
  * @param us The UnitSystem currently in use
- * @param convFactor The UnitConversionFactor for this array
- *
- *
- * Calls #error() if an error occurs.
+ * @param convFactor The UnitConversionFactor for this arrayo
  */
-void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
-                       enum DATA_TYPE type, int N, int dim, long long N_total,
-                       int mpi_rank, long long offset, char* part_c,
+void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile,
+                       char* partTypeGroupName, char* name, enum DATA_TYPE type,
+                       int N, int dim, long long N_total, int mpi_rank,
+                       long long offset, char* part_c, size_t partSize,
                        struct UnitSystem* us,
                        enum UnitConversionFactor convFactor) {
 
@@ -279,15 +280,14 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
   int i = 0, rank = 0;
   const size_t typeSize = sizeOfType(type);
   const size_t copySize = typeSize * dim;
-  const size_t partSize = sizeof(struct part);
   char* temp_c = 0;
 
   /* message("Writing '%s' array...", name); */
 
   /* Prepare the arrays in the file */
   if (mpi_rank == 0)
-    prepareArray(grp, fileName, xmfFile, name, type, N_total, dim, us,
-                 convFactor);
+    prepareArray(grp, fileName, xmfFile, partTypeGroupName, name, type, N_total,
+                 dim, us, convFactor);
 
   /* Allocate temporary buffer */
   temp = malloc(N * dim * sizeOfType(type));
@@ -362,7 +362,7 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
 #define readArray(grp, name, type, N, dim, part, N_total, offset, field, \
                   importance)                                            \
   readArrayBackEnd(grp, name, type, N, dim, N_total, offset,             \
-                   (char*)(&(part[0]).field), importance)
+                   (char*)(&(part[0]).field), sizeof(part[0]), importance)
 
 /**
  * @brief A helper macro to call the readArrayBackEnd function more easily.
@@ -371,34 +371,47 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
  * @param fileName Unused parameter in non-MPI mode
  * @param xmfFile Unused parameter in non-MPI mode
  * @param name The name of the array to write.
+ * @param partTypeGroupName The name of the group containing the particles in
+ *the HDF5 file.
  * @param type The #DATA_TYPE of the array.
  * @param N The number of particles to write.
  * @param dim The dimension of the data (1 for scalar, 3 for vector)
  * @param part A (char*) pointer on the first occurrence of the field of
- *interest
- *in the parts array
+ *interest in the parts array
+ * @param N_total Unused parameter in non-MPI mode
+ * @param mpi_rank Unused parameter in non-MPI mode
+ * @param offset Unused parameter in non-MPI mode
  * @param field The name (code name) of the field to read from.
  * @param us The UnitSystem currently in use
  * @param convFactor The UnitConversionFactor for this array
  *
  */
-#define writeArray(grp, fileName, xmfFile, name, type, N, dim, part, N_total, \
-                   mpi_rank, offset, field, us, convFactor)                   \
-  writeArrayBackEnd(grp, fileName, xmfFile, name, type, N, dim, N_total,      \
-                    mpi_rank, offset, (char*)(&(part[0]).field), us,          \
-                    convFactor)
+#define writeArray(grp, fileName, xmfFile, partTypeGroupName, name, type, N,   \
+                   dim, part, N_total, mpi_rank, offset, field, us,            \
+                   convFactor)                                                 \
+  writeArrayBackEnd(grp, fileName, xmfFile, partTypeGroupName, name, type, N,  \
+                    dim, N_total, mpi_rank, offset, (char*)(&(part[0]).field), \
+                    sizeof(part[0]), us, convFactor)
 
 /* Import the right hydro definition */
 #include "hydro_io.h"
+/* Import the right gravity definition */
+#include "gravity_io.h"
 
 /**
  * @brief Reads an HDF5 initial condition file (GADGET-3 type)
  *
  * @param fileName The file to read.
  * @param dim (output) The dimension of the volume read from the file.
- * @param parts (output) The array of #part read from the file.
- * @param N (output) The number of particles read from the file.
+ * @param parts (output) The array of #part (gas particles) read from the file.
+ * @param gparts (output) The array of #gpart read from the file.
+ * @param Ngas (output) The number of #part read from the file on that node.
+ * @param Ngparts (output) The number of #gpart read from the file on that node.
  * @param periodic (output) 1 if the volume is periodic, 0 if not.
+ * @param mpi_rank The MPI rank of this node
+ * @param mpi_size The number of MPI ranks
+ * @param comm The MPI communicator
+ * @param info The MPI information object
  *
  * Opens the HDF5 file fileName and reads the particles contained
  * in the parts array. N is the returned number of particles found
@@ -411,17 +424,18 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
  *
  */
 void read_ic_serial(char* fileName, double dim[3], struct part** parts,
-                    size_t* N, int* periodic, int mpi_rank, int mpi_size,
-                    MPI_Comm comm, MPI_Info info) {
+                    struct gpart** gparts, size_t* Ngas, size_t* Ngparts,
+                    int* periodic, int mpi_rank, int mpi_size, MPI_Comm comm,
+                    MPI_Info info) {
   hid_t h_file = 0, h_grp = 0;
-  double boxSize[3] = {0.0, -1.0, -1.0};
   /* GADGET has only cubic boxes (in cosmological mode) */
-  int numParticles[6] = {0};
-  /* GADGET has 6 particle types. We only keep the type 0*/
-  int numParticles_highWord[6] = {0};
-  long long offset = 0;
-  long long N_total = 0;
-  int rank;
+  double boxSize[3] = {0.0, -1.0, -1.0};
+  /* GADGET has 6 particle types. We only keep the type 0 & 1 for now*/
+  int numParticles[NUM_PARTICLE_TYPES] = {0};
+  int numParticles_highWord[NUM_PARTICLE_TYPES] = {0};
+  size_t N[NUM_PARTICLE_TYPES] = {0};
+  long long N_total[NUM_PARTICLE_TYPES] = {0};
+  long long offset[NUM_PARTICLE_TYPES] = {0};
 
   /* First read some information about the content */
   if (mpi_rank == 0) {
@@ -453,8 +467,10 @@ void read_ic_serial(char* fileName, double dim[3], struct part** parts,
     readAttribute(h_grp, "NumPart_Total", UINT, numParticles);
     readAttribute(h_grp, "NumPart_Total_HighWord", UINT, numParticles_highWord);
 
-    N_total = ((long long)numParticles[0]) +
-              ((long long)numParticles_highWord[0] << 32);
+    for (int ptype = 0; ptype < NUM_PARTICLE_TYPES; ++ptype)
+      N_total[ptype] = ((long long)numParticles[ptype]) +
+                       ((long long)numParticles_highWord[ptype] << 32);
+
     dim[0] = boxSize[0];
     dim[1] = (boxSize[1] < 0) ? boxSize[0] : boxSize[1];
     dim[2] = (boxSize[2] < 0) ? boxSize[0] : boxSize[2];
@@ -474,22 +490,38 @@ void read_ic_serial(char* fileName, double dim[3], struct part** parts,
 
   /* Now need to broadcast that information to all ranks. */
   MPI_Bcast(periodic, 1, MPI_INT, 0, comm);
-  MPI_Bcast(&N_total, 1, MPI_LONG_LONG, 0, comm);
+  MPI_Bcast(&N_total, NUM_PARTICLE_TYPES, MPI_LONG_LONG, 0, comm);
   MPI_Bcast(dim, 3, MPI_DOUBLE, 0, comm);
 
   /* Divide the particles among the tasks. */
-  offset = mpi_rank * N_total / mpi_size;
-  *N = (mpi_rank + 1) * N_total / mpi_size - offset;
+  for (int ptype = 0; ptype < NUM_PARTICLE_TYPES; ++ptype) {
+    offset[ptype] = mpi_rank * N_total[ptype] / mpi_size;
+    N[ptype] = (mpi_rank + 1) * N_total[ptype] / mpi_size - offset[ptype];
+  }
 
-  /* Allocate memory to store particles */
-  if (posix_memalign((void*)parts, part_align, (*N) * sizeof(struct part)) != 0)
+  /* Allocate memory to store SPH particles */
+  *Ngas = N[0];
+  if (posix_memalign((void*)parts, part_align, (*Ngas) * sizeof(struct part)) !=
+      0)
     error("Error while allocating memory for particles");
-  bzero(*parts, *N * sizeof(struct part));
+  bzero(*parts, *Ngas * sizeof(struct part));
+
+  /* Allocate memory to store all particles */
+  const size_t Ndm = N[1];
+  *Ngparts = N[1] + N[0];
+  if (posix_memalign((void*)gparts, gpart_align,
+                     *Ngparts * sizeof(struct gpart)) != 0)
+    error("Error while allocating memory for gravity particles");
+  bzero(*gparts, *Ngparts * sizeof(struct gpart));
+
   /* message("Allocated %8.2f MB for particles.", *N * sizeof(struct part) / */
   /* 	  (1024.*1024.)); */
 
+  /* message("BoxSize = %lf", dim[0]); */
+  /* message("NumPart = [%zd, %zd] Total = %zd", *Ngas, Ndm, *Ngparts); */
+
   /* Now loop over ranks and read the data */
-  for (rank = 0; rank < mpi_size; ++rank) {
+  for (int rank = 0; rank < mpi_size; ++rank) {
 
     /* Is it this rank's turn to read ? */
     if (rank == mpi_rank) {
@@ -498,17 +530,41 @@ void read_ic_serial(char* fileName, double dim[3], struct part** parts,
       if (h_file < 0)
         error("Error while opening file '%s' on rank %d.", fileName, mpi_rank);
 
-      /* Open SPH particles group */
-      /* message("Reading particle arrays..."); */
-      h_grp = H5Gopen(h_file, "/PartType0", H5P_DEFAULT);
-      if (h_grp < 0)
-        error("Error while opening particle group on rank %d.\n", mpi_rank);
+      /* Loop over all particle types */
+      for (int ptype = 0; ptype < NUM_PARTICLE_TYPES; ptype++) {
 
-      /* Read particle fields into the particle structure */
-      hydro_read_particles(h_grp, *N, N_total, offset, *parts);
+        /* Don't do anything if no particle of this kind */
+        if (N[ptype] == 0) continue;
 
-      /* Close particle group */
-      H5Gclose(h_grp);
+        /* Open the particle group in the file */
+        char partTypeGroupName[PARTICLE_GROUP_BUFFER_SIZE];
+        snprintf(partTypeGroupName, PARTICLE_GROUP_BUFFER_SIZE, "/PartType%d",
+                 ptype);
+        h_grp = H5Gopen(h_file, partTypeGroupName, H5P_DEFAULT);
+        if (h_grp < 0) {
+          error("Error while opening particle group %s.", partTypeGroupName);
+        }
+
+        /* Read particle fields into the particle structure */
+        switch (ptype) {
+
+          case GAS:
+            hydro_read_particles(h_grp, N[ptype], N_total[ptype], offset[ptype],
+                                 *parts);
+            break;
+
+          case DM:
+            darkmatter_read_particles(h_grp, N[ptype], N_total[ptype],
+                                      offset[ptype], *gparts);
+            break;
+
+          default:
+            error("Particle Type %d not yet supported. Aborting", ptype);
+        }
+
+        /* Close particle group */
+        H5Gclose(h_grp);
+      }
 
       /* Close file */
       H5Fclose(h_file);
@@ -518,6 +574,12 @@ void read_ic_serial(char* fileName, double dim[3], struct part** parts,
     MPI_Barrier(comm);
   }
 
+  /* Prepare the DM particles */
+  prepare_dm_gparts(*gparts, Ndm);
+
+  /* Now duplicate the hydro particle into gparts */
+  duplicate_hydro_gparts(*parts, *gparts, *Ngas, Ndm);
+
   /* message("Done Reading particles..."); */
 }
 
@@ -525,7 +587,11 @@ void read_ic_serial(char* fileName, double dim[3], struct part** parts,
  * @brief Writes an HDF5 output file (GADGET-3 type) with its XMF descriptor
  *
  * @param e The engine containing all the system.
- * @param us The UnitSystem used for the conversion of units in the output
+ * @param us The UnitSystem used for the conversion of units in the output.
+ * @param mpi_rank The MPI rank of this node.
+ * @param mpi_size The number of MPI ranks.
+ * @param comm The MPI communicator.
+ * @param info The MPI information object
  *
  * Creates an HDF5 output file and writes the particles contained
  * in the engine. If such a file already exists, it is erased and replaced
@@ -538,35 +604,40 @@ void read_ic_serial(char* fileName, double dim[3], struct part** parts,
 void write_output_serial(struct engine* e, struct UnitSystem* us, int mpi_rank,
                          int mpi_size, MPI_Comm comm, MPI_Info info) {
   hid_t h_file = 0, h_grp = 0, h_grpsph = 0;
-  int N = e->s->nr_parts;
+  const size_t Ngas = e->s->nr_parts;
+  const size_t Ntot = e->s->nr_gparts;
   int periodic = e->s->periodic;
-  int numParticles[6] = {N, 0};
-  int numParticlesHighWord[6] = {0};
-  unsigned int flagEntropy[6] = {0};
-  long long N_total = 0, offset = 0;
-  double offset_d = 0., N_d = 0., N_total_d = 0.;
   int numFiles = 1;
-  int rank = 0;
   struct part* parts = e->s->parts;
-  FILE* xmfFile = 0;
+  struct gpart* gparts = e->s->gparts;
+  struct gpart* dmparts = NULL;
   static int outputCount = 0;
+  FILE* xmfFile = 0;
+
+  /* Number of particles of each type */
+  // const size_t Ndm = Ntot - Ngas;
+
+  /* MATTHIEU: Temporary fix to preserve master */
+  const size_t Ndm = Ntot > 0 ? Ntot - Ngas : 0;
+  /* MATTHIEU: End temporary fix */
 
   /* File name */
-  char fileName[200];
-  sprintf(fileName, "output_%03i.hdf5", outputCount);
+  char fileName[FILENAME_BUFFER_SIZE];
+  snprintf(fileName, FILENAME_BUFFER_SIZE, "output_%03i.hdf5", outputCount);
 
   /* Compute offset in the file and total number of particles */
-  /* Done using double to allow for up to 2^50=10^15 particles */
-  N_d = (double)N;
-  MPI_Exscan(&N_d, &offset_d, 1, MPI_DOUBLE, MPI_SUM, comm);
-  N_total_d = offset_d + N_d;
-  MPI_Bcast(&N_total_d, 1, MPI_DOUBLE, mpi_size - 1, comm);
-  if (N_total_d > 1.e15)
-    error(
-        "Error while computing the offset for parallel output: Simulation has "
-        "more than 10^15 particles.\n");
-  N_total = (long long)N_total_d;
-  offset = (long long)offset_d;
+  size_t N[NUM_PARTICLE_TYPES] = {Ngas, Ndm, 0};
+  long long N_total[NUM_PARTICLE_TYPES] = {0};
+  long long offset[NUM_PARTICLE_TYPES] = {0};
+  MPI_Exscan(&N, &offset, NUM_PARTICLE_TYPES, MPI_LONG_LONG, MPI_SUM, comm);
+  for (int ptype = 0; ptype < NUM_PARTICLE_TYPES; ++ptype)
+    N_total[ptype] = offset[ptype] + N[ptype];
+
+  /* The last rank now has the correct N_total. Let's broadcast from there */
+  MPI_Bcast(&N_total, 6, MPI_LONG_LONG, mpi_size - 1, comm);
+
+  /* Now everybody konws its offset and the total number of particles of each
+   * type */
 
   /* Do common stuff first */
   if (mpi_rank == 0) {
@@ -578,7 +649,7 @@ void write_output_serial(struct engine* e, struct UnitSystem* us, int mpi_rank,
     xmfFile = prepareXMFfile();
 
     /* Write the part corresponding to this specific output */
-    writeXMFheader(xmfFile, N_total, fileName, e->time);
+    writeXMFoutputheader(xmfFile, fileName, e->time);
 
     /* Open file */
     /* message("Opening file '%s'.", fileName); */
@@ -610,15 +681,24 @@ void write_output_serial(struct engine* e, struct UnitSystem* us, int mpi_rank,
     writeAttribute(h_grp, "Time", DOUBLE, &dblTime, 1);
 
     /* GADGET-2 legacy values */
-    numParticles[0] = (unsigned int)N_total;
-    writeAttribute(h_grp, "NumPart_ThisFile", UINT, numParticles, 6);
-    writeAttribute(h_grp, "NumPart_Total", UINT, numParticles, 6);
-    numParticlesHighWord[0] = (unsigned int)(N_total >> 32);
+    /* Number of particles of each type */
+    unsigned int numParticles[NUM_PARTICLE_TYPES] = {0};
+    unsigned int numParticlesHighWord[NUM_PARTICLE_TYPES] = {0};
+    for (int ptype = 0; ptype < NUM_PARTICLE_TYPES; ++ptype) {
+      numParticles[ptype] = (unsigned int)N_total[ptype];
+      numParticlesHighWord[ptype] = (unsigned int)(N_total[ptype] >> 32);
+    }
+    writeAttribute(h_grp, "NumPart_ThisFile", LONGLONG, N_total,
+                   NUM_PARTICLE_TYPES);
+    writeAttribute(h_grp, "NumPart_Total", UINT, numParticles,
+                   NUM_PARTICLE_TYPES);
     writeAttribute(h_grp, "NumPart_Total_HighWord", UINT, numParticlesHighWord,
-                   6);
+                   NUM_PARTICLE_TYPES);
     double MassTable[6] = {0., 0., 0., 0., 0., 0.};
-    writeAttribute(h_grp, "MassTable", DOUBLE, MassTable, 6);
-    writeAttribute(h_grp, "Flag_Entropy_ICs", UINT, flagEntropy, 6);
+    writeAttribute(h_grp, "MassTable", DOUBLE, MassTable, NUM_PARTICLE_TYPES);
+    unsigned int flagEntropy[NUM_PARTICLE_TYPES] = {0};
+    writeAttribute(h_grp, "Flag_Entropy_ICs", UINT, flagEntropy,
+                   NUM_PARTICLE_TYPES);
     writeAttribute(h_grp, "NumFilesPerSnapshot", INT, &numFiles, 1);
 
     /* Close header */
@@ -636,21 +716,32 @@ void write_output_serial(struct engine* e, struct UnitSystem* us, int mpi_rank,
     /* Print the system of Units */
     writeUnitSystem(h_file, us);
 
-    /* Create SPH particles group */
-    /* message("Writing particle arrays..."); */
-    h_grp =
-        H5Gcreate(h_file, "/PartType0", H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-    if (h_grp < 0) error("Error while creating particle group.\n");
+    /* Loop over all particle types */
+    for (int ptype = 0; ptype < NUM_PARTICLE_TYPES; ptype++) {
 
-    /* Close particle group */
-    H5Gclose(h_grp);
+      /* Don't do anything if no particle of this kind */
+      if (N_total[ptype] == 0) continue;
+
+      /* Open the particle group in the file */
+      char partTypeGroupName[PARTICLE_GROUP_BUFFER_SIZE];
+      snprintf(partTypeGroupName, PARTICLE_GROUP_BUFFER_SIZE, "/PartType%d",
+               ptype);
+      h_grp = H5Gcreate(h_file, partTypeGroupName, H5P_DEFAULT, H5P_DEFAULT,
+                        H5P_DEFAULT);
+      if (h_grp < 0) {
+        error("Error while creating particle group.\n");
+      }
+
+      /* Close particle group */
+      H5Gclose(h_grp);
+    }
 
     /* Close file */
     H5Fclose(h_file);
   }
 
   /* Now loop over ranks and write the data */
-  for (rank = 0; rank < mpi_size; ++rank) {
+  for (int rank = 0; rank < mpi_size; ++rank) {
 
     /* Is it this rank's turn to write ? */
     if (rank == mpi_rank) {
@@ -659,18 +750,65 @@ void write_output_serial(struct engine* e, struct UnitSystem* us, int mpi_rank,
       if (h_file < 0)
         error("Error while opening file '%s' on rank %d.", fileName, mpi_rank);
 
-      /* Open SPH particles group */
-      /* message("Reading particle arrays..."); */
-      h_grp = H5Gopen(h_file, "/PartType0", H5P_DEFAULT);
-      if (h_grp < 0)
-        error("Error while opening particle group on rank %d.\n", mpi_rank);
+      /* Loop over all particle types */
+      for (int ptype = 0; ptype < NUM_PARTICLE_TYPES; ptype++) {
 
-      /* Write particle fields from the particle structure */
-      hydro_write_particles(h_grp, fileName, xmfFile, N, N_total, mpi_rank,
-                            offset, parts, us);
+        /* Don't do anything if no particle of this kind */
+        if (N_total[ptype] == 0) continue;
 
-      /* Close particle group */
-      H5Gclose(h_grp);
+        /* Add the global information for that particle type to the XMF
+         * meta-file */
+        if (mpi_rank == 0)
+          writeXMFgroupheader(xmfFile, fileName, N_total[ptype], ptype);
+
+        /* Open the particle group in the file */
+        char partTypeGroupName[PARTICLE_GROUP_BUFFER_SIZE];
+        snprintf(partTypeGroupName, PARTICLE_GROUP_BUFFER_SIZE, "/PartType%d",
+                 ptype);
+        h_grp = H5Gopen(h_file, partTypeGroupName, H5P_DEFAULT);
+        if (h_grp < 0) {
+          error("Error while opening particle group %s.", partTypeGroupName);
+        }
+
+        /* Read particle fields into the particle structure */
+        switch (ptype) {
+
+          case GAS:
+            hydro_write_particles(h_grp, fileName, partTypeGroupName, xmfFile,
+                                  N[ptype], N_total[ptype], mpi_rank,
+                                  offset[ptype], parts, us);
+
+            break;
+
+          case DM:
+            /* Allocate temporary array */
+            if (posix_memalign((void*)&dmparts, gpart_align,
+                               Ndm * sizeof(struct gpart)) != 0)
+              error("Error while allocating temporart memory for DM particles");
+            bzero(dmparts, Ndm * sizeof(struct gpart));
+
+            /* Collect the DM particles from gpart */
+            collect_dm_gparts(gparts, Ntot, dmparts, Ndm);
+
+            /* Write DM particles */
+            darkmatter_write_particles(h_grp, fileName, partTypeGroupName,
+                                       xmfFile, N[ptype], N_total[ptype],
+                                       mpi_rank, offset[ptype], dmparts, us);
+
+            /* Free temporary array */
+            free(dmparts);
+            break;
+
+          default:
+            error("Particle Type %d not yet supported. Aborting", ptype);
+        }
+
+        /* Close particle group */
+        H5Gclose(h_grp);
+
+        /* Close this particle group in the XMF file as well */
+        if (mpi_rank == 0) writeXMFgroupfooter(xmfFile, ptype);
+      }
 
       /* Close file */
       H5Fclose(h_file);
@@ -681,7 +819,7 @@ void write_output_serial(struct engine* e, struct UnitSystem* us, int mpi_rank,
   }
 
   /* Write footer of LXMF file descriptor */
-  if (mpi_rank == 0) writeXMFfooter(xmfFile);
+  if (mpi_rank == 0) writeXMFoutputfooter(xmfFile, outputCount, e->time);
 
   /* message("Done writing particles..."); */
   ++outputCount;
diff --git a/src/serial_io.h b/src/serial_io.h
index 95f09f5977a97a359e978db7a1b71b02030d6a14..5a34d420cfabd88d4147e3f3630e0efe89951c41 100644
--- a/src/serial_io.h
+++ b/src/serial_io.h
@@ -32,8 +32,9 @@
 #if defined(HAVE_HDF5) && defined(WITH_MPI) && !defined(HAVE_PARALLEL_HDF5)
 
 void read_ic_serial(char* fileName, double dim[3], struct part** parts,
-                    size_t* N, int* periodic, int mpi_rank, int mpi_size,
-                    MPI_Comm comm, MPI_Info info);
+                    struct gpart** gparts, size_t* Ngas, size_t* Ngparts,
+                    int* periodic, int mpi_rank, int mpi_size, MPI_Comm comm,
+                    MPI_Info info);
 
 void write_output_serial(struct engine* e, struct UnitSystem* us, int mpi_rank,
                          int mpi_size, MPI_Comm comm, MPI_Info info);
diff --git a/src/single_io.c b/src/single_io.c
index ab105175e111c74f5886e41d2f8964d18789ee7d..801428433ef5170082b68dec425e52f845bb41ae 100644
--- a/src/single_io.c
+++ b/src/single_io.c
@@ -39,9 +39,6 @@
 #include "common_io.h"
 #include "error.h"
 
-#define FILENAME_BUFFER_SIZE 150
-#define PARTICLE_GROUP_BUFFER_SIZE 20
-
 /*-----------------------------------------------------------------------------
  * Routines reading an IC file
  *-----------------------------------------------------------------------------*/
@@ -56,24 +53,23 @@
  * @param dim The dimension of the data (1 for scalar, 3 for vector)
  * @param part_c A (char*) pointer on the first occurrence of the field of
  *interest in the parts array
+ * @param partSize The size in bytes of the particle structure.
  * @param importance If COMPULSORY, the data must be present in the IC file. If
  *OPTIONAL, the array will be zeroed when the data is not present.
  *
  * @todo A better version using HDF5 hyper-slabs to read the file directly into
  *the part array
  * will be written once the structures have been stabilized.
- *
- * Calls #error() if an error occurs.
  */
 void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N,
-                      int dim, char* part_c, enum DATA_IMPORTANCE importance) {
+                      int dim, char* part_c, size_t partSize,
+                      enum DATA_IMPORTANCE importance) {
   hid_t h_data = 0, h_err = 0, h_type = 0;
   htri_t exist = 0;
   void* temp;
   int i = 0;
   const size_t typeSize = sizeOfType(type);
   const size_t copySize = typeSize * dim;
-  const size_t partSize = sizeof(struct part);
   char* temp_c = 0;
 
   /* Check whether the dataspace exists or not */
@@ -141,23 +137,25 @@ void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N,
  * @param grp The group in which to write.
  * @param fileName The name of the file in which the data is written
  * @param xmfFile The FILE used to write the XMF description
+ * @param partTypeGroupName The name of the group containing the particles in
+ *the HDF5 file.
  * @param name The name of the array to write.
  * @param type The #DATA_TYPE of the array.
  * @param N The number of particles to write.
  * @param dim The dimension of the data (1 for scalar, 3 for vector)
  * @param part_c A (char*) pointer on the first occurrence of the field of
- *interest in the parts array
+ *interest in the parts array.
+ * @param partSize The size in bytes of the particle structure.
  * @param us The UnitSystem currently in use
  * @param convFactor The UnitConversionFactor for this array
  *
  * @todo A better version using HDF5 hyper-slabs to write the file directly from
  *the part array
  * will be written once the structures have been stabilized.
- *
- * Calls #error() if an error occurs.
  */
-void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
-                       enum DATA_TYPE type, int N, int dim, char* part_c,
+void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile,
+                       char* partTypeGroupName, char* name, enum DATA_TYPE type,
+                       int N, int dim, char* part_c, size_t partSize,
                        struct UnitSystem* us,
                        enum UnitConversionFactor convFactor) {
   hid_t h_data = 0, h_err = 0, h_space = 0, h_prop = 0;
@@ -165,7 +163,6 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
   int i = 0, rank = 0;
   const size_t typeSize = sizeOfType(type);
   const size_t copySize = typeSize * dim;
-  const size_t partSize = sizeof(struct part);
   char* temp_c = 0;
   hsize_t shape[2];
   hsize_t chunk_shape[2];
@@ -202,6 +199,9 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
     chunk_shape[1] = 0;
   }
 
+  /* Make sure the chunks are not larger than the dataset */
+  if (chunk_shape[0] > N) chunk_shape[0] = N;
+
   /* Change shape of data space */
   h_err = H5Sset_extent_simple(h_space, rank, shape, NULL);
   if (h_err < 0) {
@@ -238,7 +238,7 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
   }
 
   /* Write XMF description for this data set */
-  writeXMFline(xmfFile, fileName, name, N, dim, type);
+  writeXMFline(xmfFile, fileName, partTypeGroupName, name, N, dim, type);
 
   /* Write unit conversion factors for this data set */
   conversionString(buffer, us, convFactor);
@@ -273,7 +273,7 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
 #define readArray(grp, name, type, N, dim, part, N_total, offset, field, \
                   importance)                                            \
   readArrayBackEnd(grp, name, type, N, dim, (char*)(&(part[0]).field),   \
-                   importance)
+                   sizeof(part[0]), importance)
 
 /**
  * @brief A helper macro to call the readArrayBackEnd function more easily.
@@ -282,6 +282,8 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
  * @param fileName The name of the file in which the data is written
  * @param xmfFile The FILE used to write the XMF description
  * @param name The name of the array to write.
+ * @param partTypeGroupName The name of the group containing the particles in
+ *the HDF5 file.
  * @param type The #DATA_TYPE of the array.
  * @param N The number of particles to write.
  * @param dim The dimension of the data (1 for scalar, 3 for vector)
@@ -295,10 +297,12 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
  * @param convFactor The UnitConversionFactor for this array
  *
  */
-#define writeArray(grp, fileName, xmfFile, name, type, N, dim, part, N_total, \
-                   mpi_rank, offset, field, us, convFactor)                   \
-  writeArrayBackEnd(grp, fileName, xmfFile, name, type, N, dim,               \
-                    (char*)(&(part[0]).field), us, convFactor)
+#define writeArray(grp, fileName, xmfFile, partTypeGroupName, name, type, N,  \
+                   dim, part, N_total, mpi_rank, offset, field, us,           \
+                   convFactor)                                                \
+  writeArrayBackEnd(grp, fileName, xmfFile, partTypeGroupName, name, type, N, \
+                    dim, (char*)(&(part[0]).field), sizeof(part[0]), us,      \
+                    convFactor)
 
 /* Import the right hydro definition */
 #include "hydro_io.h"
@@ -311,9 +315,9 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
  * @param fileName The file to read.
  * @param dim (output) The dimension of the volume.
  * @param parts (output) Array of Gas particles.
- * @param gparts (output) Array of DM particles.
+ * @param gparts (output) Array of #gpart particles.
  * @param Ngas (output) number of Gas particles read.
- * @param Ngparts (output) The number of DM particles read.
+ * @param Ngparts (output) The number of #gpart read.
  * @param periodic (output) 1 if the volume is periodic, 0 if not.
  *
  * Opens the HDF5 file fileName and reads the particles contained
@@ -334,6 +338,8 @@ void read_ic_single(char* fileName, double dim[3], struct part** parts,
   double boxSize[3] = {0.0, -1.0, -1.0};
   /* GADGET has 6 particle types. We only keep the type 0 & 1 for now...*/
   int numParticles[NUM_PARTICLE_TYPES] = {0};
+  int numParticles_highWord[NUM_PARTICLE_TYPES] = {0};
+  size_t N[NUM_PARTICLE_TYPES] = {0};
   size_t Ndm;
 
   /* Open file */
@@ -362,9 +368,12 @@ void read_ic_single(char* fileName, double dim[3], struct part** parts,
   /* Read the relevant information and print status */
   readAttribute(h_grp, "BoxSize", DOUBLE, boxSize);
   readAttribute(h_grp, "NumPart_Total", UINT, numParticles);
+  readAttribute(h_grp, "NumPart_Total_HighWord", UINT, numParticles_highWord);
+
+  for (int ptype = 0; ptype < NUM_PARTICLE_TYPES; ++ptype)
+    N[ptype] = ((long long)numParticles[ptype]) +
+               ((long long)numParticles_highWord[ptype] << 32);
 
-  *Ngas = numParticles[0];
-  Ndm = numParticles[1];
   dim[0] = boxSize[0];
   dim[1] = (boxSize[1] < 0) ? boxSize[0] : boxSize[1];
   dim[2] = (boxSize[2] < 0) ? boxSize[0] : boxSize[2];
@@ -375,16 +384,16 @@ void read_ic_single(char* fileName, double dim[3], struct part** parts,
   /* Close header */
   H5Gclose(h_grp);
 
-  /* Total number of particles */
-  *Ngparts = *Ngas + Ndm;
-
   /* Allocate memory to store SPH particles */
+  *Ngas = N[0];
   if (posix_memalign((void*)parts, part_align, *Ngas * sizeof(struct part)) !=
       0)
     error("Error while allocating memory for SPH particles");
   bzero(*parts, *Ngas * sizeof(struct part));
 
   /* Allocate memory to store all particles */
+  Ndm = N[1];
+  *Ngparts = N[1] + N[0];
   if (posix_memalign((void*)gparts, gpart_align,
                      *Ngparts * sizeof(struct gpart)) != 0)
     error("Error while allocating memory for gravity particles");
@@ -393,20 +402,19 @@ void read_ic_single(char* fileName, double dim[3], struct part** parts,
   /* message("Allocated %8.2f MB for particles.", *N * sizeof(struct part) /
    * (1024.*1024.)); */
 
-  /* Open SPH particles group */
-  /* message("Reading particle arrays..."); */
-  message("BoxSize = %lf", dim[0]);
-  message("NumPart = [%zd, %zd] Total = %zd", *Ngas, Ndm, *Ngparts);
+  /* message("BoxSize = %lf", dim[0]); */
+  /* message("NumPart = [%zd, %zd] Total = %zd", *Ngas, Ndm, *Ngparts); */
 
   /* Loop over all particle types */
   for (int ptype = 0; ptype < NUM_PARTICLE_TYPES; ptype++) {
 
     /* Don't do anything if no particle of this kind */
-    if (numParticles[ptype] == 0) continue;
+    if (N[ptype] == 0) continue;
 
     /* Open the particle group in the file */
     char partTypeGroupName[PARTICLE_GROUP_BUFFER_SIZE];
-    sprintf(partTypeGroupName, "/PartType%d", ptype);
+    snprintf(partTypeGroupName, PARTICLE_GROUP_BUFFER_SIZE, "/PartType%d",
+             ptype);
     h_grp = H5Gopen(h_file, partTypeGroupName, H5P_DEFAULT);
     if (h_grp < 0) {
       error("Error while opening particle group %s.", partTypeGroupName);
@@ -472,14 +480,17 @@ void write_output_single(struct engine* e, struct UnitSystem* us) {
   static int outputCount = 0;
 
   /* Number of particles of each type */
-  const size_t Ndm = Ntot - Ngas;
-  int numParticles[NUM_PARTICLE_TYPES] = /* Gadget-2 convention here */
-      {Ngas, Ndm, 0};                    /* Could use size_t instead */
-  int numParticlesHighWord[NUM_PARTICLE_TYPES] = {0};
+  // const size_t Ndm = Ntot - Ngas;
+
+  /* MATTHIEU: Temporary fix to preserve master */
+  const size_t Ndm = Ntot > 0 ? Ntot - Ngas : 0;
+  /* MATTHIEU: End temporary fix */
+
+  long long N_total[NUM_PARTICLE_TYPES] = {Ngas, Ndm, 0};
 
   /* File name */
   char fileName[FILENAME_BUFFER_SIZE];
-  sprintf(fileName, "output_%03i.hdf5", outputCount);
+  snprintf(fileName, FILENAME_BUFFER_SIZE, "output_%03i.hdf5", outputCount);
 
   /* First time, we need to create the XMF file */
   if (outputCount == 0) createXMFfile();
@@ -489,7 +500,7 @@ void write_output_single(struct engine* e, struct UnitSystem* us) {
   xmfFile = prepareXMFfile();
 
   /* Write the part corresponding to this specific output */
-  writeXMFheader(xmfFile, Ngas, fileName, e->time);
+  writeXMFoutputheader(xmfFile, fileName, e->time);
 
   /* Open file */
   /* message("Opening file '%s'.", fileName); */
@@ -517,19 +528,27 @@ void write_output_single(struct engine* e, struct UnitSystem* us) {
 
   /* Print the relevant information and print status */
   writeAttribute(h_grp, "BoxSize", DOUBLE, e->s->dim, 3);
-  writeAttribute(h_grp, "NumPart_ThisFile", UINT, numParticles,
-                 NUM_PARTICLE_TYPES);
   double dblTime = e->time;
   writeAttribute(h_grp, "Time", DOUBLE, &dblTime, 1);
 
   /* GADGET-2 legacy values */
+  /* Number of particles of each type */
+  unsigned int numParticles[NUM_PARTICLE_TYPES] = {0};
+  unsigned int numParticlesHighWord[NUM_PARTICLE_TYPES] = {0};
+  for (int ptype = 0; ptype < NUM_PARTICLE_TYPES; ++ptype) {
+    numParticles[ptype] = (unsigned int)N_total[ptype];
+    numParticlesHighWord[ptype] = (unsigned int)(N_total[ptype] >> 32);
+  }
+  writeAttribute(h_grp, "NumPart_ThisFile", LONGLONG, N_total,
+                 NUM_PARTICLE_TYPES);
   writeAttribute(h_grp, "NumPart_Total", UINT, numParticles,
                  NUM_PARTICLE_TYPES);
   writeAttribute(h_grp, "NumPart_Total_HighWord", UINT, numParticlesHighWord,
                  NUM_PARTICLE_TYPES);
-  double MassTable[NUM_PARTICLE_TYPES] = {0., 0., 0., 0., 0., 0.};
+  double MassTable[NUM_PARTICLE_TYPES] = {0};
   writeAttribute(h_grp, "MassTable", DOUBLE, MassTable, NUM_PARTICLE_TYPES);
-  writeAttribute(h_grp, "Flag_Entropy_ICs", UINT, numParticlesHighWord,
+  unsigned int flagEntropy[NUM_PARTICLE_TYPES] = {0};
+  writeAttribute(h_grp, "Flag_Entropy_ICs", UINT, flagEntropy,
                  NUM_PARTICLE_TYPES);
   writeAttribute(h_grp, "NumFilesPerSnapshot", INT, &numFiles, 1);
 
@@ -554,9 +573,13 @@ void write_output_single(struct engine* e, struct UnitSystem* us) {
     /* Don't do anything if no particle of this kind */
     if (numParticles[ptype] == 0) continue;
 
+    /* Add the global information for that particle type to the XMF meta-file */
+    writeXMFgroupheader(xmfFile, fileName, numParticles[ptype], ptype);
+
     /* Open the particle group in the file */
     char partTypeGroupName[PARTICLE_GROUP_BUFFER_SIZE];
-    sprintf(partTypeGroupName, "/PartType%d", ptype);
+    snprintf(partTypeGroupName, PARTICLE_GROUP_BUFFER_SIZE, "/PartType%d",
+             ptype);
     h_grp = H5Gcreate(h_file, partTypeGroupName, H5P_DEFAULT, H5P_DEFAULT,
                       H5P_DEFAULT);
     if (h_grp < 0) {
@@ -569,8 +592,8 @@ void write_output_single(struct engine* e, struct UnitSystem* us) {
     switch (ptype) {
 
       case GAS:
-        hydro_write_particles(h_grp, fileName, xmfFile, Ngas, Ngas, 0, 0, parts,
-                              us);
+        hydro_write_particles(h_grp, fileName, partTypeGroupName, xmfFile, Ngas,
+                              Ngas, 0, 0, parts, us);
         break;
 
       case DM:
@@ -584,8 +607,8 @@ void write_output_single(struct engine* e, struct UnitSystem* us) {
         collect_dm_gparts(gparts, Ntot, dmparts, Ndm);
 
         /* Write DM particles */
-        darkmatter_write_particles(h_grp, fileName, xmfFile, Ndm, Ndm, 0, 0,
-                                   dmparts, us);
+        darkmatter_write_particles(h_grp, fileName, partTypeGroupName, xmfFile,
+                                   Ndm, Ndm, 0, 0, dmparts, us);
 
         /* Free temporary array */
         free(dmparts);
@@ -597,10 +620,13 @@ void write_output_single(struct engine* e, struct UnitSystem* us) {
 
     /* Close particle group */
     H5Gclose(h_grp);
+
+    /* Close this particle group in the XMF file as well */
+    writeXMFgroupfooter(xmfFile, ptype);
   }
 
   /* Write LXMF file descriptor */
-  writeXMFfooter(xmfFile);
+  writeXMFoutputfooter(xmfFile, outputCount, e->time);
 
   /* message("Done writing particles..."); */
 
diff --git a/src/space.c b/src/space.c
index 3d415be930797d0e98a4b1a9cc9bdc88a6a83691..77f6c28bc7fa86fd86e2319b61c3ca966d07ac5c 100644
--- a/src/space.c
+++ b/src/space.c
@@ -97,12 +97,10 @@ const int sortlistID[27] = {
 int space_getsid(struct space *s, struct cell **ci, struct cell **cj,
                  double *shift) {
 
-  int k, sid = 0, periodic = s->periodic;
-  struct cell *temp;
-  double dx[3];
-
   /* Get the relative distance between the pairs, wrapping. */
-  for (k = 0; k < 3; k++) {
+  const int periodic = s->periodic;
+  double dx[3];
+  for (int k = 0; k < 3; k++) {
     dx[k] = (*cj)->loc[k] - (*ci)->loc[k];
     if (periodic && dx[k] < -s->dim[k] / 2)
       shift[k] = s->dim[k];
@@ -114,15 +112,16 @@ int space_getsid(struct space *s, struct cell **ci, struct cell **cj,
   }
 
   /* Get the sorting index. */
-  for (k = 0; k < 3; k++)
+  int sid = 0;
+  for (int k = 0; k < 3; k++)
     sid = 3 * sid + ((dx[k] < 0.0) ? 0 : ((dx[k] > 0.0) ? 2 : 1));
 
   /* Switch the cells around? */
   if (runner_flip[sid]) {
-    temp = *ci;
+    struct cell *temp = *ci;
     *ci = *cj;
     *cj = temp;
-    for (k = 0; k < 3; k++) shift[k] = -shift[k];
+    for (int k = 0; k < 3; k++) shift[k] = -shift[k];
   }
   sid = sortlistID[sid];
 
@@ -137,10 +136,8 @@ int space_getsid(struct space *s, struct cell **ci, struct cell **cj,
 
 void space_rebuild_recycle(struct space *s, struct cell *c) {
 
-  int k;
-
   if (c->split)
-    for (k = 0; k < 8; k++)
+    for (int k = 0; k < 8; k++)
       if (c->progeny[k] != NULL) {
         space_rebuild_recycle(s, c->progeny[k]);
         space_recycle(s, c->progeny[k]);
@@ -158,19 +155,19 @@ void space_rebuild_recycle(struct space *s, struct cell *c) {
 
 void space_regrid(struct space *s, double cell_max, int verbose) {
 
-  float h_max = s->cell_min / kernel_gamma / space_stretch, dmin;
-  int i, j, k, cdim[3], nr_parts = s->nr_parts;
+  float h_max = s->cell_min / kernel_gamma / space_stretch;
+  const size_t nr_parts = s->nr_parts;
   struct cell *restrict c;
   ticks tic = getticks();
 
   /* Run through the parts and get the current h_max. */
   // tic = getticks();
   if (s->cells != NULL) {
-    for (k = 0; k < s->nr_cells; k++) {
+    for (int k = 0; k < s->nr_cells; k++) {
       if (s->cells[k].h_max > h_max) h_max = s->cells[k].h_max;
     }
   } else {
-    for (k = 0; k < nr_parts; k++) {
+    for (int k = 0; k < nr_parts; k++) {
       if (s->parts[k].h > h_max) h_max = s->parts[k].h;
     }
     s->h_max = h_max;
@@ -190,7 +187,8 @@ void space_regrid(struct space *s, double cell_max, int verbose) {
   if (verbose) message("h_max is %.3e (cell_max=%.3e).", h_max, cell_max);
 
   /* Get the new putative cell dimensions. */
-  for (k = 0; k < 3; k++)
+  int cdim[3];
+  for (int k = 0; k < 3; k++)
     cdim[k] =
         floor(s->dim[k] / fmax(h_max * kernel_gamma * space_stretch, cell_max));
 
@@ -213,7 +211,7 @@ void space_regrid(struct space *s, double cell_max, int verbose) {
 
     /* Free the old cells, if they were allocated. */
     if (s->cells != NULL) {
-      for (k = 0; k < s->nr_cells; k++) {
+      for (int k = 0; k < s->nr_cells; k++) {
         space_rebuild_recycle(s, &s->cells[k]);
         if (s->cells[k].sort != NULL) free(s->cells[k].sort);
       }
@@ -222,12 +220,12 @@ void space_regrid(struct space *s, double cell_max, int verbose) {
     }
 
     /* Set the new cell dimensions only if smaller. */
-    for (k = 0; k < 3; k++) {
+    for (int k = 0; k < 3; k++) {
       s->cdim[k] = cdim[k];
       s->h[k] = s->dim[k] / cdim[k];
       s->ih[k] = 1.0 / s->h[k];
     }
-    dmin = fminf(s->h[0], fminf(s->h[1], s->h[2]));
+    const float dmin = fminf(s->h[0], fminf(s->h[1], s->h[2]));
 
     /* Allocate the highest level of cells. */
     s->tot_cells = s->nr_cells = cdim[0] * cdim[1] * cdim[2];
@@ -235,13 +233,13 @@ void space_regrid(struct space *s, double cell_max, int verbose) {
                        s->nr_cells * sizeof(struct cell)) != 0)
       error("Failed to allocate cells.");
     bzero(s->cells, s->nr_cells * sizeof(struct cell));
-    for (k = 0; k < s->nr_cells; k++)
+    for (int k = 0; k < s->nr_cells; k++)
       if (lock_init(&s->cells[k].lock) != 0) error("Failed to init spinlock.");
 
     /* Set the cell location and sizes. */
-    for (i = 0; i < cdim[0]; i++)
-      for (j = 0; j < cdim[1]; j++)
-        for (k = 0; k < cdim[2]; k++) {
+    for (int i = 0; i < cdim[0]; i++)
+      for (int j = 0; j < cdim[1]; j++)
+        for (int k = 0; k < cdim[2]; k++) {
           c = &s->cells[cell_getid(cdim, i, j, k)];
           c->loc[0] = i * s->h[0];
           c->loc[1] = j * s->h[1];
@@ -271,7 +269,7 @@ void space_regrid(struct space *s, double cell_max, int verbose) {
   else {
 
     /* Free the old cells, if they were allocated. */
-    for (k = 0; k < s->nr_cells; k++) {
+    for (int k = 0; k < s->nr_cells; k++) {
       space_rebuild_recycle(s, &s->cells[k]);
       s->cells[k].sorts = NULL;
       s->cells[k].nr_tasks = 0;
diff --git a/src/task.c b/src/task.c
index 91c202ad96b14bb9417f7b52f8e8d8b9c83496a8..5f1475a46e4626e1f51db673d73fd84f86e6edb6 100644
--- a/src/task.c
+++ b/src/task.c
@@ -147,7 +147,7 @@ int task_lock(struct task *t) {
 
 #ifdef WITH_MPI
     /* Check the status of the MPI request. */
-    int res, err;
+    int res = 0, err = 0;
     MPI_Status stat;
     if ((err = MPI_Test(&t->req, &res, &stat)) != MPI_SUCCESS) {
       char buff[MPI_MAX_ERROR_STRING];
diff --git a/theory/paper_pasc/Figures/Hierarchy3.pdf b/theory/paper_pasc/Figures/Hierarchy3.pdf
index 235cd289b1dbda5ac509e46f5432bdaa7d636724..589e5bbb445d5b141aa400dd3490b30e2cd3fdd3 100644
Binary files a/theory/paper_pasc/Figures/Hierarchy3.pdf and b/theory/paper_pasc/Figures/Hierarchy3.pdf differ
diff --git a/theory/paper_pasc/Figures/Hierarchy3.svg b/theory/paper_pasc/Figures/Hierarchy3.svg
index 5731bcbf2a2e64891b9ddd01c75a1e1c7babfe37..5954d8f0197901895890c639c9c757cd067e235e 100644
--- a/theory/paper_pasc/Figures/Hierarchy3.svg
+++ b/theory/paper_pasc/Figures/Hierarchy3.svg
@@ -23,14 +23,14 @@
      borderopacity="1.0"
      inkscape:pageopacity="0.0"
      inkscape:pageshadow="2"
-     inkscape:zoom="2"
+     inkscape:zoom="0.93"
      inkscape:cx="335.14507"
-     inkscape:cy="476.92781"
+     inkscape:cy="494.74535"
      inkscape:document-units="px"
      inkscape:current-layer="layer1"
      showgrid="true"
-     inkscape:window-width="1920"
-     inkscape:window-height="1033"
+     inkscape:window-width="1366"
+     inkscape:window-height="721"
      inkscape:window-x="0"
      inkscape:window-y="0"
      inkscape:window-maximized="1">
@@ -1302,5 +1302,27 @@
          x="415.34085"
          id="tspan7331"
          sodipodi:role="line">x</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:12px;font-style:normal;font-weight:bold;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans;-inkscape-font-specification:Sans Bold"
+       x="239.18359"
+       y="364.36218"
+       id="text3183"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan3185"
+         x="239.18359"
+         y="364.36218">Node 1</tspan></text>
+    <text
+       sodipodi:linespacing="125%"
+       id="text3187"
+       y="364.36218"
+       x="499.18359"
+       style="font-size:12px;font-style:normal;font-weight:bold;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans;-inkscape-font-specification:Sans Bold"
+       xml:space="preserve"><tspan
+         y="364.36218"
+         x="499.18359"
+         id="tspan3189"
+         sodipodi:role="line">Node 2</tspan></text>
   </g>
 </svg>
diff --git a/theory/paper_pasc/biblio.bib b/theory/paper_pasc/biblio.bib
index c0eead89fdf29d3503b5c2a11582f8b59ec45b8f..c779f129fa8d0384ea346cc72d57859e4afa261f 100644
--- a/theory/paper_pasc/biblio.bib
+++ b/theory/paper_pasc/biblio.bib
@@ -221,16 +221,6 @@ archivePrefix = "arXiv",
   organization={University of Edinburgh}
 }
 
-@article{gonnet2015efficient,
-  title={Efficient and Scalable Algorithms for Smoothed Particle Hydrodynamics on Hybrid Shared/Distributed-Memory Architectures},
-  author={Gonnet, Pedro},
-  journal={SIAM Journal on Scientific Computing},
-  volume={37},
-  number={1},
-  pages={C95--C121},
-  year={2015},
-  publisher={SIAM}
-}
    
 @article{ref:Dagum1998,
     title={{OpenMP}: an industry standard {API} for shared-memory programming},
@@ -390,3 +380,16 @@ archivePrefix = "arXiv",
   bibsource = {dblp computer science bibliography, http://dblp.org}
 }
 
+@inproceedings{ref:Kale1993,
+  author        = "{Kal\'{e}}, L.V. and Krishnan, S.",
+  title         = "{CHARM++: A Portable Concurrent Object Oriented System
+                   Based on C++}",
+  editor        = "Paepcke, A.",
+  fulleditor    = "Paepcke, Andreas",
+  pages         = "91--108",
+  Month         = "September",
+  Year          = "1993",
+  booktitle     = "{Proceedings of OOPSLA'93}",
+  publisher     = "{ACM Press}",
+}
+
diff --git a/theory/paper_pasc/pasc_paper.tex b/theory/paper_pasc/pasc_paper.tex
index 629ff7bc1b01b629dc1b909b7675dfadc8c1c14d..87868a74d69023ee48d793cb2d0e29e71af777d6 100644
--- a/theory/paper_pasc/pasc_paper.tex
+++ b/theory/paper_pasc/pasc_paper.tex
@@ -49,35 +49,42 @@ strong scaling on more than 100\,000 cores.}
   
 \author{
 \alignauthor
-       Matthieu~Schaller\\
-       \affaddr{Institute for Computational Cosmology (ICC)}\\
-       \affaddr{Department of Physics}\\
-       \affaddr{Durham University}\\
-       \affaddr{Durham DH1 3LE, UK}\\
-       \email{\footnotesize \url{matthieu.schaller@durham.ac.uk}}
-\alignauthor
-       Pedro~Gonnet\\
-       \affaddr{School of Engineering and Computing Sciences}\\
-       \affaddr{Durham University}\\
-       \affaddr{Durham DH1 3LE, UK}\\
-\alignauthor
-       Aidan~B.~G.~Chalk\\
-       \affaddr{School of Engineering and Computing Sciences}\\
-       \affaddr{Durham University}\\
-       \affaddr{Durham DH1 3LE, UK}\\
-\and
-\alignauthor
-       Peter~W.~Draper\\
-       \affaddr{Institute for Computational Cosmology (ICC)}\\
-       \affaddr{Department of Physics}\\
-       \affaddr{Durham University}\\
-       \affaddr{Durham DH1 3LE, UK}\\
-       %% \alignauthor
-       %% Tom Theuns\\
-       %% \affaddr{Institute for Computational Cosmology}\\
-       %% \affaddr{Department of Physics}\\
-       %% \affaddr{Durham University}\\
-       %% \affaddr{Durham DH1 3LE, UK}      
+       Main~Author\\
+       \affaddr{Institute}\\
+       \affaddr{Department}\\
+       \affaddr{University}\\
+       \affaddr{City Postal code, Country}\\
+       \email{\footnotesize \url{main.author@university.country}}
+% \alignauthor
+%        Matthieu~Schaller\\
+%        \affaddr{Institute for Computational Cosmology (ICC)}\\
+%        \affaddr{Department of Physics}\\
+%        \affaddr{Durham University}\\
+%        \affaddr{Durham DH1 3LE, UK}\\
+%        \email{\footnotesize \url{matthieu.schaller@durham.ac.uk}}
+% \alignauthor
+%        Pedro~Gonnet\\
+%        \affaddr{School of Engineering and Computing Sciences}\\
+%        \affaddr{Durham University}\\
+%        \affaddr{Durham DH1 3LE, UK}\\
+% \alignauthor
+%        Aidan~B.~G.~Chalk\\
+%        \affaddr{School of Engineering and Computing Sciences}\\
+%        \affaddr{Durham University}\\
+%        \affaddr{Durham DH1 3LE, UK}\\
+% \and
+% \alignauthor
+%        Peter~W.~Draper\\
+%        \affaddr{Institute for Computational Cosmology (ICC)}\\
+%        \affaddr{Department of Physics}\\
+%        \affaddr{Durham University}\\
+%        \affaddr{Durham DH1 3LE, UK}\\
+%        %% \alignauthor
+%        %% Tom Theuns\\
+%        %% \affaddr{Institute for Computational Cosmology}\\
+%        %% \affaddr{Department of Physics}\\
+%        %% \affaddr{Durham University}\\
+%        %% \affaddr{Durham DH1 3LE, UK}      
 }
 
 
@@ -180,7 +187,7 @@ The design and implementation of \swift\footnote{
 \swift is an open-source software project and the latest version of
 the source code, along with all the data needed to run the test cased
 presented in this paper, can be downloaded at \web.}
-\cite{gonnet2013swift,theuns2015swift,gonnet2015efficient}, a large-scale
+\cite{gonnet2013swift,theuns2015swift,ref:Gonnet2015}, a large-scale
 cosmological simulation code built from scratch, provided the perfect
 opportunity to test some newer
 approaches, i.e.~task-based parallelism, fully asynchronous communication, and
@@ -189,6 +196,9 @@ graph partition-based domain decompositions.
 This paper describes these techniques, which are not exclusive to
 cosmological simulations or any specific architecture, as well as
 the results obtained with them.
+While \cite{gonnet2013swift,ref:Gonnet2015} already describes the underlying algorithms
+in more detail, in this paper we focus on the parallelization strategy
+and the results obtained  on larger Tier-0 systems.
 
 
 %#####################################################################################################
@@ -261,24 +271,30 @@ separately:
         within range of each other and evaluate \eqn{dvdt} and \eqn{dudt}.
 \end{enumerate}
 
-Finding the interacting neighbours for each particle constitutes
-the bulk of the computation.
-Many codes, e.g. in Astrophysics simulations \cite{Gingold1977},
-rely on spatial {\em trees}
-for neighbour finding \cite{Gingold1977,Hernquist1989,Springel2005,Wadsley2004},
-i.e.~$k$-d trees \cite{Bentley1975} or octrees \cite{Meagher1982}
-are used to decompose the simulation space.
-In Astrophysics in particular, spatial trees are also a somewhat natural
-choice as they are used to compute long-range gravitational interactions
-via a Barnes-Hut \cite{Barnes1986} or Fast Multipole
-\cite{Carrier1988} method. 
-The particle interactions are then computed by traversing the list of
-particles and searching for their neighbours in the tree.
+Finding the interacting neighbours for each particle constitutes the
+bulk of the computation.  Many codes, e.g. in Astrophysics simulations
+\cite{Gingold1977}, rely on spatial {\em trees} for neighbour finding
+\cite{Gingold1977,Hernquist1989,Springel2005,Wadsley2004}, i.e.~$k$-d
+trees \cite{Bentley1975} or octrees \cite{Meagher1982} are used to
+decompose the simulation space.  In Astrophysics in particular,
+spatial trees are also a somewhat natural choice as they are used to
+compute long-range gravitational interactions via a Barnes-Hut
+\cite{Barnes1986} or Fast Multipole \cite{Carrier1988} method.  The
+particle interactions are then computed by traversing the list of
+particles and searching for their neighbours in the tree.  In current
+state-of-the-art simulations (e.g. \cite{Schaye2015}), the gravity
+calculation corresponds to roughly one quarter of the calculation time
+whilst the hydrodynamics scheme takes approximately half of the
+total time. The remainder is spent in the astrophysics modules, which
+contain interactions of the same kind as the SPH sums presented
+here. Gravity could, however, be vastly sped-up compared to commonly
+used software by employing more modern algorithms such as the Fast
+Multipole Method \cite{Carrier1988}.
 
 Although such tree traversals are trivial to parallelize, they
 have several disadvantages, e.g.~with regards to computational
 efficiency, cache efficiency, and exploiting symmetries in the
-computation (see \cite{gonnet2015efficient} for a more detailed
+computation (see \cite{ref:Gonnet2015} for a more detailed
 analysis).
 
 
@@ -337,7 +353,8 @@ Task-based parallelism is not a particularly new concept and therefore
 several implementations thereof exist, e.g.~Cilk \cite{ref:Blumofe1995},
 QUARK \cite{ref:QUARK}, StarPU \cite{ref:Augonnet2011},
 SMP~Superscalar \cite{ref:SMPSuperscalar}, OpenMP~3.0 \cite{ref:Duran2009},
-and Intel's TBB \cite{ref:Reinders2007}.
+Intel's TBB \cite{ref:Reinders2007}, and, to some extent,
+Charm++ \cite{ref:Kale1993}.
 
 For convenience, and to make experimenting with different scheduling
 techniques easier, we chose to implement our own task scheduler
@@ -357,7 +374,7 @@ which is usually not an option for large and complex codebases.
 
 Since we were re-implementing \swift from scratch, this was not an issue.
 The tree-based neighbour-finding described above was replaced with a more
-task-friendly approach as described in \cite{gonnet2015efficient}.
+task-friendly approach as described in \cite{ref:Gonnet2015}.
 Particle interactions are computed within, and between pairs, of
 hierarchical {\em cells} containing one or more particles.
 The dependencies between the tasks are set following
@@ -375,24 +392,30 @@ on a cell of particles have completed before the force evaluation tasks
 Once all the force tasks on a cell of particles have completed,
 the integrator tasks (inverted triangles) update the particle positions 
 and velocities.
+The decomposition was computed such that each cell contains $\sim 100$ particles,
+which leads to tasks of up to a few milliseconds each.
 
 Due to the cache-friendly nature of the task-based computations, 
 and their ability to exploit symmetries in the particle interactions,
 the task-based approach is already more efficient than the tree-based
 neighbour search on a single core, and scales efficiently to all
-cores of a shared-memory machine \cite{gonnet2015efficient}.
+cores of a shared-memory machine \cite{ref:Gonnet2015}.
 
 \begin{figure}
 \centering
 \includegraphics[width=\columnwidth]{Figures/Hierarchy3}
 \caption{Task hierarchy for the SPH computations in \swift,
-  including communication tasks. Arrows indicate dependencies,
+  including communication tasks.
+  Arrows indicate dependencies,
   i.e.~an arrow from task $A$ to task $B$ indicates that $A$
-  depends on $B$. The task color corresponds to the cell or
+  depends on $B$. 
+  The task color corresponds to the cell or
   cells it operates on, e.g.~the density and force tasks work
   on individual cells or pairs of cells.
-  The blue cell data is on a separate rank as the yellow and
-  purple cells, and thus its data must be sent across during
+  The task hierarchy is shown for three cells: blue, yellow,
+  and purple. Although the data for the yellow cell resides on
+  Node~2, it is required for some tasks on Node~1, and thus needs
+  to be copied over  during
   the computation using {\tt send}/{\tt recv} tasks (diamond-shaped).}
 \label{tasks}
 \end{figure}  
@@ -432,8 +455,13 @@ corresponds to minimizing the time spent on the slowest rank.
 Computing such a partition is a standard graph problem and several
 software libraries which provide good solutions\footnote{Computing
 the optimal partition for more than two nodes is considered NP-hard.},
-e.g.~METIS \cite{ref:Karypis1998} and Zoltan \cite{devine2002zoltan},
-exist.
+e.g.~METIS \cite{ref:Karypis1998} and Zoltan \cite{devine2002zoltan}, exist.
+
+In \swift, the graph partitioning is computed using the METIS library.
+The cost of each task is initially approximated via the
+asymptotic cost of the task type and the number of particles involved.
+After a task has been executed, it's effective computational cost
+is computed and used.
 
 Note that this approach does not explicitly consider any geometric
 constraints, or strive to partition the {\em amount} of data equitably.
@@ -567,6 +595,11 @@ conditions are provided with the source code. We then run the \swift code for
 100 time-steps and average the wall clock time of these time-steps after having
 removed the first and last ones, where disk I/O occurs.
 
+The initial load balancing between nodes is computed in the first steps and
+re-computed every 100 time steps, and is therefore not included in the timings.
+Particles were exchanged between nodes whenever they strayed too far beyond
+the cells in which they originally resided.
+
 \begin{figure}
 \centering
 \includegraphics[width=\columnwidth]{Figures/cosmoVolume}
@@ -630,8 +663,9 @@ the 16 node run.
   is achieved when increasing the thread count from 1 (1 node) to 128 (8 nodes)
   even on this relatively small test case. The dashed line indicates the
   efficiency when running on one single node but using all the physical and
-  virtual cores (hyper-threading). As these CPUs only have one FPU per core, we
-  see no benefit from hyper-threading.
+  virtual cores (hyper-threading). As these CPUs only have one FPU per core, the
+  benefit from hyper-threading is limited to a 20\% improvement when going
+  from 16 cores to 32 hyperthreads.
   \label{fig:cosma}}
 \end{figure*}
 
@@ -641,7 +675,8 @@ the 16 node run.
 
 For our next test, we ran \swift on the SuperMUC x86 phase~1 thin
 nodes \footnote{\url{https://www.lrz.de/services/compute/supermuc/systemdescription/}}
-located at the Leibniz Supercomputing Center in Garching near Munich. This
+located at the Leibniz Supercomputing Center in Garching near Munich,
+currently ranked 23rd in the Top500 list\footnote{\url{http://www.top500.org/list/2015/11/}}. This
 system consists of 9\,216 nodes with 2 Intel Sandy Bridge-EP Xeon E5-2680
 8C\footnote{\url{http://ark.intel.com/products/64583/Intel-Xeon-Processor-E5-2680-(20M-Cache-2_70-GHz-8_00-GTs-Intel-QPI)}}
 at $2.7~\rm{GHz}$ CPUS. Each 16-core node has $32~\rm{GByte}$ of RAM.
@@ -681,7 +716,9 @@ threads per node, i.e.~one thread per physical core.
 
 For our last set of tests, we ran \swift on the JUQUEEN IBM Blue Gene/Q
 system\footnote{\url{http://www.fz-juelich.de/ias/jsc/EN/Expertise/Supercomputers/JUQUEEN/Configuration/Configuration_node.html}}
-located at the J\"ulich Supercomputing Center. This system consists of
+located at the J\"ulich Supercomputing Center,
+currently ranked 11th in the Top500 list.
+This system consists of
 28\,672 nodes with an IBM PowerPC A2 processor running at
 $1.6~\rm{GHz}$ and $16~\rm{GByte}$ of RAM each. Of notable interest
 is the presence of two floating units per compute core. The system is
@@ -764,8 +801,9 @@ course of a time-step. For example, on the SuperMUC machine with 32 nodes (512
 cores), each MPI rank contains approximately $1.6\times10^7$ particles in
 $2.5\times10^5$ cells. \swift will generate around $58\,000$ point-to-point
 asynchronous MPI communications (a pair of \texttt{send} and \texttt{recv} tasks)
-{\em per node} and {\em per timestep}. Such an insane number of messages is
-discouraged by most practitioners.
+{\em per node} and {\em per timestep}. Each one of these communications involves,
+on average, no more than 6\,kB of data. Such an insane number of small messages is
+discouraged by most practitioners, but seems to works well in practice.
 Dispatching communications
 over the course of the calculation and not in short bursts, as is commonly done,
 may also help lower the load on the network.
@@ -791,6 +829,22 @@ ability combined with future work on vectorization of the calculations within
 each task will hopefully make \swift an important tool for future simulations in
 cosmology and help push the entire field to a new level.
 
+These results, which are in no way restricted to astrophysical simulations,
+provide a compelling argument for moving away from the traditional
+branch-and-bound paradigm of both shared and distributed memory programming
+using synchronous MPI and OpenMP.
+Although fully asynchronous methods, due to their somewhat anarchic nature,
+may seem more difficult to control, they are conceptually simple and easy
+to implement\footnote{The \swift source code consists of less than 21\,K lines
+of code, of which roughly only one tenth are needed to implement the task-based
+parallelism and asynchronous communication.}.
+The real cost of using a task-based approach comes from having
+to rethink the entire computation to fit the task-based setting. This may,
+as in our specific case, lead to completely rethinking the underlying
+algorithms and reimplementing a code from scratch.
+Given our results to date, however, we do not believe we will regret this
+investment.
+
 \swift, its documentation, and the test cases presented in this paper are all
 available at the address \web.