diff --git a/.gitignore b/.gitignore
index 8137ea759b24b3f4ec9909a460da4bcb47b0a1ac..9bae25ebff81d077253fd8f1227aad98545d28a0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,15 +25,23 @@ examples/swift_mindt
 examples/swift_mindt_mpi
 examples/swift_mpi
 
-tests/testVectorize
-tests/brute_force.dat
-tests/swift_dopair.dat
+tests/testPair
+tests/brute_force_standard.dat
+tests/swift_dopair_standard.dat
+tests/brute_force_perturbed.dat
+tests/swift_dopair_perturbed.dat
+tests/test27cells
+tests/brute_force_27_standard.dat
+tests/swift_dopair_27_standard.dat
+tests/brute_force_27_perturbed.dat
+tests/swift_dopair_27_perturbed.dat
 tests/testGreetings
 tests/testReading
 tests/input.hdf5
 tests/testSingle
 tests/testTimeIntegration
 tests/testSPHStep
+tests/testParser
 
 theory/latex/swift.pdf
 
diff --git a/examples/main.c b/examples/main.c
index c88f92a07a747c327692b5e0fbbc7dc07b93ac0c..9523af49ed30c54d256d287ea2846a854650bc05 100644
--- a/examples/main.c
+++ b/examples/main.c
@@ -55,7 +55,6 @@
  * @brief Main routine that loads a few particles and generates some output.
  *
  */
-
 int main(int argc, char *argv[]) {
 
   int c, icount, periodic = 1;
@@ -79,7 +78,10 @@ int main(int argc, char *argv[]) {
   int nr_nodes = 1, myrank = 0;
   FILE *file_thread;
   int with_outputs = 1;
-  int verbose = 0, talking;
+  int with_external_gravity = 0;
+  int with_self_gravity = 0;
+  int engine_policies = 0;
+  int verbose = 0, talking = 0;
   unsigned long long cpufreq = 0;
 
 #ifdef WITH_MPI
@@ -97,12 +99,15 @@ int main(int argc, char *argv[]) {
 #endif
 #endif
 
-/* Choke on FP-exceptions. */
-// feenableexcept( FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW );
+  /* Choke on FP-exceptions. */
+  // feenableexcept( FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW );
+
+  /* Initialize CPU frequency, this also starts time. */
+  clocks_set_cpufreq(cpufreq);
 
 #ifdef WITH_MPI
   /* Start by initializing MPI. */
-  int res, prov;
+  int res = 0, prov = 0;
   if ((res = MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &prov)) !=
       MPI_SUCCESS)
     error("Call to MPI_Init failed with error %i.", res);
@@ -128,9 +133,6 @@ int main(int argc, char *argv[]) {
          &initial_partition.grid[1], &initial_partition.grid[0]);
 #endif
 
-  /* Initialize CPU frequency, this also starts time. */
-  clocks_set_cpufreq(cpufreq);
-
   /* Greeting message */
   if (myrank == 0) greetings();
 
@@ -156,7 +158,7 @@ int main(int argc, char *argv[]) {
   bzero(&s, sizeof(struct space));
 
   /* Parse the options */
-  while ((c = getopt(argc, argv, "a:c:d:e:f:h:m:oP:q:R:s:t:v:w:y:z:")) != -1)
+  while ((c = getopt(argc, argv, "a:c:d:e:f:gGh:m:oP:q:R:s:t:v:w:y:z:")) != -1)
     switch (c) {
       case 'a':
         if (sscanf(optarg, "%lf", &scaling) != 1)
@@ -185,6 +187,12 @@ int main(int argc, char *argv[]) {
       case 'f':
         if (!strcpy(ICfileName, optarg)) error("Error parsing IC file name.");
         break;
+      case 'g':
+        with_external_gravity = 1;
+        break;
+      case 'G':
+        with_self_gravity = 1;
+        break;
       case 'h':
         if (sscanf(optarg, "%llu", &cpufreq) != 1)
           error("Error parsing CPU frequency.");
@@ -343,10 +351,6 @@ int main(int argc, char *argv[]) {
     message("CPU frequency used for tick conversion: %llu Hz", cpufreq);
   }
 
-  /* Check we have sensible time step bounds */
-  if (dt_min > dt_max)
-    error("Minimal time step size must be large than maximal time step size ");
-
   /* Check whether an IC file has been provided */
   if (strcmp(ICfileName, "") == 0)
     error("An IC file name must be provided via the option -f");
@@ -356,11 +360,11 @@ int main(int argc, char *argv[]) {
   if (myrank == 0) clocks_gettime(&tic);
 #if defined(WITH_MPI)
 #if defined(HAVE_PARALLEL_HDF5)
-  read_ic_parallel(ICfileName, dim, &parts, &Ngas, &periodic, myrank, nr_nodes,
-                   MPI_COMM_WORLD, MPI_INFO_NULL);
+  read_ic_parallel(ICfileName, dim, &parts, &gparts, &Ngas, &Ngpart, &periodic,
+                   myrank, nr_nodes, MPI_COMM_WORLD, MPI_INFO_NULL);
 #else
-  read_ic_serial(ICfileName, dim, &parts, &Ngas, &periodic, myrank, nr_nodes,
-                 MPI_COMM_WORLD, MPI_INFO_NULL);
+  read_ic_serial(ICfileName, dim, &parts, &gparts, &Ngas, &Ngpart, &periodic,
+                 myrank, nr_nodes, MPI_COMM_WORLD, MPI_INFO_NULL);
 #endif
 #else
   read_ic_single(ICfileName, dim, &parts, &gparts, &Ngas, &Ngpart, &periodic);
@@ -376,6 +380,7 @@ int main(int argc, char *argv[]) {
 #if defined(WITH_MPI)
   long long N_long[2] = {Ngas, Ngpart};
   MPI_Reduce(&N_long, &N_total, 2, MPI_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
+  N_total[1] -= N_total[0];
   if (myrank == 0)
     message("Read %lld gas particles and %lld DM particles from the ICs",
             N_total[0], N_total[1]);
@@ -383,8 +388,33 @@ int main(int argc, char *argv[]) {
   N_total[0] = Ngas;
   N_total[1] = Ngpart - Ngas;
   message("Read %lld gas particles and %lld DM particles from the ICs",
-	  N_total[0], N_total[1]);
+          N_total[0], N_total[1]);
+#endif
+
+  /* MATTHIEU: Temporary fix to preserve master */
+  if (!with_external_gravity && !with_self_gravity) {
+    free(gparts);
+    gparts = NULL;
+    for (size_t k = 0; k < Ngas; ++k) parts[k].gpart = NULL;
+    Ngpart = 0;
+#if defined(WITH_MPI)
+    N_long[0] = Ngas;
+    N_long[1] = Ngpart;
+    MPI_Reduce(&N_long, &N_total, 2, MPI_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
+    if (myrank == 0)
+      message(
+          "AFTER FIX: Read %lld gas particles and %lld DM particles from the "
+          "ICs",
+          N_total[0], N_total[1]);
+#else
+    N_total[0] = Ngas;
+    N_total[1] = Ngpart;
+    message(
+        "AFTER FIX: Read %lld gas particles and %lld DM particles from the ICs",
+        N_total[0], N_total[1]);
 #endif
+  }
+  /* MATTHIEU: End temporary fix */
 
   /* Apply h scaling */
   if (scaling != 1.0)
@@ -448,12 +478,16 @@ int main(int argc, char *argv[]) {
     message("nr of cells at depth %i is %i.", data[0], data[1]);
   }
 
+  /* Construct the engine policy */
+  engine_policies = ENGINE_POLICY | engine_policy_steal | engine_policy_hydro;
+  if (with_external_gravity) engine_policies |= engine_policy_external_gravity;
+  if (with_self_gravity) engine_policies |= engine_policy_self_gravity;
+
   /* Initialize the engine with this space. */
   if (myrank == 0) clocks_gettime(&tic);
   if (myrank == 0) message("nr_nodes is %i.", nr_nodes);
   engine_init(&e, &s, dt_max, nr_threads, nr_queues, nr_nodes, myrank,
-              ENGINE_POLICY | engine_policy_steal | engine_policy_hydro, 0,
-              time_end, dt_min, dt_max, talking);
+              engine_policies, 0, time_end, dt_min, dt_max, talking);
   if (myrank == 0 && verbose) {
     clocks_gettime(&toc);
     message("engine_init took %.3f %s.", clocks_diff(&tic, &toc),
@@ -510,8 +544,8 @@ int main(int argc, char *argv[]) {
   /* Legend */
   if (myrank == 0)
     printf(
-        "# Step  Time  time-step  Number of updates    CPU Wall-clock time "
-        "[%s]\n",
+        "# Step  Time  time-step  Number of updates  Number of updates "
+        "CPU Wall-clock time [%s]\n",
         clocks_getunit());
 
   /* Let loose a runner on the space. */
diff --git a/src/Makefile.am b/src/Makefile.am
index f44d47819672d10445fd969fe2ff20dbcb49463b..15c05a2a00d33ad86e7144b4a8e377252a2eedce 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -35,13 +35,13 @@ endif
 # List required headers
 include_HEADERS = space.h runner.h queue.h task.h lock.h cell.h part.h const.h \
     engine.h swift.h serial_io.h timers.h debug.h scheduler.h proxy.h parallel_io.h \
-    common_io.h single_io.h multipole.h map.h tools.h partition.h clocks.h
+    common_io.h single_io.h multipole.h map.h tools.h partition.h clocks.h parser.h
 
 # Common source files
 AM_SOURCES = space.c runner.c queue.c task.c cell.c engine.c \
     serial_io.c timers.c debug.c scheduler.c proxy.c parallel_io.c \
     units.c common_io.c single_io.c multipole.c version.c map.c \
-    kernel.c tools.c part.c partition.c clocks.c
+    kernel.c tools.c part.c partition.c clocks.c parser.c
 
 # Include files for distribution, not installation.
 nobase_noinst_HEADERS = approx_math.h atomic.h cycle.h error.h inline.h kernel.h vector.h \
diff --git a/src/cell.c b/src/cell.c
index 696f53069b9974c94f8b25e10f7dcba81fae8069..31a632a5b40a7706eeef6accc385d57e27f0f247 100644
--- a/src/cell.c
+++ b/src/cell.c
@@ -45,6 +45,7 @@
 /* Local headers. */
 #include "atomic.h"
 #include "error.h"
+#include "gravity.h"
 #include "hydro.h"
 #include "space.h"
 #include "timers.h"
@@ -89,14 +90,18 @@ int cell_unpack(struct pcell *pc, struct cell *c, struct space *s) {
   c->ti_end_min = pc->ti_end_min;
   c->ti_end_max = pc->ti_end_max;
   c->count = pc->count;
+  c->gcount = pc->gcount;
   c->tag = pc->tag;
 
-  /* Fill the progeny recursively, depth-first. */
+  /* Number of new cells created. */
   int count = 1;
+
+  /* Fill the progeny recursively, depth-first. */
   for (int k = 0; k < 8; k++)
     if (pc->progeny[k] >= 0) {
       struct cell *temp = space_getcell(s);
       temp->count = 0;
+      temp->gcount = 0;
       temp->loc[0] = c->loc[0];
       temp->loc[1] = c->loc[1];
       temp->loc[2] = c->loc[2];
@@ -122,7 +127,7 @@ int cell_unpack(struct pcell *pc, struct cell *c, struct space *s) {
 }
 
 /**
- * @brief Link the cells recursively to the given part array.
+ * @brief Link the cells recursively to the given #part array.
  *
  * @param c The #cell.
  * @param parts The #part array.
@@ -130,7 +135,7 @@ int cell_unpack(struct pcell *pc, struct cell *c, struct space *s) {
  * @return The number of particles linked.
  */
 
-int cell_link(struct cell *c, struct part *parts) {
+int cell_link_parts(struct cell *c, struct part *parts) {
 
   c->parts = parts;
 
@@ -139,14 +144,40 @@ int cell_link(struct cell *c, struct part *parts) {
     int offset = 0;
     for (int k = 0; k < 8; k++) {
       if (c->progeny[k] != NULL)
-        offset += cell_link(c->progeny[k], &parts[offset]);
+        offset += cell_link_parts(c->progeny[k], &parts[offset]);
     }
   }
 
-  /* Return the total number of unpacked cells. */
+  /* Return the total number of linked particles. */
   return c->count;
 }
 
+/**
+ * @brief Link the cells recursively to the given #gpart array.
+ *
+ * @param c The #cell.
+ * @param gparts The #gpart array.
+ *
+ * @return The number of particles linked.
+ */
+
+int cell_link_gparts(struct cell *c, struct gpart *gparts) {
+
+  c->gparts = gparts;
+
+  /* Fill the progeny recursively, depth-first. */
+  if (c->split) {
+    int offset = 0;
+    for (int k = 0; k < 8; k++) {
+      if (c->progeny[k] != NULL)
+        offset += cell_link_gparts(c->progeny[k], &gparts[offset]);
+    }
+  }
+
+  /* Return the total number of linked particles. */
+  return c->gcount;
+}
+
 /**
  * @brief Pack the data of the given cell and all it's sub-cells.
  *
@@ -164,6 +195,7 @@ int cell_pack(struct cell *c, struct pcell *pc) {
   pc->ti_end_min = c->ti_end_min;
   pc->ti_end_max = c->ti_end_max;
   pc->count = c->count;
+  pc->gcount = c->gcount;
   c->tag = pc->tag = atomic_inc(&cell_next_tag) % cell_max_tag;
 
   /* Fill in the progeny, depth-first recursion. */
@@ -574,6 +606,27 @@ void cell_init_parts(struct cell *c, void *data) {
   c->ti_end_max = 0;
 }
 
+/**
+ * @brief Initialises all g-particles to a valid state even if the ICs were
+ *stupid
+ *
+ * @param c Cell to act upon
+ * @param data Unused parameter
+ */
+void cell_init_gparts(struct cell *c, void *data) {
+
+  struct gpart *gp = c->gparts;
+  const int gcount = c->gcount;
+
+  for (int i = 0; i < gcount; ++i) {
+    gp[i].ti_begin = 0;
+    gp[i].ti_end = 0;
+    gravity_first_init_gpart(&gp[i]);
+  }
+  c->ti_end_min = 0;
+  c->ti_end_max = 0;
+}
+
 /**
  * @brief Converts hydro quantities to a valid state after the initial density
  *calculation
diff --git a/src/cell.h b/src/cell.h
index 857aa9282930fea330df03992ae140f97ae0f630..8b65fa1904a4aa407a15bc30954651dc5c4e29e5 100644
--- a/src/cell.h
+++ b/src/cell.h
@@ -47,7 +47,7 @@ struct pcell {
   int ti_end_min, ti_end_max;
 
   /* Number of particles in this cell. */
-  int count;
+  int count, gcount;
 
   /* tag used for MPI communication. */
   int tag;
@@ -144,7 +144,7 @@ struct cell {
   double mass, e_pot, e_int, e_kin;
 
   /* Number of particles updated in this cell. */
-  int updated;
+  int updated, g_updated;
 
   /* Linking pointer for "memory management". */
   struct cell *next;
@@ -178,8 +178,10 @@ void cell_gunlocktree(struct cell *c);
 int cell_pack(struct cell *c, struct pcell *pc);
 int cell_unpack(struct pcell *pc, struct cell *c, struct space *s);
 int cell_getsize(struct cell *c);
-int cell_link(struct cell *c, struct part *parts);
+int cell_link_parts(struct cell *c, struct part *parts);
+int cell_link_gparts(struct cell *c, struct gpart *gparts);
 void cell_init_parts(struct cell *c, void *data);
+void cell_init_gparts(struct cell *c, void *data);
 void cell_convert_hydro(struct cell *c, void *data);
 void cell_clean_links(struct cell *c, void *data);
 
diff --git a/src/common_io.c b/src/common_io.c
index b3d24aec402fc1cc38255239c60e3e630f33b051..9e162bc350f13b543a471927d3a4720a43a295d2 100644
--- a/src/common_io.c
+++ b/src/common_io.c
@@ -45,6 +45,9 @@
 #include "kernel.h"
 #include "version.h"
 
+const char* particle_type_names[NUM_PARTICLE_TYPES] = {
+    "Gas", "DM", "Boundary", "Dummy", "Star", "BH"};
+
 /**
  * @brief Converts a C data type to the HDF5 equivalent.
  *
@@ -402,52 +405,68 @@ void createXMFfile() {
  *snapshot
  *
  * @param xmfFile The file to write in.
- * @param Nparts The number of particles.
  * @param hdfFileName The name of the HDF5 file corresponding to this output.
  * @param time The current simulation time.
  */
-void writeXMFheader(FILE* xmfFile, long long Nparts, char* hdfFileName,
-                    float time) {
+void writeXMFoutputheader(FILE* xmfFile, char* hdfFileName, float time) {
   /* Write end of file */
 
+  fprintf(xmfFile, "<!-- XMF description for file: %s -->\n", hdfFileName);
   fprintf(xmfFile,
           "<Grid GridType=\"Collection\" CollectionType=\"Spatial\">\n");
   fprintf(xmfFile, "<Time Type=\"Single\" Value=\"%f\"/>\n", time);
-  fprintf(xmfFile, "<Grid Name=\"Gas\" GridType=\"Uniform\">\n");
-  fprintf(xmfFile,
-          "<Topology TopologyType=\"Polyvertex\" Dimensions=\"%lld\"/>\n",
-          Nparts);
-  fprintf(xmfFile, "<Geometry GeometryType=\"XYZ\">\n");
-  fprintf(xmfFile,
-          "<DataItem Dimensions=\"%lld 3\" NumberType=\"Double\" "
-          "Precision=\"8\" "
-          "Format=\"HDF\">%s:/PartType0/Coordinates</DataItem>\n",
-          Nparts, hdfFileName);
-  fprintf(xmfFile, "</Geometry>");
 }
 
 /**
  * @brief Writes the end of the XMF file (closes all open markups)
  *
  * @param xmfFile The file to write in.
+ * @param output The number of this output.
+ * @param time The current simulation time.
  */
-void writeXMFfooter(FILE* xmfFile) {
+void writeXMFoutputfooter(FILE* xmfFile, int output, float time) {
   /* Write end of the section of this time step */
 
-  fprintf(xmfFile, "\n</Grid>\n");
-  fprintf(xmfFile, "</Grid>\n");
-  fprintf(xmfFile, "\n</Grid>\n");
+  fprintf(xmfFile,
+          "\n</Grid> <!-- End of meta-data for output=%03i, time=%f -->\n",
+          output, time);
+  fprintf(xmfFile, "\n</Grid> <!-- timeSeries -->\n");
   fprintf(xmfFile, "</Domain>\n");
   fprintf(xmfFile, "</Xdmf>\n");
 
   fclose(xmfFile);
 }
 
+void writeXMFgroupheader(FILE* xmfFile, char* hdfFileName, size_t N,
+                         enum PARTICLE_TYPE ptype) {
+  fprintf(xmfFile, "\n<Grid Name=\"%s\" GridType=\"Uniform\">\n",
+          particle_type_names[ptype]);
+  fprintf(xmfFile,
+          "<Topology TopologyType=\"Polyvertex\" Dimensions=\"%zi\"/>\n", N);
+  fprintf(xmfFile, "<Geometry GeometryType=\"XYZ\">\n");
+  fprintf(xmfFile,
+          "<DataItem Dimensions=\"%zi 3\" NumberType=\"Double\" "
+          "Precision=\"8\" "
+          "Format=\"HDF\">%s:/PartType%d/Coordinates</DataItem>\n",
+          N, hdfFileName, ptype);
+  fprintf(xmfFile,
+          "</Geometry>\n <!-- Done geometry for %s, start of particle fields "
+          "list -->\n",
+          particle_type_names[ptype]);
+}
+
+void writeXMFgroupfooter(FILE* xmfFile, enum PARTICLE_TYPE ptype) {
+  fprintf(xmfFile, "</Grid> <!-- End of meta-data for parttype=%s -->\n",
+          particle_type_names[ptype]);
+}
+
 /**
  * @brief Writes the lines corresponding to an array of the HDF5 output
  *
  * @param xmfFile The file in which to write
  * @param fileName The name of the HDF5 file associated to this XMF descriptor.
+ * @param partTypeGroupName The name of the group containing the particles in
+ *the HDF5 file.
  * @param name The name of the array in the HDF5 file.
  * @param N The number of particles.
  * @param dim The dimension of the quantity (1 for scalars, 3 for vectors).
@@ -455,21 +474,21 @@ void writeXMFfooter(FILE* xmfFile) {
  *
  * @todo Treat the types in a better way.
  */
-void writeXMFline(FILE* xmfFile, char* fileName, char* name, long long N,
-                  int dim, enum DATA_TYPE type) {
+void writeXMFline(FILE* xmfFile, char* fileName, char* partTypeGroupName,
+                  char* name, size_t N, int dim, enum DATA_TYPE type) {
   fprintf(xmfFile,
           "<Attribute Name=\"%s\" AttributeType=\"%s\" Center=\"Node\">\n",
           name, dim == 1 ? "Scalar" : "Vector");
   if (dim == 1)
     fprintf(xmfFile,
-            "<DataItem Dimensions=\"%lld\" NumberType=\"Double\" "
-            "Precision=\"%d\" Format=\"HDF\">%s:/PartType0/%s</DataItem>\n",
-            N, type == FLOAT ? 4 : 8, fileName, name);
+            "<DataItem Dimensions=\"%zi\" NumberType=\"Double\" "
+            "Precision=\"%d\" Format=\"HDF\">%s:%s/%s</DataItem>\n",
+            N, type == FLOAT ? 4 : 8, fileName, partTypeGroupName, name);
   else
     fprintf(xmfFile,
-            "<DataItem Dimensions=\"%lld %d\" NumberType=\"Double\" "
-            "Precision=\"%d\" Format=\"HDF\">%s:/PartType0/%s</DataItem>\n",
-            N, dim, type == FLOAT ? 4 : 8, fileName, name);
+            "<DataItem Dimensions=\"%zi %d\" NumberType=\"Double\" "
+            "Precision=\"%d\" Format=\"HDF\">%s:%s/%s</DataItem>\n",
+            N, dim, type == FLOAT ? 4 : 8, fileName, partTypeGroupName, name);
   fprintf(xmfFile, "</Attribute>\n");
 }
 
@@ -483,14 +502,13 @@ void writeXMFline(FILE* xmfFile, char* fileName, char* name, long long N,
  * @param gparts The array of #gpart freshly read in.
  * @param Ndm The number of DM particles read in.
  */
-void prepare_dm_gparts(struct gpart* gparts, size_t Ndm) {
+void prepare_dm_gparts(struct gpart* const gparts, size_t Ndm) {
 
   /* Let's give all these gparts a negative id */
   for (size_t i = 0; i < Ndm; ++i) {
-
     /* 0 or negative ids are not allowed */
     if (gparts[i].id_or_neg_offset <= 0)
-      error("0 or negative ID for DM particle");
+      error("0 or negative ID for DM particle %zd: ID=%lld", i, gparts[i].id_or_neg_offset);
   }
 }
 
@@ -506,8 +524,9 @@ void prepare_dm_gparts(struct gpart* gparts, size_t Ndm) {
  * @param Ngas The number of gas particles read in.
  * @param Ndm The number of DM particles read in.
  */
-void duplicate_hydro_gparts(struct part* parts, struct gpart* gparts,
-                            size_t Ngas, size_t Ndm) {
+void duplicate_hydro_gparts(struct part* const parts,
+                            struct gpart* const gparts, size_t Ngas,
+                            size_t Ndm) {
 
   for (size_t i = 0; i < Ngas; ++i) {
 
@@ -536,14 +555,17 @@ void duplicate_hydro_gparts(struct part* parts, struct gpart* gparts,
  * @param dmparts The array of #gpart containg DM particles to be filled.
  * @param Ndm The number of DM particles.
  */
-void collect_dm_gparts(struct gpart* gparts, size_t Ntot, struct gpart* dmparts,
-                       size_t Ndm) {
+void collect_dm_gparts(const struct gpart* const gparts, size_t Ntot,
+                       struct gpart* const dmparts, size_t Ndm) {
 
   size_t count = 0;
 
   /* Loop over all gparts */
   for (size_t i = 0; i < Ntot; ++i) {
 
+    /* message("i=%zd count=%zd id=%lld part=%p", i, count, gparts[i].id,
+     * gparts[i].part); */
+
     /* And collect the DM ones */
     if (gparts[i].id_or_neg_offset > 0) {
       dmparts[count] = gparts[i];
diff --git a/src/common_io.h b/src/common_io.h
index 2623a03f9a25ce0e650dde4f698da6eb49177e26..961f40e63d771e5e06ade525301caf59aae0bceb 100644
--- a/src/common_io.h
+++ b/src/common_io.h
@@ -70,14 +70,20 @@ enum PARTICLE_TYPE {
   NUM_PARTICLE_TYPES
 };
 
+extern const char* particle_type_names[];
+
+#define FILENAME_BUFFER_SIZE 150
+#define PARTICLE_GROUP_BUFFER_SIZE 20
+
 hid_t hdf5Type(enum DATA_TYPE type);
 size_t sizeOfType(enum DATA_TYPE type);
 
-void collect_dm_gparts(struct gpart* gparts, size_t Ntot, struct gpart* dmparts,
-                       size_t Ndm);
-void prepare_dm_gparts(struct gpart* gparts, size_t Ndm);
-void duplicate_hydro_gparts(struct part* parts, struct gpart* gparts,
-                            size_t Ngas, size_t Ndm);
+void collect_dm_gparts(const struct gpart* const gparts, size_t Ntot,
+                       struct gpart* const dmparts, size_t Ndm);
+void prepare_dm_gparts(struct gpart* const gparts, size_t Ndm);
+void duplicate_hydro_gparts(struct part* const parts,
+                            struct gpart* const gparts, size_t Ngas,
+                            size_t Ndm);
 
 void readAttribute(hid_t grp, char* name, enum DATA_TYPE type, void* data);
 
@@ -92,10 +98,13 @@ void writeAttribute_s(hid_t grp, char* name, const char* str);
 
 void createXMFfile();
 FILE* prepareXMFfile();
-void writeXMFfooter(FILE* xmfFile);
-void writeXMFheader(FILE* xmfFile, long long N, char* hdfFileName, float time);
-void writeXMFline(FILE* xmfFile, char* fileName, char* name, long long N,
-                  int dim, enum DATA_TYPE type);
+void writeXMFoutputheader(FILE* xmfFile, char* hdfFileName, float time);
+void writeXMFoutputfooter(FILE* xmfFile, int outputCount, float time);
+void writeXMFgroupheader(FILE* xmfFile, char* hdfFileName, size_t N,
+                         enum PARTICLE_TYPE ptype);
+void writeXMFgroupfooter(FILE* xmfFile, enum PARTICLE_TYPE ptype);
+void writeXMFline(FILE* xmfFile, char* fileName, char* partTypeGroupName,
+                  char* name, size_t N, int dim, enum DATA_TYPE type);
 
 void writeCodeDescription(hid_t h_file);
 void writeSPHflavour(hid_t h_file);
diff --git a/src/engine.c b/src/engine.c
index b7658535335bd02d309c9cf69da61ffcc2f6c160..4cd33ef20f5aa040d075eb76ac07db5413062f7b 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -139,39 +139,56 @@ void engine_make_ghost_tasks(struct engine *e, struct cell *c,
  * @brief Redistribute the particles amongst the nodes according
  *      to their cell's node IDs.
  *
+ * The strategy here is as follows:
+ * 1) Each node counts the number of particles it has to send to each other
+ * node.
+ * 2) The number of particles of each type is then exchanged.
+ * 3) The particles to send are placed in a temporary buffer in which the
+ * part-gpart links are preserved.
+ * 4) Each node allocates enough space for the new particles.
+ * 5) (Asynchronous) communications are issued to transfer the data.
+ *
+ *
  * @param e The #engine.
  */
-
 void engine_redistribute(struct engine *e) {
 
 #ifdef WITH_MPI
 
-  int nr_nodes = e->nr_nodes, nodeID = e->nodeID;
+  const int nr_nodes = e->nr_nodes;
+  const int nodeID = e->nodeID;
   struct space *s = e->s;
-  int my_cells = 0;
-  int *cdim = s->cdim;
   struct cell *cells = s->cells;
-  int nr_cells = s->nr_cells;
+  const int nr_cells = s->nr_cells;
+  const int *cdim = s->cdim;
+  const double ih[3] = {s->ih[0], s->ih[1], s->ih[2]};
+  const double dim[3] = {s->dim[0], s->dim[1], s->dim[2]};
+  struct part *parts = s->parts;
+  struct xpart *xparts = s->xparts;
+  struct gpart *gparts = s->gparts;
   ticks tic = getticks();
 
-  /* Start by sorting the particles according to their nodes and
-     getting the counts. The counts array is indexed as
-     count[from * nr_nodes + to]. */
-  int *counts;
-  size_t *dest;
-  double ih[3], dim[3];
-  ih[0] = s->ih[0];
-  ih[1] = s->ih[1];
-  ih[2] = s->ih[2];
-  dim[0] = s->dim[0];
-  dim[1] = s->dim[1];
-  dim[2] = s->dim[2];
-  if ((counts = (int *)malloc(sizeof(int) *nr_nodes *nr_nodes)) == NULL ||
-      (dest = (size_t *)malloc(sizeof(size_t) * s->nr_parts)) == NULL)
-    error("Failed to allocate count and dest buffers.");
+  /* Allocate temporary arrays to store the counts of particles to be sent
+     and the destination of each particle */
+  int *counts, *g_counts;
+  if ((counts = (int *)malloc(sizeof(int) * nr_nodes * nr_nodes)) == NULL)
+    error("Failed to allocate count temporary buffer.");
+  if ((g_counts = (int *)malloc(sizeof(int) * nr_nodes * nr_nodes)) == NULL)
+    error("Failed to allocate gcount temporary buffer.");
   bzero(counts, sizeof(int) * nr_nodes * nr_nodes);
-  struct part *parts = s->parts;
+  bzero(g_counts, sizeof(int) * nr_nodes * nr_nodes);
+
+  // Allocate the destination index arrays.
+  int *dest, *g_dest;
+  if ((dest = (int *)malloc(sizeof(int) * s->nr_parts)) == NULL)
+    error("Failed to allocate dest temporary buffer.");
+  if ((g_dest = (int *)malloc(sizeof(int) * s->nr_gparts)) == NULL)
+    error("Failed to allocate g_dest temporary buffer.");
+
+  /* Get destination of each particle */
   for (size_t k = 0; k < s->nr_parts; k++) {
+
+    /* Periodic boundary conditions */
     for (int j = 0; j < 3; j++) {
       if (parts[k].x[j] < 0.0)
         parts[k].x[j] += dim[j];
@@ -184,36 +201,121 @@ void engine_redistribute(struct engine *e) {
        error("Bad cell id %i for part %i at [%.3e,%.3e,%.3e].",
              cid, k, parts[k].x[0], parts[k].x[1], parts[k].x[2]); */
     dest[k] = cells[cid].nodeID;
+
+    /* The counts array is indexed as count[from * nr_nodes + to]. */
     counts[nodeID * nr_nodes + dest[k]] += 1;
   }
+
+  /* Sort the particles according to their cell index. */
   space_parts_sort(s, dest, s->nr_parts, 0, nr_nodes - 1, e->verbose);
 
+  /* We need to re-link the gpart partners of parts. */
+  int current_dest = dest[0];
+  size_t count_this_dest = 0;
+  for (size_t k = 0; k < s->nr_parts; ++k) {
+    if (s->parts[k].gpart != NULL) {
+
+      /* As the addresses will be invalidated by the communications, we will */
+      /* instead store the absolute index from the start of the sub-array */
+      /* of particles to be sent to a given node. */
+      /* Recall that gparts without partners have a negative id. */
+      /* We will restore the pointers on the receiving node later on. */
+      if (dest[k] != current_dest) {
+        current_dest = dest[k];
+        count_this_dest = 0;
+      }
+
+      /* Debug */
+      /* if(s->parts[k].gpart->id < 0) */
+      /* 	error("Trying to link a partnerless gpart !"); */
+
+      s->parts[k].gpart->id_or_neg_offset = -count_this_dest;
+      count_this_dest++;
+    }
+  }
+
+  /* Get destination of each g-particle */
+  for (size_t k = 0; k < s->nr_gparts; k++) {
+
+    /* Periodic boundary conditions */
+    for (int j = 0; j < 3; j++) {
+      if (gparts[k].x[j] < 0.0)
+        gparts[k].x[j] += dim[j];
+      else if (gparts[k].x[j] >= dim[j])
+        gparts[k].x[j] -= dim[j];
+    }
+    const int cid = cell_getid(cdim, gparts[k].x[0] * ih[0],
+                               gparts[k].x[1] * ih[1], gparts[k].x[2] * ih[2]);
+    /* if (cid < 0 || cid >= s->nr_cells)
+       error("Bad cell id %i for part %i at [%.3e,%.3e,%.3e].",
+             cid, k, g_parts[k].x[0], g_parts[k].x[1], g_parts[k].x[2]); */
+    g_dest[k] = cells[cid].nodeID;
+
+    /* The counts array is indexed as count[from * nr_nodes + to]. */
+    g_counts[nodeID * nr_nodes + g_dest[k]] += 1;
+  }
+
+  /* Sort the gparticles according to their cell index. */
+  space_gparts_sort(gparts, g_dest, s->nr_gparts, 0, nr_nodes - 1);
+
   /* Get all the counts from all the nodes. */
   if (MPI_Allreduce(MPI_IN_PLACE, counts, nr_nodes * nr_nodes, MPI_INT, MPI_SUM,
                     MPI_COMM_WORLD) != MPI_SUCCESS)
     error("Failed to allreduce particle transfer counts.");
 
-  /* Get the new number of parts for this node, be generous in allocating. */
-  size_t nr_parts = 0;
+  /* Get all the g_counts from all the nodes. */
+  if (MPI_Allreduce(MPI_IN_PLACE, g_counts, nr_nodes * nr_nodes, MPI_INT,
+                    MPI_SUM, MPI_COMM_WORLD) != MPI_SUCCESS)
+    error("Failed to allreduce gparticle transfer counts.");
+
+  /* Each node knows how many parts and gparts will be transferred to every
+     other node. We can start preparing to receive data */
+
+  /* Get the new number of parts and gparts for this node */
+  size_t nr_parts = 0, nr_gparts = 0;
   for (int k = 0; k < nr_nodes; k++) nr_parts += counts[k * nr_nodes + nodeID];
+  for (int k = 0; k < nr_nodes; k++)
+    nr_gparts += g_counts[k * nr_nodes + nodeID];
+
+  /* Allocate the new arrays with some extra margin */
   struct part *parts_new = NULL;
-  struct xpart *xparts_new = NULL, *xparts = s->xparts;
+  struct xpart *xparts_new = NULL;
+  struct gpart *gparts_new = NULL;
   if (posix_memalign((void **)&parts_new, part_align,
-                     sizeof(struct part) * nr_parts * 1.2) != 0 ||
-      posix_memalign((void **)&xparts_new, part_align,
-                     sizeof(struct xpart) * nr_parts * 1.2) != 0)
+                     sizeof(struct part) * nr_parts *
+                         engine_redistribute_alloc_margin) != 0)
     error("Failed to allocate new part data.");
-
-  /* Emit the sends and recvs for the particle data. */
+  if (posix_memalign((void **)&xparts_new, xpart_align,
+                     sizeof(struct xpart) * nr_parts *
+                         engine_redistribute_alloc_margin) != 0)
+    error("Failed to allocate new xpart data.");
+  if (posix_memalign((void **)&gparts_new, gpart_align,
+                     sizeof(struct gpart) * nr_gparts *
+                         engine_redistribute_alloc_margin) != 0)
+    error("Failed to allocate new gpart data.");
+
+  /* Prepare MPI requests for the asynchronous communications */
   MPI_Request *reqs;
-  if ((reqs = (MPI_Request *)malloc(sizeof(MPI_Request) * 4 * nr_nodes)) ==
+  if ((reqs = (MPI_Request *)malloc(sizeof(MPI_Request) * 6 * nr_nodes)) ==
       NULL)
     error("Failed to allocate MPI request list.");
-  for (int k = 0; k < 4 * nr_nodes; k++) reqs[k] = MPI_REQUEST_NULL;
-  for (size_t offset_send = 0, offset_recv = 0, k = 0; k < nr_nodes; k++) {
-    int ind_send = nodeID * nr_nodes + k;
-    int ind_recv = k * nr_nodes + nodeID;
+  for (int k = 0; k < 6 * nr_nodes; k++) reqs[k] = MPI_REQUEST_NULL;
+
+  /* Emit the sends and recvs for the particle and gparticle data. */
+  size_t offset_send = 0, offset_recv = 0;
+  size_t g_offset_send = 0, g_offset_recv = 0;
+  for (int k = 0; k < nr_nodes; k++) {
+
+    /* Indices in the count arrays of the node of interest */
+    const int ind_send = nodeID * nr_nodes + k;
+    const int ind_recv = k * nr_nodes + nodeID;
+
+    /* Are we sending any part/xpart ? */
     if (counts[ind_send] > 0) {
+
+      /* message("Sending %d part to node %d", counts[ind_send], k); */
+
+      /* If the send is to the same node, just copy */
       if (k == nodeID) {
         memcpy(&parts_new[offset_recv], &s->parts[offset_send],
                sizeof(struct part) * counts[ind_recv]);
@@ -221,36 +323,73 @@ void engine_redistribute(struct engine *e) {
                sizeof(struct xpart) * counts[ind_recv]);
         offset_send += counts[ind_send];
         offset_recv += counts[ind_recv];
+
+        /* Else, emit some communications */
       } else {
-        if (MPI_Isend(&s->parts[offset_send], counts[ind_send],
-                      e->part_mpi_type, k, 2 * ind_send + 0, MPI_COMM_WORLD,
-                      &reqs[4 * k]) != MPI_SUCCESS)
-          error("Failed to isend parts to node %zi.", k);
-        if (MPI_Isend(&s->xparts[offset_send], counts[ind_send],
-                      e->xpart_mpi_type, k, 2 * ind_send + 1, MPI_COMM_WORLD,
-                      &reqs[4 * k + 1]) != MPI_SUCCESS)
-          error("Failed to isend xparts to node %zi.", k);
+        if (MPI_Isend(&s->parts[offset_send], counts[ind_send], part_mpi_type,
+                      k, 3 * ind_send + 0, MPI_COMM_WORLD,
+                      &reqs[6 * k]) != MPI_SUCCESS)
+          error("Failed to isend parts to node %i.", k);
+        if (MPI_Isend(&s->xparts[offset_send], counts[ind_send], xpart_mpi_type,
+                      k, 3 * ind_send + 1, MPI_COMM_WORLD,
+                      &reqs[6 * k + 1]) != MPI_SUCCESS)
+          error("Failed to isend xparts to node %i.", k);
         offset_send += counts[ind_send];
       }
     }
+
+    /* Are we sending any gpart ? */
+    if (g_counts[ind_send] > 0) {
+
+      /* message("Sending %d gpart to node %d", g_counts[ind_send], k); */
+
+      /* If the send is to the same node, just copy */
+      if (k == nodeID) {
+        memcpy(&gparts_new[g_offset_recv], &s->gparts[g_offset_send],
+               sizeof(struct gpart) * g_counts[ind_recv]);
+        g_offset_send += g_counts[ind_send];
+        g_offset_recv += g_counts[ind_recv];
+
+        /* Else, emit some communications */
+      } else {
+        if (MPI_Isend(&s->gparts[g_offset_send], g_counts[ind_send],
+                      gpart_mpi_type, k, 3 * ind_send + 2, MPI_COMM_WORLD,
+                      &reqs[6 * k + 2]) != MPI_SUCCESS)
+          error("Failed to isend gparts to node %i.", k);
+        g_offset_send += g_counts[ind_send];
+      }
+    }
+
+    /* Now emit the corresponding Irecv() */
+
+    /* Are we receiving any part/xpart from this node ? */
     if (k != nodeID && counts[ind_recv] > 0) {
-      if (MPI_Irecv(&parts_new[offset_recv], counts[ind_recv], e->part_mpi_type,
-                    k, 2 * ind_recv + 0, MPI_COMM_WORLD,
-                    &reqs[4 * k + 2]) != MPI_SUCCESS)
-        error("Failed to emit irecv of parts from node %zi.", k);
-      if (MPI_Irecv(&xparts_new[offset_recv], counts[ind_recv],
-                    e->xpart_mpi_type, k, 2 * ind_recv + 1, MPI_COMM_WORLD,
-                    &reqs[4 * k + 3]) != MPI_SUCCESS)
-        error("Failed to emit irecv of parts from node %zi.", k);
+      if (MPI_Irecv(&parts_new[offset_recv], counts[ind_recv], part_mpi_type, k,
+                    3 * ind_recv + 0, MPI_COMM_WORLD,
+                    &reqs[6 * k + 3]) != MPI_SUCCESS)
+        error("Failed to emit irecv of parts from node %i.", k);
+      if (MPI_Irecv(&xparts_new[offset_recv], counts[ind_recv], xpart_mpi_type,
+                    k, 3 * ind_recv + 1, MPI_COMM_WORLD,
+                    &reqs[6 * k + 4]) != MPI_SUCCESS)
+        error("Failed to emit irecv of xparts from node %i.", k);
       offset_recv += counts[ind_recv];
     }
+
+    /* Are we receiving any gpart from this node ? */
+    if (k != nodeID && g_counts[ind_recv] > 0) {
+      if (MPI_Irecv(&gparts_new[g_offset_recv], g_counts[ind_recv],
+                    gpart_mpi_type, k, 3 * ind_recv + 2, MPI_COMM_WORLD,
+                    &reqs[6 * k + 5]) != MPI_SUCCESS)
+        error("Failed to emit irecv of gparts from node %i.", k);
+      g_offset_recv += g_counts[ind_recv];
+    }
   }
 
   /* Wait for all the sends and recvs to tumble in. */
-  MPI_Status stats[4 * nr_nodes];
+  MPI_Status stats[6 * nr_nodes];
   int res;
-  if ((res = MPI_Waitall(4 * nr_nodes, reqs, stats)) != MPI_SUCCESS) {
-    for (int k = 0; k < 4 * nr_nodes; k++) {
+  if ((res = MPI_Waitall(6 * nr_nodes, reqs, stats)) != MPI_SUCCESS) {
+    for (int k = 0; k < 6 * nr_nodes; k++) {
       char buff[MPI_MAX_ERROR_STRING];
       int res;
       MPI_Error_string(stats[k].MPI_ERROR, buff, &res);
@@ -259,35 +398,92 @@ void engine_redistribute(struct engine *e) {
     error("Failed during waitall for part data.");
   }
 
+  /* We now need to restore the part<->gpart links */
+  size_t offset_parts = 0, offset_gparts = 0;
+  for (int node = 0; node < nr_nodes; ++node) {
+
+    const int ind_recv = node * nr_nodes + nodeID;
+    const size_t count_parts = counts[ind_recv];
+    const size_t count_gparts = g_counts[ind_recv];
+
+    /* Loop over the gparts received from that node */
+    for (size_t k = offset_gparts; k < offset_gparts + count_gparts; ++k) {
+
+      /* Does this gpart have a partner ? */
+      if (gparts_new[k].id_or_neg_offset <= 0) {
+
+        const size_t partner_index = offset_parts - gparts_new[k].id_or_neg_offset;
+
+        /* Re-link */
+        gparts_new[k].id_or_neg_offset = -partner_index;
+        parts_new[partner_index].gpart = &gparts_new[k];
+      }
+    }
+
+    offset_parts += count_parts;
+    offset_gparts += count_gparts;
+  }
+
   /* Verify that all parts are in the right place. */
-  /* for ( k = 0 ; k < nr_parts ; k++ ) {
-      cid = cell_getid( cdim , parts_new[k].x[0]*ih[0] , parts_new[k].x[1]*ih[1]
-     , parts_new[k].x[2]*ih[2] );
+  /* for ( int k = 0 ; k < nr_parts ; k++ ) {
+      int cid = cell_getid( cdim , parts_new[k].x[0]*ih[0],
+    parts_new[k].x[1]*ih[1], parts_new[k].x[2]*ih[2] );
       if ( cells[ cid ].nodeID != nodeID )
-          error( "Received particle (%i) that does not belong here (nodeID=%i)."
-     , k , cells[ cid ].nodeID );
-      } */
+          error( "Received particle (%i) that does not belong here
+    (nodeID=%i).", k , cells[ cid ].nodeID );
+    } */
+
+  /* Verify that the links are correct */
+  /* MATTHIEU: To be commented out once we are happy */
+  for (size_t k = 0; k < nr_gparts; ++k) {
+
+    if (gparts_new[k].id_or_neg_offset <= 0) {
+    
+      struct part *part = &parts_new[-gparts_new[k].id_or_neg_offset];
+
+      if (part->gpart != &gparts_new[k])
+        error("Linking problem !");
+
+      if (gparts_new[k].x[0] != part->x[0] ||
+          gparts_new[k].x[1] != part->x[1] ||
+          gparts_new[k].x[2] != part->x[2])
+        error("Linked particles are not at the same position !");
+    }
+  }
+  for (size_t k = 0; k < nr_parts; ++k) {
+
+    if (parts_new[k].gpart != NULL &&
+        parts_new[k].gpart->id_or_neg_offset != -k) {
+        error("Linking problem !");
+    }
+  }
 
   /* Set the new part data, free the old. */
   free(parts);
   free(xparts);
+  free(gparts);
   s->parts = parts_new;
   s->xparts = xparts_new;
+  s->gparts = gparts_new;
   s->nr_parts = nr_parts;
-  s->size_parts = 1.2 * nr_parts;
+  s->nr_gparts = nr_gparts;
+  s->size_parts = engine_redistribute_alloc_margin * nr_parts;
+  s->size_gparts = engine_redistribute_alloc_margin * nr_gparts;
 
-  /* Be verbose about what just happened. */
-  for (int k = 0; k < nr_cells; k++)
-    if (cells[k].nodeID == nodeID) my_cells += 1;
-  if (e->verbose)
-    message("node %i now has %zi parts in %i cells.", nodeID, nr_parts,
-            my_cells);
-
-  /* Clean up other stuff. */
+  /* Clean up the temporary stuff. */
   free(reqs);
   free(counts);
   free(dest);
 
+  /* Be verbose about what just happened. */
+  if (e->verbose) {
+    int my_cells = 0;
+    for (int k = 0; k < nr_cells; k++)
+      if (cells[k].nodeID == nodeID) my_cells += 1;
+    message("node %i now has %zi parts and %zi gparts in %i cells.", nodeID,
+            nr_parts, nr_gparts, my_cells);
+  }
+
   if (e->verbose)
     message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
             clocks_getunit());
@@ -513,7 +709,7 @@ void engine_exchange_cells(struct engine *e) {
 
   /* Wait for each count to come in and start the recv. */
   for (int k = 0; k < nr_proxies; k++) {
-    int pid;
+    int pid = MPI_UNDEFINED;
     if (MPI_Waitany(nr_proxies, reqs_in, &pid, &status) != MPI_SUCCESS ||
         pid == MPI_UNDEFINED)
       error("MPI_Waitany failed.");
@@ -533,7 +729,7 @@ void engine_exchange_cells(struct engine *e) {
 
   /* Wait for each pcell array to come in from the proxies. */
   for (int k = 0; k < nr_proxies; k++) {
-    int pid;
+    int pid = MPI_UNDEFINED;
     if (MPI_Waitany(nr_proxies, reqs_in, &pid, &status) != MPI_SUCCESS ||
         pid == MPI_UNDEFINED)
       error("MPI_Waitany failed.");
@@ -549,31 +745,40 @@ void engine_exchange_cells(struct engine *e) {
 
   /* Count the number of particles we need to import and re-allocate
      the buffer if needed. */
-  int count_in = 0;
+  int count_parts_in = 0, count_gparts_in = 0;
   for (int k = 0; k < nr_proxies; k++)
-    for (int j = 0; j < e->proxies[k].nr_cells_in; j++)
-      count_in += e->proxies[k].cells_in[j]->count;
-  if (count_in > s->size_parts_foreign) {
+    for (int j = 0; j < e->proxies[k].nr_cells_in; j++) {
+      count_parts_in += e->proxies[k].cells_in[j]->count;
+      count_gparts_in += e->proxies[k].cells_in[j]->gcount;
+    }
+  if (count_parts_in > s->size_parts_foreign) {
     if (s->parts_foreign != NULL) free(s->parts_foreign);
-    s->size_parts_foreign = 1.1 * count_in;
+    s->size_parts_foreign = 1.1 * count_parts_in;
     if (posix_memalign((void **)&s->parts_foreign, part_align,
                        sizeof(struct part) * s->size_parts_foreign) != 0)
       error("Failed to allocate foreign part data.");
   }
+  if (count_gparts_in > s->size_gparts_foreign) {
+    if (s->gparts_foreign != NULL) free(s->gparts_foreign);
+    s->size_gparts_foreign = 1.1 * count_gparts_in;
+    if (posix_memalign((void **)&s->gparts_foreign, gpart_align,
+                       sizeof(struct gpart) * s->size_gparts_foreign) != 0)
+      error("Failed to allocate foreign gpart data.");
+  }
 
   /* Unpack the cells and link to the particle data. */
   struct part *parts = s->parts_foreign;
+  struct gpart *gparts = s->gparts_foreign;
   for (int k = 0; k < nr_proxies; k++) {
     for (int j = 0; j < e->proxies[k].nr_cells_in; j++) {
-      cell_link(e->proxies[k].cells_in[j], parts);
+      cell_link_parts(e->proxies[k].cells_in[j], parts);
+      cell_link_gparts(e->proxies[k].cells_in[j], gparts);
       parts = &parts[e->proxies[k].cells_in[j]->count];
+      gparts = &gparts[e->proxies[k].cells_in[j]->gcount];
     }
   }
   s->nr_parts_foreign = parts - s->parts_foreign;
-
-  /* Is the parts buffer large enough? */
-  if (s->nr_parts_foreign > s->size_parts_foreign)
-    error("Foreign parts buffer too small.");
+  s->nr_gparts_foreign = gparts - s->gparts_foreign;
 
   /* Free the pcell buffer. */
   free(pcells);
@@ -591,16 +796,24 @@ void engine_exchange_cells(struct engine *e) {
  * @brief Exchange straying parts with other nodes.
  *
  * @param e The #engine.
- * @param offset The index in the parts array as of which the foreign parts
- *reside.
- * @param ind The ID of the foreign #cell.
- * @param N The number of stray parts.
+ * @param offset_parts The index in the parts array as of which the foreign
+ *        parts reside.
+ * @param ind_part The foreign #cell ID of each part.
+ * @param Npart The number of stray parts, contains the number of parts received
+ *        on return.
+ * @param offset_gparts The index in the gparts array as of which the foreign
+ *        parts reside.
+ * @param ind_gpart The foreign #cell ID of each gpart.
+ * @param Ngpart The number of stray gparts, contains the number of gparts
+ *        received on return.
  *
- * @return The number of arrived parts copied to parts and xparts.
+ * Note that this function does not mess-up the linkage between parts and
+ * gparts, i.e. the received particles have correct linkeage.
  */
 
-int engine_exchange_strays(struct engine *e, int offset, size_t *ind,
-                           size_t N) {
+void engine_exchange_strays(struct engine *e, size_t offset_parts,
+                            int *ind_part, size_t *Npart, size_t offset_gparts,
+                            int *ind_gpart, size_t *Ngpart) {
 
 #ifdef WITH_MPI
 
@@ -610,25 +823,49 @@ int engine_exchange_strays(struct engine *e, int offset, size_t *ind,
   /* Re-set the proxies. */
   for (int k = 0; k < e->nr_proxies; k++) e->proxies[k].nr_parts_out = 0;
 
-  /* Put the parts into the corresponding proxies. */
-  for (size_t k = 0; k < N; k++) {
-    const int node_id = e->s->cells[ind[k]].nodeID;
+  /* Put the parts and gparts into the corresponding proxies. */
+  for (size_t k = 0; k < *Npart; k++) {
+    /* Get the target node and proxy ID. */
+    const int node_id = e->s->cells[ind_part[k]].nodeID;
     if (node_id < 0 || node_id >= e->nr_nodes)
       error("Bad node ID %i.", node_id);
     const int pid = e->proxy_ind[node_id];
-    if (pid < 0)
+    if (pid < 0) {
       error(
           "Do not have a proxy for the requested nodeID %i for part with "
           "id=%llu, x=[%e,%e,%e].",
-          node_id, s->parts[offset + k].id, s->parts[offset + k].x[0],
-          s->parts[offset + k].x[1], s->parts[offset + k].x[2]);
-    proxy_parts_load(&e->proxies[pid], &s->parts[offset + k],
-                     &s->xparts[offset + k], 1);
+          node_id, s->parts[offset_parts + k].id,
+          s->parts[offset_parts + k].x[0], s->parts[offset_parts + k].x[1],
+          s->parts[offset_parts + k].x[2]);
+    }
+
+    /* Re-link the associated gpart with the buffer offset of the part. */
+    if (s->parts[offset_parts + k].gpart != NULL) {
+      s->parts[offset_parts + k].gpart->id_or_neg_offset = -e->proxies[pid].nr_parts_in;
+    }
+
+    /* Load the part and xpart into the proxy. */
+    proxy_parts_load(&e->proxies[pid], &s->parts[offset_parts + k],
+                     &s->xparts[offset_parts + k], 1);
+  }
+  for (size_t k = 0; k < *Ngpart; k++) {
+    const int node_id = e->s->cells[ind_gpart[k]].nodeID;
+    if (node_id < 0 || node_id >= e->nr_nodes)
+      error("Bad node ID %i.", node_id);
+    const int pid = e->proxy_ind[node_id];
+    if (pid < 0)
+      error(
+          "Do not have a proxy for the requested nodeID %i for part with "
+          "id=%lli, x=[%e,%e,%e].",
+          node_id, s->gparts[offset_parts + k].id_or_neg_offset,
+          s->gparts[offset_gparts + k].x[0], s->gparts[offset_parts + k].x[1],
+          s->gparts[offset_gparts + k].x[2]);
+    proxy_gparts_load(&e->proxies[pid], &s->gparts[offset_gparts + k], 1);
   }
 
   /* Launch the proxies. */
-  MPI_Request reqs_in[2 * engine_maxproxies];
-  MPI_Request reqs_out[2 * engine_maxproxies];
+  MPI_Request reqs_in[3 * engine_maxproxies];
+  MPI_Request reqs_out[3 * engine_maxproxies];
   for (int k = 0; k < e->nr_proxies; k++) {
     proxy_parts_exch1(&e->proxies[k]);
     reqs_in[k] = e->proxies[k].req_parts_count_in;
@@ -637,7 +874,7 @@ int engine_exchange_strays(struct engine *e, int offset, size_t *ind,
 
   /* Wait for each count to come in and start the recv. */
   for (int k = 0; k < e->nr_proxies; k++) {
-    int pid;
+    int pid = MPI_UNDEFINED;
     if (MPI_Waitany(e->nr_proxies, reqs_in, &pid, MPI_STATUS_IGNORE) !=
             MPI_SUCCESS ||
         pid == MPI_UNDEFINED)
@@ -652,11 +889,18 @@ int engine_exchange_strays(struct engine *e, int offset, size_t *ind,
 
   /* Count the total number of incoming particles and make sure we have
      enough space to accommodate them. */
-  size_t count_in = 0;
-  for (int k = 0; k < e->nr_proxies; k++) count_in += e->proxies[k].nr_parts_in;
-  if (e->verbose) message("sent out %zi particles, got %zi back.", N, count_in);
-  if (offset + count_in > s->size_parts) {
-    s->size_parts = (offset + count_in) * 1.05;
+  int count_parts_in = 0;
+  int count_gparts_in = 0;
+  for (int k = 0; k < e->nr_proxies; k++) {
+    count_parts_in += e->proxies[k].nr_parts_in;
+    count_gparts_in += e->proxies[k].nr_gparts_in;
+  }
+  if (e->verbose) {
+    message("sent out %zi/%zi parts/gparts, got %i/%i back.", *Npart, *Ngpart,
+            count_parts_in, count_gparts_in);
+  }
+  if (offset_parts + count_parts_in > s->size_parts) {
+    s->size_parts = (offset_parts + count_parts_in) * engine_parts_size_grow;
     struct part *parts_new = NULL;
     struct xpart *xparts_new = NULL;
     if (posix_memalign((void **)&parts_new, part_align,
@@ -664,37 +908,61 @@ int engine_exchange_strays(struct engine *e, int offset, size_t *ind,
         posix_memalign((void **)&xparts_new, part_align,
                        sizeof(struct xpart) * s->size_parts) != 0)
       error("Failed to allocate new part data.");
-    memcpy(parts_new, s->parts, sizeof(struct part) * offset);
-    memcpy(xparts_new, s->xparts, sizeof(struct xpart) * offset);
+    memcpy(parts_new, s->parts, sizeof(struct part) * offset_parts);
+    memcpy(xparts_new, s->xparts, sizeof(struct xpart) * offset_parts);
     free(s->parts);
     free(s->xparts);
     s->parts = parts_new;
     s->xparts = xparts_new;
   }
+  if (offset_gparts + count_gparts_in > s->size_gparts) {
+    s->size_gparts = (offset_gparts + count_gparts_in) * engine_parts_size_grow;
+    struct gpart *gparts_new = NULL;
+    if (posix_memalign((void **)&gparts_new, gpart_align,
+                       sizeof(struct gpart) * s->size_gparts) != 0)
+      error("Failed to allocate new gpart data.");
+    memcpy(gparts_new, s->gparts, sizeof(struct gpart) * offset_gparts);
+    free(s->gparts);
+    s->gparts = gparts_new;
+  }
 
   /* Collect the requests for the particle data from the proxies. */
   int nr_in = 0, nr_out = 0;
   for (int k = 0; k < e->nr_proxies; k++) {
     if (e->proxies[k].nr_parts_in > 0) {
-      reqs_in[2 * k] = e->proxies[k].req_parts_in;
-      reqs_in[2 * k + 1] = e->proxies[k].req_xparts_in;
+      reqs_in[3 * k] = e->proxies[k].req_parts_in;
+      reqs_in[3 * k + 1] = e->proxies[k].req_xparts_in;
+      nr_in += 2;
+    } else {
+      reqs_in[3 * k] = reqs_in[3 * k + 1] = MPI_REQUEST_NULL;
+    }
+    if (e->proxies[k].nr_gparts_in > 0) {
+      reqs_in[3 * k + 2] = e->proxies[k].req_gparts_in;
       nr_in += 1;
-    } else
-      reqs_in[2 * k] = reqs_in[2 * k + 1] = MPI_REQUEST_NULL;
+    } else {
+      reqs_in[3 * k + 2] = MPI_REQUEST_NULL;
+    }
     if (e->proxies[k].nr_parts_out > 0) {
-      reqs_out[2 * k] = e->proxies[k].req_parts_out;
-      reqs_out[2 * k + 1] = e->proxies[k].req_xparts_out;
+      reqs_out[3 * k] = e->proxies[k].req_parts_out;
+      reqs_out[3 * k + 1] = e->proxies[k].req_xparts_out;
+      nr_out += 2;
+    } else {
+      reqs_out[3 * k] = reqs_out[3 * k + 1] = MPI_REQUEST_NULL;
+    }
+    if (e->proxies[k].nr_gparts_out > 0) {
+      reqs_out[3 * k + 2] = e->proxies[k].req_gparts_out;
       nr_out += 1;
-    } else
-      reqs_out[2 * k] = reqs_out[2 * k + 1] = MPI_REQUEST_NULL;
+    } else {
+      reqs_out[3 * k + 2] = MPI_REQUEST_NULL;
+    }
   }
 
   /* Wait for each part array to come in and collect the new
      parts from the proxies. */
-  size_t count = 0;
-  for (int k = 0; k < 2 * (nr_in + nr_out); k++) {
+  int count_parts = 0, count_gparts = 0;
+  for (int k = 0; k < nr_in; k++) {
     int err, pid;
-    if ((err = MPI_Waitany(2 * e->nr_proxies, reqs_in, &pid,
+    if ((err = MPI_Waitany(3 * e->nr_proxies, reqs_in, &pid,
                            MPI_STATUS_IGNORE)) != MPI_SUCCESS) {
       char buff[MPI_MAX_ERROR_STRING];
       int res;
@@ -702,26 +970,45 @@ int engine_exchange_strays(struct engine *e, int offset, size_t *ind,
       error("MPI_Waitany failed (%s).", buff);
     }
     if (pid == MPI_UNDEFINED) break;
-    // message( "request from proxy %i has arrived." , pid );
-    if (reqs_in[pid & ~1] == MPI_REQUEST_NULL &&
-        reqs_in[pid | 1] == MPI_REQUEST_NULL) {
+    // message( "request from proxy %i has arrived." , pid / 3 );
+    pid = 3 * (pid / 3);
+
+    /* If all the requests for a given proxy have arrived... */
+    if (reqs_in[pid + 0] == MPI_REQUEST_NULL &&
+        reqs_in[pid + 1] == MPI_REQUEST_NULL &&
+        reqs_in[pid + 2] == MPI_REQUEST_NULL) {
+      /* Copy the particle data to the part/xpart/gpart arrays. */
       struct proxy *p = &e->proxies[pid >> 1];
-      memcpy(&s->parts[offset + count], p->parts_in,
+      memcpy(&s->parts[offset_parts + count_parts], p->parts_in,
              sizeof(struct part) * p->nr_parts_in);
-      memcpy(&s->xparts[offset + count], p->xparts_in,
+      memcpy(&s->xparts[offset_parts + count_parts], p->xparts_in,
              sizeof(struct xpart) * p->nr_parts_in);
+      memcpy(&s->gparts[offset_gparts + count_gparts], p->gparts_in,
+             sizeof(struct gpart) * p->nr_gparts_in);
       /* for (int k = offset; k < offset + count; k++)
          message(
             "received particle %lli, x=[%.3e %.3e %.3e], h=%.3e, from node %i.",
             s->parts[k].id, s->parts[k].x[0], s->parts[k].x[1],
             s->parts[k].x[2], s->parts[k].h, p->nodeID); */
-      count += p->nr_parts_in;
+
+      /* Re-link the gparts. */
+      for (int k = 0; k < p->nr_gparts_in; k++) {
+        struct gpart *gp = &s->gparts[offset_gparts + count_gparts + k];
+        if (gp->id_or_neg_offset <= 0) {
+          s->parts[offset_gparts + count_parts - gp->id_or_neg_offset].gpart = gp;
+          gp->id_or_neg_offset = -(offset_gparts + count_parts - gp->id_or_neg_offset);
+        }
+      }
+
+      /* Advance the counters. */
+      count_parts += p->nr_parts_in;
+      count_gparts += p->nr_gparts_in;
     }
   }
 
   /* Wait for all the sends to have finished too. */
   if (nr_out > 0)
-    if (MPI_Waitall(2 * e->nr_proxies, reqs_out, MPI_STATUSES_IGNORE) !=
+    if (MPI_Waitall(3 * e->nr_proxies, reqs_out, MPI_STATUSES_IGNORE) !=
         MPI_SUCCESS)
       error("MPI_Waitall on sends failed.");
 
@@ -730,11 +1017,11 @@ int engine_exchange_strays(struct engine *e, int offset, size_t *ind,
             clocks_getunit());
 
   /* Return the number of harvested parts. */
-  return count;
+  *Npart = count_parts;
+  *Ngpart = count_gparts;
 
 #else
   error("SWIFT was not compiled with MPI support.");
-  return 0;
 #endif
 }
 
@@ -743,7 +1030,7 @@ int engine_exchange_strays(struct engine *e, int offset, size_t *ind,
  *neighbours
  *
  * Here we construct all the tasks for all possible neighbouring non-empty
- * local cells in the hierarchy. No dependencies are being added thus far. 
+ * local cells in the hierarchy. No dependencies are being added thus far.
  * Additional loop over neighbours can later be added by simply duplicating
  * all the tasks created by this function.
  *
@@ -761,12 +1048,14 @@ void engine_make_hydroloop_tasks(struct engine *e) {
   for (int i = 0; i < cdim[0]; i++) {
     for (int j = 0; j < cdim[1]; j++) {
       for (int k = 0; k < cdim[2]; k++) {
-        int cid = cell_getid(cdim, i, j, k);
 
-        /* Skip cells without hydro particles */
-        if (cells[cid].count == 0) continue;
+        /* Get the cell */
+        const int cid = cell_getid(cdim, i, j, k);
         struct cell *ci = &cells[cid];
 
+        /* Skip cells without hydro particles */
+        if (ci->count == 0) continue;
+
         /* If the cells is local build a self-interaction */
         if (ci->nodeID == nodeID)
           scheduler_addtask(sched, task_type_self, task_subtype_density, 0, 0,
@@ -785,14 +1074,19 @@ void engine_make_hydroloop_tasks(struct engine *e) {
               int kkk = k + kk;
               if (!s->periodic && (kkk < 0 || kkk >= cdim[2])) continue;
               kkk = (kkk + cdim[2]) % cdim[2];
-              int cjd = cell_getid(cdim, iii, jjj, kkk);
+
+              /* Get the neighbouring cell */
+              const int cjd = cell_getid(cdim, iii, jjj, kkk);
               struct cell *cj = &cells[cjd];
 
               /* Is that neighbour local and does it have particles ? */
               if (cid >= cjd || cj->count == 0 ||
                   (ci->nodeID != nodeID && cj->nodeID != nodeID))
                 continue;
-              int sid = sortlistID[(kk + 1) + 3 * ((jj + 1) + 3 * (ii + 1))];
+
+              /* Construct the pair task */
+              const int sid =
+                  sortlistID[(kk + 1) + 3 * ((jj + 1) + 3 * (ii + 1))];
               scheduler_addtask(sched, task_type_pair, task_subtype_density,
                                 sid, 0, ci, cj, 1);
             }
@@ -874,10 +1168,16 @@ void engine_count_and_link_tasks(struct engine *e) {
 }
 
 /**
- * @brief Duplicates the first hydro loop and creates the corresponding
- *dependencies using the ghost tasks.
+ * @brief Duplicates the first hydro loop and construct all the
+ * dependencies for the hydro part
+ *
+ * This is done by looping over all the previously constructed tasks
+ * and adding another task involving the same cells but this time
+ * corresponding to the second hydro loop over neighbours.
+ * With all the relevant tasks for a given cell available, we construct
+ * all the dependencies for that cell.
  *
- * @parma e The #engine.
+ * @param e The #engine.
  */
 void engine_make_extra_hydroloop_tasks(struct engine *e) {
 
@@ -895,20 +1195,39 @@ void engine_make_extra_hydroloop_tasks(struct engine *e) {
 
     /* Self-interaction? */
     if (t->type == task_type_self && t->subtype == task_subtype_density) {
-      scheduler_addunlock(sched, t->ci->super->init, t);
-      scheduler_addunlock(sched, t, t->ci->super->ghost);
+
+      /* Start by constructing the task for the second hydro loop */
       struct task *t2 = scheduler_addtask(
           sched, task_type_self, task_subtype_force, 0, 0, t->ci, NULL, 0);
-      scheduler_addunlock(sched, t->ci->super->ghost, t2);
-      scheduler_addunlock(sched, t2, t->ci->super->kick);
+
+      /* Add the link between the new loop and the cell */
       t->ci->force = engine_addlink(e, t->ci->force, t2);
       atomic_inc(&t->ci->nr_force);
+
+      /* Now, build all the dependencies for the hydro */
+      /* init --> t (density loop) --> ghost --> t2 (force loop) --> kick */
+      scheduler_addunlock(sched, t->ci->super->init, t);
+      scheduler_addunlock(sched, t, t->ci->super->ghost);
+      scheduler_addunlock(sched, t->ci->super->ghost, t2);
+      scheduler_addunlock(sched, t2, t->ci->super->kick);
     }
 
     /* Otherwise, pair interaction? */
     else if (t->type == task_type_pair && t->subtype == task_subtype_density) {
+
+      /* Start by constructing the task for the second hydro loop */
       struct task *t2 = scheduler_addtask(
           sched, task_type_pair, task_subtype_force, 0, 0, t->ci, t->cj, 0);
+
+      /* Add the link between the new loop and both cells */
+      t->ci->force = engine_addlink(e, t->ci->force, t2);
+      atomic_inc(&t->ci->nr_force);
+      t->cj->force = engine_addlink(e, t->cj->force, t2);
+      atomic_inc(&t->cj->nr_force);
+
+      /* Now, build all the dependencies for the hydro for the cells */
+      /* that are local and are not descendant of the same super-cells */
+      /* init --> t (density loop) --> ghost --> t2 (force loop) --> kick */
       if (t->ci->nodeID == nodeID) {
         scheduler_addunlock(sched, t->ci->super->init, t);
         scheduler_addunlock(sched, t, t->ci->super->ghost);
@@ -921,17 +1240,27 @@ void engine_make_extra_hydroloop_tasks(struct engine *e) {
         scheduler_addunlock(sched, t->cj->super->ghost, t2);
         scheduler_addunlock(sched, t2, t->cj->super->kick);
       }
-      t->ci->force = engine_addlink(e, t->ci->force, t2);
-      atomic_inc(&t->ci->nr_force);
-      t->cj->force = engine_addlink(e, t->cj->force, t2);
-      atomic_inc(&t->cj->nr_force);
     }
 
     /* Otherwise, sub interaction? */
     else if (t->type == task_type_sub && t->subtype == task_subtype_density) {
+
+      /* Start by constructing the task for the second hydro loop */
       struct task *t2 =
           scheduler_addtask(sched, task_type_sub, task_subtype_force, t->flags,
                             0, t->ci, t->cj, 0);
+
+      /* Add the link between the new loop and both cells */
+      t->ci->force = engine_addlink(e, t->ci->force, t2);
+      atomic_inc(&t->ci->nr_force);
+      if (t->cj != NULL) {
+        t->cj->force = engine_addlink(e, t->cj->force, t2);
+        atomic_inc(&t->cj->nr_force);
+      }
+
+      /* Now, build all the dependencies for the hydro for the cells */
+      /* that are local and are not descendant of the same super-cells */
+      /* init --> t (density loop) --> ghost --> t2 (force loop) --> kick */
       if (t->ci->nodeID == nodeID) {
         scheduler_addunlock(sched, t, t->ci->super->ghost);
         scheduler_addunlock(sched, t->ci->super->ghost, t2);
@@ -943,12 +1272,6 @@ void engine_make_extra_hydroloop_tasks(struct engine *e) {
         scheduler_addunlock(sched, t->cj->super->ghost, t2);
         scheduler_addunlock(sched, t2, t->cj->super->kick);
       }
-      t->ci->force = engine_addlink(e, t->ci->force, t2);
-      atomic_inc(&t->ci->nr_force);
-      if (t->cj != NULL) {
-        t->cj->force = engine_addlink(e, t->cj->force, t2);
-        atomic_inc(&t->cj->nr_force);
-      }
     }
 
     /* /\* Kick tasks should rely on the grav_down tasks of their cell. *\/ */
@@ -960,6 +1283,8 @@ void engine_make_extra_hydroloop_tasks(struct engine *e) {
 /**
  * @brief Constructs the top-level pair tasks for the gravity M-M interactions
  *
+ * Correct implementation is still lacking here.
+ *
  * @param e The #engine.
  */
 void engine_make_gravityinteraction_tasks(struct engine *e) {
@@ -994,6 +1319,8 @@ void engine_make_gravityinteraction_tasks(struct engine *e) {
  * @brief Constructs the gravity tasks building the multipoles and propagating
  *them to the children
  *
+ * Correct implementation is still lacking here.
+ *
  * @param e The #engine.
  */
 void engine_make_gravityrecursive_tasks(struct engine *e) {
@@ -1028,7 +1355,6 @@ void engine_make_gravityrecursive_tasks(struct engine *e) {
  *
  * @param e The #engine we are working with.
  */
-
 void engine_maketasks(struct engine *e) {
 
   struct space *s = e->s;
@@ -1130,9 +1456,10 @@ void engine_maketasks(struct engine *e) {
 int engine_marktasks(struct engine *e) {
 
   struct scheduler *s = &e->sched;
-  const int nr_tasks = s->nr_tasks, *ind = s->tasks_ind;
+  const int ti_end = e->ti_current;
+  const int nr_tasks = s->nr_tasks;
+  const int *const ind = s->tasks_ind;
   struct task *tasks = s->tasks;
-  const float ti_end = e->ti_current;
   const ticks tic = getticks();
 
   /* Much less to do here if we're on a fixed time-step. */
@@ -1232,6 +1559,7 @@ int engine_marktasks(struct engine *e) {
       else if (t->type == task_type_kick) {
         t->skip = (t->ci->ti_end_min > ti_end);
         t->ci->updated = 0;
+        t->ci->g_updated = 0;
       }
 
       /* Drift? */
@@ -1288,6 +1616,7 @@ void engine_print_task_counts(struct engine *e) {
   printf(" skipped=%i ]\n", counts[task_type_count]);
   fflush(stdout);
   message("nr_parts = %zi.", e->s->nr_parts);
+  message("nr_gparts = %zi.", e->s->nr_gparts);
 }
 
 /**
@@ -1298,7 +1627,7 @@ void engine_print_task_counts(struct engine *e) {
 
 void engine_rebuild(struct engine *e) {
 
-  ticks tic = getticks();
+  const ticks tic = getticks();
 
   /* Clear the forcerebuild flag, whatever it was. */
   e->forcerebuild = 0;
@@ -1341,7 +1670,7 @@ void engine_prepare(struct engine *e) {
 
 /* Collect the values of rebuild from all nodes. */
 #ifdef WITH_MPI
-  int buff;
+  int buff = 0;
   if (MPI_Allreduce(&rebuild, &buff, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD) !=
       MPI_SUCCESS)
     error("Failed to aggregate the rebuild flag across nodes.");
@@ -1417,7 +1746,7 @@ void engine_collect_kick(struct cell *c) {
   if (c->kick != NULL) return;
 
   /* Counters for the different quantities. */
-  int updated = 0;
+  int updated = 0, g_updated = 0;
   double e_kin = 0.0, e_int = 0.0, e_pot = 0.0;
   float mom[3] = {0.0f, 0.0f, 0.0f}, ang[3] = {0.0f, 0.0f, 0.0f};
   int ti_end_min = max_nr_timesteps, ti_end_max = 0;
@@ -1440,6 +1769,7 @@ void engine_collect_kick(struct cell *c) {
         ti_end_min = min(ti_end_min, cp->ti_end_min);
         ti_end_max = max(ti_end_max, cp->ti_end_max);
         updated += cp->updated;
+        g_updated += cp->g_updated;
         e_kin += cp->e_kin;
         e_int += cp->e_int;
         e_pot += cp->e_pot;
@@ -1457,6 +1787,7 @@ void engine_collect_kick(struct cell *c) {
   c->ti_end_min = ti_end_min;
   c->ti_end_max = ti_end_max;
   c->updated = updated;
+  c->g_updated = g_updated;
   c->e_kin = e_kin;
   c->e_int = e_int;
   c->e_pot = e_pot;
@@ -1520,7 +1851,15 @@ void engine_init_particles(struct engine *e) {
 
   /* Make sure all particles are ready to go */
   /* i.e. clean-up any stupid state in the ICs */
-  space_map_cells_pre(s, 1, cell_init_parts, NULL);
+  if ((e->policy & engine_policy_hydro) == engine_policy_hydro) {
+    space_map_cells_pre(s, 1, cell_init_parts, NULL);
+  }
+  if (((e->policy & engine_policy_self_gravity) ==
+       engine_policy_self_gravity) ||
+      ((e->policy & engine_policy_external_gravity) ==
+       engine_policy_external_gravity)) {
+    space_map_cells_pre(s, 1, cell_init_gparts, NULL);
+  }
 
   engine_prepare(e);
 
@@ -1594,7 +1933,7 @@ void engine_init_particles(struct engine *e) {
  */
 void engine_step(struct engine *e) {
 
-  int updates = 0;
+  int updates = 0, g_updates = 0;
   int ti_end_min = max_nr_timesteps, ti_end_max = 0;
   double e_pot = 0.0, e_int = 0.0, e_kin = 0.0;
   float mom[3] = {0.0, 0.0, 0.0};
@@ -1621,6 +1960,7 @@ void engine_step(struct engine *e) {
       e_int += c->e_int;
       e_pot += c->e_pot;
       updates += c->updated;
+      g_updates += c->g_updated;
       mom[0] += c->mom[0];
       mom[1] += c->mom[1];
       mom[2] += c->mom[2];
@@ -1632,7 +1972,8 @@ void engine_step(struct engine *e) {
 /* Aggregate the data from the different nodes. */
 #ifdef WITH_MPI
   {
-    int in_i[4], out_i[4];
+    int in_i[1], out_i[1];
+    in_i[0] = 0;
     out_i[0] = ti_end_min;
     if (MPI_Allreduce(out_i, in_i, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD) !=
         MPI_SUCCESS)
@@ -1645,18 +1986,20 @@ void engine_step(struct engine *e) {
     ti_end_max = in_i[0];
   }
   {
-    double in_d[4], out_d[4];
+    double in_d[5], out_d[5];
     out_d[0] = updates;
-    out_d[1] = e_kin;
-    out_d[2] = e_int;
-    out_d[3] = e_pot;
-    if (MPI_Allreduce(out_d, in_d, 4, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD) !=
+    out_d[1] = g_updates;
+    out_d[2] = e_kin;
+    out_d[3] = e_int;
+    out_d[4] = e_pot;
+    if (MPI_Allreduce(out_d, in_d, 5, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD) !=
         MPI_SUCCESS)
       error("Failed to aggregate energies.");
     updates = in_d[0];
-    e_kin = in_d[1];
-    e_int = in_d[2];
-    e_pot = in_d[3];
+    g_updates = in_d[1];
+    e_kin = in_d[2];
+    e_int = in_d[3];
+    e_pot = in_d[4];
   }
 #endif
 
@@ -1681,8 +2024,8 @@ void engine_step(struct engine *e) {
   if (e->nodeID == 0) {
 
     /* Print some information to the screen */
-    printf("%d %e %e %d %.3f\n", e->step, e->time, e->timeStep, updates,
-           e->wallclock_time);
+    printf("%d %e %e %d %d %.3f\n", e->step, e->time, e->timeStep, updates,
+           g_updates, e->wallclock_time);
     fflush(stdout);
 
     /* Write some energy statistics */
@@ -1885,7 +2228,7 @@ void engine_split(struct engine *e, struct partition *initial_partition) {
   engine_makeproxies(e);
 
   /* Re-allocate the local parts. */
-  if (e->nodeID == 0)
+  if (e->verbose)
     message("Re-allocating parts array from %zi to %zi.", s->size_parts,
             (size_t)(s->nr_parts * 1.2));
   s->size_parts = s->nr_parts * 1.2;
@@ -1893,7 +2236,7 @@ void engine_split(struct engine *e, struct partition *initial_partition) {
   struct xpart *xparts_new = NULL;
   if (posix_memalign((void **)&parts_new, part_align,
                      sizeof(struct part) * s->size_parts) != 0 ||
-      posix_memalign((void **)&xparts_new, part_align,
+      posix_memalign((void **)&xparts_new, xpart_align,
                      sizeof(struct xpart) * s->size_parts) != 0)
     error("Failed to allocate new part data.");
   memcpy(parts_new, s->parts, sizeof(struct part) * s->nr_parts);
@@ -1902,6 +2245,47 @@ void engine_split(struct engine *e, struct partition *initial_partition) {
   free(s->xparts);
   s->parts = parts_new;
   s->xparts = xparts_new;
+
+  /* Re-link the gparts. */
+  part_relink_gparts(s->parts, s->nr_parts, 0);
+
+  /* Re-allocate the local gparts. */
+  if (e->verbose)
+    message("Re-allocating gparts array from %zi to %zi.", s->size_gparts,
+            (size_t)(s->nr_gparts * 1.2));
+  s->size_gparts = s->nr_gparts * 1.2;
+  struct gpart *gparts_new = NULL;
+  if (posix_memalign((void **)&gparts_new, gpart_align,
+                     sizeof(struct gpart) * s->size_gparts) != 0)
+    error("Failed to allocate new gpart data.");
+  memcpy(gparts_new, s->gparts, sizeof(struct gpart) * s->nr_gparts);
+  free(s->gparts);
+  s->gparts = gparts_new;
+
+  /* Re-link the parts. */
+  part_relink_parts(s->gparts, s->nr_gparts, s->parts);
+
+  /* Verify that the links are correct */
+  /* MATTHIEU: To be commented out once we are happy */
+  for (size_t k = 0; k < s->nr_gparts; ++k) {
+
+    if (s->gparts[k].id_or_neg_offset <= 0) {
+    
+      struct part *part = &s->parts[-s->gparts[k].id_or_neg_offset];
+
+      if (part->gpart != &s->gparts[k]) error("Linking problem !");
+
+      if (s->gparts[k].x[0] != part->x[0] ||
+          s->gparts[k].x[1] != part->x[1] ||
+          s->gparts[k].x[2] != part->x[2])
+        error("Linked particles are not at the same position !");
+    }
+  }
+  for (size_t k = 0; k < s->nr_parts; ++k) {
+
+    if (s->parts[k].gpart != NULL && s->parts[k].gpart->id_or_neg_offset != -k) error("Linking problem !");
+  }
+
 #else
   error("SWIFT was not compiled with MPI support.");
 #endif
@@ -1974,6 +2358,7 @@ void engine_init(struct engine *e, struct space *s, float dt, int nr_threads,
   e->dt_max = dt_max;
   e->file_stats = NULL;
   e->verbose = verbose;
+  e->count_step = 0;
   e->wallclock_time = 0.f;
   engine_rank = nodeID;
 
@@ -2084,10 +2469,12 @@ void engine_init(struct engine *e, struct space *s, float dt, int nr_threads,
         "(t_beg = %e)",
         timeEnd, timeBegin);
 
-  /* Check we have sensible time step bounds */
+  /* Check we have sensible time-step values */
   if (e->dt_min > e->dt_max)
     error(
-        "Minimal time step size must be smaller than maximal time step size ");
+        "Minimal time-step size (%e) must be smaller than maximal time-step "
+        "size (%e)",
+        e->dt_min, e->dt_max);
 
   /* Deal with timestep */
   e->timeBase = (timeEnd - timeBegin) / max_nr_timesteps;
@@ -2133,8 +2520,7 @@ void engine_init(struct engine *e, struct space *s, float dt, int nr_threads,
 
 /* Construct types for MPI communications */
 #ifdef WITH_MPI
-  part_create_mpi_type(&e->part_mpi_type);
-  xpart_create_mpi_type(&e->xpart_mpi_type);
+  part_create_mpi_types();
 #endif
 
   /* First of all, init the barrier and lock it. */
diff --git a/src/engine.h b/src/engine.h
index 741ae1f553494e435394f529606b4cb794b0e3d2..4d1860b9eed0203bf9bf75711ec6e6549d837fe7 100644
--- a/src/engine.h
+++ b/src/engine.h
@@ -62,6 +62,8 @@ extern const char *engine_policy_names[];
 #define engine_maxtaskspercell 96
 #define engine_maxproxies 64
 #define engine_tasksreweight 10
+#define engine_parts_size_grow 1.05
+#define engine_redistribute_alloc_margin 1.2
 
 /* The rank of the engine as a global variable (for messages). */
 extern int engine_rank;
@@ -160,12 +162,6 @@ struct engine {
 
   /* Are we talkative ? */
   int verbose;
-
-#ifdef WITH_MPI
-  /* MPI data type for the particle transfers */
-  MPI_Datatype part_mpi_type;
-  MPI_Datatype xpart_mpi_type;
-#endif
 };
 
 /* Function prototypes. */
@@ -182,7 +178,9 @@ void engine_init_particles(struct engine *e);
 void engine_step(struct engine *e);
 void engine_maketasks(struct engine *e);
 void engine_split(struct engine *e, struct partition *initial_partition);
-int engine_exchange_strays(struct engine *e, int offset, size_t *ind, size_t N);
+void engine_exchange_strays(struct engine *e, size_t offset_parts,
+                            int *ind_part, size_t *Npart, size_t offset_gparts,
+                            int *ind_gpart, size_t *Ngpart);
 void engine_rebuild(struct engine *e);
 void engine_repartition(struct engine *e);
 void engine_makeproxies(struct engine *e);
diff --git a/src/gravity/Default/gravity.h b/src/gravity/Default/gravity.h
index 82bc52ad3e05794c8c05896075edc463a69197ff..92a9f64c1f84a9e949f4c0e9485f892b5c808cdc 100644
--- a/src/gravity/Default/gravity.h
+++ b/src/gravity/Default/gravity.h
@@ -22,14 +22,61 @@
 /**
  * @brief Computes the gravity time-step of a given particle
  *
- * @param p Pointer to the particle data
- * @param xp Pointer to the extended particle data
+ * @param gp Pointer to the g-particle data
  *
  */
 
-__attribute__((always_inline)) INLINE static float gravity_compute_timestep(
-    struct part* p, struct xpart* xp) {
+__attribute__((always_inline))
+    INLINE static float gravity_compute_timestep(struct gpart* gp) {
 
   /* Currently no limit is imposed */
   return FLT_MAX;
 }
+
+/**
+ * @brief Initialises the g-particles for the first time
+ *
+ * This function is called only once just after the ICs have been
+ * read in to do some conversions.
+ *
+ * @param gp The particle to act upon
+ */
+__attribute__((always_inline))
+    INLINE static void gravity_first_init_gpart(struct gpart* gp) {}
+
+/**
+ * @brief Prepares a g-particle for the gravity calculation
+ *
+ * Zeroes all the relevant arrays in preparation for the sums taking place in
+ * the variaous tasks
+ *
+ * @param gp The particle to act upon
+ */
+__attribute__((always_inline))
+    INLINE static void gravity_init_part(struct gpart* gp) {
+
+  /* Zero the acceleration */
+  gp->a_grav[0] = 0.f;
+  gp->a_grav[1] = 0.f;
+  gp->a_grav[2] = 0.f;
+}
+
+/**
+ * @brief Finishes the gravity calculation.
+ *
+ * Multiplies the forces and accelerations by the appropiate constants
+ *
+ * @param gp The particle to act upon
+ */
+__attribute__((always_inline))
+    INLINE static void gravity_end_force(struct gpart* gp) {}
+
+/**
+ * @brief Kick the additional variables
+ *
+ * @param gp The particle to act upon
+ * @param dt The time-step for this kick
+ * @param half_dt The half time-step for this kick
+ */
+__attribute__((always_inline)) INLINE static void gravity_kick_extra(
+    struct gpart* gp, float dt, float half_dt) {}
diff --git a/src/gravity/Default/gravity_debug.h b/src/gravity/Default/gravity_debug.h
index 62f3cfd43edde2564e231ec272965bfda8ab59da..531afffa5c2958eea49fe49171cde81fa8350fcf 100644
--- a/src/gravity/Default/gravity_debug.h
+++ b/src/gravity/Default/gravity_debug.h
@@ -24,5 +24,5 @@ __attribute__((always_inline))
       "v_full=[%.3e,%.3e,%.3e] \n a=[%.3e,%.3e,%.3e],\n "
       "mass=%.3e t_begin=%d, t_end=%d\n",
       p->x[0], p->x[1], p->x[2], p->v_full[0], p->v_full[1], p->v_full[2],
-      p->a[0], p->a[1], p->a[2], p->mass, p->ti_begin, p->ti_end);
+      p->a_grav[0], p->a_grav[1], p->a_grav[2], p->mass, p->ti_begin, p->ti_end);
 }
diff --git a/src/gravity/Default/gravity_iact.h b/src/gravity/Default/gravity_iact.h
index e62be446e8263bf02e3fd73f902b28cb1c3b16cf..d0391aa7819475b46a44ab816c5e15c7bf74a440 100644
--- a/src/gravity/Default/gravity_iact.h
+++ b/src/gravity/Default/gravity_iact.h
@@ -25,16 +25,9 @@
 #include "kernel.h"
 #include "vector.h"
 
-/**
- * @file  runner_iact_grav.h
- * @brief Gravity interaction functions.
- *
- */
-
 /**
  * @brief Gravity potential
  */
-
 __attribute__((always_inline)) INLINE static void runner_iact_grav(
     float r2, float *dx, struct gpart *pi, struct gpart *pj) {
 
@@ -56,8 +49,8 @@ __attribute__((always_inline)) INLINE static void runner_iact_grav(
   /* Aggregate the accelerations. */
   for (k = 0; k < 3; k++) {
     w = acc * dx[k];
-    pi->a[k] -= w * mj;
-    pj->a[k] += w * mi;
+    pi->a_grav[k] -= w * mj;
+    pj->a_grav[k] += w * mi;
   }
 }
 
@@ -107,8 +100,8 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_grav(
     ai.v = w.v * mj.v;
     aj.v = w.v * mi.v;
     for (j = 0; j < VEC_SIZE; j++) {
-      pi[j]->a[k] -= ai.f[j];
-      pj[j]->a[k] += aj.f[j];
+      pi[j]->a_grav[k] -= ai.f[j];
+      pj[j]->a_grav[k] += aj.f[j];
     }
   }
 
diff --git a/src/gravity/Default/gravity_io.h b/src/gravity/Default/gravity_io.h
index bcda40c21935cc68a45af69688b7162aebd8ccc9..74f364dd97361f0513755bedec83fe7cb277c36b 100644
--- a/src/gravity/Default/gravity_io.h
+++ b/src/gravity/Default/gravity_io.h
@@ -48,6 +48,8 @@ __attribute__((always_inline)) INLINE static void darkmatter_read_particles(
  *
  * @param h_grp The HDF5 group in which to write the arrays.
  * @param fileName The name of the file (unsued in MPI mode).
+ * @param partTypeGroupName The name of the group containing the particles in
+ *the HDF5 file.
  * @param xmfFile The XMF file to write to (unused in MPI mode).
  * @param Ndm The number of DM particles on that MPI rank.
  * @param Ndm_total The total number of g-particles (only used in MPI mode)
@@ -59,18 +61,20 @@ __attribute__((always_inline)) INLINE static void darkmatter_read_particles(
  *
  */
 __attribute__((always_inline)) INLINE static void darkmatter_write_particles(
-    hid_t h_grp, char* fileName, FILE* xmfFile, int Ndm, long long Ndm_total,
-    int mpi_rank, long long offset, struct gpart* gparts,
-    struct UnitSystem* us) {
+    hid_t h_grp, char* fileName, char* partTypeGroupName, FILE* xmfFile,
+    int Ndm, long long Ndm_total, int mpi_rank, long long offset,
+    struct gpart* gparts, struct UnitSystem* us) {
 
   /* Write arrays */
-  writeArray(h_grp, fileName, xmfFile, "Coordinates", DOUBLE, Ndm, 3, gparts,
-             Ndm_total, mpi_rank, offset, x, us, UNIT_CONV_LENGTH);
-  writeArray(h_grp, fileName, xmfFile, "Masses", FLOAT, Ndm, 1, gparts,
-             Ndm_total, mpi_rank, offset, mass, us, UNIT_CONV_MASS);
-  writeArray(h_grp, fileName, xmfFile, "Velocities", FLOAT, Ndm, 3, gparts,
-             Ndm_total, mpi_rank, offset, v_full, us, UNIT_CONV_SPEED);
-  writeArray(h_grp, fileName, xmfFile, "ParticleIDs", ULONGLONG, Ndm, 1, gparts,
-             Ndm_total, mpi_rank, offset, id_or_neg_offset, us,
-             UNIT_CONV_NO_UNITS);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Coordinates", DOUBLE,
+             Ndm, 3, gparts, Ndm_total, mpi_rank, offset, x, us,
+             UNIT_CONV_LENGTH);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Masses", FLOAT, Ndm,
+             1, gparts, Ndm_total, mpi_rank, offset, mass, us, UNIT_CONV_MASS);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Velocities", FLOAT,
+             Ndm, 3, gparts, Ndm_total, mpi_rank, offset, v_full, us,
+             UNIT_CONV_SPEED);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "ParticleIDs",
+             ULONGLONG, Ndm, 1, gparts, Ndm_total, mpi_rank, offset,
+             id_or_neg_offset, us, UNIT_CONV_NO_UNITS);
 }
diff --git a/src/gravity/Default/gravity_part.h b/src/gravity/Default/gravity_part.h
index 634ee4ae8453292e272eb1b62720e8c74fca4497..d36ceea650a54e1fdd0ff1fcf162a830dc5ed7cb 100644
--- a/src/gravity/Default/gravity_part.h
+++ b/src/gravity/Default/gravity_part.h
@@ -29,7 +29,7 @@ struct gpart {
   float v_full[3];
 
   /* Particle acceleration. */
-  float a[3];
+  float a_grav[3];
 
   /* Particle mass. */
   float mass;
@@ -44,4 +44,4 @@ struct gpart {
      which this gpart is linked. */
   long long id_or_neg_offset;
 
-} __attribute__((aligned(part_align)));
+} __attribute__((aligned(gpart_align)));
diff --git a/src/hydro/Default/hydro_io.h b/src/hydro/Default/hydro_io.h
index 958bf5a1869718b57678246ff3b1985e54145824..0e9ad46ddc1d4e8c8d3ffdbf3e81262ec49a7092 100644
--- a/src/hydro/Default/hydro_io.h
+++ b/src/hydro/Default/hydro_io.h
@@ -56,6 +56,8 @@ __attribute__((always_inline)) INLINE static void hydro_read_particles(
  *
  * @param h_grp The HDF5 group in which to write the arrays.
  * @param fileName The name of the file (unsued in MPI mode).
+ * @param partTypeGroupName The name of the group containing the particles in
+ *the HDF5 file.
  * @param xmfFile The XMF file to write to (unused in MPI mode).
  * @param N The number of particles on that MPI rank.
  * @param N_total The total number of particles (only used in MPI mode)
@@ -67,26 +69,31 @@ __attribute__((always_inline)) INLINE static void hydro_read_particles(
  *
  */
 __attribute__((always_inline)) INLINE static void hydro_write_particles(
-    hid_t h_grp, char* fileName, FILE* xmfFile, int N, long long N_total,
-    int mpi_rank, long long offset, struct part* parts, struct UnitSystem* us) {
+    hid_t h_grp, char* fileName, char* partTypeGroupName, FILE* xmfFile, int N,
+    long long N_total, int mpi_rank, long long offset, struct part* parts,
+    struct UnitSystem* us) {
 
   /* Write arrays */
-  writeArray(h_grp, fileName, xmfFile, "Coordinates", DOUBLE, N, 3, parts,
-             N_total, mpi_rank, offset, x, us, UNIT_CONV_LENGTH);
-  writeArray(h_grp, fileName, xmfFile, "Velocities", FLOAT, N, 3, parts,
-             N_total, mpi_rank, offset, v, us, UNIT_CONV_SPEED);
-  writeArray(h_grp, fileName, xmfFile, "Masses", FLOAT, N, 1, parts, N_total,
-             mpi_rank, offset, mass, us, UNIT_CONV_MASS);
-  writeArray(h_grp, fileName, xmfFile, "SmoothingLength", FLOAT, N, 1, parts,
-             N_total, mpi_rank, offset, h, us, UNIT_CONV_LENGTH);
-  writeArray(h_grp, fileName, xmfFile, "InternalEnergy", FLOAT, N, 1, parts,
-             N_total, mpi_rank, offset, u, us, UNIT_CONV_ENERGY_PER_UNIT_MASS);
-  writeArray(h_grp, fileName, xmfFile, "ParticleIDs", ULONGLONG, N, 1, parts,
-             N_total, mpi_rank, offset, id, us, UNIT_CONV_NO_UNITS);
-  writeArray(h_grp, fileName, xmfFile, "Acceleration", FLOAT, N, 3, parts,
-             N_total, mpi_rank, offset, a_hydro, us, UNIT_CONV_ACCELERATION);
-  writeArray(h_grp, fileName, xmfFile, "Density", FLOAT, N, 1, parts, N_total,
-             mpi_rank, offset, rho, us, UNIT_CONV_DENSITY);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Coordinates", DOUBLE,
+             N, 3, parts, N_total, mpi_rank, offset, x, us, UNIT_CONV_LENGTH);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Velocities", FLOAT,
+             N, 3, parts, N_total, mpi_rank, offset, v, us, UNIT_CONV_SPEED);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Masses", FLOAT, N, 1,
+             parts, N_total, mpi_rank, offset, mass, us, UNIT_CONV_MASS);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "SmoothingLength",
+             FLOAT, N, 1, parts, N_total, mpi_rank, offset, h, us,
+             UNIT_CONV_LENGTH);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "InternalEnergy",
+             FLOAT, N, 1, parts, N_total, mpi_rank, offset, u, us,
+             UNIT_CONV_ENERGY_PER_UNIT_MASS);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "ParticleIDs",
+             ULONGLONG, N, 1, parts, N_total, mpi_rank, offset, id, us,
+             UNIT_CONV_NO_UNITS);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Acceleration", FLOAT,
+             N, 3, parts, N_total, mpi_rank, offset, a_hydro, us,
+             UNIT_CONV_ACCELERATION);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Density", FLOAT, N,
+             1, parts, N_total, mpi_rank, offset, rho, us, UNIT_CONV_DENSITY);
 }
 
 /**
diff --git a/src/hydro/Gadget2/hydro_iact.h b/src/hydro/Gadget2/hydro_iact.h
index d31b6be383b80a2698b63d27308f6fee9b23518f..09f796a8f37a9c015135f4aab3f821c2e862bdc9 100644
--- a/src/hydro/Gadget2/hydro_iact.h
+++ b/src/hydro/Gadget2/hydro_iact.h
@@ -93,8 +93,8 @@ __attribute__((always_inline)) INLINE static void runner_iact_density(
   dv[2] = pi->v[2] - pj->v[2];
   const float dvdr = dv[0] * dx[0] + dv[1] * dx[1] + dv[2] * dx[2];
 
-  pi->div_v += faci * dvdr;
-  pj->div_v += facj * dvdr;
+  pi->div_v -= faci * dvdr;
+  pj->div_v -= facj * dvdr;
 
   /* Compute dv cross r */
   curlvr[0] = dv[1] * dx[2] - dv[2] * dx[1];
@@ -211,10 +211,10 @@ __attribute__((always_inline)) INLINE static void runner_iact_force(
   /* Balsara term */
   const float balsara_i =
       fabsf(pi->div_v) /
-      (fabsf(pi->div_v) + pi->force.curl_v + 0.0001 * ci / fac_mu / hi);
+      (fabsf(pi->div_v) + pi->force.curl_v + 0.0001f * ci / fac_mu / hi);
   const float balsara_j =
       fabsf(pj->div_v) /
-      (fabsf(pj->div_v) + pj->force.curl_v + 0.0001 * cj / fac_mu / hj);
+      (fabsf(pj->div_v) + pj->force.curl_v + 0.0001f * cj / fac_mu / hj);
 
   /* Are the particles moving towards each others ? */
   const float omega_ij = fminf(dvdr, 0.f);
@@ -309,10 +309,10 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_force(
   /* Balsara term */
   const float balsara_i =
       fabsf(pi->div_v) /
-      (fabsf(pi->div_v) + pi->force.curl_v + 0.0001 * ci / fac_mu / hi);
+      (fabsf(pi->div_v) + pi->force.curl_v + 0.0001f * ci / fac_mu / hi);
   const float balsara_j =
       fabsf(pj->div_v) /
-      (fabsf(pj->div_v) + pj->force.curl_v + 0.0001 * cj / fac_mu / hj);
+      (fabsf(pj->div_v) + pj->force.curl_v + 0.0001f * cj / fac_mu / hj);
 
   /* Are the particles moving towards each others ? */
   const float omega_ij = fminf(dvdr, 0.f);
diff --git a/src/hydro/Gadget2/hydro_io.h b/src/hydro/Gadget2/hydro_io.h
index 17c3d3013644c3572f3c26fc3e270b1c1bc465ed..c1c59dfa4980a2843e7e13bee4c964c9b254cae6 100644
--- a/src/hydro/Gadget2/hydro_io.h
+++ b/src/hydro/Gadget2/hydro_io.h
@@ -56,6 +56,8 @@ __attribute__((always_inline)) INLINE static void hydro_read_particles(
  *
  * @param h_grp The HDF5 group in which to write the arrays.
  * @param fileName The name of the file (unsued in MPI mode).
+ * @param partTypeGroupName The name of the group containing the particles in
+ *the HDF5 file.
  * @param xmfFile The XMF file to write to (unused in MPI mode).
  * @param N The number of particles on that MPI rank.
  * @param N_total The total number of particles (only used in MPI mode)
@@ -67,27 +69,31 @@ __attribute__((always_inline)) INLINE static void hydro_read_particles(
  *
  */
 __attribute__((always_inline)) INLINE static void hydro_write_particles(
-    hid_t h_grp, char* fileName, FILE* xmfFile, int N, long long N_total,
-    int mpi_rank, long long offset, struct part* parts, struct UnitSystem* us) {
+    hid_t h_grp, char* fileName, char* partTypeGroupName, FILE* xmfFile, int N,
+    long long N_total, int mpi_rank, long long offset, struct part* parts,
+    struct UnitSystem* us) {
 
   /* Write arrays */
-  writeArray(h_grp, fileName, xmfFile, "Coordinates", DOUBLE, N, 3, parts,
-             N_total, mpi_rank, offset, x, us, UNIT_CONV_LENGTH);
-  writeArray(h_grp, fileName, xmfFile, "Velocities", FLOAT, N, 3, parts,
-             N_total, mpi_rank, offset, v, us, UNIT_CONV_SPEED);
-  writeArray(h_grp, fileName, xmfFile, "Masses", FLOAT, N, 1, parts, N_total,
-             mpi_rank, offset, mass, us, UNIT_CONV_MASS);
-  writeArray(h_grp, fileName, xmfFile, "SmoothingLength", FLOAT, N, 1, parts,
-             N_total, mpi_rank, offset, h, us, UNIT_CONV_LENGTH);
-  writeArray(h_grp, fileName, xmfFile, "InternalEnergy", FLOAT, N, 1, parts,
-             N_total, mpi_rank, offset, entropy, us,
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Coordinates", DOUBLE,
+             N, 3, parts, N_total, mpi_rank, offset, x, us, UNIT_CONV_LENGTH);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Velocities", FLOAT,
+             N, 3, parts, N_total, mpi_rank, offset, v, us, UNIT_CONV_SPEED);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Masses", FLOAT, N, 1,
+             parts, N_total, mpi_rank, offset, mass, us, UNIT_CONV_MASS);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "SmoothingLength",
+             FLOAT, N, 1, parts, N_total, mpi_rank, offset, h, us,
+             UNIT_CONV_LENGTH);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "InternalEnergy",
+             FLOAT, N, 1, parts, N_total, mpi_rank, offset, entropy, us,
              UNIT_CONV_ENTROPY_PER_UNIT_MASS);
-  writeArray(h_grp, fileName, xmfFile, "ParticleIDs", ULONGLONG, N, 1, parts,
-             N_total, mpi_rank, offset, id, us, UNIT_CONV_NO_UNITS);
-  writeArray(h_grp, fileName, xmfFile, "Acceleration", FLOAT, N, 3, parts,
-             N_total, mpi_rank, offset, a_hydro, us, UNIT_CONV_ACCELERATION);
-  writeArray(h_grp, fileName, xmfFile, "Density", FLOAT, N, 1, parts, N_total,
-             mpi_rank, offset, rho, us, UNIT_CONV_DENSITY);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "ParticleIDs",
+             ULONGLONG, N, 1, parts, N_total, mpi_rank, offset, id, us,
+             UNIT_CONV_NO_UNITS);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Acceleration", FLOAT,
+             N, 3, parts, N_total, mpi_rank, offset, a_hydro, us,
+             UNIT_CONV_ACCELERATION);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Density", FLOAT, N,
+             1, parts, N_total, mpi_rank, offset, rho, us, UNIT_CONV_DENSITY);
 }
 
 /**
diff --git a/src/hydro/Minimal/hydro_iact.h b/src/hydro/Minimal/hydro_iact.h
index 6afb9d8d38a4fc7f1d38b7286720ddb7f3c51ab4..b3b81a9a0dfe41e7bfafe51050d6f7cf7157e31c 100644
--- a/src/hydro/Minimal/hydro_iact.h
+++ b/src/hydro/Minimal/hydro_iact.h
@@ -16,8 +16,8 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  *
  ******************************************************************************/
-#ifndef SWIFT_RUNNER_IACT_H
-#define SWIFT_RUNNER_IACT_H
+#ifndef SWIFT_RUNNER_IACT_MINIMAL_H
+#define SWIFT_RUNNER_IACT_MINIMAL_H
 
 /* Includes. */
 #include "const.h"
@@ -38,33 +38,31 @@
 __attribute__((always_inline)) INLINE static void runner_iact_density(
     float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
 
-  float r = sqrtf(r2);
-  float xi, xj;
-  float h_inv;
   float wi, wj, wi_dx, wj_dx;
-  float mi, mj;
+
+  const float r = sqrtf(r2);
 
   /* Get the masses. */
-  mi = pi->mass;
-  mj = pj->mass;
+  const float mi = pi->mass;
+  const float mj = pj->mass;
 
   /* Compute density of pi. */
-  h_inv = 1.0 / hi;
-  xi = r * h_inv;
+  const float hi_inv = 1.f / hi;
+  const float xi = r * hi_inv;
   kernel_deval(xi, &wi, &wi_dx);
 
   pi->rho += mj * wi;
-  pi->rho_dh -= mj * (3.0 * wi + xi * wi_dx);
+  pi->rho_dh -= mj * (3.f * wi + xi * wi_dx);
   pi->density.wcount += wi;
   pi->density.wcount_dh -= xi * wi_dx;
 
   /* Compute density of pj. */
-  h_inv = 1.f / hj;
-  xj = r * h_inv;
+  const float hj_inv = 1.f / hj;
+  const float xj = r * hj_inv;
   kernel_deval(xj, &wj, &wj_dx);
 
   pj->rho += mi * wj;
-  pj->rho_dh -= mi * (3.0 * wj + xj * wj_dx);
+  pj->rho_dh -= mi * (3.f * wj + xj * wj_dx);
   pj->density.wcount += wj;
   pj->density.wcount_dh -= xj * wj_dx;
 }
@@ -76,24 +74,20 @@ __attribute__((always_inline)) INLINE static void runner_iact_density(
 __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density(
     float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {
 
-  float r;
-  float xi;
-  float h_inv;
   float wi, wi_dx;
-  float mj;
 
   /* Get the masses. */
-  mj = pj->mass;
+  const float mj = pj->mass;
 
   /* Get r and r inverse. */
-  r = sqrtf(r2);
+  const float r = sqrtf(r2);
 
-  h_inv = 1.f / hi;
-  xi = r * h_inv;
+  const float h_inv = 1.f / hi;
+  const float xi = r * h_inv;
   kernel_deval(xi, &wi, &wi_dx);
 
   pi->rho += mj * wi;
-  pi->rho_dh -= mj * (3.0 * wi + xi * wi_dx);
+  pi->rho_dh -= mj * (3.f * wi + xi * wi_dx);
   pi->density.wcount += wi;
   pi->density.wcount_dh -= xi * wi_dx;
 }
@@ -148,7 +142,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_force(
   /* Compute sound speeds */
   const float ci = sqrtf(const_hydro_gamma * pressurei / rhoi);
   const float cj = sqrtf(const_hydro_gamma * pressurej / rhoj);
-  float v_sig = ci + cj + 3.f * omega_ij;
+  const float v_sig = ci + cj + 3.f * omega_ij;
 
   /* SPH acceleration term */
   const float sph_term = (P_over_rho_i * wi_dr + P_over_rho_j * wj_dr) * r_inv;
@@ -225,7 +219,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_force(
   /* Compute sound speeds */
   const float ci = sqrtf(const_hydro_gamma * pressurei / rhoi);
   const float cj = sqrtf(const_hydro_gamma * pressurej / rhoj);
-  float v_sig = ci + cj + 3.f * omega_ij;
+  const float v_sig = ci + cj + 3.f * omega_ij;
 
   /* SPH acceleration term */
   const float sph_term = (P_over_rho_i * wi_dr + P_over_rho_j * wj_dr) * r_inv;
@@ -245,4 +239,4 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_force(
   pi->force.v_sig = fmaxf(pi->force.v_sig, v_sig);
 }
 
-#endif /* SWIFT_RUNNER_IACT_H */
+#endif /* SWIFT_RUNNER_IACT_MINIMAL_H */
diff --git a/src/hydro/Minimal/hydro_io.h b/src/hydro/Minimal/hydro_io.h
index 2c56fb489ab84ca7c30426b54cf95e26e3821084..afe5de83f423e43b4d2480cca1ac3e84d6c549de 100644
--- a/src/hydro/Minimal/hydro_io.h
+++ b/src/hydro/Minimal/hydro_io.h
@@ -56,6 +56,8 @@ __attribute__((always_inline)) INLINE static void hydro_read_particles(
  *
  * @param h_grp The HDF5 group in which to write the arrays.
  * @param fileName The name of the file (unsued in MPI mode).
+ * @param partTypeGroupName The name of the group containing the particles in
+ *the HDF5 file.
  * @param xmfFile The XMF file to write to (unused in MPI mode).
  * @param N The number of particles on that MPI rank.
  * @param N_total The total number of particles (only used in MPI mode)
@@ -67,26 +69,31 @@ __attribute__((always_inline)) INLINE static void hydro_read_particles(
  *
  */
 __attribute__((always_inline)) INLINE static void hydro_write_particles(
-    hid_t h_grp, char* fileName, FILE* xmfFile, int N, long long N_total,
-    int mpi_rank, long long offset, struct part* parts, struct UnitSystem* us) {
+    hid_t h_grp, char* fileName, char* partTypeGroupName, FILE* xmfFile, int N,
+    long long N_total, int mpi_rank, long long offset, struct part* parts,
+    struct UnitSystem* us) {
 
   /* Write arrays */
-  writeArray(h_grp, fileName, xmfFile, "Coordinates", DOUBLE, N, 3, parts,
-             N_total, mpi_rank, offset, x, us, UNIT_CONV_LENGTH);
-  writeArray(h_grp, fileName, xmfFile, "Velocities", FLOAT, N, 3, parts,
-             N_total, mpi_rank, offset, v, us, UNIT_CONV_SPEED);
-  writeArray(h_grp, fileName, xmfFile, "Masses", FLOAT, N, 1, parts, N_total,
-             mpi_rank, offset, mass, us, UNIT_CONV_MASS);
-  writeArray(h_grp, fileName, xmfFile, "SmoothingLength", FLOAT, N, 1, parts,
-             N_total, mpi_rank, offset, h, us, UNIT_CONV_LENGTH);
-  writeArray(h_grp, fileName, xmfFile, "InternalEnergy", FLOAT, N, 1, parts,
-             N_total, mpi_rank, offset, u, us, UNIT_CONV_ENERGY_PER_UNIT_MASS);
-  writeArray(h_grp, fileName, xmfFile, "ParticleIDs", ULONGLONG, N, 1, parts,
-             N_total, mpi_rank, offset, id, us, UNIT_CONV_NO_UNITS);
-  writeArray(h_grp, fileName, xmfFile, "Acceleration", FLOAT, N, 3, parts,
-             N_total, mpi_rank, offset, a_hydro, us, UNIT_CONV_ACCELERATION);
-  writeArray(h_grp, fileName, xmfFile, "Density", FLOAT, N, 1, parts, N_total,
-             mpi_rank, offset, rho, us, UNIT_CONV_DENSITY);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Coordinates", DOUBLE,
+             N, 3, parts, N_total, mpi_rank, offset, x, us, UNIT_CONV_LENGTH);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Velocities", FLOAT,
+             N, 3, parts, N_total, mpi_rank, offset, v, us, UNIT_CONV_SPEED);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Masses", FLOAT, N, 1,
+             parts, N_total, mpi_rank, offset, mass, us, UNIT_CONV_MASS);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "SmoothingLength",
+             FLOAT, N, 1, parts, N_total, mpi_rank, offset, h, us,
+             UNIT_CONV_LENGTH);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "InternalEnergy",
+             FLOAT, N, 1, parts, N_total, mpi_rank, offset, u, us,
+             UNIT_CONV_ENERGY_PER_UNIT_MASS);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "ParticleIDs",
+             ULONGLONG, N, 1, parts, N_total, mpi_rank, offset, id, us,
+             UNIT_CONV_NO_UNITS);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Acceleration", FLOAT,
+             N, 3, parts, N_total, mpi_rank, offset, a_hydro, us,
+             UNIT_CONV_ACCELERATION);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Density", FLOAT, N,
+             1, parts, N_total, mpi_rank, offset, rho, us, UNIT_CONV_DENSITY);
 }
 
 /**
diff --git a/src/multipole.h b/src/multipole.h
index 91ba6df965ce9d3b088d538411b7f0a8555ba0e4..b7c20ddff5c3f1afc00af501a53b9659c8728ce8 100644
--- a/src/multipole.h
+++ b/src/multipole.h
@@ -127,7 +127,7 @@ __attribute__((always_inline)) INLINE static void multipole_iact_mp(
 
 /* Compute the forces on both multipoles. */
 #if multipole_order == 1
-  for (k = 0; k < 3; k++) p->a[k] += dx[k] * acc;
+  for (k = 0; k < 3; k++) p->a_grav[k] += dx[k] * acc;
 #else
 #error( "Multipoles of order %i not yet implemented." , multipole_order )
 #endif
diff --git a/src/parallel_io.c b/src/parallel_io.c
index cffa99a0fd75566ec3e850076d15e104504eeb40..0076c225e1c5361287280f8a567c8062aefd914e 100644
--- a/src/parallel_io.c
+++ b/src/parallel_io.c
@@ -178,9 +178,10 @@ void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N,
  *
  * Calls #error() if an error occurs.
  */
-void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
-                       enum DATA_TYPE type, int N, int dim, long long N_total,
-                       int mpi_rank, long long offset, char* part_c,
+void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile,
+                       char* partTypeGroupName, char* name, enum DATA_TYPE type,
+                       int N, int dim, long long N_total, int mpi_rank,
+                       long long offset, char* part_c, size_t partSize,
                        struct UnitSystem* us,
                        enum UnitConversionFactor convFactor) {
   hid_t h_data = 0, h_err = 0, h_memspace = 0, h_filespace = 0, h_plist_id = 0;
@@ -189,7 +190,6 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
   int i = 0, rank = 0;
   const size_t typeSize = sizeOfType(type);
   const size_t copySize = typeSize * dim;
-  const size_t partSize = sizeof(struct part);
   char* temp_c = 0;
   char buffer[150];
 
@@ -269,7 +269,9 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
   }
 
   /* Write XMF description for this data set */
-  if (mpi_rank == 0) writeXMFline(xmfFile, fileName, name, N_total, dim, type);
+  if (mpi_rank == 0)
+    writeXMFline(xmfFile, fileName, partTypeGroupName, name, N_total, dim,
+                 type);
 
   /* Write unit conversion factors for this data set */
   conversionString(buffer, us, convFactor);
@@ -328,14 +330,16 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
  * @param convFactor The UnitConversionFactor for this array
  *
  */
-#define writeArray(grp, fileName, xmfFile, name, type, N, dim, part, N_total, \
-                   mpi_rank, offset, field, us, convFactor)                   \
-  writeArrayBackEnd(grp, fileName, xmfFile, name, type, N, dim, N_total,      \
-                    mpi_rank, offset, (char*)(&(part[0]).field), us,          \
-                    convFactor)
+#define writeArray(grp, fileName, xmfFile, pTypeGroupName, name, type, N, dim, \
+                   part, N_total, mpi_rank, offset, field, us, convFactor)     \
+  writeArrayBackEnd(grp, fileName, xmfFile, pTypeGroupName, name, type, N,     \
+                    dim, N_total, mpi_rank, offset, (char*)(&(part[0]).field), \
+                    sizeof(part[0]), us, convFactor)
 
 /* Import the right hydro definition */
 #include "hydro_io.h"
+/* Import the right gravity definition */
+#include "gravity_io.h"
 
 /**
  * @brief Reads an HDF5 initial condition file (GADGET-3 type) in parallel
@@ -357,16 +361,17 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
  *
  */
 void read_ic_parallel(char* fileName, double dim[3], struct part** parts,
-                      size_t* N, int* periodic, int mpi_rank, int mpi_size,
-                      MPI_Comm comm, MPI_Info info) {
+                      struct gpart** gparts, size_t* Ngas, size_t* Ngparts,
+                      int* periodic, int mpi_rank, int mpi_size, MPI_Comm comm,
+                      MPI_Info info) {
   hid_t h_file = 0, h_grp = 0;
-  double boxSize[3] = {
-      0.0, -1.0, -1.0}; /* GADGET has only cubic boxes (in cosmological mode) */
-  int numParticles[6] = {
-      0}; /* GADGET has 6 particle types. We only keep the type 0*/
-  int numParticles_highWord[6] = {0};
-  long long offset = 0;
-  long long N_total = 0;
+  /* GADGET has only cubic boxes (in cosmological mode) */
+  double boxSize[3] = {0.0, -1.0, -1.0};
+  int numParticles[NUM_PARTICLE_TYPES] = {0};
+  int numParticles_highWord[NUM_PARTICLE_TYPES] = {0};
+  size_t N[NUM_PARTICLE_TYPES] = {0};
+  long long N_total[NUM_PARTICLE_TYPES] = {0};
+  long long offset[NUM_PARTICLE_TYPES] = {0};
 
   /* Open file */
   /* message("Opening file '%s' as IC.", fileName); */
@@ -398,58 +403,116 @@ void read_ic_parallel(char* fileName, double dim[3], struct part** parts,
   readAttribute(h_grp, "NumPart_Total", UINT, numParticles);
   readAttribute(h_grp, "NumPart_Total_HighWord", UINT, numParticles_highWord);
 
-  N_total = ((long long)numParticles[0]) +
-            ((long long)numParticles_highWord[0] << 32);
+  for (int ptype = 0; ptype < NUM_PARTICLE_TYPES; ++ptype)
+    N_total[ptype] = ((long long)numParticles[ptype]) +
+                     ((long long)numParticles_highWord[ptype] << 32);
+
   dim[0] = boxSize[0];
   dim[1] = (boxSize[1] < 0) ? boxSize[0] : boxSize[1];
   dim[2] = (boxSize[2] < 0) ? boxSize[0] : boxSize[2];
 
-  /* message("Found %d particles in a %speriodic box of size [%f %f %f].",  */
-  /* 	 N_total, (periodic ? "": "non-"), dim[0], dim[1], dim[2]); */
+  /* message("Found %d particles in a %speriodic box of size
+   * [%f %f %f].",  */
+  /* 	 N_total, (periodic ? "": "non-"), dim[0],
+   * dim[1], dim[2]); */
 
   /* Divide the particles among the tasks. */
-  offset = mpi_rank * N_total / mpi_size;
-  *N = (mpi_rank + 1) * N_total / mpi_size - offset;
+  for (int ptype = 0; ptype < NUM_PARTICLE_TYPES; ++ptype) {
+    offset[ptype] = mpi_rank * N_total[ptype] / mpi_size;
+    N[ptype] = (mpi_rank + 1) * N_total[ptype] / mpi_size - offset[ptype];
+  }
 
   /* Close header */
   H5Gclose(h_grp);
 
-  /* Allocate memory to store particles */
-  if (posix_memalign((void*)parts, part_align, *N * sizeof(struct part)) != 0)
+  /* Allocate memory to store SPH particles */
+  *Ngas = N[0];
+  if (posix_memalign((void*)parts, part_align, (*Ngas) * sizeof(struct part)) !=
+      0)
     error("Error while allocating memory for particles");
-  bzero(*parts, *N * sizeof(struct part));
+  bzero(*parts, *Ngas * sizeof(struct part));
 
-  /* message("Allocated %8.2f MB for particles.", *N * sizeof(struct part) /
+  /* Allocate memory to store all particles */
+  const size_t Ndm = N[1];
+  *Ngparts = N[1] + N[0];
+  if (posix_memalign((void*)gparts, gpart_align,
+                     *Ngparts * sizeof(struct gpart)) != 0)
+    error(
+        "Error while allocating memory for gravity "
+        "particles");
+  bzero(*gparts, *Ngparts * sizeof(struct gpart));
+
+  /* message("Allocated %8.2f MB for particles.", *N *
+   * sizeof(struct part) /
    * (1024.*1024.)); */
 
-  /* Open SPH particles group */
-  /* message("Reading particle arrays..."); */
-  h_grp = H5Gopen(h_file, "/PartType0", H5P_DEFAULT);
-  if (h_grp < 0) error("Error while opening particle group.\n");
+  /* message("BoxSize = %lf", dim[0]); */
+  /* message("NumPart = [%zd, %zd] Total = %zd", *Ngas, Ndm,
+   * *Ngparts); */
 
-  /* Read particle fields into the particle structure */
-  hydro_read_particles(h_grp, *N, N_total, offset, *parts);
+  /* Loop over all particle types */
+  for (int ptype = 0; ptype < NUM_PARTICLE_TYPES; ptype++) {
 
-  /* Close particle group */
-  H5Gclose(h_grp);
+    /* Don't do anything if no particle of this kind */
+    if (N_total[ptype] == 0) continue;
+
+    /* Open the particle group in the file */
+    char partTypeGroupName[PARTICLE_GROUP_BUFFER_SIZE];
+    snprintf(partTypeGroupName, PARTICLE_GROUP_BUFFER_SIZE, "/PartType%d",
+             ptype);
+    h_grp = H5Gopen(h_file, partTypeGroupName, H5P_DEFAULT);
+    if (h_grp < 0) {
+      error("Error while opening particle group %s.", partTypeGroupName);
+    }
+
+    /* Read particle fields into the particle structure */
+    switch (ptype) {
+
+      case GAS:
+        hydro_read_particles(h_grp, N[ptype], N_total[ptype], offset[ptype],
+                             *parts);
+        break;
+
+      case DM:
+        darkmatter_read_particles(h_grp, N[ptype], N_total[ptype],
+                                  offset[ptype], *gparts);
+        break;
+
+      default:
+        error("Particle Type %d not yet supported. Aborting", ptype);
+    }
+
+    /* Close particle group */
+    H5Gclose(h_grp);
+  }
+
+  /* Prepare the DM particles */
+  prepare_dm_gparts(*gparts, Ndm);
+
+  /* Now duplicate the hydro particle into gparts */
+  duplicate_hydro_gparts(*parts, *gparts, *Ngas, Ndm);
+
+  /* message("Done Reading particles..."); */
 
   /* Close property handler */
   H5Pclose(h_plist_id);
 
   /* Close file */
   H5Fclose(h_file);
-
-  /* message("Done Reading particles..."); */
 }
 
 /**
- * @brief Writes an HDF5 output file (GADGET-3 type) with its XMF descriptor
+ * @brief Writes an HDF5 output file (GADGET-3 type) with
+ *its XMF descriptor
  *
  * @param e The engine containing all the system.
- * @param us The UnitSystem used for the conversion of units in the output
+ * @param us The UnitSystem used for the conversion of units
+ *in the output
  *
- * Creates an HDF5 output file and writes the particles contained
- * in the engine. If such a file already exists, it is erased and replaced
+ * Creates an HDF5 output file and writes the particles
+ *contained
+ * in the engine. If such a file already exists, it is
+ *erased and replaced
  * by the new one.
  * The companion XMF file is also updated accordingly.
  *
@@ -459,23 +522,27 @@ void read_ic_parallel(char* fileName, double dim[3], struct part** parts,
 void write_output_parallel(struct engine* e, struct UnitSystem* us,
                            int mpi_rank, int mpi_size, MPI_Comm comm,
                            MPI_Info info) {
-
   hid_t h_file = 0, h_grp = 0, h_grpsph = 0;
-  int N = e->s->nr_parts;
+  const size_t Ngas = e->s->nr_parts;
+  const size_t Ntot = e->s->nr_gparts;
   int periodic = e->s->periodic;
-  unsigned int numParticles[6] = {N, 0};
-  unsigned int numParticlesHighWord[6] = {0};
-  unsigned int flagEntropy[6] = {0};
-  long long N_total = 0, offset = 0;
-  double offset_d = 0., N_d = 0., N_total_d = 0.;
   int numFiles = 1;
   struct part* parts = e->s->parts;
-  FILE* xmfFile = 0;
+  struct gpart* gparts = e->s->gparts;
+  struct gpart* dmparts = NULL;
   static int outputCount = 0;
+  FILE* xmfFile = 0;
+
+  /* Number of particles of each type */
+  // const size_t Ndm = Ntot - Ngas;
+
+  /* MATTHIEU: Temporary fix to preserve master */
+  const size_t Ndm = Ntot > 0 ? Ntot - Ngas : 0;
+  /* MATTHIEU: End temporary fix */
 
   /* File name */
-  char fileName[200];
-  sprintf(fileName, "output_%03i.hdf5", outputCount);
+  char fileName[FILENAME_BUFFER_SIZE];
+  snprintf(fileName, FILENAME_BUFFER_SIZE, "output_%03i.hdf5", outputCount);
 
   /* First time, we need to create the XMF file */
   if (outputCount == 0 && mpi_rank == 0) createXMFfile();
@@ -491,21 +558,26 @@ void write_output_parallel(struct engine* e, struct UnitSystem* us,
     error("Error while opening file '%s'.", fileName);
   }
 
-  /* Compute offset in the file and total number of particles */
-  /* Done using double to allow for up to 2^50=10^15 particles */
-  N_d = (double)N;
-  MPI_Exscan(&N_d, &offset_d, 1, MPI_DOUBLE, MPI_SUM, comm);
-  N_total_d = offset_d + N_d;
-  MPI_Bcast(&N_total_d, 1, MPI_DOUBLE, mpi_size - 1, comm);
-  if (N_total_d > 1.e15)
-    error(
-        "Error while computing the offset for parallel output: Simulation has "
-        "more than 10^15 particles.\n");
-  N_total = (long long)N_total_d;
-  offset = (long long)offset_d;
+  /* Compute offset in the file and total number of
+   * particles */
+  size_t N[NUM_PARTICLE_TYPES] = {Ngas, Ndm, 0};
+  long long N_total[NUM_PARTICLE_TYPES] = {0};
+  long long offset[NUM_PARTICLE_TYPES] = {0};
+  MPI_Exscan(&N, &offset, NUM_PARTICLE_TYPES, MPI_LONG_LONG, MPI_SUM, comm);
+  for (int ptype = 0; ptype < NUM_PARTICLE_TYPES; ++ptype)
+    N_total[ptype] = offset[ptype] + N[ptype];
+
+  /* The last rank now has the correct N_total. Let's
+   * broadcast from there */
+  MPI_Bcast(&N_total, 6, MPI_LONG_LONG, mpi_size - 1, comm);
 
-  /* Write the part of the XMF file corresponding to this specific output */
-  if (mpi_rank == 0) writeXMFheader(xmfFile, N_total, fileName, e->time);
+  /* Now everybody konws its offset and the total number of
+   * particles of each
+   * type */
+
+  /* Write the part of the XMF file corresponding to this
+   * specific output */
+  if (mpi_rank == 0) writeXMFoutputheader(xmfFile, fileName, e->time);
 
   /* Open header to write simulation properties */
   /* message("Writing runtime parameters..."); */
@@ -526,19 +598,28 @@ void write_output_parallel(struct engine* e, struct UnitSystem* us,
 
   /* Print the relevant information and print status */
   writeAttribute(h_grp, "BoxSize", DOUBLE, e->s->dim, 3);
-  writeAttribute(h_grp, "NumPart_ThisFile", UINT, numParticles, 6);
   double dblTime = e->time;
   writeAttribute(h_grp, "Time", DOUBLE, &dblTime, 1);
 
   /* GADGET-2 legacy values */
-  numParticles[0] = (unsigned int)N_total;
-  writeAttribute(h_grp, "NumPart_Total", UINT, numParticles, 6);
-  numParticlesHighWord[0] = (unsigned int)(N_total >> 32);
+  /* Number of particles of each type */
+  unsigned int numParticles[NUM_PARTICLE_TYPES] = {0};
+  unsigned int numParticlesHighWord[NUM_PARTICLE_TYPES] = {0};
+  for (int ptype = 0; ptype < NUM_PARTICLE_TYPES; ++ptype) {
+    numParticles[ptype] = (unsigned int)N_total[ptype];
+    numParticlesHighWord[ptype] = (unsigned int)(N_total[ptype] >> 32);
+  }
+  writeAttribute(h_grp, "NumPart_ThisFile", LONGLONG, N_total,
+                 NUM_PARTICLE_TYPES);
+  writeAttribute(h_grp, "NumPart_Total", UINT, numParticles,
+                 NUM_PARTICLE_TYPES);
   writeAttribute(h_grp, "NumPart_Total_HighWord", UINT, numParticlesHighWord,
-                 6);
+                 NUM_PARTICLE_TYPES);
   double MassTable[6] = {0., 0., 0., 0., 0., 0.};
-  writeAttribute(h_grp, "MassTable", DOUBLE, MassTable, 6);
-  writeAttribute(h_grp, "Flag_Entropy_ICs", UINT, flagEntropy, 6);
+  writeAttribute(h_grp, "MassTable", DOUBLE, MassTable, NUM_PARTICLE_TYPES);
+  unsigned int flagEntropy[NUM_PARTICLE_TYPES] = {0};
+  writeAttribute(h_grp, "Flag_Entropy_ICs", UINT, flagEntropy,
+                 NUM_PARTICLE_TYPES);
   writeAttribute(h_grp, "NumFilesPerSnapshot", INT, &numFiles, 1);
 
   /* Close header */
@@ -556,21 +637,71 @@ void write_output_parallel(struct engine* e, struct UnitSystem* us,
   /* Print the system of Units */
   writeUnitSystem(h_file, us);
 
-  /* Create SPH particles group */
-  /* message("Writing particle arrays..."); */
-  h_grp =
-      H5Gcreate(h_file, "/PartType0", H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-  if (h_grp < 0) error("Error while creating particle group.\n");
+  /* Loop over all particle types */
+  for (int ptype = 0; ptype < NUM_PARTICLE_TYPES; ptype++) {
+
+    /* Don't do anything if no particle of this kind */
+    if (N_total[ptype] == 0) continue;
+
+    /* Add the global information for that particle type to
+     * the XMF meta-file */
+    if (mpi_rank == 0)
+      writeXMFgroupheader(xmfFile, fileName, N_total[ptype], ptype);
+
+    /* Open the particle group in the file */
+    char partTypeGroupName[PARTICLE_GROUP_BUFFER_SIZE];
+    snprintf(partTypeGroupName, PARTICLE_GROUP_BUFFER_SIZE, "/PartType%d",
+             ptype);
+    h_grp = H5Gcreate(h_file, partTypeGroupName, H5P_DEFAULT, H5P_DEFAULT,
+                      H5P_DEFAULT);
+    if (h_grp < 0) {
+      error("Error while opening particle group %s.", partTypeGroupName);
+    }
 
-  /* Write particle fields from the particle structure */
-  hydro_write_particles(h_grp, fileName, xmfFile, N, N_total, mpi_rank, offset,
-                        parts, us);
+    /* Read particle fields into the particle structure */
+    switch (ptype) {
 
-  /* Close particle group */
-  H5Gclose(h_grp);
+      case GAS:
+        hydro_write_particles(h_grp, fileName, partTypeGroupName, xmfFile,
+                              N[ptype], N_total[ptype], mpi_rank, offset[ptype],
+                              parts, us);
+
+        break;
+
+      case DM:
+        /* Allocate temporary array */
+        if (posix_memalign((void*)&dmparts, gpart_align,
+                           Ndm * sizeof(struct gpart)) != 0)
+          error(
+              "Error while allocating temporart memory for "
+              "DM particles");
+        bzero(dmparts, Ndm * sizeof(struct gpart));
+
+        /* Collect the DM particles from gpart */
+        collect_dm_gparts(gparts, Ntot, dmparts, Ndm);
+
+        /* Write DM particles */
+        darkmatter_write_particles(h_grp, fileName, partTypeGroupName, xmfFile,
+                                   N[ptype], N_total[ptype], mpi_rank,
+                                   offset[ptype], dmparts, us);
+
+        /* Free temporary array */
+        free(dmparts);
+        break;
+
+      default:
+        error("Particle Type %d not yet supported. Aborting", ptype);
+    }
+
+    /* Close particle group */
+    H5Gclose(h_grp);
+
+    /* Close this particle group in the XMF file as well */
+    if (mpi_rank == 0) writeXMFgroupfooter(xmfFile, ptype);
+  }
 
   /* Write LXMF file descriptor */
-  if (mpi_rank == 0) writeXMFfooter(xmfFile);
+  if (mpi_rank == 0) writeXMFoutputfooter(xmfFile, outputCount, e->time);
 
   /* message("Done writing particles..."); */
 
diff --git a/src/parallel_io.h b/src/parallel_io.h
index a0589944ec845c712abde1e64e305980748db0e7..663f0aabac44c08682b964512839b925673ea5c5 100644
--- a/src/parallel_io.h
+++ b/src/parallel_io.h
@@ -32,8 +32,9 @@
 #if defined(HAVE_HDF5) && defined(WITH_MPI) && defined(HAVE_PARALLEL_HDF5)
 
 void read_ic_parallel(char* fileName, double dim[3], struct part** parts,
-                      size_t* N, int* periodic, int mpi_rank, int mpi_size,
-                      MPI_Comm comm, MPI_Info info);
+                      struct gpart** gparts, size_t* Ngas, size_t* Ngparts,
+                      int* periodic, int mpi_rank, int mpi_size, MPI_Comm comm,
+                      MPI_Info info);
 
 void write_output_parallel(struct engine* e, struct UnitSystem* us,
                            int mpi_rank, int mpi_size, MPI_Comm comm,
diff --git a/src/parser.c b/src/parser.c
new file mode 100644
index 0000000000000000000000000000000000000000..06dc819842d54d952704e4e0c40ebec5b561f691
--- /dev/null
+++ b/src/parser.c
@@ -0,0 +1,265 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (c) 2016 James Willis (james.s.willis@durham.ac.uk)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+
+/* Config parameters. */
+#include "../config.h"
+
+/* Some standard headers. */
+/* Needs to be included so that strtok returns char * instead of a int *. */
+#include <string.h>
+#include <stdlib.h>
+
+/* This object's header. */
+#include "parser.h"
+
+/* Local headers. */
+#include "error.h"
+
+/* Private functions. */
+static int count_char(char *str, char val);
+static void parse_line(FILE *fp, struct swift_params *params);
+
+/**
+ * @brief Reads an input file and stores each parameter in a structure.
+ *
+ * @param file_name Name of file to be read
+ * @param params Structure to be populated from file
+ */
+
+void parser_read_file(const char *file_name, struct swift_params *params) {
+
+  FILE *fp;
+
+  params->count = 0;
+
+  /* Open file for reading */
+  fp = fopen(file_name, "r");
+
+  if (fp == NULL) {
+    error("Error opening parameter file: %s", file_name);
+  }
+
+  /* Read until the end of the file is reached.*/
+  while (!feof(fp)) {
+    parse_line(fp, params);
+  }
+
+  fclose(fp);
+}
+
+/**
+ * @brief Counts the number of times a specific character appears in a string.
+ *
+ * @param str String to be checked
+ * @param val Character to be counted
+ */
+
+static int count_char(char *str, char val) {
+
+  int count = 0;
+
+  /* Check if the line contains the character */
+  while (*str) {
+    if (*str++ == val) ++count;
+  }
+
+  return count;
+}
+
+/**
+ * @brief Parses a line from a file and stores any parameters in a structure.
+ *
+ * @param fp File pointer to file to be read
+ * @param params Structure to be populated from file
+ *
+ */
+
+static void parse_line(FILE *fp, struct swift_params *params) {
+
+  char line[PARSER_MAX_LINE_SIZE];
+  char trim_line[PARSER_MAX_LINE_SIZE];
+
+  /* Read a line of the file */
+  if (fgets(line, PARSER_MAX_LINE_SIZE, fp) != NULL) {
+
+    char *token;
+    /* Remove comments */
+    token = strtok(line, PARSER_COMMENT_CHAR);
+    strcpy(trim_line, token);
+
+    /* Check if the line contains a value */
+    if (strchr(trim_line, PARSER_VALUE_CHAR)) {
+      /* Check for more than one parameter on the same line. */
+      if (count_char(trim_line, PARSER_VALUE_CHAR) > 1) {
+        error("Found more than one parameter in '%s', only one allowed.", line);
+      } else {
+        /* Take first token as the parameter name. */
+        token = strtok(trim_line, PARSER_VALUE_STRING);
+        strcpy(params->data[params->count].name, token);
+
+        /* Take second token as the parameter value. */
+        token = strtok(NULL, " #\n");
+        strcpy(params->data[params->count++].value, token);
+      }
+    }
+  }
+}
+
+/**
+ * @brief Retrieve integer parameter from structure.
+ *
+ * @param params Structure that holds the parameters
+ * @param name Name of the parameter to be found
+ * @param retParam Value of the parameter found
+ *
+ */
+
+void parser_get_param_int(struct swift_params *params, char *name,
+                          int *retParam) {
+
+  char str[128];
+
+  for (int i = 0; i < params->count; i++) {
+
+    /*strcmp returns 0 if both strings are the same.*/
+    if (!strcmp(name, params->data[i].name)) {
+
+      /* Check that exactly one number is parsed. */
+      if (sscanf(params->data[i].value, "%d%s", retParam, str) != 1) {
+        error(
+            "Tried parsing int '%s' but found '%s' with illegal integer "
+            "characters '%s'.",
+            params->data[i].name, params->data[i].value, str);
+      }
+
+      return;
+    }
+  }
+
+  message("Cannot find '%s' in the structure.", name);
+}
+
+/**
+ * @brief Retrieve float parameter from structure.
+ *
+ * @param params Structure that holds the parameters
+ * @param name Name of the parameter to be found
+ * @param retParam Value of the parameter found
+ *
+ */
+
+void parser_get_param_float(struct swift_params *params, char *name,
+                            float *retParam) {
+
+  char str[128];
+
+  for (int i = 0; i < params->count; i++) {
+
+    /*strcmp returns 0 if both strings are the same.*/
+    if (!strcmp(name, params->data[i].name)) {
+
+      /* Check that exactly one number is parsed. */
+      if (sscanf(params->data[i].value, "%f%s", retParam, str) != 1) {
+        error(
+            "Tried parsing float '%s' but found '%s' with illegal float "
+            "characters '%s'.",
+            params->data[i].name, params->data[i].value, str);
+      }
+
+      return;
+    }
+  }
+
+  message("Cannot find '%s' in the structure.", name);
+}
+
+/**
+ * @brief Retrieve double parameter from structure.
+ *
+ * @param params Structure that holds the parameters
+ * @param name Name of the parameter to be found
+ * @param retParam Value of the parameter found
+ *
+ */
+
+void parser_get_param_double(struct swift_params *params, char *name,
+                             double *retParam) {
+
+  char str[128];
+
+  for (int i = 0; i < params->count; i++) {
+
+    /*strcmp returns 0 if both strings are the same.*/
+    if (!strcmp(name, params->data[i].name)) {
+
+      /* Check that exactly one number is parsed. */
+      if (sscanf(params->data[i].value, "%lf", retParam) != 1) {
+        error(
+            "Tried parsing double '%s' but found '%s' with illegal double "
+            "characters '%s'.",
+            params->data[i].name, params->data[i].value, str);
+      }
+
+      return;
+    }
+  }
+
+  message("Cannot find '%s' in the structure.", name);
+}
+
+/**
+ * @brief Retrieve string parameter from structure.
+ *
+ * @param params Structure that holds the parameters
+ * @param name Name of the parameter to be found
+ * @param retParam Value of the parameter found
+ *
+ */
+
+void parser_get_param_string(struct swift_params *params, char *name,
+                             char *retParam) {
+
+  for (int i = 0; i < params->count; i++) {
+
+    /*strcmp returns 0 if both strings are the same.*/
+    if (!strcmp(name, params->data[i].name)) {
+      strcpy(retParam, params->data[i].value);
+      return;
+    }
+  }
+}
+
+/**
+ * @brief Prints the contents of the parameter structure.
+ *
+ * @param params Structure that holds the parameters
+ *
+ */
+
+void parser_print_params(struct swift_params *params) {
+
+  printf("\n--------------------------\n");
+  printf("|  SWIFT Parameter File  |\n");
+  printf("--------------------------\n");
+
+  for (int i = 0; i < params->count; i++) {
+    printf("Parameter name: %s\n", params->data[i].name);
+    printf("Parameter value: %s\n", params->data[i].value);
+  }
+}
diff --git a/src/parser.h b/src/parser.h
new file mode 100644
index 0000000000000000000000000000000000000000..2fb4148944cd423da016341744cb6d58e222182e
--- /dev/null
+++ b/src/parser.h
@@ -0,0 +1,54 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (c) 2016 James Willis (james.s.willis@durham.ac.uk)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+#ifndef SWIFT_PARSER_H
+#define SWIFT_PARSER_H
+
+#include <stdio.h>
+
+#define PARSER_MAX_LINE_SIZE 128
+#define PARSER_MAX_NO_OF_PARAMS 512
+
+#define PARSER_COMMENT_CHAR "#"
+#define PARSER_VALUE_CHAR ':'
+#define PARSER_VALUE_STRING ":"
+#define PARSER_END_OF_FILE "..."
+
+struct parameter {
+  char name[PARSER_MAX_LINE_SIZE];
+  char value[PARSER_MAX_LINE_SIZE];
+};
+
+struct swift_params {
+  struct parameter data[PARSER_MAX_NO_OF_PARAMS];
+  int count;
+};
+
+/* Public API. */
+void parser_read_file(const char *file_name, struct swift_params *params);
+void parser_print_params(struct swift_params *params);
+void parser_get_param_int(struct swift_params *params, char *name,
+                          int *retParam);
+void parser_get_param_float(struct swift_params *params, char *name,
+                            float *retParam);
+void parser_get_param_double(struct swift_params *params, char *name,
+                             double *retParam);
+void parser_get_param_string(struct swift_params *params, char *name,
+                             char *retParam);
+
+#endif /* SWIFT_PARSER_H */
diff --git a/src/part.c b/src/part.c
index fa87a50d0c4407ac7e20963ca99a8419187f0eee..d5a2bc0ec82c44219509d338f9a5108b3821e11e 100644
--- a/src/part.c
+++ b/src/part.c
@@ -26,6 +26,7 @@
 #endif
 
 /* This object's header. */
+#include "error.h"
 #include "part.h"
 
 /**
@@ -57,30 +58,17 @@ void part_relink_parts(struct gpart *gparts, size_t N, struct part *parts) {
 }
 
 #ifdef WITH_MPI
-/**
- * @brief Registers and returns an MPI type for the particles
- *
- * @param part_type The type container
- */
-void part_create_mpi_type(MPI_Datatype* part_type) {
-
-  /* This is not the recommended way of doing this.
-     One should define the structure field by field
-     But as long as we don't do serialization via MPI-IO
-     we don't really care.
-     Also we would have to modify this function everytime something
-     is added to the part structure. */
-  MPI_Type_contiguous(sizeof(struct part) / sizeof(unsigned char), MPI_BYTE,
-                      part_type);
-  MPI_Type_commit(part_type);
-}
+/* MPI data type for the particle transfers */
+MPI_Datatype part_mpi_type;
+MPI_Datatype xpart_mpi_type;
+MPI_Datatype gpart_mpi_type;
+#endif
 
+#ifdef WITH_MPI
 /**
- * @brief Registers and returns an MPI type for the xparticles
- *
- * @param xpart_type The type container
+ * @brief Registers MPI particle types.
  */
-void xpart_create_mpi_type(MPI_Datatype* xpart_type) {
+void part_create_mpi_types() {
 
   /* This is not the recommended way of doing this.
      One should define the structure field by field
@@ -88,9 +76,20 @@ void xpart_create_mpi_type(MPI_Datatype* xpart_type) {
      we don't really care.
      Also we would have to modify this function everytime something
      is added to the part structure. */
-  MPI_Type_contiguous(sizeof(struct xpart) / sizeof(unsigned char), MPI_BYTE,
-                      xpart_type);
-  MPI_Type_commit(xpart_type);
+  if (MPI_Type_contiguous(sizeof(struct part) / sizeof(unsigned char), MPI_BYTE,
+                          &part_mpi_type) != MPI_SUCCESS ||
+      MPI_Type_commit(&part_mpi_type) != MPI_SUCCESS) {
+    error("Failed to create MPI type for parts.");
+  }
+  if (MPI_Type_contiguous(sizeof(struct xpart) / sizeof(unsigned char),
+                          MPI_BYTE, &xpart_mpi_type) != MPI_SUCCESS ||
+      MPI_Type_commit(&xpart_mpi_type) != MPI_SUCCESS) {
+    error("Failed to create MPI type for xparts.");
+  }
+  if (MPI_Type_contiguous(sizeof(struct gpart) / sizeof(unsigned char),
+                          MPI_BYTE, &gpart_mpi_type) != MPI_SUCCESS ||
+      MPI_Type_commit(&gpart_mpi_type) != MPI_SUCCESS) {
+    error("Failed to create MPI type for gparts.");
+  }
 }
-
 #endif
diff --git a/src/part.h b/src/part.h
index fa52cefc0d2561a8daa83b4c507e361f1e281f58..1fba171a46cecb7df6ea20ff28ba3bbaefecc7d1 100644
--- a/src/part.h
+++ b/src/part.h
@@ -36,8 +36,8 @@
 
 /* Some constants. */
 #define part_align 64
-#define gpart_align 32
 #define xpart_align 32
+#define gpart_align 32
 
 /* Import the right particle definition */
 #if defined(MINIMAL_SPH)
@@ -55,8 +55,12 @@
 void part_relink_gparts(struct part *parts, size_t N, ptrdiff_t offset);
 void part_relink_parts(struct gpart *gparts, size_t N, struct part *parts);
 #ifdef WITH_MPI
-void part_create_mpi_type(MPI_Datatype* part_type);
-void xpart_create_mpi_type(MPI_Datatype* xpart_type);
+/* MPI data type for the particle transfers */
+extern MPI_Datatype part_mpi_type;
+extern MPI_Datatype xpart_mpi_type;
+extern MPI_Datatype gpart_mpi_type;
+
+void part_create_mpi_types();
 #endif
 
 #endif /* SWIFT_PART_H */
diff --git a/src/partition.c b/src/partition.c
index 0f8eb3ebe334d71228510307dd9ccc4e56e234b3..ea25bc132dacf19b7a5c12765d2a39313fc01486 100644
--- a/src/partition.c
+++ b/src/partition.c
@@ -424,7 +424,7 @@ static void repart_edge_metis(int partweights, int bothweights, int nodeID,
    * assume the same graph structure as used in the part_ calls). */
   int nr_cells = s->nr_cells;
   struct cell *cells = s->cells;
-  float wscale = 1e-3, vscale = 1e-3, wscale_buff;
+  float wscale = 1e-3, vscale = 1e-3, wscale_buff = 0.0;
   int wtot = 0;
   int wmax = 1e9 / nr_nodes;
   int wmin;
diff --git a/src/proxy.c b/src/proxy.c
index 7d2e546bf945ca18c2195ea2801d1b2058cb2f58..02263a5653bdcdd2d1bf0a86523ed1a599d4bf21 100644
--- a/src/proxy.c
+++ b/src/proxy.c
@@ -50,11 +50,9 @@ void proxy_cells_exch1(struct proxy *p) {
 
 #ifdef WITH_MPI
 
-  int k, ind;
-
   /* Get the number of pcells we will need to send. */
   p->size_pcells_out = 0;
-  for (k = 0; k < p->nr_cells_out; k++)
+  for (int k = 0; k < p->nr_cells_out; k++)
     p->size_pcells_out += p->cells_out[k]->pcell_size;
 
   /* Send the number of pcells. */
@@ -70,7 +68,7 @@ void proxy_cells_exch1(struct proxy *p) {
   if ((p->pcells_out = malloc(sizeof(struct pcell) * p->size_pcells_out)) ==
       NULL)
     error("Failed to allocate pcell_out buffer.");
-  for (ind = 0, k = 0; k < p->nr_cells_out; k++) {
+  for (int ind = 0, k = 0; k < p->nr_cells_out; k++) {
     memcpy(&p->pcells_out[ind], p->cells_out[k]->pcell,
            sizeof(struct pcell) * p->cells_out[k]->pcell_size);
     ind += p->cells_out[k]->pcell_size;
@@ -131,16 +129,14 @@ void proxy_cells_exch2(struct proxy *p) {
 
 void proxy_addcell_in(struct proxy *p, struct cell *c) {
 
-  int k;
-  struct cell **temp;
-
   /* Check if the cell is already registered with the proxy. */
-  for (k = 0; k < p->nr_cells_in; k++)
+  for (int k = 0; k < p->nr_cells_in; k++)
     if (p->cells_in[k] == c) return;
 
   /* Do we need to grow the number of in cells? */
   if (p->nr_cells_in == p->size_cells_in) {
     p->size_cells_in *= proxy_buffgrow;
+    struct cell **temp;
     if ((temp = malloc(sizeof(struct cell *) * p->size_cells_in)) == NULL)
       error("Failed to allocate incoming cell list.");
     memcpy(temp, p->cells_in, sizeof(struct cell *) * p->nr_cells_in);
@@ -162,16 +158,14 @@ void proxy_addcell_in(struct proxy *p, struct cell *c) {
 
 void proxy_addcell_out(struct proxy *p, struct cell *c) {
 
-  int k;
-  struct cell **temp;
-
   /* Check if the cell is already registered with the proxy. */
-  for (k = 0; k < p->nr_cells_out; k++)
+  for (int k = 0; k < p->nr_cells_out; k++)
     if (p->cells_out[k] == c) return;
 
   /* Do we need to grow the number of out cells? */
   if (p->nr_cells_out == p->size_cells_out) {
     p->size_cells_out *= proxy_buffgrow;
+    struct cell **temp;
     if ((temp = malloc(sizeof(struct cell *) * p->size_cells_out)) == NULL)
       error("Failed to allocate outgoing cell list.");
     memcpy(temp, p->cells_out, sizeof(struct cell *) * p->nr_cells_out);
@@ -195,20 +189,21 @@ void proxy_parts_exch1(struct proxy *p) {
 #ifdef WITH_MPI
 
   /* Send the number of particles. */
-  if (MPI_Isend(&p->nr_parts_out, 1, MPI_INT, p->nodeID,
+  p->buff_out[0] = p->nr_parts_out;
+  p->buff_out[1] = p->nr_gparts_out;
+  if (MPI_Isend(p->buff_out, 2, MPI_INT, p->nodeID,
                 p->mynodeID * proxy_tag_shift + proxy_tag_count, MPI_COMM_WORLD,
                 &p->req_parts_count_out) != MPI_SUCCESS)
     error("Failed to isend nr of parts.");
-  // message( "isent particle count (%i) from node %i to node %i." ,
-  // p->nr_parts_out , p->mynodeID , p->nodeID ); fflush(stdout);
+  /* message( "isent particle counts [%i, %i] from node %i to node %i." ,
+  p->buff_out[0], p->buff_out[1], p->mynodeID , p->nodeID ); fflush(stdout); */
 
   /* Send the particle buffers. */
   if (p->nr_parts_out > 0) {
-    if (MPI_Isend(p->parts_out, sizeof(struct part) * p->nr_parts_out, MPI_BYTE,
-                  p->nodeID, p->mynodeID * proxy_tag_shift + proxy_tag_parts,
+    if (MPI_Isend(p->parts_out, p->nr_parts_out, part_mpi_type, p->nodeID,
+                  p->mynodeID * proxy_tag_shift + proxy_tag_parts,
                   MPI_COMM_WORLD, &p->req_parts_out) != MPI_SUCCESS ||
-        MPI_Isend(p->xparts_out, sizeof(struct xpart) * p->nr_parts_out,
-                  MPI_BYTE, p->nodeID,
+        MPI_Isend(p->xparts_out, p->nr_parts_out, xpart_mpi_type, p->nodeID,
                   p->mynodeID * proxy_tag_shift + proxy_tag_xparts,
                   MPI_COMM_WORLD, &p->req_xparts_out) != MPI_SUCCESS)
       error("Failed to isend part data.");
@@ -219,14 +214,20 @@ void proxy_parts_exch1(struct proxy *p) {
               p->parts_out[k].id, p->parts_out[k].x[0], p->parts_out[k].x[1],
               p->parts_out[k].x[2], p->parts_out[k].h, p->nodeID);*/
   }
+  if (p->nr_gparts_out > 0) {
+    if (MPI_Isend(p->gparts_out, p->nr_gparts_out, gpart_mpi_type, p->nodeID,
+                  p->mynodeID * proxy_tag_shift + proxy_tag_gparts,
+                  MPI_COMM_WORLD, &p->req_gparts_out) != MPI_SUCCESS)
+      error("Failed to isend part data.");
+    // message( "isent gpart data (%i) to node %i." , p->nr_parts_out ,
+    // p->nodeID ); fflush(stdout);
+  }
 
   /* Receive the number of particles. */
-  if (MPI_Irecv(&p->nr_parts_in, 1, MPI_INT, p->nodeID,
+  if (MPI_Irecv(p->buff_in, 2, MPI_INT, p->nodeID,
                 p->nodeID * proxy_tag_shift + proxy_tag_count, MPI_COMM_WORLD,
                 &p->req_parts_count_in) != MPI_SUCCESS)
     error("Failed to irecv nr of parts.");
-// message( "irecv particle count on node %i from node %i." , p->mynodeID ,
-// p->nodeID ); fflush(stdout);
 
 #else
   error("SWIFT was not compiled with MPI support.");
@@ -237,6 +238,10 @@ void proxy_parts_exch2(struct proxy *p) {
 
 #ifdef WITH_MPI
 
+  /* Unpack the incomming parts counts. */
+  p->nr_parts_in = p->buff_in[0];
+  p->nr_gparts_in = p->buff_in[1];
+
   /* Is there enough space in the buffer? */
   if (p->nr_parts_in > p->size_parts_in) {
     do {
@@ -250,19 +255,36 @@ void proxy_parts_exch2(struct proxy *p) {
                                                p->size_parts_in)) == NULL)
       error("Failed to re-allocate parts_in buffers.");
   }
+  if (p->nr_gparts_in > p->size_gparts_in) {
+    do {
+      p->size_gparts_in *= proxy_buffgrow;
+    } while (p->nr_gparts_in > p->size_gparts_in);
+    free(p->gparts_in);
+    if ((p->gparts_in = (struct gpart *)malloc(sizeof(struct gpart) *
+                                               p->size_gparts_in)) == NULL)
+      error("Failed to re-allocate gparts_in buffers.");
+  }
 
   /* Receive the particle buffers. */
   if (p->nr_parts_in > 0) {
-    if (MPI_Irecv(p->parts_in, sizeof(struct part) * p->nr_parts_in, MPI_BYTE,
-                  p->nodeID, p->nodeID * proxy_tag_shift + proxy_tag_parts,
-                  MPI_COMM_WORLD, &p->req_parts_in) != MPI_SUCCESS ||
-        MPI_Irecv(p->xparts_in, sizeof(struct xpart) * p->nr_parts_in, MPI_BYTE,
-                  p->nodeID, p->nodeID * proxy_tag_shift + proxy_tag_xparts,
+    if (MPI_Irecv(p->parts_in, p->nr_parts_in, part_mpi_type, p->nodeID,
+                  p->nodeID * proxy_tag_shift + proxy_tag_parts, MPI_COMM_WORLD,
+                  &p->req_parts_in) != MPI_SUCCESS ||
+        MPI_Irecv(p->xparts_in, p->nr_parts_in, xpart_mpi_type, p->nodeID,
+                  p->nodeID * proxy_tag_shift + proxy_tag_xparts,
                   MPI_COMM_WORLD, &p->req_xparts_in) != MPI_SUCCESS)
       error("Failed to irecv part data.");
     // message( "irecv particle data (%i) from node %i." , p->nr_parts_in ,
     // p->nodeID ); fflush(stdout);
   }
+  if (p->nr_gparts_in > 0) {
+    if (MPI_Irecv(p->gparts_in, p->nr_gparts_in, gpart_mpi_type, p->nodeID,
+                  p->nodeID * proxy_tag_shift + proxy_tag_gparts,
+                  MPI_COMM_WORLD, &p->req_gparts_in) != MPI_SUCCESS)
+      error("Failed to irecv gpart data.");
+    // message( "irecv gpart data (%i) from node %i." , p->nr_gparts_in ,
+    // p->nodeID ); fflush(stdout);
+  }
 
 #else
   error("SWIFT was not compiled with MPI support.");
@@ -278,8 +300,8 @@ void proxy_parts_exch2(struct proxy *p) {
  * @param N The number of parts.
  */
 
-void proxy_parts_load(struct proxy *p, struct part *parts, struct xpart *xparts,
-                      int N) {
+void proxy_parts_load(struct proxy *p, const struct part *parts,
+                      const struct xpart *xparts, int N) {
 
   /* Is there enough space in the buffer? */
   if (p->nr_parts_out + N > p->size_parts_out) {
@@ -309,6 +331,37 @@ void proxy_parts_load(struct proxy *p, struct part *parts, struct xpart *xparts,
   p->nr_parts_out += N;
 }
 
+/**
+ * @brief Load parts onto a proxy for exchange.
+ *
+ * @param p The #proxy.
+ * @param gparts Pointer to an array of #gpart to send.
+ * @param N The number of parts.
+ */
+
+void proxy_gparts_load(struct proxy *p, const struct gpart *gparts, int N) {
+
+  /* Is there enough space in the buffer? */
+  if (p->nr_gparts_out + N > p->size_gparts_out) {
+    do {
+      p->size_gparts_out *= proxy_buffgrow;
+    } while (p->nr_gparts_out + N > p->size_gparts_out);
+    struct gpart *tp;
+    if ((tp = (struct gpart *)malloc(sizeof(struct gpart) *
+                                     p->size_gparts_out)) == NULL)
+      error("Failed to re-allocate gparts_out buffers.");
+    memcpy(tp, p->gparts_out, sizeof(struct gpart) * p->nr_gparts_out);
+    free(p->gparts_out);
+    p->gparts_out = tp;
+  }
+
+  /* Copy the parts and xparts data to the buffer. */
+  memcpy(&p->gparts_out[p->nr_gparts_out], gparts, sizeof(struct gpart) * N);
+
+  /* Increase the counters. */
+  p->nr_gparts_out += N;
+}
+
 /**
  * @brief Initialize the given proxy.
  *
@@ -358,4 +411,20 @@ void proxy_init(struct proxy *p, int mynodeID, int nodeID) {
       error("Failed to allocate parts_out buffers.");
   }
   p->nr_parts_out = 0;
+
+  /* Allocate the gpart send and receive buffers, if needed. */
+  if (p->gparts_in == NULL) {
+    p->size_gparts_in = proxy_buffinit;
+    if ((p->gparts_in = (struct gpart *)malloc(sizeof(struct gpart) *
+                                               p->size_gparts_in)) == NULL)
+      error("Failed to allocate gparts_in buffers.");
+  }
+  p->nr_gparts_in = 0;
+  if (p->gparts_out == NULL) {
+    p->size_gparts_out = proxy_buffinit;
+    if ((p->gparts_out = (struct gpart *)malloc(sizeof(struct gpart) *
+                                                p->size_gparts_out)) == NULL)
+      error("Failed to allocate gparts_out buffers.");
+  }
+  p->nr_gparts_out = 0;
 }
diff --git a/src/proxy.h b/src/proxy.h
index 3cd33e0f0819ee1ecac53213630445b39c809dea..5a747187e05a78a109ce4523ebb3c9d5fe2ad717 100644
--- a/src/proxy.h
+++ b/src/proxy.h
@@ -32,7 +32,8 @@
 #define proxy_tag_count 0
 #define proxy_tag_parts 1
 #define proxy_tag_xparts 2
-#define proxy_tag_cells 3
+#define proxy_tag_gparts 3
+#define proxy_tag_cells 4
 
 /* Data structure for the proxy. */
 struct proxy {
@@ -53,14 +54,21 @@ struct proxy {
   /* The parts and xparts buffers for input and output. */
   struct part *parts_in, *parts_out;
   struct xpart *xparts_in, *xparts_out;
+  struct gpart *gparts_in, *gparts_out;
   int size_parts_in, size_parts_out;
   int nr_parts_in, nr_parts_out;
+  int size_gparts_in, size_gparts_out;
+  int nr_gparts_in, nr_gparts_out;
+
+  /* Buffer to hold the incomming/outgoing particle counts. */
+  int buff_out[2], buff_in[2];
 
 /* MPI request handles. */
 #ifdef WITH_MPI
   MPI_Request req_parts_count_out, req_parts_count_in;
   MPI_Request req_parts_out, req_parts_in;
   MPI_Request req_xparts_out, req_xparts_in;
+  MPI_Request req_gparts_out, req_gparts_in;
   MPI_Request req_cells_count_out, req_cells_count_in;
   MPI_Request req_cells_out, req_cells_in;
 #endif
@@ -68,8 +76,9 @@ struct proxy {
 
 /* Function prototypes. */
 void proxy_init(struct proxy *p, int mynodeID, int nodeID);
-void proxy_parts_load(struct proxy *p, struct part *parts, struct xpart *xparts,
-                      int N);
+void proxy_parts_load(struct proxy *p, const struct part *parts,
+                      const struct xpart *xparts, int N);
+void proxy_gparts_load(struct proxy *p, const struct gpart *gparts, int N);
 void proxy_parts_exch1(struct proxy *p);
 void proxy_parts_exch2(struct proxy *p);
 void proxy_addcell_in(struct proxy *p, struct cell *c);
diff --git a/src/queue.c b/src/queue.c
index a7321155100df9225526c2f19fac2b99531307e4..6b788d7376ba4bdc95f1b1d918ab52a9514e7b4a 100644
--- a/src/queue.c
+++ b/src/queue.c
@@ -136,9 +136,6 @@ struct task *queue_gettask(struct queue *q, const struct task *prev,
   lock_type *qlock = &q->lock;
   struct task *res = NULL;
 
-  /* If there are no tasks, leave immediately. */
-  if (q->count == 0) return NULL;
-
   /* Grab the task lock. */
   if (blocking) {
     if (lock_lock(qlock) != 0) error("Locking the qlock failed.\n");
@@ -146,6 +143,12 @@ struct task *queue_gettask(struct queue *q, const struct task *prev,
     if (lock_trylock(qlock) != 0) return NULL;
   }
 
+  /* If there are no tasks, leave immediately. */
+  if (q->count == 0) {
+    lock_unlock_blind(qlock);
+    return NULL;
+  }
+
   /* Set some pointers we will use often. */
   int *qtid = q->tid;
   struct task *qtasks = q->tasks;
diff --git a/src/runner.c b/src/runner.c
index 7eedb6adc72755ba12faed5429edad43d3849451..fefbb04a4fb0ae55628f2bc5d42f2d140226c5c5 100644
--- a/src/runner.c
+++ b/src/runner.c
@@ -469,8 +469,10 @@ void runner_dogsort(struct runner *r, struct cell *c, int flags, int clock) {
 
 void runner_doinit(struct runner *r, struct cell *c, int timer) {
 
-  struct part *p, *parts = c->parts;
+  struct part *const parts = c->parts;
+  struct gpart *const gparts = c->gparts;
   const int count = c->count;
+  const int gcount = c->gcount;
   const int ti_current = r->e->ti_current;
 
   TIMER_TIC;
@@ -486,7 +488,7 @@ void runner_doinit(struct runner *r, struct cell *c, int timer) {
     for (int i = 0; i < count; i++) {
 
       /* Get a direct pointer on the part. */
-      p = &parts[i];
+      struct part *const p = &parts[i];
 
       if (p->ti_end <= ti_current) {
 
@@ -494,6 +496,19 @@ void runner_doinit(struct runner *r, struct cell *c, int timer) {
         hydro_init_part(p);
       }
     }
+
+    /* Loop over the gparts in this cell. */
+    for (int i = 0; i < gcount; i++) {
+
+      /* Get a direct pointer on the part. */
+      struct gpart *const gp = &gparts[i];
+
+      if (gp->ti_end <= ti_current) {
+
+        /* Get ready for a density calculation */
+        gravity_init_part(gp);
+      }
+    }
   }
 
   if (timer) TIMER_TOC(timer_init);
@@ -649,7 +664,7 @@ void runner_doghost(struct runner *r, struct cell *c) {
 }
 
 /**
- * @brief Drift particles forward in time
+ * @brief Drift particles and g-particles forward in time
  *
  * @param r The runner thread.
  * @param c The cell.
@@ -658,26 +673,39 @@ void runner_doghost(struct runner *r, struct cell *c) {
 void runner_dodrift(struct runner *r, struct cell *c, int timer) {
 
   const int nr_parts = c->count;
+  const int nr_gparts = c->gcount;
   const double timeBase = r->e->timeBase;
   const double dt = (r->e->ti_current - r->e->ti_old) * timeBase;
-  const float ti_old = r->e->ti_old;
-  const float ti_current = r->e->ti_current;
-  struct part *restrict p, *restrict parts = c->parts;
-  struct xpart *restrict xp, *restrict xparts = c->xparts;
-  float dx_max = 0.f, h_max = 0.f;
-  float w;
+  const int ti_old = r->e->ti_old;
+  const int ti_current = r->e->ti_current;
+  struct part *const parts = c->parts;
+  struct xpart *const xparts = c->xparts;
+  struct gpart *const gparts = c->gparts;
+  float dx_max = 0.f, dx2_max = 0.f, h_max = 0.f;
 
   TIMER_TIC
 
   /* No children? */
   if (!c->split) {
 
-    /* Loop over all the particles in the cell */
+    /* Loop over all the g-particles in the cell */
+    for (int k = 0; k < nr_gparts; ++k) {
+
+      /* Get a handle on the gpart. */
+      struct gpart *const gp = &gparts[k];
+
+      /* Drift... */
+      gp->x[0] += gp->v_full[0] * dt;
+      gp->x[1] += gp->v_full[1] * dt;
+      gp->x[2] += gp->v_full[2] * dt;
+    }
+
+    /* Loop over all the particles in the cell (more work for these !) */
     for (int k = 0; k < nr_parts; k++) {
 
       /* Get a handle on the part. */
-      p = &parts[k];
-      xp = &xparts[k];
+      struct part *const p = &parts[k];
+      struct xpart *const xp = &xparts[k];
 
       /* Useful quantity */
       const float h_inv = 1.0f / p->h;
@@ -693,32 +721,34 @@ void runner_dodrift(struct runner *r, struct cell *c, int timer) {
       p->v[2] += p->a_hydro[2] * dt;
 
       /* Predict smoothing length */
-      w = p->h_dt * h_inv * dt;
-      if (fabsf(w) < 0.2f)
-        p->h *= approx_expf(w); /* 4th order expansion of exp(w) */
+      const float w1 = p->h_dt * h_inv * dt;
+      if (fabsf(w1) < 0.2f)
+        p->h *= approx_expf(w1); /* 4th order expansion of exp(w) */
       else
-        p->h *= expf(w);
+        p->h *= expf(w1);
 
       /* Predict density */
-      w = -3.0f * p->h_dt * h_inv * dt;
-      if (fabsf(w) < 0.2f)
-        p->rho *= approx_expf(w); /* 4th order expansion of exp(w) */
+      const float w2 = -3.0f * p->h_dt * h_inv * dt;
+      if (fabsf(w2) < 0.2f)
+        p->rho *= approx_expf(w2); /* 4th order expansion of exp(w) */
       else
-        p->rho *= expf(w);
+        p->rho *= expf(w2);
 
       /* Predict the values of the extra fields */
       hydro_predict_extra(p, xp, ti_old, ti_current, timeBase);
 
-      /* Compute motion since last cell construction */
-      const float dx =
-          sqrtf((p->x[0] - xp->x_old[0]) * (p->x[0] - xp->x_old[0]) +
-                (p->x[1] - xp->x_old[1]) * (p->x[1] - xp->x_old[1]) +
-                (p->x[2] - xp->x_old[2]) * (p->x[2] - xp->x_old[2]));
-      dx_max = fmaxf(dx_max, dx);
+      /* Compute (square of) motion since last cell construction */
+      const float dx2 = (p->x[0] - xp->x_old[0]) * (p->x[0] - xp->x_old[0]) +
+                        (p->x[1] - xp->x_old[1]) * (p->x[1] - xp->x_old[1]) +
+                        (p->x[2] - xp->x_old[2]) * (p->x[2] - xp->x_old[2]);
+      dx2_max = fmaxf(dx2_max, dx2);
 
       /* Maximal smoothing length */
       h_max = fmaxf(p->h, h_max);
     }
+
+    /* Now, get the maximal particle motion from its square */
+    dx_max = sqrtf(dx2_max);
   }
 
   /* Otherwise, aggregate data from children. */
@@ -758,37 +788,97 @@ void runner_dokick(struct runner *r, struct cell *c, int timer) {
   const double timeBase = r->e->timeBase;
   const double timeBase_inv = 1.0 / r->e->timeBase;
   const int count = c->count;
+  const int gcount = c->gcount;
+  struct part *const parts = c->parts;
+  struct xpart *const xparts = c->xparts;
+  struct gpart *const gparts = c->gparts;
   const int is_fixdt =
       (r->e->policy & engine_policy_fixdt) == engine_policy_fixdt;
 
-  int new_dti;
-  int dti_timeline;
-
-  int updated = 0;
+  int updated = 0, g_updated = 0;
   int ti_end_min = max_nr_timesteps, ti_end_max = 0;
   double e_kin = 0.0, e_int = 0.0, e_pot = 0.0, mass = 0.0;
   float mom[3] = {0.0f, 0.0f, 0.0f};
   float ang[3] = {0.0f, 0.0f, 0.0f};
-  float x[3], v_full[3];
-  struct part *restrict p, *restrict parts = c->parts;
-  struct xpart *restrict xp, *restrict xparts = c->xparts;
 
   TIMER_TIC
 
   /* No children? */
   if (!c->split) {
 
+    /* Loop over the g-particles and kick the active ones. */
+    for (int k = 0; k < gcount; k++) {
+
+      /* Get a handle on the part. */
+      struct gpart *const gp = &gparts[k];
+
+      /* If the g-particle has no counterpart and needs to be kicked */
+      if (gp->id_or_neg_offset > 0 && (is_fixdt || gp->ti_end <= ti_current)) {
+
+        /* First, finish the force calculation */
+        gravity_end_force(gp);
+
+        /* Now we are ready to compute the next time-step size */
+        int new_dti;
+
+        if (is_fixdt) {
+
+          /* Now we have a time step, proceed with the kick */
+          new_dti = global_dt_max * timeBase_inv;
+
+        } else {
+
+          /* Compute the next timestep (gravity condition) */
+          float new_dt = gravity_compute_timestep(gp);
+
+          /* Limit timestep within the allowed range */
+          new_dt = fminf(new_dt, global_dt_max);
+          new_dt = fmaxf(new_dt, global_dt_min);
+
+          /* Convert to integer time */
+          new_dti = new_dt * timeBase_inv;
+
+          /* Recover the current timestep */
+          const int current_dti = gp->ti_end - gp->ti_begin;
+
+          /* Limit timestep increase */
+          if (current_dti > 0) new_dti = min(new_dti, 2 * current_dti);
+
+          /* Put this timestep on the time line */
+          int dti_timeline = max_nr_timesteps;
+          while (new_dti < dti_timeline) dti_timeline /= 2;
+
+          /* Now we have a time step, proceed with the kick */
+          new_dti = dti_timeline;
+        }
+
+        /* Compute the time step for this kick */
+        const int ti_start = (gp->ti_begin + gp->ti_end) / 2;
+        const int ti_end = gp->ti_end + new_dti / 2;
+        const double dt = (ti_end - ti_start) * timeBase;
+        const double half_dt = (ti_end - gp->ti_end) * timeBase;
+
+        /* Kick particles in momentum space */
+        gp->v_full[0] += gp->a_grav[0] * dt;
+        gp->v_full[1] += gp->a_grav[1] * dt;
+        gp->v_full[2] += gp->a_grav[2] * dt;
+
+        /* Extra kick work */
+        gravity_kick_extra(gp, dt, half_dt);
+
+        /* Number of updated g-particles */
+        g_updated++;
+      }
+    }
+
+    /* Now do the hydro ones... */
+
     /* Loop over the particles and kick the active ones. */
     for (int k = 0; k < count; k++) {
 
       /* Get a handle on the part. */
-      p = &parts[k];
-      xp = &xparts[k];
-
-      const float m = p->mass;
-      x[0] = p->x[0];
-      x[1] = p->x[1];
-      x[2] = p->x[2];
+      struct part *const p = &parts[k];
+      struct xpart *const xp = &xparts[k];
 
       /* If particle needs to be kicked */
       if (is_fixdt || p->ti_end <= ti_current) {
@@ -798,8 +888,10 @@ void runner_dokick(struct runner *r, struct cell *c, int timer) {
 
         /* And do the same of the extra variable */
         hydro_end_force(p);
+        if (p->gpart != NULL) gravity_end_force(p->gpart);
 
         /* Now we are ready to compute the next time-step size */
+        int new_dti;
 
         if (is_fixdt) {
 
@@ -808,9 +900,13 @@ void runner_dokick(struct runner *r, struct cell *c, int timer) {
 
         } else {
 
-          /* Compute the next timestep */
+          /* Compute the next timestep (hydro condition) */
           const float new_dt_hydro = hydro_compute_timestep(p, xp);
-          const float new_dt_grav = gravity_compute_timestep(p, xp);
+
+          /* Compute the next timestep (gravity condition) */
+          float new_dt_grav = FLT_MAX;
+          if (p->gpart != NULL)
+            new_dt_grav = gravity_compute_timestep(p->gpart);
 
           float new_dt = fminf(new_dt_hydro, new_dt_grav);
 
@@ -835,7 +931,7 @@ void runner_dokick(struct runner *r, struct cell *c, int timer) {
           if (current_dti > 0) new_dti = min(new_dti, 2 * current_dti);
 
           /* Put this timestep on the time line */
-          dti_timeline = max_nr_timesteps;
+          int dti_timeline = max_nr_timesteps;
           while (new_dti < dti_timeline) dti_timeline /= 2;
 
           /* Now we have a time step, proceed with the kick */
@@ -845,34 +941,51 @@ void runner_dokick(struct runner *r, struct cell *c, int timer) {
         /* Compute the time step for this kick */
         const int ti_start = (p->ti_begin + p->ti_end) / 2;
         const int ti_end = p->ti_end + new_dti / 2;
-        const float dt = (ti_end - ti_start) * timeBase;
-        const float half_dt = (ti_end - p->ti_end) * timeBase;
+        const double dt = (ti_end - ti_start) * timeBase;
+        const double half_dt = (ti_end - p->ti_end) * timeBase;
 
         /* Move particle forward in time */
         p->ti_begin = p->ti_end;
         p->ti_end = p->ti_begin + new_dti;
 
+        /* Get the acceleration */
+        float a_tot[3] = {p->a_hydro[0], p->a_hydro[1], p->a_hydro[2]};
+        if (p->gpart != NULL) {
+          a_tot[0] += p->gpart->a_grav[0];
+          a_tot[1] += p->gpart->a_grav[1];
+          a_tot[1] += p->gpart->a_grav[2];
+        }
+
         /* Kick particles in momentum space */
-        xp->v_full[0] += p->a_hydro[0] * dt;
-        xp->v_full[1] += p->a_hydro[1] * dt;
-        xp->v_full[2] += p->a_hydro[2] * dt;
+        xp->v_full[0] += a_tot[0] * dt;
+        xp->v_full[1] += a_tot[1] * dt;
+        xp->v_full[2] += a_tot[2] * dt;
+
+        if (p->gpart != NULL) {
+          p->gpart->v_full[0] = xp->v_full[0];
+          p->gpart->v_full[1] = xp->v_full[1];
+          p->gpart->v_full[2] = xp->v_full[2];
+        }
 
-        p->v[0] = xp->v_full[0] - half_dt * p->a_hydro[0];
-        p->v[1] = xp->v_full[1] - half_dt * p->a_hydro[1];
-        p->v[2] = xp->v_full[2] - half_dt * p->a_hydro[2];
+        /* Go back by half-step for the hydro velocity */
+        p->v[0] = xp->v_full[0] - half_dt * a_tot[0];
+        p->v[1] = xp->v_full[1] - half_dt * a_tot[1];
+        p->v[2] = xp->v_full[2] - half_dt * a_tot[2];
 
         /* Extra kick work */
         hydro_kick_extra(p, xp, dt, half_dt);
+        if (p->gpart != NULL) gravity_kick_extra(p->gpart, dt, half_dt);
 
         /* Number of updated particles */
         updated++;
+        if (p->gpart != NULL) g_updated++;
       }
 
       /* Now collect quantities for statistics */
 
-      v_full[0] = xp->v_full[0];
-      v_full[1] = xp->v_full[1];
-      v_full[2] = xp->v_full[2];
+      const double x[3] = {p->x[0], p->x[1], p->x[2]};
+      const float v_full[3] = {xp->v_full[0], xp->v_full[1], xp->v_full[2]};
+      const float m = p->mass;
 
       /* Collect mass */
       mass += m;
@@ -906,13 +1019,14 @@ void runner_dokick(struct runner *r, struct cell *c, int timer) {
     /* Loop over the progeny. */
     for (int k = 0; k < 8; k++)
       if (c->progeny[k] != NULL) {
-        struct cell *cp = c->progeny[k];
+        struct cell *const cp = c->progeny[k];
 
         /* Recurse */
         runner_dokick(r, cp, 0);
 
         /* And aggregate */
         updated += cp->updated;
+        g_updated += cp->g_updated;
         e_kin += cp->e_kin;
         e_int += cp->e_int;
         e_pot += cp->e_pot;
@@ -930,6 +1044,7 @@ void runner_dokick(struct runner *r, struct cell *c, int timer) {
 
   /* Store the values. */
   c->updated = updated;
+  c->g_updated = g_updated;
   c->e_kin = e_kin;
   c->e_int = e_int;
   c->e_pot = e_pot;
diff --git a/src/runner_doiact.h b/src/runner_doiact.h
index cf5d56e94169b44e6cd2974a3422a0bc5e4610ac..de339db6133fcc829bdc6ee0ce9e537b68982422 100644
--- a/src/runner_doiact.h
+++ b/src/runner_doiact.h
@@ -1235,7 +1235,7 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
 #else
 
           /* Does pi need to be updated too? */
-          if (pi->dt <= dt_step) {
+          if (pi->ti_end <= ti_current) {
 
             /* Add this interaction to the symmetric queue. */
             r2q2[icount2] = r2;
diff --git a/src/runner_doiact_grav.h b/src/runner_doiact_grav.h
index f374339da75e31b39a5295fcd8bbc23c34d8d67d..02626295a49f314fef840bc044a476f5c9cf332d 100644
--- a/src/runner_doiact_grav.h
+++ b/src/runner_doiact_grav.h
@@ -267,9 +267,9 @@ void runner_dograv_down(struct runner *r, struct cell *c) {
     /* Apply the multipole acceleration to all gparts. */
     for (int k = 0; k < c->gcount; k++) {
       struct gpart *p = &c->gparts[k];
-      p->a[0] += m->a[0];
-      p->a[1] += m->a[1];
-      p->a[2] += m->a[2];
+      p->a_grav[0] += m->a[0];
+      p->a_grav[1] += m->a[1];
+      p->a_grav[2] += m->a[2];
     }
   }
 }
@@ -594,5 +594,4 @@ void runner_dosub_grav(struct runner *r, struct cell *ci, struct cell *cj,
 
   if (gettimer) TIMER_TOC(timer_dosub_grav);
 }
-
 #endif /* SWIFT_RUNNER_DOIACT_GRAV_H */
diff --git a/src/scheduler.c b/src/scheduler.c
index 722e344b5a86b5fbdc42c7038fd3cb00e44b2ee8..38a1cd8c663307e0c0378d8bec2e0cd3d8f37fa8 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -95,32 +95,29 @@ void scheduler_addunlock(struct scheduler *s, struct task *ta,
 
 void scheduler_splittasks(struct scheduler *s) {
 
-  int j, k, ind, sid, tid = 0, redo;
-  struct cell *ci, *cj;
-  double hi, hj, shift[3];
-  struct task *t, *t_old;
-  // float dt_step = s->dt_step;
-  int pts[7][8] = {{-1, 12, 10, 9, 4, 3, 1, 0},
-                   {-1, -1, 11, 10, 5, 4, 2, 1},
-                   {-1, -1, -1, 12, 7, 6, 4, 3},
-                   {-1, -1, -1, -1, 8, 7, 5, 4},
-                   {-1, -1, -1, -1, -1, 12, 10, 9},
-                   {-1, -1, -1, -1, -1, -1, 11, 10},
-                   {-1, -1, -1, -1, -1, -1, -1, 12}};
-  float sid_scale[13] = {0.1897, 0.4025, 0.1897, 0.4025, 0.5788, 0.4025, 0.1897,
-                         0.4025, 0.1897, 0.4025, 0.5788, 0.4025, 0.5788};
+  const int pts[7][8] = {{-1, 12, 10, 9, 4, 3, 1, 0},
+                         {-1, -1, 11, 10, 5, 4, 2, 1},
+                         {-1, -1, -1, 12, 7, 6, 4, 3},
+                         {-1, -1, -1, -1, 8, 7, 5, 4},
+                         {-1, -1, -1, -1, -1, 12, 10, 9},
+                         {-1, -1, -1, -1, -1, -1, 11, 10},
+                         {-1, -1, -1, -1, -1, -1, -1, 12}};
+  const float sid_scale[13] = {0.1897, 0.4025, 0.1897, 0.4025, 0.5788,
+                               0.4025, 0.1897, 0.4025, 0.1897, 0.4025,
+                               0.5788, 0.4025, 0.5788};
 
   /* Loop through the tasks... */
-  redo = 0;
-  t_old = t = NULL;
+  int tid = 0, redo = 0;
+  struct task *t_old = NULL;
   while (1) {
 
     /* Get a pointer on the task. */
+    struct task *t = t_old;
     if (redo) {
       redo = 0;
-      t = t_old;
     } else {
-      if ((ind = atomic_inc(&tid)) < s->nr_tasks)
+      const int ind = atomic_inc(&tid);
+      if (ind < s->nr_tasks)
         t_old = t = &s->tasks[s->tasks_ind[ind]];
       else
         break;
@@ -161,7 +158,7 @@ void scheduler_splittasks(struct scheduler *s) {
     if (t->type == task_type_self) {
 
       /* Get a handle on the cell involved. */
-      ci = t->ci;
+      struct cell *ci = t->ci;
 
       /* Foreign task? */
       if (ci->nodeID != s->nodeID) {
@@ -187,18 +184,18 @@ void scheduler_splittasks(struct scheduler *s) {
           redo = 1;
 
           /* Add the self task. */
-          for (k = 0; ci->progeny[k] == NULL; k++)
-            ;
-          t->ci = ci->progeny[k];
-          for (k += 1; k < 8; k++)
+          int first_child = 0;
+          while (ci->progeny[first_child] == NULL) first_child++;
+          t->ci = ci->progeny[first_child];
+          for (int k = first_child + 1; k < 8; k++)
             if (ci->progeny[k] != NULL)
               scheduler_addtask(s, task_type_self, t->subtype, 0, 0,
                                 ci->progeny[k], NULL, 0);
 
           /* Make a task for each pair of progeny. */
-          for (j = 0; j < 8; j++)
+          for (int j = 0; j < 8; j++)
             if (ci->progeny[j] != NULL)
-              for (k = j + 1; k < 8; k++)
+              for (int k = j + 1; k < 8; k++)
                 if (ci->progeny[k] != NULL)
                   scheduler_addtask(s, task_type_pair, t->subtype, pts[j][k], 0,
                                     ci->progeny[j], ci->progeny[k], 0);
@@ -211,10 +208,10 @@ void scheduler_splittasks(struct scheduler *s) {
     else if (t->type == task_type_pair) {
 
       /* Get a handle on the cells involved. */
-      ci = t->ci;
-      cj = t->cj;
-      hi = ci->dmin;
-      hj = cj->dmin;
+      struct cell *ci = t->ci;
+      struct cell *cj = t->cj;
+      const double hi = ci->dmin;
+      const double hj = cj->dmin;
 
       /* Foreign task? */
       if (ci->nodeID != s->nodeID && cj->nodeID != s->nodeID) {
@@ -224,7 +221,8 @@ void scheduler_splittasks(struct scheduler *s) {
 
       /* Get the sort ID, use space_getsid and not t->flags
          to make sure we get ci and cj swapped if needed. */
-      sid = space_getsid(s->space, &ci, &cj, shift);
+      double shift[3];
+      int sid = space_getsid(s->space, &ci, &cj, shift);
 
       /* Should this task be split-up? */
       if (ci->split && cj->split &&
@@ -480,9 +478,9 @@ void scheduler_splittasks(struct scheduler *s) {
         /* Replace the current task. */
         t->type = task_type_none;
 
-        for (j = 0; j < 8; j++)
+        for (int j = 0; j < 8; j++)
           if (ci->progeny[j] != NULL)
-            for (k = 0; k < 8; k++)
+            for (int k = 0; k < 8; k++)
               if (cj->progeny[k] != NULL) {
                 t = scheduler_addtask(s, task_type_pair, t->subtype, 0, 0,
                                       ci->progeny[j], cj->progeny[k], 0);
@@ -521,8 +519,8 @@ void scheduler_splittasks(struct scheduler *s) {
     else if (t->type == task_type_grav_mm) {
 
       /* Get a handle on the cells involved. */
-      ci = t->ci;
-      cj = t->cj;
+      struct cell *ci = t->ci;
+      struct cell *cj = t->cj;
 
       /* Self-interaction? */
       if (cj == NULL) {
@@ -546,7 +544,7 @@ void scheduler_splittasks(struct scheduler *s) {
 
             /* Split this task into tasks on its progeny. */
             t->type = task_type_none;
-            for (j = 0; j < 8; j++)
+            for (int j = 0; j < 8; j++)
               if (ci->progeny[j] != NULL && ci->progeny[j]->gcount > 0) {
                 if (t->type == task_type_none) {
                   t->type = task_type_grav_mm;
@@ -555,7 +553,7 @@ void scheduler_splittasks(struct scheduler *s) {
                 } else
                   t = scheduler_addtask(s, task_type_grav_mm, task_subtype_none,
                                         0, 0, ci->progeny[j], NULL, 0);
-                for (k = j + 1; k < 8; k++)
+                for (int k = j + 1; k < 8; k++)
                   if (ci->progeny[k] != NULL && ci->progeny[k]->gcount > 0) {
                     if (t->type == task_type_none) {
                       t->type = task_type_grav_mm;
@@ -594,7 +592,7 @@ void scheduler_splittasks(struct scheduler *s) {
 
           /* Get the opening angle theta. */
           float dx[3], theta;
-          for (k = 0; k < 3; k++) {
+          for (int k = 0; k < 3; k++) {
             dx[k] = fabs(ci->loc[k] - cj->loc[k]);
             if (s->space->periodic && dx[k] > 0.5 * s->space->dim[k])
               dx[k] = -dx[k] + s->space->dim[k];
@@ -615,9 +613,9 @@ void scheduler_splittasks(struct scheduler *s) {
 
               /* Split this task into tasks on its progeny. */
               t->type = task_type_none;
-              for (j = 0; j < 8; j++)
+              for (int j = 0; j < 8; j++)
                 if (ci->progeny[j] != NULL && ci->progeny[j]->gcount > 0) {
-                  for (k = 0; k < 8; k++)
+                  for (int k = 0; k < 8; k++)
                     if (cj->progeny[k] != NULL && cj->progeny[k]->gcount > 0) {
                       if (t->type == task_type_none) {
                         t->type = task_type_grav_mm;
@@ -663,17 +661,14 @@ struct task *scheduler_addtask(struct scheduler *s, int type, int subtype,
                                int flags, int wait, struct cell *ci,
                                struct cell *cj, int tight) {
 
-  int ind;
-  struct task *t;
-
   /* Get the next free task. */
-  ind = atomic_inc(&s->tasks_next);
+  const int ind = atomic_inc(&s->tasks_next);
 
   /* Overflow? */
   if (ind >= s->size) error("Task list overflow.");
 
   /* Get a pointer to the new task. */
-  t = &s->tasks[ind];
+  struct task *t = &s->tasks[ind];
 
   /* Copy the data. */
   t->type = type;
@@ -768,24 +763,24 @@ void scheduler_set_unlocks(struct scheduler *s) {
 
 void scheduler_ranktasks(struct scheduler *s) {
 
-  int i, j = 0, k, temp, left = 0, rank;
-  struct task *t, *tasks = s->tasks;
-  int *tid = s->tasks_ind, nr_tasks = s->nr_tasks;
+  struct task *tasks = s->tasks;
+  int *tid = s->tasks_ind;
+  const int nr_tasks = s->nr_tasks;
 
   /* Run through the tasks and get all the waits right. */
-  for (i = 0, k = 0; k < nr_tasks; k++) {
+  for (int k = 0; k < nr_tasks; k++) {
     tid[k] = k;
-    for (j = 0; j < tasks[k].nr_unlock_tasks; j++)
+    for (int j = 0; j < tasks[k].nr_unlock_tasks; j++)
       tasks[k].unlock_tasks[j]->wait += 1;
   }
 
   /* Main loop. */
-  for (j = 0, rank = 0; left < nr_tasks; rank++) {
+  for (int j = 0, rank = 0, left = 0; left < nr_tasks; rank++) {
 
     /* Load the tids of tasks with no waits. */
-    for (k = left; k < nr_tasks; k++)
+    for (int k = left; k < nr_tasks; k++)
       if (tasks[tid[k]].wait == 0) {
-        temp = tid[j];
+        int temp = tid[j];
         tid[j] = tid[k];
         tid[k] = temp;
         j += 1;
@@ -795,15 +790,16 @@ void scheduler_ranktasks(struct scheduler *s) {
     if (j == left) error("Unsatisfiable task dependencies detected.");
 
     /* Unlock the next layer of tasks. */
-    for (i = left; i < j; i++) {
-      t = &tasks[tid[i]];
+    for (int i = left; i < j; i++) {
+      struct task *t = &tasks[tid[i]];
       t->rank = rank;
       tid[i] = t - tasks;
       if (tid[i] >= nr_tasks) error("Task index overshoot.");
       /* message( "task %i of type %s has rank %i." , i ,
           (t->type == task_type_self) ? "self" : (t->type == task_type_pair) ?
          "pair" : "sort" , rank ); */
-      for (k = 0; k < t->nr_unlock_tasks; k++) t->unlock_tasks[k]->wait -= 1;
+      for (int k = 0; k < t->nr_unlock_tasks; k++)
+        t->unlock_tasks[k]->wait -= 1;
     }
 
     /* The new left (no, not tony). */
@@ -825,8 +821,6 @@ void scheduler_ranktasks(struct scheduler *s) {
 
 void scheduler_reset(struct scheduler *s, int size) {
 
-  int k;
-
   /* Do we need to re-allocate? */
   if (size > s->size) {
 
@@ -853,7 +847,7 @@ void scheduler_reset(struct scheduler *s, int size) {
   s->nr_unlocks = 0;
 
   /* Set the task pointers in the queues. */
-  for (k = 0; k < s->nr_queues; k++) s->queues[k].tasks = s->tasks;
+  for (int k = 0; k < s->nr_queues; k++) s->queues[k].tasks = s->tasks;
 }
 
 /**
@@ -864,21 +858,23 @@ void scheduler_reset(struct scheduler *s, int size) {
 
 void scheduler_reweight(struct scheduler *s) {
 
-  int k, j, nr_tasks = s->nr_tasks, *tid = s->tasks_ind;
-  struct task *t, *tasks = s->tasks;
-  int nodeID = s->nodeID;
-  float sid_scale[13] = {0.1897, 0.4025, 0.1897, 0.4025, 0.5788, 0.4025, 0.1897,
-                         0.4025, 0.1897, 0.4025, 0.5788, 0.4025, 0.5788};
-  float wscale = 0.001;
+  const int nr_tasks = s->nr_tasks;
+  int *tid = s->tasks_ind;
+  struct task *tasks = s->tasks;
+  const int nodeID = s->nodeID;
+  const float sid_scale[13] = {0.1897, 0.4025, 0.1897, 0.4025, 0.5788,
+                               0.4025, 0.1897, 0.4025, 0.1897, 0.4025,
+                               0.5788, 0.4025, 0.5788};
+  const float wscale = 0.001;
   // ticks tic;
 
   /* Run through the tasks backwards and set their waits and
      weights. */
   // tic = getticks();
-  for (k = nr_tasks - 1; k >= 0; k--) {
-    t = &tasks[tid[k]];
+  for (int k = nr_tasks - 1; k >= 0; k--) {
+    struct task *t = &tasks[tid[k]];
     t->weight = 0;
-    for (j = 0; j < t->nr_unlock_tasks; j++)
+    for (int j = 0; j < t->nr_unlock_tasks; j++)
       if (t->unlock_tasks[j]->weight > t->weight)
         t->weight = t->unlock_tasks[j]->weight;
     if (!t->implicit && t->tic > 0)
@@ -959,8 +955,9 @@ void scheduler_reweight(struct scheduler *s) {
 void scheduler_start(struct scheduler *s, unsigned int mask,
                      unsigned int submask) {
 
-  int nr_tasks = s->nr_tasks, *tid = s->tasks_ind;
-  struct task *t, *tasks = s->tasks;
+  const int nr_tasks = s->nr_tasks;
+  int *tid = s->tasks_ind;
+  struct task *tasks = s->tasks;
   // ticks tic;
 
   /* Store the masks */
@@ -986,8 +983,7 @@ void scheduler_start(struct scheduler *s, unsigned int mask,
   const int waiting_old = s->waiting;
 
   /* We are going to use the task structure in a modified way to pass
-     information
-     to the task. Don't do this at home !
+     information to the task. Don't do this at home !
      - ci and cj will give the range of tasks to which the waits will be applied
      - the flags will be used to transfer the mask
      - the rank will be used to transfer the submask
@@ -1012,6 +1008,7 @@ void scheduler_start(struct scheduler *s, unsigned int mask,
 
   /* Wait for the rewait tasks to have executed. */
   pthread_mutex_lock(&s->sleep_mutex);
+  pthread_cond_broadcast(&s->sleep_cond);
   while (s->waiting > waiting_old) {
     pthread_cond_wait(&s->sleep_cond, &s->sleep_mutex);
   }
@@ -1025,7 +1022,7 @@ void scheduler_start(struct scheduler *s, unsigned int mask,
   /* Loop over the tasks and enqueue whoever is ready. */
   // tic = getticks();
   for (int k = 0; k < s->nr_tasks; k++) {
-    t = &tasks[tid[k]];
+    struct task *t = &tasks[tid[k]];
     if (atomic_dec(&t->wait) == 1 && ((1 << t->type) & s->mask) &&
         ((1 << t->subtype) & s->submask) && !t->skip) {
       scheduler_enqueue(s, t);
@@ -1033,6 +1030,11 @@ void scheduler_start(struct scheduler *s, unsigned int mask,
     }
   }
 
+  /* To be safe, fire of one last sleep_cond in a safe way. */
+  pthread_mutex_lock(&s->sleep_mutex);
+  pthread_cond_broadcast(&s->sleep_cond);
+  pthread_mutex_unlock(&s->sleep_mutex);
+
   // message( "enqueueing tasks took %.3f %s." ,
   // clocks_from_ticks( getticks() - tic ), clocks_getunit());
 }
@@ -1046,10 +1048,8 @@ void scheduler_start(struct scheduler *s, unsigned int mask,
 
 void scheduler_enqueue(struct scheduler *s, struct task *t) {
 
+  /* The target queue for this task. */
   int qid = -1;
-#ifdef WITH_MPI
-  int err;
-#endif
 
   /* Fail if this task has already been enqueued before. */
   if (t->rid >= 0) error("Task has already been enqueued.");
@@ -1071,6 +1071,9 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
 
   /* Otherwise, look for a suitable queue. */
   else {
+#ifdef WITH_MPI
+    int err;
+#endif
 
     /* Find the previous owner for each task type, and do
        any pre-processing needed. */
@@ -1093,13 +1096,10 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
         break;
       case task_type_recv:
 #ifdef WITH_MPI
-        if ((err = MPI_Irecv(t->ci->parts, t->ci->count, s->part_mpi_type,
-                             t->ci->nodeID, t->flags, MPI_COMM_WORLD,
-                             &t->req)) != MPI_SUCCESS) {
-          char buff[MPI_MAX_ERROR_STRING];
-          int len;
-          MPI_Error_string(err, buff, &len);
-          error("Failed to emit irecv for particle data (%s).", buff);
+        err = MPI_Irecv(t->ci->parts, t->ci->count, part_mpi_type,
+                        t->ci->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+        if (err != MPI_SUCCESS) {
+          mpi_error(err, "Failed to emit irecv for particle data.");
         }
         // message( "receiving %i parts with tag=%i from %i to %i." ,
         //     t->ci->count , t->flags , t->ci->nodeID , s->nodeID );
@@ -1111,13 +1111,10 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
         break;
       case task_type_send:
 #ifdef WITH_MPI
-        if ((err = MPI_Isend(t->ci->parts, t->ci->count, s->part_mpi_type,
-                             t->cj->nodeID, t->flags, MPI_COMM_WORLD,
-                             &t->req)) != MPI_SUCCESS) {
-          char buff[MPI_MAX_ERROR_STRING];
-          int len;
-          MPI_Error_string(err, buff, &len);
-          error("Failed to emit isend for particle data (%s).", buff);
+        err = MPI_Isend(t->ci->parts, t->ci->count, part_mpi_type,
+                        t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+        if (err != MPI_SUCCESS) {
+          mpi_error(err, "Failed to emit isend for particle data.");
         }
         // message( "sending %i parts with tag=%i from %i to %i." ,
         //     t->ci->count , t->flags , s->nodeID , t->cj->nodeID );
@@ -1133,7 +1130,7 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
 
     if (qid >= s->nr_queues) error("Bad computed qid.");
 
-    /* If no previous owner, find the shortest queue. */
+    /* If no previous owner, pick a random queue. */
     if (qid < 0) qid = rand() % s->nr_queues;
 
     /* Increase the waiting counter. */
@@ -1164,7 +1161,7 @@ struct task *scheduler_done(struct scheduler *s, struct task *t) {
   for (int k = 0; k < t->nr_unlock_tasks; k++) {
     struct task *t2 = t->unlock_tasks[k];
 
-    int res = atomic_dec(&t2->wait);
+    const int res = atomic_dec(&t2->wait);
     if (res < 1) {
       error("Negative wait!");
     } else if (res == 1) {
@@ -1203,7 +1200,7 @@ struct task *scheduler_unlock(struct scheduler *s, struct task *t) {
      they are ready. */
   for (int k = 0; k < t->nr_unlock_tasks; k++) {
     struct task *t2 = t->unlock_tasks[k];
-    int res = atomic_dec(&t2->wait);
+    const int res = atomic_dec(&t2->wait);
     if (res < 1) {
       error("Negative wait!");
     } else if (res == 1) {
@@ -1240,7 +1237,7 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
                                const struct task *prev) {
 
   struct task *res = NULL;
-  int k, nr_queues = s->nr_queues;
+  const int nr_queues = s->nr_queues;
   unsigned int seed = qid;
 
   /* Check qid. */
@@ -1264,10 +1261,10 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
       /* If unsuccessful, try stealing from the other queues. */
       if (s->flags & scheduler_flag_steal) {
         int count = 0, qids[nr_queues];
-        for (k = 0; k < nr_queues; k++)
+        for (int k = 0; k < nr_queues; k++)
           if (s->queues[k].count > 0) qids[count++] = k;
-        for (k = 0; k < scheduler_maxsteal && count > 0; k++) {
-          int ind = rand_r(&seed) % count;
+        for (int k = 0; k < scheduler_maxsteal && count > 0; k++) {
+          const int ind = rand_r(&seed) % count;
           TIMER_TIC
           res = queue_gettask(&s->queues[qids[ind]], prev, 0);
           TIMER_TOC(timer_qsteal);
@@ -1287,7 +1284,10 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
     if (res == NULL) {
 #endif
       pthread_mutex_lock(&s->sleep_mutex);
-      if (s->waiting > 0) pthread_cond_wait(&s->sleep_cond, &s->sleep_mutex);
+      res = queue_gettask(&s->queues[qid], prev, 1);
+      if (res == NULL && s->waiting > 0) {
+        pthread_cond_wait(&s->sleep_cond, &s->sleep_mutex);
+      }
       pthread_mutex_unlock(&s->sleep_mutex);
     }
   }
@@ -1352,12 +1352,6 @@ void scheduler_init(struct scheduler *s, struct space *space, int nr_tasks,
   s->tasks = NULL;
   s->tasks_ind = NULL;
   scheduler_reset(s, nr_tasks);
-
-/* Construct types for MPI communications */
-#ifdef WITH_MPI
-  part_create_mpi_type(&s->part_mpi_type);
-  xpart_create_mpi_type(&s->xpart_mpi_type);
-#endif
 }
 
 /**
@@ -1366,7 +1360,7 @@ void scheduler_init(struct scheduler *s, struct space *space, int nr_tasks,
  * @param s The #scheduler
  * @param fileName Name of the file to write to
  */
-void scheduler_print_tasks(struct scheduler *s, char *fileName) {
+void scheduler_print_tasks(const struct scheduler *s, const char *fileName) {
 
   const int nr_tasks = s->nr_tasks, *tid = s->tasks_ind;
   struct task *t, *tasks = s->tasks;
diff --git a/src/scheduler.h b/src/scheduler.h
index 3f2d8c289d0d691d0d155b20ae0522c5830524aa..64c694aea295c13810a20b626055fc6c15eb0af8 100644
--- a/src/scheduler.h
+++ b/src/scheduler.h
@@ -100,12 +100,6 @@ struct scheduler {
 
   /* The node we are working on. */
   int nodeID;
-
-#ifdef WITH_MPI
-  /* MPI data type for the particle transfers */
-  MPI_Datatype part_mpi_type;
-  MPI_Datatype xpart_mpi_type;
-#endif
 };
 
 /* Function prototypes. */
@@ -128,7 +122,7 @@ struct task *scheduler_unlock(struct scheduler *s, struct task *t);
 void scheduler_addunlock(struct scheduler *s, struct task *ta, struct task *tb);
 void scheduler_set_unlocks(struct scheduler *s);
 void scheduler_dump_queue(struct scheduler *s);
-void scheduler_print_tasks(struct scheduler *s, char *fileName);
+void scheduler_print_tasks(const struct scheduler *s, const char *fileName);
 void scheduler_do_rewait(struct task *t_begin, struct task *t_end,
                          unsigned int mask, unsigned int submask);
 
diff --git a/src/serial_io.c b/src/serial_io.c
index 8e63db5cfad3a3b50fc7e350bbac6ce09708230a..40bd2b1c8921f4acbfa0950984d6915ebd3d241e 100644
--- a/src/serial_io.c
+++ b/src/serial_io.c
@@ -57,18 +57,18 @@
  * @param dim The dimension of the data (1 for scalar, 3 for vector)
  * @param part_c A (char*) pointer on the first occurrence of the field of
  *interest in the parts array
+ * @param partSize The size in bytes of the particle structure.
  * @param importance If COMPULSORY, the data must be present in the IC file. If
  *OPTIONAL, the array will be zeroed when the data is not present.
  *
  * @todo A better version using HDF5 hyper-slabs to read the file directly into
  *the part array
  * will be written once the structures have been stabilized.
- *
- * Calls #error() if an error occurs.
  */
 void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N,
                       int dim, long long N_total, long long offset,
-                      char* part_c, enum DATA_IMPORTANCE importance) {
+                      char* part_c, size_t partSize,
+                      enum DATA_IMPORTANCE importance) {
   hid_t h_data = 0, h_err = 0, h_type = 0, h_memspace = 0, h_filespace = 0;
   hsize_t shape[2], offsets[2];
   htri_t exist = 0;
@@ -76,7 +76,6 @@ void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N,
   int i = 0, rank = 0;
   const size_t typeSize = sizeOfType(type);
   const size_t copySize = typeSize * dim;
-  const size_t partSize = sizeof(struct part);
   char* temp_c = 0;
 
   /* Check whether the dataspace exists or not */
@@ -172,9 +171,10 @@ void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N,
  * Routines writing an output file
  *-----------------------------------------------------------------------------*/
 
-void prepareArray(hid_t grp, char* fileName, FILE* xmfFile, char* name,
-                  enum DATA_TYPE type, long long N_total, int dim,
-                  struct UnitSystem* us, enum UnitConversionFactor convFactor) {
+void prepareArray(hid_t grp, char* fileName, FILE* xmfFile,
+                  char* partTypeGroupName, char* name, enum DATA_TYPE type,
+                  long long N_total, int dim, struct UnitSystem* us,
+                  enum UnitConversionFactor convFactor) {
   hid_t h_data = 0, h_err = 0, h_space = 0, h_prop = 0;
   int rank = 0;
   hsize_t shape[2];
@@ -234,7 +234,7 @@ void prepareArray(hid_t grp, char* fileName, FILE* xmfFile, char* name,
   }
 
   /* Write XMF description for this data set */
-  writeXMFline(xmfFile, fileName, name, N_total, dim, type);
+  writeXMFline(xmfFile, fileName, partTypeGroupName, name, N_total, dim, type);
 
   /* Write unit conversion factors for this data set */
   conversionString(buffer, us, convFactor);
@@ -255,21 +255,22 @@ void prepareArray(hid_t grp, char* fileName, FILE* xmfFile, char* name,
  * @param grp The group in which to write.
  * @param fileName The name of the file in which the data is written
  * @param xmfFile The FILE used to write the XMF description
+ * @param partTypeGroupName The name of the group containing the particles in
+ *the HDF5 file.
  * @param name The name of the array to write.
  * @param type The #DATA_TYPE of the array.
  * @param N The number of particles to write.
  * @param dim The dimension of the data (1 for scalar, 3 for vector)
  * @param part_c A (char*) pointer on the first occurrence of the field of
  *interest in the parts array
+ * @param partSize The size in bytes of the particle structure.
  * @param us The UnitSystem currently in use
- * @param convFactor The UnitConversionFactor for this array
- *
- *
- * Calls #error() if an error occurs.
+ * @param convFactor The UnitConversionFactor for this arrayo
  */
-void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
-                       enum DATA_TYPE type, int N, int dim, long long N_total,
-                       int mpi_rank, long long offset, char* part_c,
+void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile,
+                       char* partTypeGroupName, char* name, enum DATA_TYPE type,
+                       int N, int dim, long long N_total, int mpi_rank,
+                       long long offset, char* part_c, size_t partSize,
                        struct UnitSystem* us,
                        enum UnitConversionFactor convFactor) {
 
@@ -279,15 +280,14 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
   int i = 0, rank = 0;
   const size_t typeSize = sizeOfType(type);
   const size_t copySize = typeSize * dim;
-  const size_t partSize = sizeof(struct part);
   char* temp_c = 0;
 
   /* message("Writing '%s' array...", name); */
 
   /* Prepare the arrays in the file */
   if (mpi_rank == 0)
-    prepareArray(grp, fileName, xmfFile, name, type, N_total, dim, us,
-                 convFactor);
+    prepareArray(grp, fileName, xmfFile, partTypeGroupName, name, type, N_total,
+                 dim, us, convFactor);
 
   /* Allocate temporary buffer */
   temp = malloc(N * dim * sizeOfType(type));
@@ -362,7 +362,7 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
 #define readArray(grp, name, type, N, dim, part, N_total, offset, field, \
                   importance)                                            \
   readArrayBackEnd(grp, name, type, N, dim, N_total, offset,             \
-                   (char*)(&(part[0]).field), importance)
+                   (char*)(&(part[0]).field), sizeof(part[0]), importance)
 
 /**
  * @brief A helper macro to call the readArrayBackEnd function more easily.
@@ -371,34 +371,47 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
  * @param fileName Unused parameter in non-MPI mode
  * @param xmfFile Unused parameter in non-MPI mode
  * @param name The name of the array to write.
+ * @param partTypeGroupName The name of the group containing the particles in
+ *the HDF5 file.
  * @param type The #DATA_TYPE of the array.
  * @param N The number of particles to write.
  * @param dim The dimension of the data (1 for scalar, 3 for vector)
  * @param part A (char*) pointer on the first occurrence of the field of
- *interest
- *in the parts array
+ *interest in the parts array
+ * @param N_total Unused parameter in non-MPI mode
+ * @param mpi_rank Unused parameter in non-MPI mode
+ * @param offset Unused parameter in non-MPI mode
  * @param field The name (code name) of the field to read from.
  * @param us The UnitSystem currently in use
  * @param convFactor The UnitConversionFactor for this array
  *
  */
-#define writeArray(grp, fileName, xmfFile, name, type, N, dim, part, N_total, \
-                   mpi_rank, offset, field, us, convFactor)                   \
-  writeArrayBackEnd(grp, fileName, xmfFile, name, type, N, dim, N_total,      \
-                    mpi_rank, offset, (char*)(&(part[0]).field), us,          \
-                    convFactor)
+#define writeArray(grp, fileName, xmfFile, partTypeGroupName, name, type, N,   \
+                   dim, part, N_total, mpi_rank, offset, field, us,            \
+                   convFactor)                                                 \
+  writeArrayBackEnd(grp, fileName, xmfFile, partTypeGroupName, name, type, N,  \
+                    dim, N_total, mpi_rank, offset, (char*)(&(part[0]).field), \
+                    sizeof(part[0]), us, convFactor)
 
 /* Import the right hydro definition */
 #include "hydro_io.h"
+/* Import the right gravity definition */
+#include "gravity_io.h"
 
 /**
  * @brief Reads an HDF5 initial condition file (GADGET-3 type)
  *
  * @param fileName The file to read.
  * @param dim (output) The dimension of the volume read from the file.
- * @param parts (output) The array of #part read from the file.
- * @param N (output) The number of particles read from the file.
+ * @param parts (output) The array of #part (gas particles) read from the file.
+ * @param gparts (output) The array of #gpart read from the file.
+ * @param Ngas (output) The number of #part read from the file on that node.
+ * @param Ngparts (output) The number of #gpart read from the file on that node.
  * @param periodic (output) 1 if the volume is periodic, 0 if not.
+ * @param mpi_rank The MPI rank of this node
+ * @param mpi_size The number of MPI ranks
+ * @param comm The MPI communicator
+ * @param info The MPI information object
  *
  * Opens the HDF5 file fileName and reads the particles contained
  * in the parts array. N is the returned number of particles found
@@ -411,17 +424,18 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
  *
  */
 void read_ic_serial(char* fileName, double dim[3], struct part** parts,
-                    size_t* N, int* periodic, int mpi_rank, int mpi_size,
-                    MPI_Comm comm, MPI_Info info) {
+                    struct gpart** gparts, size_t* Ngas, size_t* Ngparts,
+                    int* periodic, int mpi_rank, int mpi_size, MPI_Comm comm,
+                    MPI_Info info) {
   hid_t h_file = 0, h_grp = 0;
-  double boxSize[3] = {0.0, -1.0, -1.0};
   /* GADGET has only cubic boxes (in cosmological mode) */
-  int numParticles[6] = {0};
-  /* GADGET has 6 particle types. We only keep the type 0*/
-  int numParticles_highWord[6] = {0};
-  long long offset = 0;
-  long long N_total = 0;
-  int rank;
+  double boxSize[3] = {0.0, -1.0, -1.0};
+  /* GADGET has 6 particle types. We only keep the type 0 & 1 for now*/
+  int numParticles[NUM_PARTICLE_TYPES] = {0};
+  int numParticles_highWord[NUM_PARTICLE_TYPES] = {0};
+  size_t N[NUM_PARTICLE_TYPES] = {0};
+  long long N_total[NUM_PARTICLE_TYPES] = {0};
+  long long offset[NUM_PARTICLE_TYPES] = {0};
 
   /* First read some information about the content */
   if (mpi_rank == 0) {
@@ -453,8 +467,10 @@ void read_ic_serial(char* fileName, double dim[3], struct part** parts,
     readAttribute(h_grp, "NumPart_Total", UINT, numParticles);
     readAttribute(h_grp, "NumPart_Total_HighWord", UINT, numParticles_highWord);
 
-    N_total = ((long long)numParticles[0]) +
-              ((long long)numParticles_highWord[0] << 32);
+    for (int ptype = 0; ptype < NUM_PARTICLE_TYPES; ++ptype)
+      N_total[ptype] = ((long long)numParticles[ptype]) +
+                       ((long long)numParticles_highWord[ptype] << 32);
+
     dim[0] = boxSize[0];
     dim[1] = (boxSize[1] < 0) ? boxSize[0] : boxSize[1];
     dim[2] = (boxSize[2] < 0) ? boxSize[0] : boxSize[2];
@@ -474,22 +490,38 @@ void read_ic_serial(char* fileName, double dim[3], struct part** parts,
 
   /* Now need to broadcast that information to all ranks. */
   MPI_Bcast(periodic, 1, MPI_INT, 0, comm);
-  MPI_Bcast(&N_total, 1, MPI_LONG_LONG, 0, comm);
+  MPI_Bcast(&N_total, NUM_PARTICLE_TYPES, MPI_LONG_LONG, 0, comm);
   MPI_Bcast(dim, 3, MPI_DOUBLE, 0, comm);
 
   /* Divide the particles among the tasks. */
-  offset = mpi_rank * N_total / mpi_size;
-  *N = (mpi_rank + 1) * N_total / mpi_size - offset;
+  for (int ptype = 0; ptype < NUM_PARTICLE_TYPES; ++ptype) {
+    offset[ptype] = mpi_rank * N_total[ptype] / mpi_size;
+    N[ptype] = (mpi_rank + 1) * N_total[ptype] / mpi_size - offset[ptype];
+  }
 
-  /* Allocate memory to store particles */
-  if (posix_memalign((void*)parts, part_align, (*N) * sizeof(struct part)) != 0)
+  /* Allocate memory to store SPH particles */
+  *Ngas = N[0];
+  if (posix_memalign((void*)parts, part_align, (*Ngas) * sizeof(struct part)) !=
+      0)
     error("Error while allocating memory for particles");
-  bzero(*parts, *N * sizeof(struct part));
+  bzero(*parts, *Ngas * sizeof(struct part));
+
+  /* Allocate memory to store all particles */
+  const size_t Ndm = N[1];
+  *Ngparts = N[1] + N[0];
+  if (posix_memalign((void*)gparts, gpart_align,
+                     *Ngparts * sizeof(struct gpart)) != 0)
+    error("Error while allocating memory for gravity particles");
+  bzero(*gparts, *Ngparts * sizeof(struct gpart));
+
   /* message("Allocated %8.2f MB for particles.", *N * sizeof(struct part) / */
   /* 	  (1024.*1024.)); */
 
+  /* message("BoxSize = %lf", dim[0]); */
+  /* message("NumPart = [%zd, %zd] Total = %zd", *Ngas, Ndm, *Ngparts); */
+
   /* Now loop over ranks and read the data */
-  for (rank = 0; rank < mpi_size; ++rank) {
+  for (int rank = 0; rank < mpi_size; ++rank) {
 
     /* Is it this rank's turn to read ? */
     if (rank == mpi_rank) {
@@ -498,17 +530,41 @@ void read_ic_serial(char* fileName, double dim[3], struct part** parts,
       if (h_file < 0)
         error("Error while opening file '%s' on rank %d.", fileName, mpi_rank);
 
-      /* Open SPH particles group */
-      /* message("Reading particle arrays..."); */
-      h_grp = H5Gopen(h_file, "/PartType0", H5P_DEFAULT);
-      if (h_grp < 0)
-        error("Error while opening particle group on rank %d.\n", mpi_rank);
+      /* Loop over all particle types */
+      for (int ptype = 0; ptype < NUM_PARTICLE_TYPES; ptype++) {
 
-      /* Read particle fields into the particle structure */
-      hydro_read_particles(h_grp, *N, N_total, offset, *parts);
+        /* Don't do anything if no particle of this kind */
+        if (N[ptype] == 0) continue;
 
-      /* Close particle group */
-      H5Gclose(h_grp);
+        /* Open the particle group in the file */
+        char partTypeGroupName[PARTICLE_GROUP_BUFFER_SIZE];
+        snprintf(partTypeGroupName, PARTICLE_GROUP_BUFFER_SIZE, "/PartType%d",
+                 ptype);
+        h_grp = H5Gopen(h_file, partTypeGroupName, H5P_DEFAULT);
+        if (h_grp < 0) {
+          error("Error while opening particle group %s.", partTypeGroupName);
+        }
+
+        /* Read particle fields into the particle structure */
+        switch (ptype) {
+
+          case GAS:
+            hydro_read_particles(h_grp, N[ptype], N_total[ptype], offset[ptype],
+                                 *parts);
+            break;
+
+          case DM:
+            darkmatter_read_particles(h_grp, N[ptype], N_total[ptype],
+                                      offset[ptype], *gparts);
+            break;
+
+          default:
+            error("Particle Type %d not yet supported. Aborting", ptype);
+        }
+
+        /* Close particle group */
+        H5Gclose(h_grp);
+      }
 
       /* Close file */
       H5Fclose(h_file);
@@ -518,6 +574,12 @@ void read_ic_serial(char* fileName, double dim[3], struct part** parts,
     MPI_Barrier(comm);
   }
 
+  /* Prepare the DM particles */
+  prepare_dm_gparts(*gparts, Ndm);
+
+  /* Now duplicate the hydro particle into gparts */
+  duplicate_hydro_gparts(*parts, *gparts, *Ngas, Ndm);
+
   /* message("Done Reading particles..."); */
 }
 
@@ -525,7 +587,11 @@ void read_ic_serial(char* fileName, double dim[3], struct part** parts,
  * @brief Writes an HDF5 output file (GADGET-3 type) with its XMF descriptor
  *
  * @param e The engine containing all the system.
- * @param us The UnitSystem used for the conversion of units in the output
+ * @param us The UnitSystem used for the conversion of units in the output.
+ * @param mpi_rank The MPI rank of this node.
+ * @param mpi_size The number of MPI ranks.
+ * @param comm The MPI communicator.
+ * @param info The MPI information object
  *
  * Creates an HDF5 output file and writes the particles contained
  * in the engine. If such a file already exists, it is erased and replaced
@@ -538,35 +604,40 @@ void read_ic_serial(char* fileName, double dim[3], struct part** parts,
 void write_output_serial(struct engine* e, struct UnitSystem* us, int mpi_rank,
                          int mpi_size, MPI_Comm comm, MPI_Info info) {
   hid_t h_file = 0, h_grp = 0, h_grpsph = 0;
-  int N = e->s->nr_parts;
+  const size_t Ngas = e->s->nr_parts;
+  const size_t Ntot = e->s->nr_gparts;
   int periodic = e->s->periodic;
-  int numParticles[6] = {N, 0};
-  int numParticlesHighWord[6] = {0};
-  unsigned int flagEntropy[6] = {0};
-  long long N_total = 0, offset = 0;
-  double offset_d = 0., N_d = 0., N_total_d = 0.;
   int numFiles = 1;
-  int rank = 0;
   struct part* parts = e->s->parts;
-  FILE* xmfFile = 0;
+  struct gpart* gparts = e->s->gparts;
+  struct gpart* dmparts = NULL;
   static int outputCount = 0;
+  FILE* xmfFile = 0;
+
+  /* Number of particles of each type */
+  // const size_t Ndm = Ntot - Ngas;
+
+  /* MATTHIEU: Temporary fix to preserve master */
+  const size_t Ndm = Ntot > 0 ? Ntot - Ngas : 0;
+  /* MATTHIEU: End temporary fix */
 
   /* File name */
-  char fileName[200];
-  sprintf(fileName, "output_%03i.hdf5", outputCount);
+  char fileName[FILENAME_BUFFER_SIZE];
+  snprintf(fileName, FILENAME_BUFFER_SIZE, "output_%03i.hdf5", outputCount);
 
   /* Compute offset in the file and total number of particles */
-  /* Done using double to allow for up to 2^50=10^15 particles */
-  N_d = (double)N;
-  MPI_Exscan(&N_d, &offset_d, 1, MPI_DOUBLE, MPI_SUM, comm);
-  N_total_d = offset_d + N_d;
-  MPI_Bcast(&N_total_d, 1, MPI_DOUBLE, mpi_size - 1, comm);
-  if (N_total_d > 1.e15)
-    error(
-        "Error while computing the offset for parallel output: Simulation has "
-        "more than 10^15 particles.\n");
-  N_total = (long long)N_total_d;
-  offset = (long long)offset_d;
+  size_t N[NUM_PARTICLE_TYPES] = {Ngas, Ndm, 0};
+  long long N_total[NUM_PARTICLE_TYPES] = {0};
+  long long offset[NUM_PARTICLE_TYPES] = {0};
+  MPI_Exscan(&N, &offset, NUM_PARTICLE_TYPES, MPI_LONG_LONG, MPI_SUM, comm);
+  for (int ptype = 0; ptype < NUM_PARTICLE_TYPES; ++ptype)
+    N_total[ptype] = offset[ptype] + N[ptype];
+
+  /* The last rank now has the correct N_total. Let's broadcast from there */
+  MPI_Bcast(&N_total, 6, MPI_LONG_LONG, mpi_size - 1, comm);
+
+  /* Now everybody konws its offset and the total number of particles of each
+   * type */
 
   /* Do common stuff first */
   if (mpi_rank == 0) {
@@ -578,7 +649,7 @@ void write_output_serial(struct engine* e, struct UnitSystem* us, int mpi_rank,
     xmfFile = prepareXMFfile();
 
     /* Write the part corresponding to this specific output */
-    writeXMFheader(xmfFile, N_total, fileName, e->time);
+    writeXMFoutputheader(xmfFile, fileName, e->time);
 
     /* Open file */
     /* message("Opening file '%s'.", fileName); */
@@ -610,15 +681,24 @@ void write_output_serial(struct engine* e, struct UnitSystem* us, int mpi_rank,
     writeAttribute(h_grp, "Time", DOUBLE, &dblTime, 1);
 
     /* GADGET-2 legacy values */
-    numParticles[0] = (unsigned int)N_total;
-    writeAttribute(h_grp, "NumPart_ThisFile", UINT, numParticles, 6);
-    writeAttribute(h_grp, "NumPart_Total", UINT, numParticles, 6);
-    numParticlesHighWord[0] = (unsigned int)(N_total >> 32);
+    /* Number of particles of each type */
+    unsigned int numParticles[NUM_PARTICLE_TYPES] = {0};
+    unsigned int numParticlesHighWord[NUM_PARTICLE_TYPES] = {0};
+    for (int ptype = 0; ptype < NUM_PARTICLE_TYPES; ++ptype) {
+      numParticles[ptype] = (unsigned int)N_total[ptype];
+      numParticlesHighWord[ptype] = (unsigned int)(N_total[ptype] >> 32);
+    }
+    writeAttribute(h_grp, "NumPart_ThisFile", LONGLONG, N_total,
+                   NUM_PARTICLE_TYPES);
+    writeAttribute(h_grp, "NumPart_Total", UINT, numParticles,
+                   NUM_PARTICLE_TYPES);
     writeAttribute(h_grp, "NumPart_Total_HighWord", UINT, numParticlesHighWord,
-                   6);
+                   NUM_PARTICLE_TYPES);
     double MassTable[6] = {0., 0., 0., 0., 0., 0.};
-    writeAttribute(h_grp, "MassTable", DOUBLE, MassTable, 6);
-    writeAttribute(h_grp, "Flag_Entropy_ICs", UINT, flagEntropy, 6);
+    writeAttribute(h_grp, "MassTable", DOUBLE, MassTable, NUM_PARTICLE_TYPES);
+    unsigned int flagEntropy[NUM_PARTICLE_TYPES] = {0};
+    writeAttribute(h_grp, "Flag_Entropy_ICs", UINT, flagEntropy,
+                   NUM_PARTICLE_TYPES);
     writeAttribute(h_grp, "NumFilesPerSnapshot", INT, &numFiles, 1);
 
     /* Close header */
@@ -636,21 +716,32 @@ void write_output_serial(struct engine* e, struct UnitSystem* us, int mpi_rank,
     /* Print the system of Units */
     writeUnitSystem(h_file, us);
 
-    /* Create SPH particles group */
-    /* message("Writing particle arrays..."); */
-    h_grp =
-        H5Gcreate(h_file, "/PartType0", H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-    if (h_grp < 0) error("Error while creating particle group.\n");
+    /* Loop over all particle types */
+    for (int ptype = 0; ptype < NUM_PARTICLE_TYPES; ptype++) {
 
-    /* Close particle group */
-    H5Gclose(h_grp);
+      /* Don't do anything if no particle of this kind */
+      if (N_total[ptype] == 0) continue;
+
+      /* Open the particle group in the file */
+      char partTypeGroupName[PARTICLE_GROUP_BUFFER_SIZE];
+      snprintf(partTypeGroupName, PARTICLE_GROUP_BUFFER_SIZE, "/PartType%d",
+               ptype);
+      h_grp = H5Gcreate(h_file, partTypeGroupName, H5P_DEFAULT, H5P_DEFAULT,
+                        H5P_DEFAULT);
+      if (h_grp < 0) {
+        error("Error while creating particle group.\n");
+      }
+
+      /* Close particle group */
+      H5Gclose(h_grp);
+    }
 
     /* Close file */
     H5Fclose(h_file);
   }
 
   /* Now loop over ranks and write the data */
-  for (rank = 0; rank < mpi_size; ++rank) {
+  for (int rank = 0; rank < mpi_size; ++rank) {
 
     /* Is it this rank's turn to write ? */
     if (rank == mpi_rank) {
@@ -659,18 +750,65 @@ void write_output_serial(struct engine* e, struct UnitSystem* us, int mpi_rank,
       if (h_file < 0)
         error("Error while opening file '%s' on rank %d.", fileName, mpi_rank);
 
-      /* Open SPH particles group */
-      /* message("Reading particle arrays..."); */
-      h_grp = H5Gopen(h_file, "/PartType0", H5P_DEFAULT);
-      if (h_grp < 0)
-        error("Error while opening particle group on rank %d.\n", mpi_rank);
+      /* Loop over all particle types */
+      for (int ptype = 0; ptype < NUM_PARTICLE_TYPES; ptype++) {
 
-      /* Write particle fields from the particle structure */
-      hydro_write_particles(h_grp, fileName, xmfFile, N, N_total, mpi_rank,
-                            offset, parts, us);
+        /* Don't do anything if no particle of this kind */
+        if (N_total[ptype] == 0) continue;
 
-      /* Close particle group */
-      H5Gclose(h_grp);
+        /* Add the global information for that particle type to the XMF
+         * meta-file */
+        if (mpi_rank == 0)
+          writeXMFgroupheader(xmfFile, fileName, N_total[ptype], ptype);
+
+        /* Open the particle group in the file */
+        char partTypeGroupName[PARTICLE_GROUP_BUFFER_SIZE];
+        snprintf(partTypeGroupName, PARTICLE_GROUP_BUFFER_SIZE, "/PartType%d",
+                 ptype);
+        h_grp = H5Gopen(h_file, partTypeGroupName, H5P_DEFAULT);
+        if (h_grp < 0) {
+          error("Error while opening particle group %s.", partTypeGroupName);
+        }
+
+        /* Read particle fields into the particle structure */
+        switch (ptype) {
+
+          case GAS:
+            hydro_write_particles(h_grp, fileName, partTypeGroupName, xmfFile,
+                                  N[ptype], N_total[ptype], mpi_rank,
+                                  offset[ptype], parts, us);
+
+            break;
+
+          case DM:
+            /* Allocate temporary array */
+            if (posix_memalign((void*)&dmparts, gpart_align,
+                               Ndm * sizeof(struct gpart)) != 0)
+              error("Error while allocating temporart memory for DM particles");
+            bzero(dmparts, Ndm * sizeof(struct gpart));
+
+            /* Collect the DM particles from gpart */
+            collect_dm_gparts(gparts, Ntot, dmparts, Ndm);
+
+            /* Write DM particles */
+            darkmatter_write_particles(h_grp, fileName, partTypeGroupName,
+                                       xmfFile, N[ptype], N_total[ptype],
+                                       mpi_rank, offset[ptype], dmparts, us);
+
+            /* Free temporary array */
+            free(dmparts);
+            break;
+
+          default:
+            error("Particle Type %d not yet supported. Aborting", ptype);
+        }
+
+        /* Close particle group */
+        H5Gclose(h_grp);
+
+        /* Close this particle group in the XMF file as well */
+        if (mpi_rank == 0) writeXMFgroupfooter(xmfFile, ptype);
+      }
 
       /* Close file */
       H5Fclose(h_file);
@@ -681,7 +819,7 @@ void write_output_serial(struct engine* e, struct UnitSystem* us, int mpi_rank,
   }
 
   /* Write footer of LXMF file descriptor */
-  if (mpi_rank == 0) writeXMFfooter(xmfFile);
+  if (mpi_rank == 0) writeXMFoutputfooter(xmfFile, outputCount, e->time);
 
   /* message("Done writing particles..."); */
   ++outputCount;
diff --git a/src/serial_io.h b/src/serial_io.h
index 95f09f5977a97a359e978db7a1b71b02030d6a14..5a34d420cfabd88d4147e3f3630e0efe89951c41 100644
--- a/src/serial_io.h
+++ b/src/serial_io.h
@@ -32,8 +32,9 @@
 #if defined(HAVE_HDF5) && defined(WITH_MPI) && !defined(HAVE_PARALLEL_HDF5)
 
 void read_ic_serial(char* fileName, double dim[3], struct part** parts,
-                    size_t* N, int* periodic, int mpi_rank, int mpi_size,
-                    MPI_Comm comm, MPI_Info info);
+                    struct gpart** gparts, size_t* Ngas, size_t* Ngparts,
+                    int* periodic, int mpi_rank, int mpi_size, MPI_Comm comm,
+                    MPI_Info info);
 
 void write_output_serial(struct engine* e, struct UnitSystem* us, int mpi_rank,
                          int mpi_size, MPI_Comm comm, MPI_Info info);
diff --git a/src/single_io.c b/src/single_io.c
index 59686a68b5d9e5ea41267ba7b3aad9391862fae4..801428433ef5170082b68dec425e52f845bb41ae 100644
--- a/src/single_io.c
+++ b/src/single_io.c
@@ -39,9 +39,6 @@
 #include "common_io.h"
 #include "error.h"
 
-#define FILENAME_BUFFER_SIZE 150
-#define PARTICLE_GROUP_BUFFER_SIZE 20
-
 /*-----------------------------------------------------------------------------
  * Routines reading an IC file
  *-----------------------------------------------------------------------------*/
@@ -56,24 +53,23 @@
  * @param dim The dimension of the data (1 for scalar, 3 for vector)
  * @param part_c A (char*) pointer on the first occurrence of the field of
  *interest in the parts array
+ * @param partSize The size in bytes of the particle structure.
  * @param importance If COMPULSORY, the data must be present in the IC file. If
  *OPTIONAL, the array will be zeroed when the data is not present.
  *
  * @todo A better version using HDF5 hyper-slabs to read the file directly into
  *the part array
  * will be written once the structures have been stabilized.
- *
- * Calls #error() if an error occurs.
  */
 void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N,
-                      int dim, char* part_c, enum DATA_IMPORTANCE importance) {
+                      int dim, char* part_c, size_t partSize,
+                      enum DATA_IMPORTANCE importance) {
   hid_t h_data = 0, h_err = 0, h_type = 0;
   htri_t exist = 0;
   void* temp;
   int i = 0;
   const size_t typeSize = sizeOfType(type);
   const size_t copySize = typeSize * dim;
-  const size_t partSize = sizeof(struct part);
   char* temp_c = 0;
 
   /* Check whether the dataspace exists or not */
@@ -141,23 +137,25 @@ void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N,
  * @param grp The group in which to write.
  * @param fileName The name of the file in which the data is written
  * @param xmfFile The FILE used to write the XMF description
+ * @param partTypeGroupName The name of the group containing the particles in
+ *the HDF5 file.
  * @param name The name of the array to write.
  * @param type The #DATA_TYPE of the array.
  * @param N The number of particles to write.
  * @param dim The dimension of the data (1 for scalar, 3 for vector)
  * @param part_c A (char*) pointer on the first occurrence of the field of
- *interest in the parts array
+ *interest in the parts array.
+ * @param partSize The size in bytes of the particle structure.
  * @param us The UnitSystem currently in use
  * @param convFactor The UnitConversionFactor for this array
  *
  * @todo A better version using HDF5 hyper-slabs to write the file directly from
  *the part array
  * will be written once the structures have been stabilized.
- *
- * Calls #error() if an error occurs.
  */
-void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
-                       enum DATA_TYPE type, int N, int dim, char* part_c,
+void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile,
+                       char* partTypeGroupName, char* name, enum DATA_TYPE type,
+                       int N, int dim, char* part_c, size_t partSize,
                        struct UnitSystem* us,
                        enum UnitConversionFactor convFactor) {
   hid_t h_data = 0, h_err = 0, h_space = 0, h_prop = 0;
@@ -165,7 +163,6 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
   int i = 0, rank = 0;
   const size_t typeSize = sizeOfType(type);
   const size_t copySize = typeSize * dim;
-  const size_t partSize = sizeof(struct part);
   char* temp_c = 0;
   hsize_t shape[2];
   hsize_t chunk_shape[2];
@@ -204,7 +201,7 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
 
   /* Make sure the chunks are not larger than the dataset */
   if (chunk_shape[0] > N) chunk_shape[0] = N;
-  
+
   /* Change shape of data space */
   h_err = H5Sset_extent_simple(h_space, rank, shape, NULL);
   if (h_err < 0) {
@@ -241,7 +238,7 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
   }
 
   /* Write XMF description for this data set */
-  writeXMFline(xmfFile, fileName, name, N, dim, type);
+  writeXMFline(xmfFile, fileName, partTypeGroupName, name, N, dim, type);
 
   /* Write unit conversion factors for this data set */
   conversionString(buffer, us, convFactor);
@@ -276,7 +273,7 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
 #define readArray(grp, name, type, N, dim, part, N_total, offset, field, \
                   importance)                                            \
   readArrayBackEnd(grp, name, type, N, dim, (char*)(&(part[0]).field),   \
-                   importance)
+                   sizeof(part[0]), importance)
 
 /**
  * @brief A helper macro to call the readArrayBackEnd function more easily.
@@ -285,6 +282,8 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
  * @param fileName The name of the file in which the data is written
  * @param xmfFile The FILE used to write the XMF description
  * @param name The name of the array to write.
+ * @param partTypeGroupName The name of the group containing the particles in
+ *the HDF5 file.
  * @param type The #DATA_TYPE of the array.
  * @param N The number of particles to write.
  * @param dim The dimension of the data (1 for scalar, 3 for vector)
@@ -298,10 +297,12 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
  * @param convFactor The UnitConversionFactor for this array
  *
  */
-#define writeArray(grp, fileName, xmfFile, name, type, N, dim, part, N_total, \
-                   mpi_rank, offset, field, us, convFactor)                   \
-  writeArrayBackEnd(grp, fileName, xmfFile, name, type, N, dim,               \
-                    (char*)(&(part[0]).field), us, convFactor)
+#define writeArray(grp, fileName, xmfFile, partTypeGroupName, name, type, N,  \
+                   dim, part, N_total, mpi_rank, offset, field, us,           \
+                   convFactor)                                                \
+  writeArrayBackEnd(grp, fileName, xmfFile, partTypeGroupName, name, type, N, \
+                    dim, (char*)(&(part[0]).field), sizeof(part[0]), us,      \
+                    convFactor)
 
 /* Import the right hydro definition */
 #include "hydro_io.h"
@@ -314,9 +315,9 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
  * @param fileName The file to read.
  * @param dim (output) The dimension of the volume.
  * @param parts (output) Array of Gas particles.
- * @param gparts (output) Array of DM particles.
+ * @param gparts (output) Array of #gpart particles.
  * @param Ngas (output) number of Gas particles read.
- * @param Ngparts (output) The number of DM particles read.
+ * @param Ngparts (output) The number of #gpart read.
  * @param periodic (output) 1 if the volume is periodic, 0 if not.
  *
  * Opens the HDF5 file fileName and reads the particles contained
@@ -337,6 +338,8 @@ void read_ic_single(char* fileName, double dim[3], struct part** parts,
   double boxSize[3] = {0.0, -1.0, -1.0};
   /* GADGET has 6 particle types. We only keep the type 0 & 1 for now...*/
   int numParticles[NUM_PARTICLE_TYPES] = {0};
+  int numParticles_highWord[NUM_PARTICLE_TYPES] = {0};
+  size_t N[NUM_PARTICLE_TYPES] = {0};
   size_t Ndm;
 
   /* Open file */
@@ -365,9 +368,12 @@ void read_ic_single(char* fileName, double dim[3], struct part** parts,
   /* Read the relevant information and print status */
   readAttribute(h_grp, "BoxSize", DOUBLE, boxSize);
   readAttribute(h_grp, "NumPart_Total", UINT, numParticles);
+  readAttribute(h_grp, "NumPart_Total_HighWord", UINT, numParticles_highWord);
+
+  for (int ptype = 0; ptype < NUM_PARTICLE_TYPES; ++ptype)
+    N[ptype] = ((long long)numParticles[ptype]) +
+               ((long long)numParticles_highWord[ptype] << 32);
 
-  *Ngas = numParticles[0];
-  Ndm = numParticles[1];
   dim[0] = boxSize[0];
   dim[1] = (boxSize[1] < 0) ? boxSize[0] : boxSize[1];
   dim[2] = (boxSize[2] < 0) ? boxSize[0] : boxSize[2];
@@ -378,16 +384,16 @@ void read_ic_single(char* fileName, double dim[3], struct part** parts,
   /* Close header */
   H5Gclose(h_grp);
 
-  /* Total number of particles */
-  *Ngparts = *Ngas + Ndm;
-
   /* Allocate memory to store SPH particles */
+  *Ngas = N[0];
   if (posix_memalign((void*)parts, part_align, *Ngas * sizeof(struct part)) !=
       0)
     error("Error while allocating memory for SPH particles");
   bzero(*parts, *Ngas * sizeof(struct part));
 
   /* Allocate memory to store all particles */
+  Ndm = N[1];
+  *Ngparts = N[1] + N[0];
   if (posix_memalign((void*)gparts, gpart_align,
                      *Ngparts * sizeof(struct gpart)) != 0)
     error("Error while allocating memory for gravity particles");
@@ -396,16 +402,14 @@ void read_ic_single(char* fileName, double dim[3], struct part** parts,
   /* message("Allocated %8.2f MB for particles.", *N * sizeof(struct part) /
    * (1024.*1024.)); */
 
-  /* Open SPH particles group */
-  /* message("Reading particle arrays..."); */
-  message("BoxSize = %lf", dim[0]);
-  message("NumPart = [%zd, %zd] Total = %zd", *Ngas, Ndm, *Ngparts);
+  /* message("BoxSize = %lf", dim[0]); */
+  /* message("NumPart = [%zd, %zd] Total = %zd", *Ngas, Ndm, *Ngparts); */
 
   /* Loop over all particle types */
   for (int ptype = 0; ptype < NUM_PARTICLE_TYPES; ptype++) {
 
     /* Don't do anything if no particle of this kind */
-    if (numParticles[ptype] == 0) continue;
+    if (N[ptype] == 0) continue;
 
     /* Open the particle group in the file */
     char partTypeGroupName[PARTICLE_GROUP_BUFFER_SIZE];
@@ -476,10 +480,13 @@ void write_output_single(struct engine* e, struct UnitSystem* us) {
   static int outputCount = 0;
 
   /* Number of particles of each type */
-  const size_t Ndm = Ntot - Ngas;
-  int numParticles[NUM_PARTICLE_TYPES] = /* Gadget-2 convention here */
-      {Ngas, Ndm, 0};                    /* Could use size_t instead */
-  int numParticlesHighWord[NUM_PARTICLE_TYPES] = {0};
+  // const size_t Ndm = Ntot - Ngas;
+
+  /* MATTHIEU: Temporary fix to preserve master */
+  const size_t Ndm = Ntot > 0 ? Ntot - Ngas : 0;
+  /* MATTHIEU: End temporary fix */
+
+  long long N_total[NUM_PARTICLE_TYPES] = {Ngas, Ndm, 0};
 
   /* File name */
   char fileName[FILENAME_BUFFER_SIZE];
@@ -493,7 +500,7 @@ void write_output_single(struct engine* e, struct UnitSystem* us) {
   xmfFile = prepareXMFfile();
 
   /* Write the part corresponding to this specific output */
-  writeXMFheader(xmfFile, Ngas, fileName, e->time);
+  writeXMFoutputheader(xmfFile, fileName, e->time);
 
   /* Open file */
   /* message("Opening file '%s'.", fileName); */
@@ -521,19 +528,27 @@ void write_output_single(struct engine* e, struct UnitSystem* us) {
 
   /* Print the relevant information and print status */
   writeAttribute(h_grp, "BoxSize", DOUBLE, e->s->dim, 3);
-  writeAttribute(h_grp, "NumPart_ThisFile", UINT, numParticles,
-                 NUM_PARTICLE_TYPES);
   double dblTime = e->time;
   writeAttribute(h_grp, "Time", DOUBLE, &dblTime, 1);
 
   /* GADGET-2 legacy values */
+  /* Number of particles of each type */
+  unsigned int numParticles[NUM_PARTICLE_TYPES] = {0};
+  unsigned int numParticlesHighWord[NUM_PARTICLE_TYPES] = {0};
+  for (int ptype = 0; ptype < NUM_PARTICLE_TYPES; ++ptype) {
+    numParticles[ptype] = (unsigned int)N_total[ptype];
+    numParticlesHighWord[ptype] = (unsigned int)(N_total[ptype] >> 32);
+  }
+  writeAttribute(h_grp, "NumPart_ThisFile", LONGLONG, N_total,
+                 NUM_PARTICLE_TYPES);
   writeAttribute(h_grp, "NumPart_Total", UINT, numParticles,
                  NUM_PARTICLE_TYPES);
   writeAttribute(h_grp, "NumPart_Total_HighWord", UINT, numParticlesHighWord,
                  NUM_PARTICLE_TYPES);
-  double MassTable[NUM_PARTICLE_TYPES] = {0., 0., 0., 0., 0., 0.};
+  double MassTable[NUM_PARTICLE_TYPES] = {0};
   writeAttribute(h_grp, "MassTable", DOUBLE, MassTable, NUM_PARTICLE_TYPES);
-  writeAttribute(h_grp, "Flag_Entropy_ICs", UINT, numParticlesHighWord,
+  unsigned int flagEntropy[NUM_PARTICLE_TYPES] = {0};
+  writeAttribute(h_grp, "Flag_Entropy_ICs", UINT, flagEntropy,
                  NUM_PARTICLE_TYPES);
   writeAttribute(h_grp, "NumFilesPerSnapshot", INT, &numFiles, 1);
 
@@ -558,6 +573,9 @@ void write_output_single(struct engine* e, struct UnitSystem* us) {
     /* Don't do anything if no particle of this kind */
     if (numParticles[ptype] == 0) continue;
 
+    /* Add the global information for that particle type to the XMF meta-file */
+    writeXMFgroupheader(xmfFile, fileName, numParticles[ptype], ptype);
+
     /* Open the particle group in the file */
     char partTypeGroupName[PARTICLE_GROUP_BUFFER_SIZE];
     snprintf(partTypeGroupName, PARTICLE_GROUP_BUFFER_SIZE, "/PartType%d",
@@ -574,8 +592,8 @@ void write_output_single(struct engine* e, struct UnitSystem* us) {
     switch (ptype) {
 
       case GAS:
-        hydro_write_particles(h_grp, fileName, xmfFile, Ngas, Ngas, 0, 0, parts,
-                              us);
+        hydro_write_particles(h_grp, fileName, partTypeGroupName, xmfFile, Ngas,
+                              Ngas, 0, 0, parts, us);
         break;
 
       case DM:
@@ -589,8 +607,8 @@ void write_output_single(struct engine* e, struct UnitSystem* us) {
         collect_dm_gparts(gparts, Ntot, dmparts, Ndm);
 
         /* Write DM particles */
-        darkmatter_write_particles(h_grp, fileName, xmfFile, Ndm, Ndm, 0, 0,
-                                   dmparts, us);
+        darkmatter_write_particles(h_grp, fileName, partTypeGroupName, xmfFile,
+                                   Ndm, Ndm, 0, 0, dmparts, us);
 
         /* Free temporary array */
         free(dmparts);
@@ -602,10 +620,13 @@ void write_output_single(struct engine* e, struct UnitSystem* us) {
 
     /* Close particle group */
     H5Gclose(h_grp);
+
+    /* Close this particle group in the XMF file as well */
+    writeXMFgroupfooter(xmfFile, ptype);
   }
 
   /* Write LXMF file descriptor */
-  writeXMFfooter(xmfFile);
+  writeXMFoutputfooter(xmfFile, outputCount, e->time);
 
   /* message("Done writing particles..."); */
 
diff --git a/src/space.c b/src/space.c
index 954c9af7dd6d92adcd29d836dde16a61cf0f4792..d1a78b03da7c0a7cf5e742ec16912b94a442478b 100644
--- a/src/space.c
+++ b/src/space.c
@@ -97,12 +97,10 @@ const int sortlistID[27] = {
 int space_getsid(struct space *s, struct cell **ci, struct cell **cj,
                  double *shift) {
 
-  int k, sid = 0, periodic = s->periodic;
-  struct cell *temp;
-  double dx[3];
-
   /* Get the relative distance between the pairs, wrapping. */
-  for (k = 0; k < 3; k++) {
+  const int periodic = s->periodic;
+  double dx[3];
+  for (int k = 0; k < 3; k++) {
     dx[k] = (*cj)->loc[k] - (*ci)->loc[k];
     if (periodic && dx[k] < -s->dim[k] / 2)
       shift[k] = s->dim[k];
@@ -114,15 +112,16 @@ int space_getsid(struct space *s, struct cell **ci, struct cell **cj,
   }
 
   /* Get the sorting index. */
-  for (k = 0; k < 3; k++)
+  int sid = 0;
+  for (int k = 0; k < 3; k++)
     sid = 3 * sid + ((dx[k] < 0.0) ? 0 : ((dx[k] > 0.0) ? 2 : 1));
 
   /* Switch the cells around? */
   if (runner_flip[sid]) {
-    temp = *ci;
+    struct cell *temp = *ci;
     *ci = *cj;
     *cj = temp;
-    for (k = 0; k < 3; k++) shift[k] = -shift[k];
+    for (int k = 0; k < 3; k++) shift[k] = -shift[k];
   }
   sid = sortlistID[sid];
 
@@ -137,10 +136,8 @@ int space_getsid(struct space *s, struct cell **ci, struct cell **cj,
 
 void space_rebuild_recycle(struct space *s, struct cell *c) {
 
-  int k;
-
   if (c->split)
-    for (k = 0; k < 8; k++)
+    for (int k = 0; k < 8; k++)
       if (c->progeny[k] != NULL) {
         space_rebuild_recycle(s, c->progeny[k]);
         space_recycle(s, c->progeny[k]);
@@ -158,19 +155,19 @@ void space_rebuild_recycle(struct space *s, struct cell *c) {
 
 void space_regrid(struct space *s, double cell_max, int verbose) {
 
-  float h_max = s->cell_min / kernel_gamma / space_stretch, dmin;
-  int i, j, k, cdim[3], nr_parts = s->nr_parts;
+  float h_max = s->cell_min / kernel_gamma / space_stretch;
+  const size_t nr_parts = s->nr_parts;
   struct cell *restrict c;
   ticks tic = getticks();
 
   /* Run through the parts and get the current h_max. */
   // tic = getticks();
   if (s->cells != NULL) {
-    for (k = 0; k < s->nr_cells; k++) {
+    for (int k = 0; k < s->nr_cells; k++) {
       if (s->cells[k].h_max > h_max) h_max = s->cells[k].h_max;
     }
   } else {
-    for (k = 0; k < nr_parts; k++) {
+    for (int k = 0; k < nr_parts; k++) {
       if (s->parts[k].h > h_max) h_max = s->parts[k].h;
     }
     s->h_max = h_max;
@@ -190,7 +187,8 @@ void space_regrid(struct space *s, double cell_max, int verbose) {
   if (verbose) message("h_max is %.3e (cell_max=%.3e).", h_max, cell_max);
 
   /* Get the new putative cell dimensions. */
-  for (k = 0; k < 3; k++)
+  int cdim[3];
+  for (int k = 0; k < 3; k++)
     cdim[k] =
         floor(s->dim[k] / fmax(h_max * kernel_gamma * space_stretch, cell_max));
 
@@ -213,7 +211,7 @@ void space_regrid(struct space *s, double cell_max, int verbose) {
 
     /* Free the old cells, if they were allocated. */
     if (s->cells != NULL) {
-      for (k = 0; k < s->nr_cells; k++) {
+      for (int k = 0; k < s->nr_cells; k++) {
         space_rebuild_recycle(s, &s->cells[k]);
         if (s->cells[k].sort != NULL) free(s->cells[k].sort);
       }
@@ -222,12 +220,12 @@ void space_regrid(struct space *s, double cell_max, int verbose) {
     }
 
     /* Set the new cell dimensions only if smaller. */
-    for (k = 0; k < 3; k++) {
+    for (int k = 0; k < 3; k++) {
       s->cdim[k] = cdim[k];
       s->h[k] = s->dim[k] / cdim[k];
       s->ih[k] = 1.0 / s->h[k];
     }
-    dmin = fminf(s->h[0], fminf(s->h[1], s->h[2]));
+    const float dmin = fminf(s->h[0], fminf(s->h[1], s->h[2]));
 
     /* Allocate the highest level of cells. */
     s->tot_cells = s->nr_cells = cdim[0] * cdim[1] * cdim[2];
@@ -235,13 +233,13 @@ void space_regrid(struct space *s, double cell_max, int verbose) {
                        s->nr_cells * sizeof(struct cell)) != 0)
       error("Failed to allocate cells.");
     bzero(s->cells, s->nr_cells * sizeof(struct cell));
-    for (k = 0; k < s->nr_cells; k++)
+    for (int k = 0; k < s->nr_cells; k++)
       if (lock_init(&s->cells[k].lock) != 0) error("Failed to init spinlock.");
 
     /* Set the cell location and sizes. */
-    for (i = 0; i < cdim[0]; i++)
-      for (j = 0; j < cdim[1]; j++)
-        for (k = 0; k < cdim[2]; k++) {
+    for (int i = 0; i < cdim[0]; i++)
+      for (int j = 0; j < cdim[1]; j++)
+        for (int k = 0; k < cdim[2]; k++) {
           c = &s->cells[cell_getid(cdim, i, j, k)];
           c->loc[0] = i * s->h[0];
           c->loc[1] = j * s->h[1];
@@ -271,7 +269,7 @@ void space_regrid(struct space *s, double cell_max, int verbose) {
   else {
 
     /* Free the old cells, if they were allocated. */
-    for (k = 0; k < s->nr_cells; k++) {
+    for (int k = 0; k < s->nr_cells; k++) {
       space_rebuild_recycle(s, &s->cells[k]);
       s->cells[k].sorts = NULL;
       s->cells[k].nr_tasks = 0;
@@ -308,7 +306,7 @@ void space_regrid(struct space *s, double cell_max, int verbose) {
 
 void space_rebuild(struct space *s, double cell_max, int verbose) {
 
-  ticks tic = getticks();
+  const ticks tic = getticks();
 
   /* Be verbose about this. */
   // message( "re)building space..." ); fflush(stdout);
@@ -320,23 +318,15 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
   int nr_gparts = s->nr_gparts;
   struct cell *restrict cells = s->cells;
 
-  double ih[3], dim[3];
-  int cdim[3];
-  ih[0] = s->ih[0];
-  ih[1] = s->ih[1];
-  ih[2] = s->ih[2];
-  dim[0] = s->dim[0];
-  dim[1] = s->dim[1];
-  dim[2] = s->dim[2];
-  cdim[0] = s->cdim[0];
-  cdim[1] = s->cdim[1];
-  cdim[2] = s->cdim[2];
+  const double ih[3] = {s->ih[0], s->ih[1], s->ih[2]};
+  const double dim[3] = {s->dim[0], s->dim[1], s->dim[2]};
+  const int cdim[3] = {s->cdim[0], s->cdim[1], s->cdim[2]};
 
   /* Run through the particles and get their cell index. */
   // tic = getticks();
   const size_t ind_size = s->size_parts;
-  size_t *ind;
-  if ((ind = (size_t *)malloc(sizeof(size_t) * ind_size)) == NULL)
+  int *ind;
+  if ((ind = (int *)malloc(sizeof(int) * ind_size)) == NULL)
     error("Failed to allocate temporary particle indices.");
   for (int k = 0; k < nr_parts; k++) {
     struct part *restrict p = &s->parts[k];
@@ -349,37 +339,92 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
         cell_getid(cdim, p->x[0] * ih[0], p->x[1] * ih[1], p->x[2] * ih[2]);
     cells[ind[k]].count++;
   }
+  // message( "getting particle indices took %.3f %s." ,
+  // clocks_from_ticks(getticks() - tic), clocks_getunit()):
+
+  /* Run through the gravity particles and get their cell index. */
+  // tic = getticks();
+  const size_t gind_size = s->size_gparts;
+  int *gind;
+  if ((gind = (int *)malloc(sizeof(int) * gind_size)) == NULL)
+    error("Failed to allocate temporary g-particle indices.");
+  for (int k = 0; k < nr_gparts; k++) {
+    struct gpart *restrict gp = &s->gparts[k];
+    for (int j = 0; j < 3; j++)
+      if (gp->x[j] < 0.0)
+        gp->x[j] += dim[j];
+      else if (gp->x[j] >= dim[j])
+        gp->x[j] -= dim[j];
+    gind[k] =
+        cell_getid(cdim, gp->x[0] * ih[0], gp->x[1] * ih[1], gp->x[2] * ih[2]);
+    cells[gind[k]].gcount++;
+  }
 // message( "getting particle indices took %.3f %s." ,
-// clocks_from_ticks(getticks() - tic), clocks_getunit()):
+// clocks_from_ticks(getticks() - tic), clocks_getunit());
 
 #ifdef WITH_MPI
   /* Move non-local parts to the end of the list. */
-  const int nodeID = s->e->nodeID;
+  const int local_nodeID = s->e->nodeID;
   for (int k = 0; k < nr_parts; k++)
-    if (cells[ind[k]].nodeID != nodeID) {
+    if (cells[ind[k]].nodeID != local_nodeID) {
       cells[ind[k]].count -= 1;
       nr_parts -= 1;
-      struct part tp = s->parts[k];
+      const struct part tp = s->parts[k];
       s->parts[k] = s->parts[nr_parts];
       s->parts[nr_parts] = tp;
-      struct xpart txp = s->xparts[k];
+      if (s->parts[k].gpart != NULL) {
+        s->parts[k].gpart->id_or_neg_offset = -k;
+      }
+      if (s->parts[nr_parts].gpart != NULL) {
+        s->parts[nr_parts].gpart->id_or_neg_offset = -nr_parts;
+      }
+      const struct xpart txp = s->xparts[k];
       s->xparts[k] = s->xparts[nr_parts];
       s->xparts[nr_parts] = txp;
-      int t = ind[k];
+      const int t = ind[k];
       ind[k] = ind[nr_parts];
       ind[nr_parts] = t;
     }
 
+  /* Move non-local gparts to the end of the list. */
+  for (int k = 0; k < nr_gparts; k++)
+    if (cells[gind[k]].nodeID != local_nodeID) {
+      cells[gind[k]].gcount -= 1;
+      nr_gparts -= 1;
+      const struct gpart tp = s->gparts[k];
+      s->gparts[k] = s->gparts[nr_gparts];
+      s->gparts[nr_gparts] = tp;
+      if (s->gparts[k].id_or_neg_offset <= 0) {
+        s->parts[-s->gparts[k].id_or_neg_offset].gpart = &s->gparts[k];
+      }
+      if (s->gparts[nr_gparts].id_or_neg_offset <= 0) {
+        s->parts[-s->gparts[nr_gparts].id_or_neg_offset].gpart =
+            &s->gparts[nr_gparts];
+      }
+      const int t = gind[k];
+      gind[k] = gind[nr_gparts];
+      gind[nr_gparts] = t;
+    }
+
   /* Exchange the strays, note that this potentially re-allocates
      the parts arrays. */
-  s->nr_parts =
-      nr_parts + engine_exchange_strays(s->e, nr_parts, &ind[nr_parts],
-                                        s->nr_parts - nr_parts);
+  /* TODO: This function also exchanges gparts, but this is shorted-out
+     until they are fully implemented. */
+  size_t nr_parts_exchanged = s->nr_parts - nr_parts;
+  size_t nr_gparts_exchanged = s->nr_gparts - nr_gparts;
+  engine_exchange_strays(s->e, nr_parts, &ind[nr_parts], &nr_parts_exchanged,
+                         nr_gparts, &gind[nr_gparts], &nr_gparts_exchanged);
+
+  /* Add post-processing, i.e. re-linking/creating of gparts here. */
+
+  /* Set the new particle counts. */
+  s->nr_parts = nr_parts + nr_parts_exchanged;
+  s->nr_gparts = nr_gparts + nr_gparts_exchanged;
 
   /* Re-allocate the index array if needed.. */
   if (s->nr_parts > ind_size) {
-    size_t *ind_new;
-    if ((ind_new = (size_t *)malloc(sizeof(size_t) * s->nr_parts)) == NULL)
+    int *ind_new;
+    if ((ind_new = (int *)malloc(sizeof(int) * s->nr_parts)) == NULL)
       error("Failed to allocate temporary particle indices.");
     memcpy(ind_new, ind, sizeof(size_t) * nr_parts);
     free(ind);
@@ -388,7 +433,7 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
 
   /* Assign each particle to its cell. */
   for (int k = nr_parts; k < s->nr_parts; k++) {
-    struct part *p = &s->parts[k];
+    const struct part *const p = &s->parts[k];
     ind[k] =
         cell_getid(cdim, p->x[0] * ih[0], p->x[1] * ih[1], p->x[2] * ih[2]);
     cells[ind[k]].count += 1;
@@ -418,65 +463,24 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
   /* We no longer need the indices as of here. */
   free(ind);
 
-  /* Run through the gravity particles and get their cell index. */
-  // tic = getticks();
-  const size_t gind_size = s->size_gparts;
-  size_t *gind;
-  if ((gind = (size_t *)malloc(sizeof(size_t) * gind_size)) == NULL)
-    error("Failed to allocate temporary g-particle indices.");
-  for (int k = 0; k < nr_gparts; k++) {
-    struct gpart *gp = &s->gparts[k];
-    for (int j = 0; j < 3; j++)
-      if (gp->x[j] < 0.0)
-        gp->x[j] += dim[j];
-      else if (gp->x[j] >= dim[j])
-        gp->x[j] -= dim[j];
-    gind[k] =
-        cell_getid(cdim, gp->x[0] * ih[0], gp->x[1] * ih[1], gp->x[2] * ih[2]);
-    cells[gind[k]].gcount++;
-  }
-// message( "getting particle indices took %.3f %s." ,
-// clocks_from_ticks(getticks() - tic), clocks_getunit());
-
 #ifdef WITH_MPI
 
-  /* Move non-local gparts to the end of the list. */
-  for (int k = 0; k < nr_gparts; k++)
-    if (cells[ind[k]].nodeID != nodeID) {
-      cells[ind[k]].gcount -= 1;
-      nr_gparts -= 1;
-      struct gpart tp = s->gparts[k];
-      s->gparts[k] = s->gparts[nr_gparts];
-      s->gparts[nr_gparts] = tp;
-      int t = ind[k];
-      ind[k] = ind[nr_gparts];
-      ind[nr_gparts] = t;
-    }
-
-  /* Exchange the strays, note that this potentially re-allocates
-     the parts arrays. */
-  // s->nr_gparts =
-  //    nr_gparts + engine_exchange_strays(s->e, nr_gparts, &ind[nr_gparts],
-  //                                        s->nr_gparts - nr_gparts);
-  if (nr_gparts > 0)
-    error("Need to implement the exchange of strays for the gparts");
-
   /* Re-allocate the index array if needed.. */
   if (s->nr_gparts > gind_size) {
-    size_t *gind_new;
-    if ((gind_new = (size_t *)malloc(sizeof(size_t) * s->nr_gparts)) == NULL)
+    int *gind_new;
+    if ((gind_new = (int *)malloc(sizeof(int) * s->nr_gparts)) == NULL)
       error("Failed to allocate temporary g-particle indices.");
-    memcpy(gind_new, gind, sizeof(size_t) * nr_gparts);
+    memcpy(gind_new, gind, sizeof(int) * nr_gparts);
     free(gind);
     gind = gind_new;
   }
 
   /* Assign each particle to its cell. */
   for (int k = nr_gparts; k < s->nr_gparts; k++) {
-    struct gpart *p = &s->gparts[k];
+    const struct gpart *const p = &s->gparts[k];
     gind[k] =
         cell_getid(cdim, p->x[0] * ih[0], p->x[1] * ih[1], p->x[2] * ih[2]);
-    cells[gind[k]].count += 1;
+    cells[gind[k]].gcount += 1;
     /* if ( cells[ ind[k] ].nodeID != nodeID )
         error( "Received part that does not belong to me (nodeID=%i)." , cells[
        ind[k] ].nodeID ); */
@@ -494,6 +498,29 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
   /* We no longer need the indices as of here. */
   free(gind);
 
+  /* Verify that the links are correct */
+  /* MATTHIEU: To be commented out once we are happy */
+  for (size_t k = 0; k < nr_gparts; ++k) {
+
+    if (s->gparts[k].id_or_neg_offset < 0) {
+
+      const struct part *part = &s->parts[-s->gparts[k].id_or_neg_offset];
+
+      if (part->gpart != &s->gparts[k]) error("Linking problem !");
+
+      if (s->gparts[k].x[0] != part->x[0] || s->gparts[k].x[1] != part->x[1] ||
+          s->gparts[k].x[2] != part->x[2])
+        error("Linked particles are not at the same position !");
+    }
+  }
+  for (size_t k = 0; k < nr_parts; ++k) {
+
+    if (s->parts[k].gpart != NULL &&
+        s->parts[k].gpart->id_or_neg_offset != -k) {
+      error("Linking problem !");
+    }
+  }
+
   /* Hook the cells up to the parts. */
   // tic = getticks();
   struct part *finger = s->parts;
@@ -529,7 +556,7 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
  */
 void space_split(struct space *s, struct cell *cells, int verbose) {
 
-  ticks tic = getticks();
+  const ticks tic = getticks();
 
   for (int k = 0; k < s->nr_cells; k++)
     scheduler_addtask(&s->e->sched, task_type_split_cell, task_subtype_none, k,
@@ -553,7 +580,7 @@ void space_split(struct space *s, struct cell *cells, int verbose) {
  * @param verbose Are we talkative ?
  */
 
-void space_parts_sort(struct space *s, size_t *ind, size_t N, int min, int max,
+void space_parts_sort(struct space *s, int *ind, size_t N, int min, int max,
                       int verbose) {
 
   ticks tic = getticks();
@@ -601,7 +628,7 @@ void space_parts_sort(struct space *s, size_t *ind, size_t N, int min, int max,
 void space_do_parts_sort() {
 
   /* Pointers to the sorting data. */
-  size_t *ind = space_sort_struct.ind;
+  int *ind = space_sort_struct.ind;
   struct part *parts = space_sort_struct.parts;
   struct xpart *xparts = space_sort_struct.xparts;
 
@@ -723,7 +750,7 @@ void space_do_parts_sort() {
   } /* main loop. */
 }
 
-void space_gparts_sort(struct gpart *gparts, size_t *ind, size_t N, int min,
+void space_gparts_sort(struct gpart *gparts, int *ind, size_t N, int min,
                        int max) {
 
   struct qstack {
diff --git a/src/space.h b/src/space.h
index 91485ff7e2ebe9da8ab927748589ae9f71320803..e761595838ae78b0d8a67cca676cfa59f3f700f6 100644
--- a/src/space.h
+++ b/src/space.h
@@ -64,9 +64,6 @@ struct space {
   /* The minimum and maximum cutoff radii. */
   double h_max, cell_min;
 
-  /* Current time step for particles. */
-  float dt_step;
-
   /* Current maximum displacement for particles. */
   float dx_max;
 
@@ -106,6 +103,8 @@ struct space {
   /* Buffers for parts that we will receive from foreign cells. */
   struct part *parts_foreign;
   size_t nr_parts_foreign, size_parts_foreign;
+  struct gpart *gparts_foreign;
+  size_t nr_gparts_foreign, size_gparts_foreign;
 };
 
 /* Interval stack necessary for parallel particle sorting. */
@@ -117,7 +116,7 @@ struct qstack {
 struct parallel_sort {
   struct part *parts;
   struct xpart *xparts;
-  size_t *ind;
+  int *ind;
   struct qstack *stack;
   unsigned int stack_size;
   volatile unsigned int first, last, waiting;
@@ -125,9 +124,9 @@ struct parallel_sort {
 extern struct parallel_sort space_sort_struct;
 
 /* function prototypes. */
-void space_parts_sort(struct space *s, size_t *ind, size_t N, int min, int max,
+void space_parts_sort(struct space *s, int *ind, size_t N, int min, int max,
                       int verbose);
-void space_gparts_sort(struct gpart *gparts, size_t *ind, size_t N, int min,
+void space_gparts_sort(struct gpart *gparts, int *ind, size_t N, int min,
                        int max);
 struct cell *space_getcell(struct space *s);
 int space_getsid(struct space *s, struct cell **ci, struct cell **cj,
diff --git a/src/swift.h b/src/swift.h
index 9ab090dccd195ff4927d3e614e446b36d273f824..e568a28c888295affc9ec45b6d059d34f5b4bf04 100644
--- a/src/swift.h
+++ b/src/swift.h
@@ -27,7 +27,6 @@
 #include "cell.h"
 #include "clocks.h"
 #include "const.h"
-#include "const.h"
 #include "cycle.h"
 #include "debug.h"
 #include "engine.h"
@@ -38,7 +37,9 @@
 #include "map.h"
 #include "multipole.h"
 #include "parallel_io.h"
+#include "parser.h"
 #include "part.h"
+#include "partition.h"
 #include "queue.h"
 #include "runner.h"
 #include "scheduler.h"
@@ -47,9 +48,8 @@
 #include "space.h"
 #include "task.h"
 #include "timers.h"
-#include "units.h"
 #include "tools.h"
-#include "partition.h"
+#include "units.h"
 #include "version.h"
 
 #endif /* SWIFT_SWIFT_H */
diff --git a/src/task.c b/src/task.c
index 69109f9e6d4fe8730a317db46ea3862e65ab90b2..74f8451d5dfaec2454f2eeed8670765e1be5b658 100644
--- a/src/task.c
+++ b/src/task.c
@@ -145,7 +145,7 @@ int task_lock(struct task *t) {
 
 #ifdef WITH_MPI
     /* Check the status of the MPI request. */
-    int res, err;
+    int res = 0, err = 0;
     MPI_Status stat;
     if ((err = MPI_Test(&t->req, &res, &stat)) != MPI_SUCCESS) {
       char buff[MPI_MAX_ERROR_STRING];
diff --git a/src/tools.c b/src/tools.c
index d5749e88e27a5f7491f5f5108586629ecc83d13e..1efdc027d3da50733372e73e1cfd6a9c7206784f 100644
--- a/src/tools.c
+++ b/src/tools.c
@@ -236,6 +236,53 @@ void pairs_all_density(struct runner *r, struct cell *ci, struct cell *cj) {
   }
 }
 
+void self_all_density(struct runner *r, struct cell *ci) {
+  float r2, hi, hj, hig2, hjg2, dxi[3];  //, dxj[3];
+  struct part *pi, *pj;
+
+  /* Implements a double-for loop and checks every interaction */
+  for (int i = 0; i < ci->count; ++i) {
+
+    pi = &ci->parts[i];
+    hi = pi->h;
+    hig2 = hi * hi * kernel_gamma2;
+
+    for (int j = i + 1; j < ci->count; ++j) {
+
+      pj = &ci->parts[j];
+      hj = pj->h;
+      hjg2 = hj * hj * kernel_gamma2;
+
+      if (pi == pj) continue;
+
+      /* Pairwise distance */
+      r2 = 0.0f;
+      for (int k = 0; k < 3; k++) {
+        dxi[k] = ci->parts[i].x[k] - ci->parts[j].x[k];
+        r2 += dxi[k] * dxi[k];
+      }
+
+      /* Hit or miss? */
+      if (r2 < hig2) {
+
+        /* Interact */
+        runner_iact_nonsym_density(r2, dxi, hi, hj, pi, pj);
+      }
+
+      /* Hit or miss? */
+      if (r2 < hjg2) {
+
+        dxi[0] = -dxi[0];
+        dxi[1] = -dxi[1];
+        dxi[2] = -dxi[2];
+
+        /* Interact */
+        runner_iact_nonsym_density(r2, dxi, hj, hi, pj, pi);
+      }
+    }
+  }
+}
+
 void pairs_single_grav(double *dim, long long int pid,
                        struct gpart *__restrict__ gparts,
                        const struct part *parts, int N, int periodic) {
@@ -256,9 +303,9 @@ void pairs_single_grav(double *dim, long long int pid,
       break;
   if (k == N) error("Part not found.");
   pi = gparts[k];
-  pi.a[0] = 0.0f;
-  pi.a[1] = 0.0f;
-  pi.a[2] = 0.0f;
+  pi.a_grav[0] = 0.0f;
+  pi.a_grav[1] = 0.0f;
+  pi.a_grav[2] = 0.0f;
 
   /* Loop over all particle pairs. */
   for (k = 0; k < N; k++) {
@@ -276,15 +323,15 @@ void pairs_single_grav(double *dim, long long int pid,
     }
     r2 = fdx[0] * fdx[0] + fdx[1] * fdx[1] + fdx[2] * fdx[2];
     runner_iact_grav(r2, fdx, &pi, &pj);
-    a[0] += pi.a[0];
-    a[1] += pi.a[1];
-    a[2] += pi.a[2];
-    aabs[0] += fabsf(pi.a[0]);
-    aabs[1] += fabsf(pi.a[1]);
-    aabs[2] += fabsf(pi.a[2]);
-    pi.a[0] = 0.0f;
-    pi.a[1] = 0.0f;
-    pi.a[2] = 0.0f;
+    a[0] += pi.a_grav[0];
+    a[1] += pi.a_grav[1];
+    a[2] += pi.a_grav[2];
+    aabs[0] += fabsf(pi.a_grav[0]);
+    aabs[1] += fabsf(pi.a_grav[1]);
+    aabs[2] += fabsf(pi.a_grav[2]);
+    pi.a_grav[0] = 0.0f;
+    pi.a_grav[1] = 0.0f;
+    pi.a_grav[2] = 0.0f;
   }
 
   /* Dump the result. */
diff --git a/src/tools.h b/src/tools.h
index ed85c1bcb4c0bb34d255a8ab2fbf402b5dda6ba4..01226ee7cdbfe42aa44affadc4a9cbe02bad2428 100644
--- a/src/tools.h
+++ b/src/tools.h
@@ -34,6 +34,7 @@ void pairs_single_density(double *dim, long long int pid,
                           struct part *__restrict__ parts, int N, int periodic);
 
 void pairs_all_density(struct runner *r, struct cell *ci, struct cell *cj);
+void self_all_density(struct runner *r, struct cell *ci);
 
 void pairs_n2(double *dim, struct part *__restrict__ parts, int N,
               int periodic);
diff --git a/tests/Makefile.am b/tests/Makefile.am
index f0bfbefd3c7f4591134d1707c4ac9bf63278e855..d66282059d874f345437d779d59ec3edb08e47cb 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -21,10 +21,12 @@ AM_CFLAGS = -I../src $(HDF5_CPPFLAGS) -DTIMER
 AM_LDFLAGS = ../src/.libs/libswiftsim.a $(HDF5_LDFLAGS) $(HDF5_LIBS)
 
 # List of programs and scripts to run in the test suite
-TESTS = testGreetings testReading.sh testSingle testTimeIntegration
+TESTS = testGreetings testReading.sh testSingle testPair.sh testPairPerturbed.sh \
+	test27cells.sh test27cellsPerturbed.sh testParser.sh
 
 # List of test programs to compile
-check_PROGRAMS = testGreetings testReading testSingle testTimeIntegration testSPHStep testVectorize
+check_PROGRAMS = testGreetings testReading testSingle testTimeIntegration \
+		 testSPHStep testPair test27cells testParser
 
 # Sources for the individual programs
 testGreetings_SOURCES = testGreetings.c
@@ -37,7 +39,13 @@ testSPHStep_SOURCES = testSPHStep.c
 
 testSingle_SOURCES = testSingle.c
 
-testVectorize_SOURCES = testVectorize.c
+testPair_SOURCES = testPair.c
+
+test27cells_SOURCES = test27cells.c
+
+testParser_SOURCES = testParser.c
 
 # Files necessary for distribution
-EXTRA_DIST = testReading.sh makeInput.py
+EXTRA_DIST = testReading.sh makeInput.py testPair.sh testPairPerturbed.sh \
+	     test27cells.sh test27cellsPerturbed.sh tolerance.dat testParser.sh \ 
+	     testParserInput.yaml
diff --git a/tests/difffloat.py b/tests/difffloat.py
new file mode 100644
index 0000000000000000000000000000000000000000..bbb7c95a1e77e04bbe21bec6dc6c5d529cd77c70
--- /dev/null
+++ b/tests/difffloat.py
@@ -0,0 +1,103 @@
+###############################################################################
+ # This file is part of SWIFT.
+ # Copyright (c) 2016 Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ # 
+ # This program is free software: you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published
+ # by the Free Software Foundation, either version 3 of the License, or
+ # (at your option) any later version.
+ # 
+ # This program is distributed in the hope that it will be useful,
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ # GNU General Public License for more details.
+ # 
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ # 
+ ##############################################################################
+
+from numpy import *
+import sys
+
+abs_tol = 1e-7
+rel_tol = 1e-7
+
+# Compares the content of two ASCII tables of floats line by line and
+# reports all differences beyond the given tolerances
+# Comparisons are done both in absolute and relative terms
+
+# Individual tolerances for each column can be provided in a file
+
+file1 = sys.argv[1]
+file2 = sys.argv[2]
+fileTol = ""
+
+if len(sys.argv) == 4:
+    fileTol = sys.argv[3]
+
+data1 = loadtxt(file1)
+data2 = loadtxt(file2)
+if fileTol != "":
+    dataTol = loadtxt(fileTol)
+    n_linesTol = shape(dataTol)[0]
+    n_columnsTol = shape(dataTol)[1]
+
+
+if shape(data1) != shape(data2):
+    print "Non-matching array sizes in the files", file1, "and", file2, "."
+    sys.exit(1)
+
+n_lines = shape(data1)[0]
+n_columns = shape(data1)[1]
+
+if fileTol != "":
+    if n_linesTol != 2:
+        print "Incorrect number of lines in tolerance file '%s'."%fileTol
+    if n_columnsTol != n_columns:
+        print "Incorrect number of columns in tolerance file '%s'."%fileTol
+
+if fileTol == "":
+    print "Absolute difference tolerance:", abs_tol
+    print "Relative difference tolerance:", rel_tol
+    absTol = ones(n_columns) * abs_tol
+    relTol = ones(n_columns) * rel_tol
+else:
+    print "Tolerances read from file"
+    absTol = dataTol[0,:]
+    relTol = dataTol[1,:]
+
+error = False
+for i in range(n_lines):
+    for j in range(n_columns):
+
+        abs_diff = abs(data1[i,j] - data2[i,j])
+
+        sum = abs(data1[i,j] + data2[i,j])
+        if sum > 0:
+            rel_diff = abs(data1[i,j] - data2[i,j]) / sum
+        else:
+            rel_diff = 0.
+
+        if( abs_diff > absTol[j]):
+            print "Absolute difference larger than tolerance (%e) on line %d, column %d:"%(absTol[j], i,j)
+            print "%10s:           a = %e"%("File 1", data1[i,j])
+            print "%10s:           b = %e"%("File 2", data2[i,j])
+            print "%10s:       |a-b| = %e"%("Difference", abs_diff)
+            print ""
+            error = True
+
+        if( rel_diff > relTol[j]):
+            print "Relative difference larger than tolerance (%e) on line %d, column %d:"%(relTol[j], i,j)
+            print "%10s:           a = %e"%("File 1", data1[i,j])
+            print "%10s:           b = %e"%("File 2", data2[i,j])
+            print "%10s: |a-b|/|a+b| = %e"%("Difference", rel_diff)
+            print ""
+            error = True
+
+
+if error:
+    exit(1)
+else:
+    print "No differences found"
+    exit(0)
diff --git a/tests/test27cells.c b/tests/test27cells.c
new file mode 100644
index 0000000000000000000000000000000000000000..74c38996a81056b10633bf2bbf18cc7cff7e8f0d
--- /dev/null
+++ b/tests/test27cells.c
@@ -0,0 +1,367 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (C) 2015 Matthieu Schaller (matthieu.schaller@durham.ac.uk).
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+
+#include <fenv.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <unistd.h>
+#include "swift.h"
+
+/**
+ * Returns a random number (uniformly distributed) in [a,b[
+ */
+double random_uniform(double a, double b) {
+  return (rand() / (double)RAND_MAX) * (b - a) + a;
+}
+
+/* n is both particles per axis and box size:
+ * particles are generated on a mesh with unit spacing
+ */
+struct cell *make_cell(size_t n, double *offset, double size, double h,
+                       double density, long long *partId, double pert) {
+  const size_t count = n * n * n;
+  const double volume = size * size * size;
+  struct cell *cell = malloc(sizeof(struct cell));
+  bzero(cell, sizeof(struct cell));
+
+  if (posix_memalign((void **)&cell->parts, part_align,
+                     count * sizeof(struct part)) != 0) {
+    error("couldn't allocate particles, no. of particles: %d", (int)count);
+  }
+  bzero(cell->parts, count * sizeof(struct part));
+
+  /* Construct the parts */
+  struct part *part = cell->parts;
+  for (size_t x = 0; x < n; ++x) {
+    for (size_t y = 0; y < n; ++y) {
+      for (size_t z = 0; z < n; ++z) {
+        part->x[0] =
+            offset[0] +
+            size * (x + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n;
+        part->x[1] =
+            offset[1] +
+            size * (y + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n;
+        part->x[2] =
+            offset[2] +
+            size * (z + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n;
+        // part->v[0] = part->x[0] - 1.5;
+        // part->v[1] = part->x[1] - 1.5;
+        // part->v[2] = part->x[2] - 1.5;
+        part->v[0] = random_uniform(-0.05, 0.05);
+        part->v[1] = random_uniform(-0.05, 0.05);
+        part->v[2] = random_uniform(-0.05, 0.05);
+        part->h = size * h / (float)n;
+        part->id = ++(*partId);
+        part->mass = density * volume / count;
+        part->ti_begin = 0;
+        part->ti_end = 1;
+        ++part;
+      }
+    }
+  }
+
+  /* Cell properties */
+  cell->split = 0;
+  cell->h_max = h;
+  cell->count = count;
+  cell->dx_max = 0.;
+  cell->h[0] = size;
+  cell->h[1] = size;
+  cell->h[2] = size;
+  cell->loc[0] = offset[0];
+  cell->loc[1] = offset[1];
+  cell->loc[2] = offset[2];
+
+  cell->ti_end_min = 1;
+  cell->ti_end_max = 1;
+
+  cell->sorted = 0;
+  cell->sort = NULL;
+  cell->sortsize = 0;
+  runner_dosort(NULL, cell, 0x1FFF, 0);
+
+  return cell;
+}
+
+void clean_up(struct cell *ci) {
+  free(ci->parts);
+  free(ci->sort);
+  free(ci);
+}
+
+/**
+ * @brief Initializes all particles field to be ready for a density calculation
+ */
+void zero_particle_fields(struct cell *c) {
+
+  for (size_t pid = 0; pid < c->count; pid++) {
+    c->parts[pid].rho = 0.f;
+    c->parts[pid].rho_dh = 0.f;
+    hydro_init_part(&c->parts[pid]);
+  }
+}
+
+/**
+ * @brief Ends the loop by adding the appropriate coefficients
+ */
+void end_calculation(struct cell *c) {
+
+  for (size_t pid = 0; pid < c->count; pid++) {
+    hydro_end_density(&c->parts[pid], 1);
+  }
+}
+
+/**
+ * @brief Dump all the particles to a file
+ */
+void dump_particle_fields(char *fileName, struct cell *main_cell,
+                          struct cell **cells) {
+
+  FILE *file = fopen(fileName, "w");
+
+  /* Write header */
+  fprintf(file,
+          "# %4s %10s %10s %10s %10s %10s %10s %13s %13s %13s %13s %13s "
+          "%13s %13s %13s\n",
+          "ID", "pos_x", "pos_y", "pos_z", "v_x", "v_y", "v_z", "rho", "rho_dh",
+          "wcount", "wcount_dh", "div_v", "curl_vx", "curl_vy", "curl_vz");
+
+  fprintf(file, "# Main cell --------------------------------------------\n");
+
+  /* Write main cell */
+  for (size_t pid = 0; pid < main_cell->count; pid++) {
+    fprintf(file,
+            "%6llu %10f %10f %10f %10f %10f %10f %13e %13e %13e %13e %13e "
+            "%13e %13e %13e\n",
+            main_cell->parts[pid].id, main_cell->parts[pid].x[0],
+            main_cell->parts[pid].x[1], main_cell->parts[pid].x[2],
+            main_cell->parts[pid].v[0], main_cell->parts[pid].v[1],
+            main_cell->parts[pid].v[2], main_cell->parts[pid].rho,
+            main_cell->parts[pid].rho_dh, main_cell->parts[pid].density.wcount,
+            main_cell->parts[pid].density.wcount_dh,
+#ifdef GADGET2_SPH
+            main_cell->parts[pid].div_v, main_cell->parts[pid].density.rot_v[0],
+            main_cell->parts[pid].density.rot_v[1],
+            main_cell->parts[pid].density.rot_v[2]
+#else
+            0., 0., 0., 0.
+#endif
+            );
+  }
+
+  /* Write all other cells */
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 3; ++k) {
+
+        struct cell *cj = cells[i * 9 + j * 3 + k];
+        if (cj == main_cell) continue;
+
+        fprintf(file,
+                "# Offset: [%2d %2d %2d] -----------------------------------\n",
+                i - 1, j - 1, k - 1);
+
+        for (size_t pjd = 0; pjd < cj->count; pjd++) {
+          fprintf(
+              file,
+              "%6llu %10f %10f %10f %10f %10f %10f %13e %13e %13e %13e %13e "
+              "%13e %13e %13e\n",
+              cj->parts[pjd].id, cj->parts[pjd].x[0], cj->parts[pjd].x[1],
+              cj->parts[pjd].x[2], cj->parts[pjd].v[0], cj->parts[pjd].v[1],
+              cj->parts[pjd].v[2], cj->parts[pjd].rho, cj->parts[pjd].rho_dh,
+              cj->parts[pjd].density.wcount, cj->parts[pjd].density.wcount_dh,
+#ifdef GADGET2_SPH
+              cj->parts[pjd].div_v, cj->parts[pjd].density.rot_v[0],
+              cj->parts[pjd].density.rot_v[1], cj->parts[pjd].density.rot_v[2]
+#else
+              0., 0., 0., 0.
+#endif
+              );
+        }
+      }
+    }
+  }
+  fclose(file);
+}
+
+/* Just a forward declaration... */
+void runner_dopair1_density(struct runner *r, struct cell *ci, struct cell *cj);
+void runner_doself1_density(struct runner *r, struct cell *ci);
+
+/* And go... */
+int main(int argc, char *argv[]) {
+
+  size_t runs = 0, particles = 0;
+  double h = 1.12575, size = 1., rho = 1.;
+  double perturbation = 0.;
+  char outputFileNameExtension[200] = "";
+  char outputFileName[200] = "";
+
+  /* Initialize CPU frequency, this also starts time. */
+  unsigned long long cpufreq = 0;
+  clocks_set_cpufreq(cpufreq);
+
+  /* Get some randomness going */
+  srand(0);
+
+  char c;
+  while ((c = getopt(argc, argv, "m:s:h:p:r:t:d:f:")) != -1) {
+    switch (c) {
+      case 'h':
+        sscanf(optarg, "%lf", &h);
+        break;
+      case 's':
+        sscanf(optarg, "%lf", &size);
+        break;
+      case 'p':
+        sscanf(optarg, "%zu", &particles);
+        break;
+      case 'r':
+        sscanf(optarg, "%zu", &runs);
+        break;
+      case 'd':
+        sscanf(optarg, "%lf", &perturbation);
+        break;
+      case 'm':
+        sscanf(optarg, "%lf", &rho);
+        break;
+      case 'f':
+        strcpy(outputFileNameExtension, optarg);
+        break;
+      case '?':
+        error("Unknown option.");
+        break;
+    }
+  }
+
+  if (h < 0 || particles == 0 || runs == 0) {
+    printf(
+        "\nUsage: %s -p PARTICLES_PER_AXIS -r NUMBER_OF_RUNS [OPTIONS...]\n"
+        "\nGenerates a cell pair, filled with particles on a Cartesian grid."
+        "\nThese are then interacted using runner_dopair1_density."
+        "\n\nOptions:"
+        "\n-h DISTANCE=1.1255 - Smoothing length"
+        "\n-m rho             - Physical density in the cell"
+        "\n-s size            - Physical size of the cell"
+        "\n-d pert            - Perturbation to apply to the particles [0,1["
+        "\n-f fileName        - Part of the file name used to save the dumps\n",
+        argv[0]);
+    exit(1);
+  }
+
+  /* Help users... */
+  message("Smoothing length: h = %f", h);
+  message("Neighbour target: N = %f", kernel_nwneigh);
+
+  /* Build the infrastructure */
+  struct space space;
+  space.periodic = 0;
+  space.h_max = h;
+
+  struct engine engine;
+  engine.s = &space;
+  engine.time = 0.1f;
+  engine.ti_current = 1;
+
+  struct runner runner;
+  runner.e = &engine;
+
+  /* Construct some cells */
+  struct cell *cells[27];
+  struct cell *main_cell;
+  static long long partId = 0;
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 3; ++k) {
+
+        double offset[3] = {i * size, j * size, k * size};
+        cells[i * 9 + j * 3 + k] =
+            make_cell(particles, offset, size, h, rho, &partId, perturbation);
+      }
+    }
+  }
+
+  /* Store the main cell for future use */
+  main_cell = cells[13];
+
+  ticks time = 0;
+  for (size_t i = 0; i < runs; ++i) {
+
+    /* Zero the fields */
+    for (int j = 0; j < 27; ++j) zero_particle_fields(cells[j]);
+
+    const ticks tic = getticks();
+
+    /* Run all the pairs */
+    for (int j = 0; j < 27; ++j)
+      if (cells[j] != main_cell)
+        runner_dopair1_density(&runner, main_cell, cells[j]);
+
+    /* And now the self-interaction */
+    runner_doself1_density(&runner, main_cell);
+
+    const ticks toc = getticks();
+    time += toc - tic;
+
+    /* Let's get physical ! */
+    end_calculation(main_cell);
+
+    /* Dump if necessary */
+    if (i % 50 == 0) {
+      sprintf(outputFileName, "swift_dopair_27_%s.dat",
+              outputFileNameExtension);
+      dump_particle_fields(outputFileName, main_cell, cells);
+    }
+  }
+
+  /* Output timing */
+  message("SWIFT calculation took       : %15lli ticks.", time / runs);
+
+  /* Now perform a brute-force version for accuracy tests */
+
+  /* Zero the fields */
+  for (int i = 0; i < 27; ++i) zero_particle_fields(cells[i]);
+
+  const ticks tic = getticks();
+
+  /* Run all the brute-force pairs */
+  for (int j = 0; j < 27; ++j)
+    if (cells[j] != main_cell) pairs_all_density(&runner, main_cell, cells[j]);
+
+  /* And now the self-interaction */
+  self_all_density(&runner, main_cell);
+
+  const ticks toc = getticks();
+
+  /* Let's get physical ! */
+  end_calculation(main_cell);
+
+  /* Dump */
+  sprintf(outputFileName, "brute_force_27_%s.dat", outputFileNameExtension);
+  dump_particle_fields(outputFileName, main_cell, cells);
+
+  /* Output timing */
+  message("Brute force calculation took : %15lli ticks.", toc - tic);
+
+  /* Clean things to make the sanitizer happy ... */
+  for (int i = 0; i < 27; ++i) clean_up(cells[i]);
+
+  return 0;
+}
diff --git a/tests/test27cells.sh b/tests/test27cells.sh
new file mode 100755
index 0000000000000000000000000000000000000000..09d2513bd3ef404c7bf434948af7f10306c98ede
--- /dev/null
+++ b/tests/test27cells.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+rm brute_force_27_standard.dat swift_dopair_27_standard.dat
+
+./test27cells -p 6 -r 1 -d 0 -f standard
+
+python difffloat.py brute_force_27_standard.dat swift_dopair_27_standard.dat tolerance.dat
+
+exit $?
diff --git a/tests/test27cellsPerturbed.sh b/tests/test27cellsPerturbed.sh
new file mode 100755
index 0000000000000000000000000000000000000000..73d2933984d38f7dcc992f07ec2e016f3544b636
--- /dev/null
+++ b/tests/test27cellsPerturbed.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+rm brute_force_27_perturbed.dat swift_dopair_27_perturbed.dat
+
+./test27cells -p 6 -r 1 -d 0.1 -f perturbed
+
+python difffloat.py brute_force_27_perturbed.dat swift_dopair_27_perturbed.dat tolerance.dat
+
+exit $?
diff --git a/tests/testPair.c b/tests/testPair.c
new file mode 100644
index 0000000000000000000000000000000000000000..23ce4eb3de460f4e17b7b6f81cb39a628f3d100f
--- /dev/null
+++ b/tests/testPair.c
@@ -0,0 +1,305 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (C) 2015 Matthieu Schaller (matthieu.schaller@durham.ac.uk).
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+
+#include <fenv.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <unistd.h>
+#include "swift.h"
+
+/**
+ * Returns a random number (uniformly distributed) in [a,b[
+ */
+double random_uniform(double a, double b) {
+  return (rand() / (double)RAND_MAX) * (b - a) + a;
+}
+
+/* n is both particles per axis and box size:
+ * particles are generated on a mesh with unit spacing
+ */
+struct cell *make_cell(size_t n, double *offset, double size, double h,
+                       double density, unsigned long long *partId,
+                       double pert) {
+  const size_t count = n * n * n;
+  const double volume = size * size * size;
+  struct cell *cell = malloc(sizeof(struct cell));
+  bzero(cell, sizeof(struct cell));
+
+  if (posix_memalign((void **)&cell->parts, part_align,
+                     count * sizeof(struct part)) != 0) {
+    error("couldn't allocate particles, no. of particles: %d", (int)count);
+  }
+  bzero(cell->parts, count * sizeof(struct part));
+
+  /* Construct the parts */
+  struct part *part = cell->parts;
+  for (size_t x = 0; x < n; ++x) {
+    for (size_t y = 0; y < n; ++y) {
+      for (size_t z = 0; z < n; ++z) {
+        part->x[0] =
+            offset[0] +
+            size * (x + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n;
+        part->x[1] =
+            offset[1] +
+            size * (y + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n;
+        part->x[2] =
+            offset[2] +
+            size * (z + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n;
+        // part->v[0] = part->x[0] - 1.5;
+        // part->v[1] = part->x[1] - 1.5;
+        // part->v[2] = part->x[2] - 1.5;
+        part->v[0] = random_uniform(-0.05, 0.05);
+        part->v[1] = random_uniform(-0.05, 0.05);
+        part->v[2] = random_uniform(-0.05, 0.05);
+        part->h = size * h / (float)n;
+        part->id = ++(*partId);
+        part->mass = density * volume / count;
+        part->ti_begin = 0;
+        part->ti_end = 1;
+        ++part;
+      }
+    }
+  }
+
+  /* Cell properties */
+  cell->split = 0;
+  cell->h_max = h;
+  cell->count = count;
+  cell->dx_max = 0.;
+  cell->h[0] = n;
+  cell->h[1] = n;
+  cell->h[2] = n;
+  cell->loc[0] = offset[0];
+  cell->loc[1] = offset[1];
+  cell->loc[2] = offset[2];
+
+  cell->ti_end_min = 1;
+  cell->ti_end_max = 1;
+
+  cell->sorted = 0;
+  cell->sort = NULL;
+  cell->sortsize = 0;
+  runner_dosort(NULL, cell, 0x1FFF, 0);
+
+  return cell;
+}
+
+void clean_up(struct cell *ci) {
+  free(ci->parts);
+  free(ci->sort);
+  free(ci);
+}
+
+/**
+ * @brief Initializes all particles field to be ready for a density calculation
+ */
+void zero_particle_fields(struct cell *c) {
+
+  for (size_t pid = 0; pid < c->count; pid++) {
+    c->parts[pid].rho = 0.f;
+    c->parts[pid].rho_dh = 0.f;
+    hydro_init_part(&c->parts[pid]);
+  }
+}
+
+/**
+ * @brief Dump all the particles to a file
+ */
+void dump_particle_fields(char *fileName, struct cell *ci, struct cell *cj) {
+
+  FILE *file = fopen(fileName, "w");
+
+  /* Write header */
+  fprintf(file,
+          "# %4s %10s %10s %10s %10s %10s %10s %13s %13s %13s %13s %13s "
+          "%13s %13s %13s\n",
+          "ID", "pos_x", "pos_y", "pos_z", "v_x", "v_y", "v_z", "rho", "rho_dh",
+          "wcount", "wcount_dh", "div_v", "curl_vx", "curl_vy", "curl_vz");
+
+  fprintf(file, "# ci --------------------------------------------\n");
+
+  for (size_t pid = 0; pid < ci->count; pid++) {
+    fprintf(file,
+            "%6llu %10f %10f %10f %10f %10f %10f %13e %13e %13e %13e %13e "
+            "%13e %13e %13e\n",
+            ci->parts[pid].id, ci->parts[pid].x[0], ci->parts[pid].x[1],
+            ci->parts[pid].x[2], ci->parts[pid].v[0], ci->parts[pid].v[1],
+            ci->parts[pid].v[2], ci->parts[pid].rho, ci->parts[pid].rho_dh,
+            ci->parts[pid].density.wcount, ci->parts[pid].density.wcount_dh,
+#ifdef GADGET2_SPH
+            ci->parts[pid].div_v, ci->parts[pid].density.rot_v[0],
+            ci->parts[pid].density.rot_v[1], ci->parts[pid].density.rot_v[2]
+#else
+            0., 0., 0., 0.
+#endif
+            );
+  }
+
+  fprintf(file, "# cj --------------------------------------------\n");
+
+  for (size_t pjd = 0; pjd < cj->count; pjd++) {
+    fprintf(file,
+            "%6llu %10f %10f %10f %10f %10f %10f %13e %13e %13e %13e %13e "
+            "%13e %13e %13e\n",
+            cj->parts[pjd].id, cj->parts[pjd].x[0], cj->parts[pjd].x[1],
+            cj->parts[pjd].x[2], cj->parts[pjd].v[0], cj->parts[pjd].v[1],
+            cj->parts[pjd].v[2], cj->parts[pjd].rho, cj->parts[pjd].rho_dh,
+            cj->parts[pjd].density.wcount, cj->parts[pjd].density.wcount_dh,
+#ifdef GADGET2_SPH
+            cj->parts[pjd].div_v, cj->parts[pjd].density.rot_v[0],
+            cj->parts[pjd].density.rot_v[1], cj->parts[pjd].density.rot_v[2]
+#else
+            0., 0., 0., 0.
+#endif
+            );
+  }
+
+  fclose(file);
+}
+
+/* Just a forward declaration... */
+void runner_dopair1_density(struct runner *r, struct cell *ci, struct cell *cj);
+
+int main(int argc, char *argv[]) {
+  size_t particles = 0, runs = 0, volume, type = 0;
+  double offset[3] = {0, 0, 0}, h = 1.1255, size = 1., rho = 1.;
+  double perturbation = 0.1;
+  struct cell *ci, *cj;
+  struct space space;
+  struct engine engine;
+  struct runner runner;
+  char c;
+  static unsigned long long partId = 0;
+  char outputFileNameExtension[200] = "";
+  char outputFileName[200] = "";
+  ticks tic, toc, time;
+
+  /* Initialize CPU frequency, this also starts time. */
+  unsigned long long cpufreq = 0;
+  clocks_set_cpufreq(cpufreq);
+
+  srand(0);
+
+  while ((c = getopt(argc, argv, "h:p:r:t:d:f:")) != -1) {
+    switch (c) {
+      case 'h':
+        sscanf(optarg, "%lf", &h);
+        break;
+      case 'p':
+        sscanf(optarg, "%zu", &particles);
+        break;
+      case 'r':
+        sscanf(optarg, "%zu", &runs);
+        break;
+      case 't':
+        sscanf(optarg, "%zu", &type);
+        break;
+      case 'd':
+        sscanf(optarg, "%lf", &perturbation);
+        break;
+      case 'f':
+        strcpy(outputFileNameExtension, optarg);
+        break;
+      case '?':
+        error("Unknown option.");
+        break;
+    }
+  }
+
+  if (h < 0 || particles == 0 || runs == 0 || type > 2) {
+    printf(
+        "\nUsage: %s -p PARTICLES_PER_AXIS -r NUMBER_OF_RUNS [OPTIONS...]\n"
+        "\nGenerates a cell pair, filled with particles on a Cartesian grid."
+        "\nThese are then interacted using runner_dopair1_density."
+        "\n\nOptions:"
+        "\n-t TYPE=0          - cells share face (0), edge (1) or corner (2)"
+        "\n-h DISTANCE=1.1255 - smoothing length"
+        "\n-d pert            - perturbation to apply to the particles [0,1["
+        "\n-f fileName        - part of the file name used to save the dumps\n",
+        argv[0]);
+    exit(1);
+  }
+
+  space.periodic = 0;
+  space.h_max = h;
+
+  engine.s = &space;
+  engine.time = 0.1f;
+  engine.ti_current = 1;
+  runner.e = &engine;
+
+  volume = particles * particles * particles;
+  message("particles: %zu B\npositions: 0 B", 2 * volume * sizeof(struct part));
+
+  ci = make_cell(particles, offset, size, h, rho, &partId, perturbation);
+  for (size_t i = 0; i < type + 1; ++i) offset[i] = 1.;
+  cj = make_cell(particles, offset, size, h, rho, &partId, perturbation);
+
+  time = 0;
+  for (size_t i = 0; i < runs; ++i) {
+
+    /* Zero the fields */
+    zero_particle_fields(ci);
+    zero_particle_fields(cj);
+
+    tic = getticks();
+
+    /* Run the test */
+    runner_dopair1_density(&runner, ci, cj);
+
+    toc = getticks();
+    time += toc - tic;
+
+    /* Dump if necessary */
+    if (i % 50 == 0) {
+      sprintf(outputFileName, "swift_dopair_%s.dat", outputFileNameExtension);
+      dump_particle_fields(outputFileName, ci, cj);
+    }
+  }
+
+  /* Output timing */
+  message("SWIFT calculation took       %lli ticks.", time / runs);
+
+  /* Now perform a brute-force version for accuracy tests */
+
+  /* Zero the fields */
+  zero_particle_fields(ci);
+  zero_particle_fields(cj);
+
+  tic = getticks();
+
+  /* Run the brute-force test */
+  pairs_all_density(&runner, ci, cj);
+
+  toc = getticks();
+
+  /* Dump */
+  sprintf(outputFileName, "brute_force_%s.dat", outputFileNameExtension);
+  dump_particle_fields(outputFileName, ci, cj);
+
+  /* Output timing */
+  message("Brute force calculation took %lli ticks.", toc - tic);
+
+  /* Clean things to make the sanitizer happy ... */
+  clean_up(ci);
+  clean_up(cj);
+
+  return 0;
+}
diff --git a/tests/testPair.sh b/tests/testPair.sh
new file mode 100755
index 0000000000000000000000000000000000000000..f6f505e56a2c7a5c3cff0ec04bd871278634193c
--- /dev/null
+++ b/tests/testPair.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+rm brute_force_standard.dat swift_dopair_standard.dat
+
+./testPair -p 6 -r 1 -d 0 -f standard
+
+python difffloat.py brute_force_standard.dat swift_dopair_standard.dat tolerance.dat
+
+exit $?
diff --git a/tests/testPairPerturbed.sh b/tests/testPairPerturbed.sh
new file mode 100755
index 0000000000000000000000000000000000000000..544ba1b032da8426c065dcfb2ce3ee554c5e76a1
--- /dev/null
+++ b/tests/testPairPerturbed.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+rm brute_force_perturbed.dat swift_dopair_perturbed.dat
+
+./testPair -p 6 -r 1 -d 0.1 -f perturbed
+
+python difffloat.py brute_force_perturbed.dat swift_dopair_perturbed.dat tolerance.dat
+
+exit $?
diff --git a/tests/testParser.c b/tests/testParser.c
new file mode 100644
index 0000000000000000000000000000000000000000..a4b8789fca056fef659bca78eae9d0effb2ceb66
--- /dev/null
+++ b/tests/testParser.c
@@ -0,0 +1,67 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (C) 2016 James Willis (james.s.willis@durham.ac.uk).
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+
+#include "parser.h"
+#include <assert.h>
+#include <string.h>
+#include <math.h>
+
+int main(int argc, char *argv[]) {
+
+  const char *input_file = argv[1];
+
+  /* Create a structure to read file into. */
+  struct swift_params param_file;
+
+  /* Create variables that will be set from the parameter file. */
+  int no_of_threads = 0;
+  int no_of_time_steps = 0;
+  float max_h = 0.0f;
+  double start_time = 0.0;
+  char ic_file[PARSER_MAX_LINE_SIZE];
+
+  /* Read the parameter file. */
+  parser_read_file(input_file, &param_file);
+
+  /* Print the contents of the structure. */
+  parser_print_params(&param_file);
+
+  /* Retrieve parameters and store them in variables defined above.
+   * Have to specify the name of the parameter as it appears in the
+   * input file: testParserInput.yaml.*/
+  parser_get_param_int(&param_file, "no_of_threads", &no_of_threads);
+  parser_get_param_int(&param_file, "no_of_time_steps", &no_of_time_steps);
+  parser_get_param_float(&param_file, "max_h", &max_h);
+  parser_get_param_double(&param_file, "start_time", &start_time);
+  parser_get_param_string(&param_file, "ic_file", ic_file);
+
+  /* Print the variables to check their values are correct. */
+  printf(
+      "no_of_threads: %d, no_of_time_steps: %d, max_h: %f, start_time: %lf, "
+      "ic_file: %s\n",
+      no_of_threads, no_of_time_steps, max_h, start_time, ic_file);
+
+  assert(no_of_threads == 16);
+  assert(no_of_time_steps == 10);
+  assert(fabs(max_h - 1.1255) < 0.00001);
+  assert(fabs(start_time - 1.23456789) < 0.00001);
+  assert(strcmp(ic_file, "ic_file.ini") == 0); /*strcmp returns 0 if correct.*/
+
+  return 0;
+}
diff --git a/tests/testParser.sh b/tests/testParser.sh
new file mode 100755
index 0000000000000000000000000000000000000000..3dad7f386f792ff2beb6e94eb093bad4085023a4
--- /dev/null
+++ b/tests/testParser.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+./testParser testParserInput.yaml
diff --git a/tests/testParserInput.yaml b/tests/testParserInput.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d695e6a8ddd327e31224f36a6e34767ea8d36408
--- /dev/null
+++ b/tests/testParserInput.yaml
@@ -0,0 +1,9 @@
+---
+no_of_threads:      16 # The number of threads that will be used. 
+no_of_time_steps:   10
+max_h:              1.1255
+start_time:         1.23456789
+#Input file
+ic_file:            ic_file.ini
+
+...
diff --git a/tests/testReading.c b/tests/testReading.c
index d2a2a766171a85ace486914f0f39a987d9d8c3d3..9dda4c7bad75d35a8a93e0c2acb0619409a91afd 100644
--- a/tests/testReading.c
+++ b/tests/testReading.c
@@ -22,7 +22,7 @@
 
 int main() {
 
-  int Ngas = -1, Ngpart = -1;
+  size_t Ngas = 0, Ngpart = 0;
   int periodic = -1;
   int i, j, k, n;
   double dim[3];
diff --git a/tests/testSPHStep.c b/tests/testSPHStep.c
index 984b8ea867250d0bda1bc14d2600279a27321b2c..223078ecb637e64d94e37cdf8c0f60a86bdd5ff7 100644
--- a/tests/testSPHStep.c
+++ b/tests/testSPHStep.c
@@ -77,6 +77,10 @@ struct cell *make_cell(size_t N, float cellSize, int offset[3], int id_offset) {
 
 #ifdef DEFAULT_SPH
 
+/* Just a forward declaration... */
+void runner_doself1_density(struct runner *r, struct cell *ci);
+void runner_doself2_force(struct runner *r, struct cell *ci);
+
 /* Run a full time step integration for one cell */
 int main() {
 
@@ -132,7 +136,7 @@ int main() {
 
   /* Initialise the particles */
   for (j = 0; j < 27; ++j) {
-    runner_doinit(&r, cells[j]);
+    runner_doinit(&r, cells[j], 0);
   }
 
   /* Compute density */
@@ -145,7 +149,7 @@ int main() {
   runner_doself2_force(&r, ci);
   runner_dokick(&r, ci, 1);
 
-  message("t_end=%f", p->t_end);
+  message("ti_end=%d", p->ti_end);
 
   free(ci->parts);
   free(ci->xparts);
diff --git a/tests/testSingle.c b/tests/testSingle.c
index c85b77ff1c5b2285c33fa7787bbd53deab463039..8771fba0c1912905d3936562fa9dad0223d89220 100644
--- a/tests/testSingle.c
+++ b/tests/testSingle.c
@@ -91,8 +91,8 @@ int main(int argc, char *argv[]) {
   p2.force.POrho2 = p2.u * (const_hydro_gamma - 1.0f) / p2.rho;
 
   /* Dump a header. */
-  printParticle_single(&p1);
-  printParticle_single(&p2);
+  //printParticle_single(&p1, NULL);
+  //printParticle_single(&p2, NULL);
   printf("# r a_1 udt_1 a_2 udt_2\n");
 
   /* Loop over the different radii. */
@@ -103,9 +103,9 @@ int main(int argc, char *argv[]) {
     r2 = dx[0] * dx[0];
 
     /* Clear the particle fields. */
-    p1.a[0] = 0.0f;
+    p1.a_hydro[0] = 0.0f;
     p1.force.u_dt = 0.0f;
-    p2.a[0] = 0.0f;
+    p2.a_hydro[0] = 0.0f;
     p2.force.u_dt = 0.0f;
 
     /* Interact the particles. */
@@ -130,8 +130,8 @@ int main(int argc, char *argv[]) {
 
     /* Output the results. */
     printf(
-        "%.3e %.3e %.3e %.3e %.3e %.3e %.3e %.3e %.3e %.3e\n", -dx[0], p1.a[0],
-        p1.a[1], p1.a[2], p1.force.u_dt,
+        "%.3e %.3e %.3e %.3e %.3e %.3e %.3e %.3e %.3e %.3e\n", -dx[0],
+        p1.a_hydro[0], p1.a_hydro[1], p1.a_hydro[2], p1.force.u_dt,
         /// -dx[0] , p1.rho , p1.density.wcount , p2.rho , p2.density.wcount ,
         w, dwdx, gradw[0], gradw[1], gradw[2]);
 
diff --git a/tests/testVectorize.c b/tests/testVectorize.c
deleted file mode 100644
index a18b6e8af5ac3f7b94bd7be3bdf8fd21e49681ff..0000000000000000000000000000000000000000
--- a/tests/testVectorize.c
+++ /dev/null
@@ -1,212 +0,0 @@
-#include <fenv.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdio.h>
-#include <unistd.h>
-#include "swift.h"
-
-/* n is both particles per axis and box size:
- * particles are generated on a mesh with unit spacing
- */
-struct cell *make_cell(size_t n, double *offset, double h,
-                       unsigned long long *partId) {
-  size_t count = n * n * n;
-  struct cell *cell = malloc(sizeof *cell);
-  struct part *part;
-  size_t x, y, z, size;
-
-  size = count * sizeof(struct part);
-  if (posix_memalign((void **)&cell->parts, part_align, size) != 0) {
-    error("couldn't allocate particles, no. of particles: %d", (int)count);
-  }
-
-  part = cell->parts;
-  for (x = 0; x < n; ++x) {
-    for (y = 0; y < n; ++y) {
-      for (z = 0; z < n; ++z) {
-        // Add .5 for symmetry: 0.5, 1.5, 2.5 vs. 0, 1, 2
-        part->x[0] = x + offset[0] + 0.5;
-        part->x[1] = y + offset[1] + 0.5;
-        part->x[2] = z + offset[2] + 0.5;
-        part->v[0] = 1.0f;
-        part->v[1] = 1.0f;
-        part->v[2] = 1.0f;
-        part->h = h;
-        part->id = ++(*partId);
-        part->mass = 1.0f;
-        part->ti_begin = 0;
-        part->ti_end = 1;
-        ++part;
-      }
-    }
-  }
-
-  cell->split = 0;
-  cell->h_max = h;
-  cell->count = count;
-  cell->dx_max = 1.;
-  cell->h[0] = n;
-  cell->h[1] = n;
-  cell->h[2] = n;
-
-  cell->sort = malloc(13 * count * sizeof *cell->sort);
-  runner_dosort(NULL, cell, 0x1FFF, 0);
-
-  return cell;
-}
-
-void clean_up(struct cell *ci) {
-  free(ci->parts);
-  free(ci->sort);
-  free(ci);
-}
-
-/**
- * @brief Initializes all particles field to be ready for a density calculation
- */
-void zero_particle_fields(struct cell *c) {
-
-  for (size_t pid = 0; pid < c->count; pid++) {
-    c->parts[pid].rho = 0.f;
-    c->parts[pid].rho_dh = 0.f;
-    hydro_init_part(&c->parts[pid]);
-  }
-}
-
-/**
- * @brief Dump all the particles to a file
- */
-void dump_particle_fields(char *fileName, struct cell *ci, struct cell *cj) {
-
-  FILE *file = fopen(fileName, "w");
-
-  fprintf(file,
-          "# ID  rho  rho_dh  wcount  wcount_dh  div_v  curl_v:[x y z]\n");
-
-  for (size_t pid = 0; pid < ci->count; pid++) {
-    fprintf(file, "%6llu %f %f %f %f %f %f %f %f\n", ci->parts[pid].id,
-            ci->parts[pid].rho, ci->parts[pid].rho_dh,
-            ci->parts[pid].density.wcount, ci->parts[pid].density.wcount_dh,
-            ci->parts[pid].div_v, ci->parts[pid].density.rot_v[0],
-            ci->parts[pid].density.rot_v[1], ci->parts[pid].density.rot_v[2]);
-  }
-
-  fprintf(file, "# -----------------------------------\n");
-
-  for (size_t pjd = 0; pjd < cj->count; pjd++) {
-    fprintf(file, "%6llu %f %f %f %f %f %f %f %f\n", cj->parts[pjd].id,
-            cj->parts[pjd].rho, cj->parts[pjd].rho_dh,
-            cj->parts[pjd].density.wcount, cj->parts[pjd].density.wcount_dh,
-            cj->parts[pjd].div_v, cj->parts[pjd].density.rot_v[0],
-            cj->parts[pjd].density.rot_v[1], cj->parts[pjd].density.rot_v[2]);
-  }
-
-  fclose(file);
-}
-
-/* Just a forward declaration... */
-void runner_dopair1_density(struct runner *r, struct cell *ci, struct cell *cj);
-
-int main(int argc, char *argv[]) {
-  size_t particles = 0, runs = 0, volume, type = 0;
-  double offset[3] = {0, 0, 0}, h = 1.1255;  // * DIM/PARTS_PER_AXIS == * 1
-  struct cell *ci, *cj;
-  struct space space;
-  struct engine engine;
-  struct runner runner;
-  char c;
-  static unsigned long long partId = 0;
-  ticks tic, toc, time;
-
-  while ((c = getopt(argc, argv, "h:p:r:t:")) != -1) {
-    switch (c) {
-      case 'h':
-        sscanf(optarg, "%lf", &h);
-        break;
-      case 'p':
-        sscanf(optarg, "%zu", &particles);
-        break;
-      case 'r':
-        sscanf(optarg, "%zu", &runs);
-        break;
-      case 't':
-        sscanf(optarg, "%zu", &type);
-        break;
-    }
-  }
-
-  if (h < 0 || particles == 0 || runs == 0 || type > 2) {
-    printf(
-        "\nUsage: %s -p PARTICLES_PER_AXIS -r NUMBER_OF_RUNS [OPTIONS...]\n"
-        "\nGenerates a cell pair, filled with particles on a Cartesian grid."
-        "\nThese are then interacted using runner_dopair1_density."
-        "\n\nOptions:"
-        "\n-t TYPE=0          - cells share face (0), edge (1) or corner (2)"
-        "\n-h DISTANCE=1.1255 - smoothing length\n",
-        argv[0]);
-    exit(1);
-  }
-
-  volume = particles * particles * particles;
-  message("particles: %zu B\npositions: 0 B", 2 * volume * sizeof(struct part));
-
-  ci = make_cell(particles, offset, h, &partId);
-  for (size_t i = 0; i < type + 1; ++i) offset[i] = particles;
-  cj = make_cell(particles, offset, h, &partId);
-
-  for (int i = 0; i < 3; ++i) {
-    space.h_max = h;
-    space.dt_step = 0.1;
-  }
-
-  engine.s = &space;
-  engine.time = 0.1f;
-  runner.e = &engine;
-
-  time = 0;
-  for (size_t i = 0; i < runs; ++i) {
-
-    /* Zero the fields */
-    zero_particle_fields(ci);
-    zero_particle_fields(cj);
-
-    tic = getticks();
-
-    /* Run the test */
-    runner_dopair1_density(&runner, ci, cj);
-
-    toc = getticks();
-    time += toc - tic;
-
-    /* Dump if necessary */
-    if (i % 50 == 0) dump_particle_fields("swift_dopair.dat", ci, cj);
-  }
-
-  /* Output timing */
-  message("SWIFT calculation took       %lli ticks.", time / runs);
-
-  /* Now perform a brute-force version for accuracy tests */
-
-  /* Zero the fields */
-  zero_particle_fields(ci);
-  zero_particle_fields(cj);
-
-  tic = getticks();
-
-  /* Run the test */
-  pairs_all_density(&runner, ci, cj);
-
-  toc = getticks();
-
-  /* Dump */
-  dump_particle_fields("brute_force.dat", ci, cj);
-
-  /* Output timing */
-  message("Brute force calculation took %lli ticks.", toc - tic);
-
-  /* Clean things to make the sanitizer happy ... */
-  clean_up(ci);
-  clean_up(cj);
-
-  return 0;
-}
diff --git a/tests/tolerance.dat b/tests/tolerance.dat
new file mode 100644
index 0000000000000000000000000000000000000000..48de4383eab6214812183be25d3036a324ccbc27
--- /dev/null
+++ b/tests/tolerance.dat
@@ -0,0 +1,3 @@
+#   ID      pos_x      pos_y      pos_z        v_x        v_y        v_z           rho        rho_dh        wcount     wcount_dh         div_v       curl_vx       curl_vy       curl_vz
+    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-5	      1e-5	    2e-5       3e-4		 1e-5	     1e-5	   1e-5		 1e-5
+    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-5	      1.2e-5	    1e-5       1e-5		 1e-4	     1e-4	   1e-4		 1e-4