diff --git a/examples/main.c b/examples/main.c
index 43b56291c4eb023238c39556d536ab88c3daad0c..f770265c1b59dff93513a59ff7246258539325e3 100644
--- a/examples/main.c
+++ b/examples/main.c
@@ -362,8 +362,8 @@ int main(int argc, char *argv[]) {
   if (myrank == 0) clocks_gettime(&tic);
 #if defined(WITH_MPI)
 #if defined(HAVE_PARALLEL_HDF5)
-  read_ic_parallel(ICfileName, dim, &parts, &Ngas, &periodic, myrank, nr_nodes,
-                   MPI_COMM_WORLD, MPI_INFO_NULL);
+  read_ic_parallel(ICfileName, dim, &parts, &gparts, &Ngas, &Ngpart, &periodic,
+                   myrank, nr_nodes, MPI_COMM_WORLD, MPI_INFO_NULL);
 #else
   read_ic_serial(ICfileName, dim, &parts, &gparts, &Ngas, &Ngpart, &periodic,
                  myrank, nr_nodes, MPI_COMM_WORLD, MPI_INFO_NULL);
@@ -396,8 +396,8 @@ int main(int argc, char *argv[]) {
   /* MATTHIEU: Temporary fix to preserve master */
   if (!with_gravity) {
     free(gparts);
-    for(size_t k = 0; k < Ngas; ++k)
-      parts[k].gpart = NULL;
+    gparts = NULL;
+    for (size_t k = 0; k < Ngas; ++k) parts[k].gpart = NULL;
     Ngpart = 0;
 #if defined(WITH_MPI)
     N_long[0] = Ngas;
diff --git a/src/cell.c b/src/cell.c
index df11782048dfa80c697f53feefe8fabc104eb23b..b0e299999e9073cc14d92b769b3e157f50d64c8f 100644
--- a/src/cell.c
+++ b/src/cell.c
@@ -89,14 +89,18 @@ int cell_unpack(struct pcell *pc, struct cell *c, struct space *s) {
   c->ti_end_min = pc->ti_end_min;
   c->ti_end_max = pc->ti_end_max;
   c->count = pc->count;
+  c->gcount = pc->gcount;
   c->tag = pc->tag;
 
-  /* Fill the progeny recursively, depth-first. */
+  /* Number of new cells created. */
   int count = 1;
+
+  /* Fill the progeny recursively, depth-first. */
   for (int k = 0; k < 8; k++)
     if (pc->progeny[k] >= 0) {
       struct cell *temp = space_getcell(s);
       temp->count = 0;
+      temp->gcount = 0;
       temp->loc[0] = c->loc[0];
       temp->loc[1] = c->loc[1];
       temp->loc[2] = c->loc[2];
@@ -122,7 +126,7 @@ int cell_unpack(struct pcell *pc, struct cell *c, struct space *s) {
 }
 
 /**
- * @brief Link the cells recursively to the given part array.
+ * @brief Link the cells recursively to the given #part array.
  *
  * @param c The #cell.
  * @param parts The #part array.
@@ -130,7 +134,7 @@ int cell_unpack(struct pcell *pc, struct cell *c, struct space *s) {
  * @return The number of particles linked.
  */
 
-int cell_link(struct cell *c, struct part *parts) {
+int cell_link_parts(struct cell *c, struct part *parts) {
 
   c->parts = parts;
 
@@ -139,14 +143,40 @@ int cell_link(struct cell *c, struct part *parts) {
     int offset = 0;
     for (int k = 0; k < 8; k++) {
       if (c->progeny[k] != NULL)
-        offset += cell_link(c->progeny[k], &parts[offset]);
+        offset += cell_link_parts(c->progeny[k], &parts[offset]);
     }
   }
 
-  /* Return the total number of unpacked cells. */
+  /* Return the total number of linked particles. */
   return c->count;
 }
 
+/**
+ * @brief Link the cells recursively to the given #gpart array.
+ *
+ * @param c The #cell.
+ * @param gparts The #gpart array.
+ *
+ * @return The number of particles linked.
+ */
+
+int cell_link_gparts(struct cell *c, struct gpart *gparts) {
+
+  c->gparts = gparts;
+
+  /* Fill the progeny recursively, depth-first. */
+  if (c->split) {
+    int offset = 0;
+    for (int k = 0; k < 8; k++) {
+      if (c->progeny[k] != NULL)
+        offset += cell_link_gparts(c->progeny[k], &gparts[offset]);
+    }
+  }
+
+  /* Return the total number of linked particles. */
+  return c->gcount;
+}
+
 /**
  * @brief Pack the data of the given cell and all it's sub-cells.
  *
@@ -164,6 +194,7 @@ int cell_pack(struct cell *c, struct pcell *pc) {
   pc->ti_end_min = c->ti_end_min;
   pc->ti_end_max = c->ti_end_max;
   pc->count = c->count;
+  pc->gcount = c->gcount;
   c->tag = pc->tag = atomic_inc(&cell_next_tag) % cell_max_tag;
 
   /* Fill in the progeny, depth-first recursion. */
diff --git a/src/cell.h b/src/cell.h
index b0451b311fda9c300427da6b3a9a25955090d799..05b870b144dd425ee9e72ee3b529122b6e721523 100644
--- a/src/cell.h
+++ b/src/cell.h
@@ -44,7 +44,7 @@ struct pcell {
   int ti_end_min, ti_end_max;
 
   /* Number of particles in this cell. */
-  int count;
+  int count, gcount;
 
   /* tag used for MPI communication. */
   int tag;
@@ -175,7 +175,8 @@ void cell_gunlocktree(struct cell *c);
 int cell_pack(struct cell *c, struct pcell *pc);
 int cell_unpack(struct pcell *pc, struct cell *c, struct space *s);
 int cell_getsize(struct cell *c);
-int cell_link(struct cell *c, struct part *parts);
+int cell_link_parts(struct cell *c, struct part *parts);
+int cell_link_gparts(struct cell *c, struct gpart *gparts);
 void cell_init_parts(struct cell *c, void *data);
 void cell_convert_hydro(struct cell *c, void *data);
 void cell_clean_links(struct cell *c, void *data);
diff --git a/src/common_io.c b/src/common_io.c
index 4be334ccd602c7b0c450f5dbe4aedd9155b35ff1..6183effe9ce392ab930c581cbd118f025bbce773 100644
--- a/src/common_io.c
+++ b/src/common_io.c
@@ -45,6 +45,9 @@
 #include "kernel.h"
 #include "version.h"
 
+const char* particle_type_names[NUM_PARTICLE_TYPES] = {
+    "Gas", "DM", "Boundary", "Dummy", "Star", "BH"};
+
 /**
  * @brief Converts a C data type to the HDF5 equivalent.
  *
@@ -402,52 +405,68 @@ void createXMFfile() {
  *snapshot
  *
  * @param xmfFile The file to write in.
- * @param Nparts The number of particles.
  * @param hdfFileName The name of the HDF5 file corresponding to this output.
  * @param time The current simulation time.
  */
-void writeXMFheader(FILE* xmfFile, long long Nparts, char* hdfFileName,
-                    float time) {
+void writeXMFoutputheader(FILE* xmfFile, char* hdfFileName, float time) {
   /* Write end of file */
 
+  fprintf(xmfFile, "<!-- XMF description for file: %s -->\n", hdfFileName);
   fprintf(xmfFile,
           "<Grid GridType=\"Collection\" CollectionType=\"Spatial\">\n");
   fprintf(xmfFile, "<Time Type=\"Single\" Value=\"%f\"/>\n", time);
-  fprintf(xmfFile, "<Grid Name=\"Gas\" GridType=\"Uniform\">\n");
-  fprintf(xmfFile,
-          "<Topology TopologyType=\"Polyvertex\" Dimensions=\"%lld\"/>\n",
-          Nparts);
-  fprintf(xmfFile, "<Geometry GeometryType=\"XYZ\">\n");
-  fprintf(xmfFile,
-          "<DataItem Dimensions=\"%lld 3\" NumberType=\"Double\" "
-          "Precision=\"8\" "
-          "Format=\"HDF\">%s:/PartType0/Coordinates</DataItem>\n",
-          Nparts, hdfFileName);
-  fprintf(xmfFile, "</Geometry>");
 }
 
 /**
  * @brief Writes the end of the XMF file (closes all open markups)
  *
  * @param xmfFile The file to write in.
+ * @param output The number of this output.
+ * @param time The current simulation time.
  */
-void writeXMFfooter(FILE* xmfFile) {
+void writeXMFoutputfooter(FILE* xmfFile, int output, float time) {
   /* Write end of the section of this time step */
 
-  fprintf(xmfFile, "\n</Grid>\n");
-  fprintf(xmfFile, "</Grid>\n");
-  fprintf(xmfFile, "\n</Grid>\n");
+  fprintf(xmfFile,
+          "\n</Grid> <!-- End of meta-data for output=%03i, time=%f -->\n",
+          output, time);
+  fprintf(xmfFile, "\n</Grid> <!-- timeSeries -->\n");
   fprintf(xmfFile, "</Domain>\n");
   fprintf(xmfFile, "</Xdmf>\n");
 
   fclose(xmfFile);
 }
 
+void writeXMFgroupheader(FILE* xmfFile, char* hdfFileName, size_t N,
+                         enum PARTICLE_TYPE ptype) {
+  fprintf(xmfFile, "\n<Grid Name=\"%s\" GridType=\"Uniform\">\n",
+          particle_type_names[ptype]);
+  fprintf(xmfFile,
+          "<Topology TopologyType=\"Polyvertex\" Dimensions=\"%zi\"/>\n", N);
+  fprintf(xmfFile, "<Geometry GeometryType=\"XYZ\">\n");
+  fprintf(xmfFile,
+          "<DataItem Dimensions=\"%zi 3\" NumberType=\"Double\" "
+          "Precision=\"8\" "
+          "Format=\"HDF\">%s:/PartType%d/Coordinates</DataItem>\n",
+          N, hdfFileName, ptype);
+  fprintf(xmfFile,
+          "</Geometry>\n <!-- Done geometry for %s, start of particle fields "
+          "list -->\n",
+          particle_type_names[ptype]);
+}
+
+void writeXMFgroupfooter(FILE* xmfFile, enum PARTICLE_TYPE ptype) {
+  fprintf(xmfFile, "</Grid> <!-- End of meta-data for parttype=%s -->\n",
+          particle_type_names[ptype]);
+}
+
 /**
  * @brief Writes the lines corresponding to an array of the HDF5 output
  *
  * @param xmfFile The file in which to write
  * @param fileName The name of the HDF5 file associated to this XMF descriptor.
+ * @param partTypeGroupName The name of the group containing the particles in
+ *the HDF5 file.
  * @param name The name of the array in the HDF5 file.
  * @param N The number of particles.
  * @param dim The dimension of the quantity (1 for scalars, 3 for vectors).
@@ -455,21 +474,21 @@ void writeXMFfooter(FILE* xmfFile) {
  *
  * @todo Treat the types in a better way.
  */
-void writeXMFline(FILE* xmfFile, char* fileName, char* name, long long N,
-                  int dim, enum DATA_TYPE type) {
+void writeXMFline(FILE* xmfFile, char* fileName, char* partTypeGroupName,
+                  char* name, size_t N, int dim, enum DATA_TYPE type) {
   fprintf(xmfFile,
           "<Attribute Name=\"%s\" AttributeType=\"%s\" Center=\"Node\">\n",
           name, dim == 1 ? "Scalar" : "Vector");
   if (dim == 1)
     fprintf(xmfFile,
-            "<DataItem Dimensions=\"%lld\" NumberType=\"Double\" "
-            "Precision=\"%d\" Format=\"HDF\">%s:/PartType0/%s</DataItem>\n",
-            N, type == FLOAT ? 4 : 8, fileName, name);
+            "<DataItem Dimensions=\"%zi\" NumberType=\"Double\" "
+            "Precision=\"%d\" Format=\"HDF\">%s:%s/%s</DataItem>\n",
+            N, type == FLOAT ? 4 : 8, fileName, partTypeGroupName, name);
   else
     fprintf(xmfFile,
-            "<DataItem Dimensions=\"%lld %d\" NumberType=\"Double\" "
-            "Precision=\"%d\" Format=\"HDF\">%s:/PartType0/%s</DataItem>\n",
-            N, dim, type == FLOAT ? 4 : 8, fileName, name);
+            "<DataItem Dimensions=\"%zi %d\" NumberType=\"Double\" "
+            "Precision=\"%d\" Format=\"HDF\">%s:%s/%s</DataItem>\n",
+            N, dim, type == FLOAT ? 4 : 8, fileName, partTypeGroupName, name);
   fprintf(xmfFile, "</Attribute>\n");
 }
 
@@ -483,7 +502,7 @@ void writeXMFline(FILE* xmfFile, char* fileName, char* name, long long N,
  * @param gparts The array of #gpart freshly read in.
  * @param Ndm The number of DM particles read in.
  */
-void prepare_dm_gparts(struct gpart* gparts, size_t Ndm) {
+void prepare_dm_gparts(struct gpart* const gparts, size_t Ndm) {
 
   /* Let's give all these gparts a negative id */
   for (size_t i = 0; i < Ndm; ++i) {
@@ -508,8 +527,9 @@ void prepare_dm_gparts(struct gpart* gparts, size_t Ndm) {
  * @param Ngas The number of gas particles read in.
  * @param Ndm The number of DM particles read in.
  */
-void duplicate_hydro_gparts(struct part* parts, struct gpart* gparts,
-                            size_t Ngas, size_t Ndm) {
+void duplicate_hydro_gparts(struct part* const parts,
+                            struct gpart* const gparts, size_t Ngas,
+                            size_t Ndm) {
 
   for (size_t i = 0; i < Ngas; ++i) {
 
@@ -538,16 +558,19 @@ void duplicate_hydro_gparts(struct part* parts, struct gpart* gparts,
  * @param dmparts The array of #gpart containg DM particles to be filled.
  * @param Ndm The number of DM particles.
  */
-void collect_dm_gparts(struct gpart* gparts, size_t Ntot, struct gpart* dmparts,
-                       size_t Ndm) {
+void collect_dm_gparts(const struct gpart* const gparts, size_t Ntot,
+                       struct gpart* const dmparts, size_t Ndm) {
 
   size_t count = 0;
 
   /* Loop over all gparts */
   for (size_t i = 0; i < Ntot; ++i) {
 
+    /* message("i=%zd count=%zd id=%lld part=%p", i, count, gparts[i].id,
+     * gparts[i].part); */
+
     /* And collect the DM ones */
-    if (gparts[i].id < 0) {
+    if (gparts[i].id < 0LL) {
       memcpy(&dmparts[count], &gparts[i], sizeof(struct gpart));
       dmparts[count].id = -dmparts[count].id;
       count++;
diff --git a/src/common_io.h b/src/common_io.h
index 426fa6a01ec87a4413eceaaeb0d0880cbef8a214..961f40e63d771e5e06ade525301caf59aae0bceb 100644
--- a/src/common_io.h
+++ b/src/common_io.h
@@ -70,17 +70,20 @@ enum PARTICLE_TYPE {
   NUM_PARTICLE_TYPES
 };
 
+extern const char* particle_type_names[];
+
 #define FILENAME_BUFFER_SIZE 150
 #define PARTICLE_GROUP_BUFFER_SIZE 20
 
 hid_t hdf5Type(enum DATA_TYPE type);
 size_t sizeOfType(enum DATA_TYPE type);
 
-void collect_dm_gparts(struct gpart* gparts, size_t Ntot, struct gpart* dmparts,
-                       size_t Ndm);
-void prepare_dm_gparts(struct gpart* gparts, size_t Ndm);
-void duplicate_hydro_gparts(struct part* parts, struct gpart* gparts,
-                            size_t Ngas, size_t Ndm);
+void collect_dm_gparts(const struct gpart* const gparts, size_t Ntot,
+                       struct gpart* const dmparts, size_t Ndm);
+void prepare_dm_gparts(struct gpart* const gparts, size_t Ndm);
+void duplicate_hydro_gparts(struct part* const parts,
+                            struct gpart* const gparts, size_t Ngas,
+                            size_t Ndm);
 
 void readAttribute(hid_t grp, char* name, enum DATA_TYPE type, void* data);
 
@@ -95,10 +98,13 @@ void writeAttribute_s(hid_t grp, char* name, const char* str);
 
 void createXMFfile();
 FILE* prepareXMFfile();
-void writeXMFfooter(FILE* xmfFile);
-void writeXMFheader(FILE* xmfFile, long long N, char* hdfFileName, float time);
-void writeXMFline(FILE* xmfFile, char* fileName, char* name, long long N,
-                  int dim, enum DATA_TYPE type);
+void writeXMFoutputheader(FILE* xmfFile, char* hdfFileName, float time);
+void writeXMFoutputfooter(FILE* xmfFile, int outputCount, float time);
+void writeXMFgroupheader(FILE* xmfFile, char* hdfFileName, size_t N,
+                         enum PARTICLE_TYPE ptype);
+void writeXMFgroupfooter(FILE* xmfFile, enum PARTICLE_TYPE ptype);
+void writeXMFline(FILE* xmfFile, char* fileName, char* partTypeGroupName,
+                  char* name, size_t N, int dim, enum DATA_TYPE type);
 
 void writeCodeDescription(hid_t h_file);
 void writeSPHflavour(hid_t h_file);
diff --git a/src/engine.c b/src/engine.c
index 2d3a5337433280a2af736351f3b0611b01a8d2a0..28d75bba5378b83d4d0c8f6bf5c7705b40f6edb8 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -139,39 +139,56 @@ void engine_make_ghost_tasks(struct engine *e, struct cell *c,
  * @brief Redistribute the particles amongst the nodes according
  *      to their cell's node IDs.
  *
+ * The strategy here is as follows:
+ * 1) Each node counts the number of particles it has to send to each other
+ * node.
+ * 2) The number of particles of each type is then exchanged.
+ * 3) The particles to send are placed in a temporary buffer in which the
+ * part-gpart links are preserved.
+ * 4) Each node allocates enough space for the new particles.
+ * 5) (Asynchronous) communications are issued to transfer the data.
+ *
+ *
  * @param e The #engine.
  */
-
 void engine_redistribute(struct engine *e) {
 
 #ifdef WITH_MPI
 
-  int nr_nodes = e->nr_nodes, nodeID = e->nodeID;
+  const int nr_nodes = e->nr_nodes;
+  const int nodeID = e->nodeID;
   struct space *s = e->s;
-  int my_cells = 0;
-  int *cdim = s->cdim;
   struct cell *cells = s->cells;
-  int nr_cells = s->nr_cells;
+  const int nr_cells = s->nr_cells;
+  const int *cdim = s->cdim;
+  const double ih[3] = {s->ih[0], s->ih[1], s->ih[2]};
+  const double dim[3] = {s->dim[0], s->dim[1], s->dim[2]};
+  struct part *parts = s->parts;
+  struct xpart *xparts = s->xparts;
+  struct gpart *gparts = s->gparts;
   ticks tic = getticks();
 
-  /* Start by sorting the particles according to their nodes and
-     getting the counts. The counts array is indexed as
-     count[from * nr_nodes + to]. */
-  int *counts;
-  size_t *dest;
-  double ih[3], dim[3];
-  ih[0] = s->ih[0];
-  ih[1] = s->ih[1];
-  ih[2] = s->ih[2];
-  dim[0] = s->dim[0];
-  dim[1] = s->dim[1];
-  dim[2] = s->dim[2];
-  if ((counts = (int *)malloc(sizeof(int) *nr_nodes *nr_nodes)) == NULL ||
-      (dest = (size_t *)malloc(sizeof(size_t) * s->nr_parts)) == NULL)
-    error("Failed to allocate count and dest buffers.");
+  /* Allocate temporary arrays to store the counts of particles to be sent
+     and the destination of each particle */
+  int *counts, *g_counts;
+  if ((counts = (int *)malloc(sizeof(int) * nr_nodes * nr_nodes)) == NULL)
+    error("Failed to allocate count temporary buffer.");
+  if ((g_counts = (int *)malloc(sizeof(int) * nr_nodes * nr_nodes)) == NULL)
+    error("Failed to allocate gcount temporary buffer.");
   bzero(counts, sizeof(int) * nr_nodes * nr_nodes);
-  struct part *parts = s->parts;
+  bzero(g_counts, sizeof(int) * nr_nodes * nr_nodes);
+
+  // Allocate the destination index arrays.
+  int *dest, *g_dest;
+  if ((dest = (int *)malloc(sizeof(int) * s->nr_parts)) == NULL)
+    error("Failed to allocate dest temporary buffer.");
+  if ((g_dest = (int *)malloc(sizeof(int) * s->nr_gparts)) == NULL)
+    error("Failed to allocate g_dest temporary buffer.");
+
+  /* Get destination of each particle */
   for (size_t k = 0; k < s->nr_parts; k++) {
+
+    /* Periodic boundary conditions */
     for (int j = 0; j < 3; j++) {
       if (parts[k].x[j] < 0.0)
         parts[k].x[j] += dim[j];
@@ -184,36 +201,121 @@ void engine_redistribute(struct engine *e) {
        error("Bad cell id %i for part %i at [%.3e,%.3e,%.3e].",
              cid, k, parts[k].x[0], parts[k].x[1], parts[k].x[2]); */
     dest[k] = cells[cid].nodeID;
+
+    /* The counts array is indexed as count[from * nr_nodes + to]. */
     counts[nodeID * nr_nodes + dest[k]] += 1;
   }
+
+  /* Sort the particles according to their cell index. */
   space_parts_sort(s, dest, s->nr_parts, 0, nr_nodes - 1, e->verbose);
 
+  /* We need to re-link the gpart partners of parts. */
+  int current_dest = dest[0];
+  size_t count_this_dest = 0;
+  for (size_t k = 0; k < s->nr_parts; ++k) {
+    if (s->parts[k].gpart != NULL) {
+
+      /* As the addresses will be invalidated by the communications, we will */
+      /* instead store the absolute index from the start of the sub-array */
+      /* of particles to be sent to a given node. */
+      /* Recall that gparts without partners have a negative id. */
+      /* We will restore the pointers on the receiving node later on. */
+      if (dest[k] != current_dest) {
+        current_dest = dest[k];
+        count_this_dest = 0;
+      }
+
+      /* Debug */
+      /* if(s->parts[k].gpart->id < 0) */
+      /* 	error("Trying to link a partnerless gpart !"); */
+
+      s->parts[k].gpart->id = count_this_dest;
+      count_this_dest++;
+    }
+  }
+
+  /* Get destination of each g-particle */
+  for (size_t k = 0; k < s->nr_gparts; k++) {
+
+    /* Periodic boundary conditions */
+    for (int j = 0; j < 3; j++) {
+      if (gparts[k].x[j] < 0.0)
+        gparts[k].x[j] += dim[j];
+      else if (gparts[k].x[j] >= dim[j])
+        gparts[k].x[j] -= dim[j];
+    }
+    const int cid = cell_getid(cdim, gparts[k].x[0] * ih[0],
+                               gparts[k].x[1] * ih[1], gparts[k].x[2] * ih[2]);
+    /* if (cid < 0 || cid >= s->nr_cells)
+       error("Bad cell id %i for part %i at [%.3e,%.3e,%.3e].",
+             cid, k, g_parts[k].x[0], g_parts[k].x[1], g_parts[k].x[2]); */
+    g_dest[k] = cells[cid].nodeID;
+
+    /* The counts array is indexed as count[from * nr_nodes + to]. */
+    g_counts[nodeID * nr_nodes + g_dest[k]] += 1;
+  }
+
+  /* Sort the gparticles according to their cell index. */
+  space_gparts_sort(gparts, g_dest, s->nr_gparts, 0, nr_nodes - 1);
+
   /* Get all the counts from all the nodes. */
   if (MPI_Allreduce(MPI_IN_PLACE, counts, nr_nodes * nr_nodes, MPI_INT, MPI_SUM,
                     MPI_COMM_WORLD) != MPI_SUCCESS)
     error("Failed to allreduce particle transfer counts.");
 
-  /* Get the new number of parts for this node, be generous in allocating. */
-  size_t nr_parts = 0;
+  /* Get all the g_counts from all the nodes. */
+  if (MPI_Allreduce(MPI_IN_PLACE, g_counts, nr_nodes * nr_nodes, MPI_INT,
+                    MPI_SUM, MPI_COMM_WORLD) != MPI_SUCCESS)
+    error("Failed to allreduce gparticle transfer counts.");
+
+  /* Each node knows how many parts and gparts will be transferred to every
+     other node. We can start preparing to receive data */
+
+  /* Get the new number of parts and gparts for this node */
+  size_t nr_parts = 0, nr_gparts = 0;
   for (int k = 0; k < nr_nodes; k++) nr_parts += counts[k * nr_nodes + nodeID];
+  for (int k = 0; k < nr_nodes; k++)
+    nr_gparts += g_counts[k * nr_nodes + nodeID];
+
+  /* Allocate the new arrays with some extra margin */
   struct part *parts_new = NULL;
-  struct xpart *xparts_new = NULL, *xparts = s->xparts;
+  struct xpart *xparts_new = NULL;
+  struct gpart *gparts_new = NULL;
   if (posix_memalign((void **)&parts_new, part_align,
-                     sizeof(struct part) * nr_parts * 1.2) != 0 ||
-      posix_memalign((void **)&xparts_new, part_align,
-                     sizeof(struct xpart) * nr_parts * 1.2) != 0)
+                     sizeof(struct part) * nr_parts *
+                         engine_redistribute_alloc_margin) != 0)
     error("Failed to allocate new part data.");
-
-  /* Emit the sends and recvs for the particle data. */
+  if (posix_memalign((void **)&xparts_new, xpart_align,
+                     sizeof(struct xpart) * nr_parts *
+                         engine_redistribute_alloc_margin) != 0)
+    error("Failed to allocate new xpart data.");
+  if (posix_memalign((void **)&gparts_new, gpart_align,
+                     sizeof(struct gpart) * nr_gparts *
+                         engine_redistribute_alloc_margin) != 0)
+    error("Failed to allocate new gpart data.");
+
+  /* Prepare MPI requests for the asynchronous communications */
   MPI_Request *reqs;
-  if ((reqs = (MPI_Request *)malloc(sizeof(MPI_Request) * 4 * nr_nodes)) ==
+  if ((reqs = (MPI_Request *)malloc(sizeof(MPI_Request) * 6 * nr_nodes)) ==
       NULL)
     error("Failed to allocate MPI request list.");
-  for (int k = 0; k < 4 * nr_nodes; k++) reqs[k] = MPI_REQUEST_NULL;
-  for (size_t offset_send = 0, offset_recv = 0, k = 0; k < nr_nodes; k++) {
-    int ind_send = nodeID * nr_nodes + k;
-    int ind_recv = k * nr_nodes + nodeID;
+  for (int k = 0; k < 6 * nr_nodes; k++) reqs[k] = MPI_REQUEST_NULL;
+
+  /* Emit the sends and recvs for the particle and gparticle data. */
+  size_t offset_send = 0, offset_recv = 0;
+  size_t g_offset_send = 0, g_offset_recv = 0;
+  for (int k = 0; k < nr_nodes; k++) {
+
+    /* Indices in the count arrays of the node of interest */
+    const int ind_send = nodeID * nr_nodes + k;
+    const int ind_recv = k * nr_nodes + nodeID;
+
+    /* Are we sending any part/xpart ? */
     if (counts[ind_send] > 0) {
+
+      /* message("Sending %d part to node %d", counts[ind_send], k); */
+
+      /* If the send is to the same node, just copy */
       if (k == nodeID) {
         memcpy(&parts_new[offset_recv], &s->parts[offset_send],
                sizeof(struct part) * counts[ind_recv]);
@@ -221,36 +323,73 @@ void engine_redistribute(struct engine *e) {
                sizeof(struct xpart) * counts[ind_recv]);
         offset_send += counts[ind_send];
         offset_recv += counts[ind_recv];
+
+        /* Else, emit some communications */
       } else {
-        if (MPI_Isend(&s->parts[offset_send], counts[ind_send],
-                      e->part_mpi_type, k, 2 * ind_send + 0, MPI_COMM_WORLD,
-                      &reqs[4 * k]) != MPI_SUCCESS)
-          error("Failed to isend parts to node %zi.", k);
-        if (MPI_Isend(&s->xparts[offset_send], counts[ind_send],
-                      e->xpart_mpi_type, k, 2 * ind_send + 1, MPI_COMM_WORLD,
-                      &reqs[4 * k + 1]) != MPI_SUCCESS)
-          error("Failed to isend xparts to node %zi.", k);
+        if (MPI_Isend(&s->parts[offset_send], counts[ind_send], part_mpi_type,
+                      k, 3 * ind_send + 0, MPI_COMM_WORLD,
+                      &reqs[6 * k]) != MPI_SUCCESS)
+          error("Failed to isend parts to node %i.", k);
+        if (MPI_Isend(&s->xparts[offset_send], counts[ind_send], xpart_mpi_type,
+                      k, 3 * ind_send + 1, MPI_COMM_WORLD,
+                      &reqs[6 * k + 1]) != MPI_SUCCESS)
+          error("Failed to isend xparts to node %i.", k);
         offset_send += counts[ind_send];
       }
     }
+
+    /* Are we sending any gpart ? */
+    if (g_counts[ind_send] > 0) {
+
+      /* message("Sending %d gpart to node %d", g_counts[ind_send], k); */
+
+      /* If the send is to the same node, just copy */
+      if (k == nodeID) {
+        memcpy(&gparts_new[g_offset_recv], &s->gparts[g_offset_send],
+               sizeof(struct gpart) * g_counts[ind_recv]);
+        g_offset_send += g_counts[ind_send];
+        g_offset_recv += g_counts[ind_recv];
+
+        /* Else, emit some communications */
+      } else {
+        if (MPI_Isend(&s->gparts[g_offset_send], g_counts[ind_send],
+                      gpart_mpi_type, k, 3 * ind_send + 2, MPI_COMM_WORLD,
+                      &reqs[6 * k + 2]) != MPI_SUCCESS)
+          error("Failed to isend gparts to node %i.", k);
+        g_offset_send += g_counts[ind_send];
+      }
+    }
+
+    /* Now emit the corresponding Irecv() */
+
+    /* Are we receiving any part/xpart from this node ? */
     if (k != nodeID && counts[ind_recv] > 0) {
-      if (MPI_Irecv(&parts_new[offset_recv], counts[ind_recv], e->part_mpi_type,
-                    k, 2 * ind_recv + 0, MPI_COMM_WORLD,
-                    &reqs[4 * k + 2]) != MPI_SUCCESS)
-        error("Failed to emit irecv of parts from node %zi.", k);
-      if (MPI_Irecv(&xparts_new[offset_recv], counts[ind_recv],
-                    e->xpart_mpi_type, k, 2 * ind_recv + 1, MPI_COMM_WORLD,
-                    &reqs[4 * k + 3]) != MPI_SUCCESS)
-        error("Failed to emit irecv of parts from node %zi.", k);
+      if (MPI_Irecv(&parts_new[offset_recv], counts[ind_recv], part_mpi_type, k,
+                    3 * ind_recv + 0, MPI_COMM_WORLD,
+                    &reqs[6 * k + 3]) != MPI_SUCCESS)
+        error("Failed to emit irecv of parts from node %i.", k);
+      if (MPI_Irecv(&xparts_new[offset_recv], counts[ind_recv], xpart_mpi_type,
+                    k, 3 * ind_recv + 1, MPI_COMM_WORLD,
+                    &reqs[6 * k + 4]) != MPI_SUCCESS)
+        error("Failed to emit irecv of xparts from node %i.", k);
       offset_recv += counts[ind_recv];
     }
+
+    /* Are we receiving any gpart from this node ? */
+    if (k != nodeID && g_counts[ind_recv] > 0) {
+      if (MPI_Irecv(&gparts_new[g_offset_recv], g_counts[ind_recv],
+                    gpart_mpi_type, k, 3 * ind_recv + 2, MPI_COMM_WORLD,
+                    &reqs[6 * k + 5]) != MPI_SUCCESS)
+        error("Failed to emit irecv of gparts from node %i.", k);
+      g_offset_recv += g_counts[ind_recv];
+    }
   }
 
   /* Wait for all the sends and recvs to tumble in. */
-  MPI_Status stats[4 * nr_nodes];
+  MPI_Status stats[6 * nr_nodes];
   int res;
-  if ((res = MPI_Waitall(4 * nr_nodes, reqs, stats)) != MPI_SUCCESS) {
-    for (int k = 0; k < 4 * nr_nodes; k++) {
+  if ((res = MPI_Waitall(6 * nr_nodes, reqs, stats)) != MPI_SUCCESS) {
+    for (int k = 0; k < 6 * nr_nodes; k++) {
       char buff[MPI_MAX_ERROR_STRING];
       int res;
       MPI_Error_string(stats[k].MPI_ERROR, buff, &res);
@@ -259,33 +398,90 @@ void engine_redistribute(struct engine *e) {
     error("Failed during waitall for part data.");
   }
 
+  /* We now need to restore the part<->gpart links */
+  size_t offset_parts = 0, offset_gparts = 0;
+  for (int node = 0; node < nr_nodes; ++node) {
+
+    const int ind_recv = node * nr_nodes + nodeID;
+    const size_t count_parts = counts[ind_recv];
+    const size_t count_gparts = g_counts[ind_recv];
+
+    /* Loop over the gparts received from that node */
+    for (size_t k = offset_gparts; k < offset_gparts + count_gparts; ++k) {
+
+      /* Does this gpart have a partner ? */
+      if (gparts_new[k].id >= 0) {
+
+        const size_t partner_index = offset_parts + gparts_new[k].id;
+
+        /* Re-link */
+        gparts_new[k].part = &parts_new[partner_index];
+        gparts_new[k].part->gpart = &gparts_new[k];
+      }
+    }
+
+    offset_parts += count_parts;
+    offset_gparts += count_gparts;
+  }
+
   /* Verify that all parts are in the right place. */
   /* for ( int k = 0 ; k < nr_parts ; k++ ) {
-      int cid = cell_getid( cdim , parts_new[k].x[0]*ih[0], parts_new[k].x[1]*ih[1], parts_new[k].x[2]*ih[2] );
+      int cid = cell_getid( cdim , parts_new[k].x[0]*ih[0],
+    parts_new[k].x[1]*ih[1], parts_new[k].x[2]*ih[2] );
       if ( cells[ cid ].nodeID != nodeID )
-          error( "Received particle (%i) that does not belong here (nodeID=%i).", k , cells[ cid ].nodeID );
+          error( "Received particle (%i) that does not belong here
+    (nodeID=%i).", k , cells[ cid ].nodeID );
     } */
 
+  /* Verify that the links are correct */
+  /* MATTHIEU: To be commented out once we are happy */
+  for (size_t k = 0; k < nr_gparts; ++k) {
+
+    if (gparts_new[k].id > 0) {
+
+      if (gparts_new[k].part->gpart != &gparts_new[k])
+        error("Linking problem !");
+
+      /* if (gparts_new[k].x[0] != gparts_new[k].part->x[0] || */
+      /*     gparts_new[k].x[1] != gparts_new[k].part->x[1] || */
+      /*     gparts_new[k].x[2] != gparts_new[k].part->x[2]) */
+      /*   error("Linked particles are not at the same position !"); */
+    }
+  }
+  for (size_t k = 0; k < nr_parts; ++k) {
+
+    if (parts_new[k].gpart != NULL) {
+
+      if (parts_new[k].gpart->part != &parts_new[k]) error("Linking problem !");
+    }
+  }
+
   /* Set the new part data, free the old. */
   free(parts);
   free(xparts);
+  free(gparts);
   s->parts = parts_new;
   s->xparts = xparts_new;
+  s->gparts = gparts_new;
   s->nr_parts = nr_parts;
-  s->size_parts = 1.2 * nr_parts;
+  s->nr_gparts = nr_gparts;
+  s->size_parts = engine_redistribute_alloc_margin * nr_parts;
+  s->size_gparts = engine_redistribute_alloc_margin * nr_gparts;
 
-  /* Be verbose about what just happened. */
-  for (int k = 0; k < nr_cells; k++)
-    if (cells[k].nodeID == nodeID) my_cells += 1;
-  if (e->verbose)
-    message("node %i now has %zi parts in %i cells.", nodeID, nr_parts,
-            my_cells);
-
-  /* Clean up other stuff. */
+  /* Clean up the temporary stuff. */
   free(reqs);
   free(counts);
   free(dest);
 
+  /* Be verbose about what just happened. */
+  if (e->verbose) {
+    int my_cells = 0;
+    for (int k = 0; k < nr_cells; k++)
+      if (cells[k].nodeID == nodeID) my_cells += 1;
+    message("node %i now has %zi parts and %zi gparts in %i cells.", nodeID,
+            nr_parts, nr_gparts, my_cells);
+  }
+
   if (e->verbose)
     message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
             clocks_getunit());
@@ -547,31 +743,40 @@ void engine_exchange_cells(struct engine *e) {
 
   /* Count the number of particles we need to import and re-allocate
      the buffer if needed. */
-  int count_in = 0;
+  int count_parts_in = 0, count_gparts_in = 0;
   for (int k = 0; k < nr_proxies; k++)
-    for (int j = 0; j < e->proxies[k].nr_cells_in; j++)
-      count_in += e->proxies[k].cells_in[j]->count;
-  if (count_in > s->size_parts_foreign) {
+    for (int j = 0; j < e->proxies[k].nr_cells_in; j++) {
+      count_parts_in += e->proxies[k].cells_in[j]->count;
+      count_gparts_in += e->proxies[k].cells_in[j]->gcount;
+    }
+  if (count_parts_in > s->size_parts_foreign) {
     if (s->parts_foreign != NULL) free(s->parts_foreign);
-    s->size_parts_foreign = 1.1 * count_in;
+    s->size_parts_foreign = 1.1 * count_parts_in;
     if (posix_memalign((void **)&s->parts_foreign, part_align,
                        sizeof(struct part) * s->size_parts_foreign) != 0)
       error("Failed to allocate foreign part data.");
   }
+  if (count_gparts_in > s->size_gparts_foreign) {
+    if (s->gparts_foreign != NULL) free(s->gparts_foreign);
+    s->size_gparts_foreign = 1.1 * count_gparts_in;
+    if (posix_memalign((void **)&s->gparts_foreign, gpart_align,
+                       sizeof(struct gpart) * s->size_gparts_foreign) != 0)
+      error("Failed to allocate foreign gpart data.");
+  }
 
   /* Unpack the cells and link to the particle data. */
   struct part *parts = s->parts_foreign;
+  struct gpart *gparts = s->gparts_foreign;
   for (int k = 0; k < nr_proxies; k++) {
     for (int j = 0; j < e->proxies[k].nr_cells_in; j++) {
-      cell_link(e->proxies[k].cells_in[j], parts);
+      cell_link_parts(e->proxies[k].cells_in[j], parts);
+      cell_link_gparts(e->proxies[k].cells_in[j], gparts);
       parts = &parts[e->proxies[k].cells_in[j]->count];
+      gparts = &gparts[e->proxies[k].cells_in[j]->gcount];
     }
   }
   s->nr_parts_foreign = parts - s->parts_foreign;
-
-  /* Is the parts buffer large enough? */
-  if (s->nr_parts_foreign > s->size_parts_foreign)
-    error("Foreign parts buffer too small.");
+  s->nr_gparts_foreign = gparts - s->gparts_foreign;
 
   /* Free the pcell buffer. */
   free(pcells);
@@ -589,16 +794,24 @@ void engine_exchange_cells(struct engine *e) {
  * @brief Exchange straying parts with other nodes.
  *
  * @param e The #engine.
- * @param offset The index in the parts array as of which the foreign parts
- *reside.
- * @param ind The ID of the foreign #cell.
- * @param N The number of stray parts.
+ * @param offset_parts The index in the parts array as of which the foreign
+ *        parts reside.
+ * @param ind_part The foreign #cell ID of each part.
+ * @param Npart The number of stray parts, contains the number of parts received
+ *        on return.
+ * @param offset_gparts The index in the gparts array as of which the foreign
+ *        parts reside.
+ * @param ind_gpart The foreign #cell ID of each gpart.
+ * @param Ngpart The number of stray gparts, contains the number of gparts
+ *        received on return.
  *
- * @return The number of arrived parts copied to parts and xparts.
+ * Note that this function does not mess-up the linkage between parts and
+ * gparts, i.e. the received particles have correct linkeage.
  */
 
-int engine_exchange_strays(struct engine *e, int offset, size_t *ind,
-                           size_t N) {
+void engine_exchange_strays(struct engine *e, size_t offset_parts,
+                            int *ind_part, size_t *Npart, size_t offset_gparts,
+                            int *ind_gpart, size_t *Ngpart) {
 
 #ifdef WITH_MPI
 
@@ -608,25 +821,49 @@ int engine_exchange_strays(struct engine *e, int offset, size_t *ind,
   /* Re-set the proxies. */
   for (int k = 0; k < e->nr_proxies; k++) e->proxies[k].nr_parts_out = 0;
 
-  /* Put the parts into the corresponding proxies. */
-  for (size_t k = 0; k < N; k++) {
-    const int node_id = e->s->cells[ind[k]].nodeID;
+  /* Put the parts and gparts into the corresponding proxies. */
+  for (size_t k = 0; k < *Npart; k++) {
+    /* Get the target node and proxy ID. */
+    const int node_id = e->s->cells[ind_part[k]].nodeID;
     if (node_id < 0 || node_id >= e->nr_nodes)
       error("Bad node ID %i.", node_id);
     const int pid = e->proxy_ind[node_id];
-    if (pid < 0)
+    if (pid < 0) {
       error(
           "Do not have a proxy for the requested nodeID %i for part with "
           "id=%llu, x=[%e,%e,%e].",
-          node_id, s->parts[offset + k].id, s->parts[offset + k].x[0],
-          s->parts[offset + k].x[1], s->parts[offset + k].x[2]);
-    proxy_parts_load(&e->proxies[pid], &s->parts[offset + k],
-                     &s->xparts[offset + k], 1);
+          node_id, s->parts[offset_parts + k].id,
+          s->parts[offset_parts + k].x[0], s->parts[offset_parts + k].x[1],
+          s->parts[offset_parts + k].x[2]);
+    }
+
+    /* Re-link the associated gpart with the buffer offset of the part. */
+    if (s->parts[offset_parts + k].gpart != NULL) {
+      s->parts[offset_parts + k].gpart->id = e->proxies[pid].nr_parts_in;
+    }
+
+    /* Load the part and xpart into the proxy. */
+    proxy_parts_load(&e->proxies[pid], &s->parts[offset_parts + k],
+                     &s->xparts[offset_parts + k], 1);
+  }
+  for (size_t k = 0; k < *Ngpart; k++) {
+    const int node_id = e->s->cells[ind_gpart[k]].nodeID;
+    if (node_id < 0 || node_id >= e->nr_nodes)
+      error("Bad node ID %i.", node_id);
+    const int pid = e->proxy_ind[node_id];
+    if (pid < 0)
+      error(
+          "Do not have a proxy for the requested nodeID %i for part with "
+          "id=%lli, x=[%e,%e,%e].",
+          node_id, s->gparts[offset_parts + k].id,
+          s->gparts[offset_gparts + k].x[0], s->gparts[offset_parts + k].x[1],
+          s->gparts[offset_gparts + k].x[2]);
+    proxy_gparts_load(&e->proxies[pid], &s->gparts[offset_gparts + k], 1);
   }
 
   /* Launch the proxies. */
-  MPI_Request reqs_in[2 * engine_maxproxies];
-  MPI_Request reqs_out[2 * engine_maxproxies];
+  MPI_Request reqs_in[3 * engine_maxproxies];
+  MPI_Request reqs_out[3 * engine_maxproxies];
   for (int k = 0; k < e->nr_proxies; k++) {
     proxy_parts_exch1(&e->proxies[k]);
     reqs_in[k] = e->proxies[k].req_parts_count_in;
@@ -650,11 +887,18 @@ int engine_exchange_strays(struct engine *e, int offset, size_t *ind,
 
   /* Count the total number of incoming particles and make sure we have
      enough space to accommodate them. */
-  size_t count_in = 0;
-  for (int k = 0; k < e->nr_proxies; k++) count_in += e->proxies[k].nr_parts_in;
-  if (e->verbose) message("sent out %zi particles, got %zi back.", N, count_in);
-  if (offset + count_in > s->size_parts) {
-    s->size_parts = (offset + count_in) * 1.05;
+  int count_parts_in = 0;
+  int count_gparts_in = 0;
+  for (int k = 0; k < e->nr_proxies; k++) {
+    count_parts_in += e->proxies[k].nr_parts_in;
+    count_gparts_in += e->proxies[k].nr_gparts_in;
+  }
+  if (e->verbose) {
+    message("sent out %zi/%zi parts/gparts, got %i/%i back.", *Npart, *Ngpart,
+            count_parts_in, count_gparts_in);
+  }
+  if (offset_parts + count_parts_in > s->size_parts) {
+    s->size_parts = (offset_parts + count_parts_in) * engine_parts_size_grow;
     struct part *parts_new = NULL;
     struct xpart *xparts_new = NULL;
     if (posix_memalign((void **)&parts_new, part_align,
@@ -662,37 +906,61 @@ int engine_exchange_strays(struct engine *e, int offset, size_t *ind,
         posix_memalign((void **)&xparts_new, part_align,
                        sizeof(struct xpart) * s->size_parts) != 0)
       error("Failed to allocate new part data.");
-    memcpy(parts_new, s->parts, sizeof(struct part) * offset);
-    memcpy(xparts_new, s->xparts, sizeof(struct xpart) * offset);
+    memcpy(parts_new, s->parts, sizeof(struct part) * offset_parts);
+    memcpy(xparts_new, s->xparts, sizeof(struct xpart) * offset_parts);
     free(s->parts);
     free(s->xparts);
     s->parts = parts_new;
     s->xparts = xparts_new;
   }
+  if (offset_gparts + count_gparts_in > s->size_gparts) {
+    s->size_gparts = (offset_gparts + count_gparts_in) * engine_parts_size_grow;
+    struct gpart *gparts_new = NULL;
+    if (posix_memalign((void **)&gparts_new, gpart_align,
+                       sizeof(struct gpart) * s->size_gparts) != 0)
+      error("Failed to allocate new gpart data.");
+    memcpy(gparts_new, s->gparts, sizeof(struct gpart) * offset_gparts);
+    free(s->gparts);
+    s->gparts = gparts_new;
+  }
 
   /* Collect the requests for the particle data from the proxies. */
   int nr_in = 0, nr_out = 0;
   for (int k = 0; k < e->nr_proxies; k++) {
     if (e->proxies[k].nr_parts_in > 0) {
-      reqs_in[2 * k] = e->proxies[k].req_parts_in;
-      reqs_in[2 * k + 1] = e->proxies[k].req_xparts_in;
+      reqs_in[3 * k] = e->proxies[k].req_parts_in;
+      reqs_in[3 * k + 1] = e->proxies[k].req_xparts_in;
+      nr_in += 2;
+    } else {
+      reqs_in[3 * k] = reqs_in[3 * k + 1] = MPI_REQUEST_NULL;
+    }
+    if (e->proxies[k].nr_gparts_in > 0) {
+      reqs_in[3 * k + 2] = e->proxies[k].req_gparts_in;
       nr_in += 1;
-    } else
-      reqs_in[2 * k] = reqs_in[2 * k + 1] = MPI_REQUEST_NULL;
+    } else {
+      reqs_in[3 * k + 2] = MPI_REQUEST_NULL;
+    }
     if (e->proxies[k].nr_parts_out > 0) {
-      reqs_out[2 * k] = e->proxies[k].req_parts_out;
-      reqs_out[2 * k + 1] = e->proxies[k].req_xparts_out;
+      reqs_out[3 * k] = e->proxies[k].req_parts_out;
+      reqs_out[3 * k + 1] = e->proxies[k].req_xparts_out;
+      nr_out += 2;
+    } else {
+      reqs_out[3 * k] = reqs_out[3 * k + 1] = MPI_REQUEST_NULL;
+    }
+    if (e->proxies[k].nr_gparts_out > 0) {
+      reqs_out[3 * k + 2] = e->proxies[k].req_gparts_out;
       nr_out += 1;
-    } else
-      reqs_out[2 * k] = reqs_out[2 * k + 1] = MPI_REQUEST_NULL;
+    } else {
+      reqs_out[3 * k + 2] = MPI_REQUEST_NULL;
+    }
   }
 
   /* Wait for each part array to come in and collect the new
      parts from the proxies. */
-  size_t count = 0;
-  for (int k = 0; k < 2 * (nr_in + nr_out); k++) {
-    int err = 0, pid = MPI_UNDEFINED;
-    if ((err = MPI_Waitany(2 * e->nr_proxies, reqs_in, &pid,
+  int count_parts = 0, count_gparts = 0;
+  for (int k = 0; k < nr_in; k++) {
+    int err, pid;
+    if ((err = MPI_Waitany(3 * e->nr_proxies, reqs_in, &pid,
                            MPI_STATUS_IGNORE)) != MPI_SUCCESS) {
       char buff[MPI_MAX_ERROR_STRING];
       int res;
@@ -700,26 +968,46 @@ int engine_exchange_strays(struct engine *e, int offset, size_t *ind,
       error("MPI_Waitany failed (%s).", buff);
     }
     if (pid == MPI_UNDEFINED) break;
-    // message( "request from proxy %i has arrived." , pid );
-    if (reqs_in[pid & ~1] == MPI_REQUEST_NULL &&
-        reqs_in[pid | 1] == MPI_REQUEST_NULL) {
+    // message( "request from proxy %i has arrived." , pid / 3 );
+    pid = 3 * (pid / 3);
+
+    /* If all the requests for a given proxy have arrived... */
+    if (reqs_in[pid + 0] == MPI_REQUEST_NULL &&
+        reqs_in[pid + 1] == MPI_REQUEST_NULL &&
+        reqs_in[pid + 2] == MPI_REQUEST_NULL) {
+      /* Copy the particle data to the part/xpart/gpart arrays. */
       struct proxy *p = &e->proxies[pid >> 1];
-      memcpy(&s->parts[offset + count], p->parts_in,
+      memcpy(&s->parts[offset_parts + count_parts], p->parts_in,
              sizeof(struct part) * p->nr_parts_in);
-      memcpy(&s->xparts[offset + count], p->xparts_in,
+      memcpy(&s->xparts[offset_parts + count_parts], p->xparts_in,
              sizeof(struct xpart) * p->nr_parts_in);
+      memcpy(&s->gparts[offset_gparts + count_gparts], p->gparts_in,
+             sizeof(struct gpart) * p->nr_gparts_in);
       /* for (int k = offset; k < offset + count; k++)
          message(
             "received particle %lli, x=[%.3e %.3e %.3e], h=%.3e, from node %i.",
             s->parts[k].id, s->parts[k].x[0], s->parts[k].x[1],
             s->parts[k].x[2], s->parts[k].h, p->nodeID); */
-      count += p->nr_parts_in;
+
+      /* Re-link the gparts. */
+      for (int k = 0; k < p->nr_gparts_in; k++) {
+        struct gpart *gp = &s->gparts[offset_gparts + count_gparts + k];
+        if (gp->id >= 0) {
+          struct part *p = &s->parts[offset_gparts + count_parts + gp->id];
+          gp->part = p;
+          p->gpart = gp;
+        }
+      }
+
+      /* Advance the counters. */
+      count_parts += p->nr_parts_in;
+      count_gparts += p->nr_gparts_in;
     }
   }
 
   /* Wait for all the sends to have finished too. */
   if (nr_out > 0)
-    if (MPI_Waitall(2 * e->nr_proxies, reqs_out, MPI_STATUSES_IGNORE) !=
+    if (MPI_Waitall(3 * e->nr_proxies, reqs_out, MPI_STATUSES_IGNORE) !=
         MPI_SUCCESS)
       error("MPI_Waitall on sends failed.");
 
@@ -728,11 +1016,11 @@ int engine_exchange_strays(struct engine *e, int offset, size_t *ind,
             clocks_getunit());
 
   /* Return the number of harvested parts. */
-  return count;
+  *Npart = count_parts;
+  *Ngpart = count_gparts;
 
 #else
   error("SWIFT was not compiled with MPI support.");
-  return 0;
 #endif
 }
 
@@ -1924,7 +2212,7 @@ void engine_split(struct engine *e, struct partition *initial_partition) {
   engine_makeproxies(e);
 
   /* Re-allocate the local parts. */
-  if (e->nodeID == 0)
+  if (e->verbose)
     message("Re-allocating parts array from %zi to %zi.", s->size_parts,
             (size_t)(s->nr_parts * 1.2));
   s->size_parts = s->nr_parts * 1.2;
@@ -1932,7 +2220,7 @@ void engine_split(struct engine *e, struct partition *initial_partition) {
   struct xpart *xparts_new = NULL;
   if (posix_memalign((void **)&parts_new, part_align,
                      sizeof(struct part) * s->size_parts) != 0 ||
-      posix_memalign((void **)&xparts_new, part_align,
+      posix_memalign((void **)&xparts_new, xpart_align,
                      sizeof(struct xpart) * s->size_parts) != 0)
     error("Failed to allocate new part data.");
   memcpy(parts_new, s->parts, sizeof(struct part) * s->nr_parts);
@@ -1941,6 +2229,28 @@ void engine_split(struct engine *e, struct partition *initial_partition) {
   free(s->xparts);
   s->parts = parts_new;
   s->xparts = xparts_new;
+
+  /* Re-link the gparts. */
+  for (size_t k = 0; k < s->nr_parts; k++)
+    if (s->parts[k].gpart != NULL) s->parts[k].gpart->part = &s->parts[k];
+
+  /* Re-allocate the local gparts. */
+  if (e->verbose)
+    message("Re-allocating gparts array from %zi to %zi.", s->size_gparts,
+            (size_t)(s->nr_gparts * 1.2));
+  s->size_gparts = s->nr_gparts * 1.2;
+  struct gpart *gparts_new = NULL;
+  if (posix_memalign((void **)&gparts_new, gpart_align,
+                     sizeof(struct gpart) * s->size_gparts) != 0)
+    error("Failed to allocate new gpart data.");
+  memcpy(gparts_new, s->gparts, sizeof(struct gpart) * s->nr_gparts);
+  free(s->gparts);
+  s->gparts = gparts_new;
+
+  /* Re-link the parts. */
+  for (size_t k = 0; k < s->nr_gparts; k++)
+    if (s->gparts[k].id > 0) s->gparts[k].part->gpart = &s->gparts[k];
+
 #else
   error("SWIFT was not compiled with MPI support.");
 #endif
@@ -2173,8 +2483,7 @@ void engine_init(struct engine *e, struct space *s, float dt, int nr_threads,
 
 /* Construct types for MPI communications */
 #ifdef WITH_MPI
-  part_create_mpi_type(&e->part_mpi_type);
-  xpart_create_mpi_type(&e->xpart_mpi_type);
+  part_create_mpi_types();
 #endif
 
   /* First of all, init the barrier and lock it. */
diff --git a/src/engine.h b/src/engine.h
index 741ae1f553494e435394f529606b4cb794b0e3d2..4d1860b9eed0203bf9bf75711ec6e6549d837fe7 100644
--- a/src/engine.h
+++ b/src/engine.h
@@ -62,6 +62,8 @@ extern const char *engine_policy_names[];
 #define engine_maxtaskspercell 96
 #define engine_maxproxies 64
 #define engine_tasksreweight 10
+#define engine_parts_size_grow 1.05
+#define engine_redistribute_alloc_margin 1.2
 
 /* The rank of the engine as a global variable (for messages). */
 extern int engine_rank;
@@ -160,12 +162,6 @@ struct engine {
 
   /* Are we talkative ? */
   int verbose;
-
-#ifdef WITH_MPI
-  /* MPI data type for the particle transfers */
-  MPI_Datatype part_mpi_type;
-  MPI_Datatype xpart_mpi_type;
-#endif
 };
 
 /* Function prototypes. */
@@ -182,7 +178,9 @@ void engine_init_particles(struct engine *e);
 void engine_step(struct engine *e);
 void engine_maketasks(struct engine *e);
 void engine_split(struct engine *e, struct partition *initial_partition);
-int engine_exchange_strays(struct engine *e, int offset, size_t *ind, size_t N);
+void engine_exchange_strays(struct engine *e, size_t offset_parts,
+                            int *ind_part, size_t *Npart, size_t offset_gparts,
+                            int *ind_gpart, size_t *Ngpart);
 void engine_rebuild(struct engine *e);
 void engine_repartition(struct engine *e);
 void engine_makeproxies(struct engine *e);
diff --git a/src/gravity/Default/gravity_io.h b/src/gravity/Default/gravity_io.h
index d707d69631e65eed8ad21a7fa9601c07d3c71263..129c4b39828ca73d2d80d79edbdaa8ec4d5a9e01 100644
--- a/src/gravity/Default/gravity_io.h
+++ b/src/gravity/Default/gravity_io.h
@@ -48,6 +48,8 @@ __attribute__((always_inline)) INLINE static void darkmatter_read_particles(
  *
  * @param h_grp The HDF5 group in which to write the arrays.
  * @param fileName The name of the file (unsued in MPI mode).
+ * @param partTypeGroupName The name of the group containing the particles in
+ *the HDF5 file.
  * @param xmfFile The XMF file to write to (unused in MPI mode).
  * @param Ndm The number of DM particles on that MPI rank.
  * @param Ndm_total The total number of g-particles (only used in MPI mode)
@@ -59,17 +61,20 @@ __attribute__((always_inline)) INLINE static void darkmatter_read_particles(
  *
  */
 __attribute__((always_inline)) INLINE static void darkmatter_write_particles(
-    hid_t h_grp, char* fileName, FILE* xmfFile, int Ndm, long long Ndm_total,
-    int mpi_rank, long long offset, struct gpart* gparts,
-    struct UnitSystem* us) {
+    hid_t h_grp, char* fileName, char* partTypeGroupName, FILE* xmfFile,
+    int Ndm, long long Ndm_total, int mpi_rank, long long offset,
+    struct gpart* gparts, struct UnitSystem* us) {
 
   /* Write arrays */
-  writeArray(h_grp, fileName, xmfFile, "Coordinates", DOUBLE, Ndm, 3, gparts,
-             Ndm_total, mpi_rank, offset, x, us, UNIT_CONV_LENGTH);
-  writeArray(h_grp, fileName, xmfFile, "Masses", FLOAT, Ndm, 1, gparts,
-             Ndm_total, mpi_rank, offset, mass, us, UNIT_CONV_MASS);
-  writeArray(h_grp, fileName, xmfFile, "Velocities", FLOAT, Ndm, 3, gparts,
-             Ndm_total, mpi_rank, offset, v_full, us, UNIT_CONV_SPEED);
-  writeArray(h_grp, fileName, xmfFile, "ParticleIDs", ULONGLONG, Ndm, 1, gparts,
-             Ndm_total, mpi_rank, offset, id, us, UNIT_CONV_NO_UNITS);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Coordinates", DOUBLE,
+             Ndm, 3, gparts, Ndm_total, mpi_rank, offset, x, us,
+             UNIT_CONV_LENGTH);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Masses", FLOAT, Ndm,
+             1, gparts, Ndm_total, mpi_rank, offset, mass, us, UNIT_CONV_MASS);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Velocities", FLOAT,
+             Ndm, 3, gparts, Ndm_total, mpi_rank, offset, v_full, us,
+             UNIT_CONV_SPEED);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "ParticleIDs",
+             ULONGLONG, Ndm, 1, gparts, Ndm_total, mpi_rank, offset, id, us,
+             UNIT_CONV_NO_UNITS);
 }
diff --git a/src/hydro/Default/hydro_io.h b/src/hydro/Default/hydro_io.h
index 958bf5a1869718b57678246ff3b1985e54145824..0e9ad46ddc1d4e8c8d3ffdbf3e81262ec49a7092 100644
--- a/src/hydro/Default/hydro_io.h
+++ b/src/hydro/Default/hydro_io.h
@@ -56,6 +56,8 @@ __attribute__((always_inline)) INLINE static void hydro_read_particles(
  *
  * @param h_grp The HDF5 group in which to write the arrays.
  * @param fileName The name of the file (unsued in MPI mode).
+ * @param partTypeGroupName The name of the group containing the particles in
+ *the HDF5 file.
  * @param xmfFile The XMF file to write to (unused in MPI mode).
  * @param N The number of particles on that MPI rank.
  * @param N_total The total number of particles (only used in MPI mode)
@@ -67,26 +69,31 @@ __attribute__((always_inline)) INLINE static void hydro_read_particles(
  *
  */
 __attribute__((always_inline)) INLINE static void hydro_write_particles(
-    hid_t h_grp, char* fileName, FILE* xmfFile, int N, long long N_total,
-    int mpi_rank, long long offset, struct part* parts, struct UnitSystem* us) {
+    hid_t h_grp, char* fileName, char* partTypeGroupName, FILE* xmfFile, int N,
+    long long N_total, int mpi_rank, long long offset, struct part* parts,
+    struct UnitSystem* us) {
 
   /* Write arrays */
-  writeArray(h_grp, fileName, xmfFile, "Coordinates", DOUBLE, N, 3, parts,
-             N_total, mpi_rank, offset, x, us, UNIT_CONV_LENGTH);
-  writeArray(h_grp, fileName, xmfFile, "Velocities", FLOAT, N, 3, parts,
-             N_total, mpi_rank, offset, v, us, UNIT_CONV_SPEED);
-  writeArray(h_grp, fileName, xmfFile, "Masses", FLOAT, N, 1, parts, N_total,
-             mpi_rank, offset, mass, us, UNIT_CONV_MASS);
-  writeArray(h_grp, fileName, xmfFile, "SmoothingLength", FLOAT, N, 1, parts,
-             N_total, mpi_rank, offset, h, us, UNIT_CONV_LENGTH);
-  writeArray(h_grp, fileName, xmfFile, "InternalEnergy", FLOAT, N, 1, parts,
-             N_total, mpi_rank, offset, u, us, UNIT_CONV_ENERGY_PER_UNIT_MASS);
-  writeArray(h_grp, fileName, xmfFile, "ParticleIDs", ULONGLONG, N, 1, parts,
-             N_total, mpi_rank, offset, id, us, UNIT_CONV_NO_UNITS);
-  writeArray(h_grp, fileName, xmfFile, "Acceleration", FLOAT, N, 3, parts,
-             N_total, mpi_rank, offset, a_hydro, us, UNIT_CONV_ACCELERATION);
-  writeArray(h_grp, fileName, xmfFile, "Density", FLOAT, N, 1, parts, N_total,
-             mpi_rank, offset, rho, us, UNIT_CONV_DENSITY);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Coordinates", DOUBLE,
+             N, 3, parts, N_total, mpi_rank, offset, x, us, UNIT_CONV_LENGTH);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Velocities", FLOAT,
+             N, 3, parts, N_total, mpi_rank, offset, v, us, UNIT_CONV_SPEED);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Masses", FLOAT, N, 1,
+             parts, N_total, mpi_rank, offset, mass, us, UNIT_CONV_MASS);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "SmoothingLength",
+             FLOAT, N, 1, parts, N_total, mpi_rank, offset, h, us,
+             UNIT_CONV_LENGTH);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "InternalEnergy",
+             FLOAT, N, 1, parts, N_total, mpi_rank, offset, u, us,
+             UNIT_CONV_ENERGY_PER_UNIT_MASS);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "ParticleIDs",
+             ULONGLONG, N, 1, parts, N_total, mpi_rank, offset, id, us,
+             UNIT_CONV_NO_UNITS);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Acceleration", FLOAT,
+             N, 3, parts, N_total, mpi_rank, offset, a_hydro, us,
+             UNIT_CONV_ACCELERATION);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Density", FLOAT, N,
+             1, parts, N_total, mpi_rank, offset, rho, us, UNIT_CONV_DENSITY);
 }
 
 /**
diff --git a/src/hydro/Gadget2/hydro_io.h b/src/hydro/Gadget2/hydro_io.h
index 17c3d3013644c3572f3c26fc3e270b1c1bc465ed..c1c59dfa4980a2843e7e13bee4c964c9b254cae6 100644
--- a/src/hydro/Gadget2/hydro_io.h
+++ b/src/hydro/Gadget2/hydro_io.h
@@ -56,6 +56,8 @@ __attribute__((always_inline)) INLINE static void hydro_read_particles(
  *
  * @param h_grp The HDF5 group in which to write the arrays.
  * @param fileName The name of the file (unsued in MPI mode).
+ * @param partTypeGroupName The name of the group containing the particles in
+ *the HDF5 file.
  * @param xmfFile The XMF file to write to (unused in MPI mode).
  * @param N The number of particles on that MPI rank.
  * @param N_total The total number of particles (only used in MPI mode)
@@ -67,27 +69,31 @@ __attribute__((always_inline)) INLINE static void hydro_read_particles(
  *
  */
 __attribute__((always_inline)) INLINE static void hydro_write_particles(
-    hid_t h_grp, char* fileName, FILE* xmfFile, int N, long long N_total,
-    int mpi_rank, long long offset, struct part* parts, struct UnitSystem* us) {
+    hid_t h_grp, char* fileName, char* partTypeGroupName, FILE* xmfFile, int N,
+    long long N_total, int mpi_rank, long long offset, struct part* parts,
+    struct UnitSystem* us) {
 
   /* Write arrays */
-  writeArray(h_grp, fileName, xmfFile, "Coordinates", DOUBLE, N, 3, parts,
-             N_total, mpi_rank, offset, x, us, UNIT_CONV_LENGTH);
-  writeArray(h_grp, fileName, xmfFile, "Velocities", FLOAT, N, 3, parts,
-             N_total, mpi_rank, offset, v, us, UNIT_CONV_SPEED);
-  writeArray(h_grp, fileName, xmfFile, "Masses", FLOAT, N, 1, parts, N_total,
-             mpi_rank, offset, mass, us, UNIT_CONV_MASS);
-  writeArray(h_grp, fileName, xmfFile, "SmoothingLength", FLOAT, N, 1, parts,
-             N_total, mpi_rank, offset, h, us, UNIT_CONV_LENGTH);
-  writeArray(h_grp, fileName, xmfFile, "InternalEnergy", FLOAT, N, 1, parts,
-             N_total, mpi_rank, offset, entropy, us,
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Coordinates", DOUBLE,
+             N, 3, parts, N_total, mpi_rank, offset, x, us, UNIT_CONV_LENGTH);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Velocities", FLOAT,
+             N, 3, parts, N_total, mpi_rank, offset, v, us, UNIT_CONV_SPEED);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Masses", FLOAT, N, 1,
+             parts, N_total, mpi_rank, offset, mass, us, UNIT_CONV_MASS);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "SmoothingLength",
+             FLOAT, N, 1, parts, N_total, mpi_rank, offset, h, us,
+             UNIT_CONV_LENGTH);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "InternalEnergy",
+             FLOAT, N, 1, parts, N_total, mpi_rank, offset, entropy, us,
              UNIT_CONV_ENTROPY_PER_UNIT_MASS);
-  writeArray(h_grp, fileName, xmfFile, "ParticleIDs", ULONGLONG, N, 1, parts,
-             N_total, mpi_rank, offset, id, us, UNIT_CONV_NO_UNITS);
-  writeArray(h_grp, fileName, xmfFile, "Acceleration", FLOAT, N, 3, parts,
-             N_total, mpi_rank, offset, a_hydro, us, UNIT_CONV_ACCELERATION);
-  writeArray(h_grp, fileName, xmfFile, "Density", FLOAT, N, 1, parts, N_total,
-             mpi_rank, offset, rho, us, UNIT_CONV_DENSITY);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "ParticleIDs",
+             ULONGLONG, N, 1, parts, N_total, mpi_rank, offset, id, us,
+             UNIT_CONV_NO_UNITS);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Acceleration", FLOAT,
+             N, 3, parts, N_total, mpi_rank, offset, a_hydro, us,
+             UNIT_CONV_ACCELERATION);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Density", FLOAT, N,
+             1, parts, N_total, mpi_rank, offset, rho, us, UNIT_CONV_DENSITY);
 }
 
 /**
diff --git a/src/hydro/Minimal/hydro_io.h b/src/hydro/Minimal/hydro_io.h
index 2c56fb489ab84ca7c30426b54cf95e26e3821084..afe5de83f423e43b4d2480cca1ac3e84d6c549de 100644
--- a/src/hydro/Minimal/hydro_io.h
+++ b/src/hydro/Minimal/hydro_io.h
@@ -56,6 +56,8 @@ __attribute__((always_inline)) INLINE static void hydro_read_particles(
  *
  * @param h_grp The HDF5 group in which to write the arrays.
  * @param fileName The name of the file (unsued in MPI mode).
+ * @param partTypeGroupName The name of the group containing the particles in
+ *the HDF5 file.
  * @param xmfFile The XMF file to write to (unused in MPI mode).
  * @param N The number of particles on that MPI rank.
  * @param N_total The total number of particles (only used in MPI mode)
@@ -67,26 +69,31 @@ __attribute__((always_inline)) INLINE static void hydro_read_particles(
  *
  */
 __attribute__((always_inline)) INLINE static void hydro_write_particles(
-    hid_t h_grp, char* fileName, FILE* xmfFile, int N, long long N_total,
-    int mpi_rank, long long offset, struct part* parts, struct UnitSystem* us) {
+    hid_t h_grp, char* fileName, char* partTypeGroupName, FILE* xmfFile, int N,
+    long long N_total, int mpi_rank, long long offset, struct part* parts,
+    struct UnitSystem* us) {
 
   /* Write arrays */
-  writeArray(h_grp, fileName, xmfFile, "Coordinates", DOUBLE, N, 3, parts,
-             N_total, mpi_rank, offset, x, us, UNIT_CONV_LENGTH);
-  writeArray(h_grp, fileName, xmfFile, "Velocities", FLOAT, N, 3, parts,
-             N_total, mpi_rank, offset, v, us, UNIT_CONV_SPEED);
-  writeArray(h_grp, fileName, xmfFile, "Masses", FLOAT, N, 1, parts, N_total,
-             mpi_rank, offset, mass, us, UNIT_CONV_MASS);
-  writeArray(h_grp, fileName, xmfFile, "SmoothingLength", FLOAT, N, 1, parts,
-             N_total, mpi_rank, offset, h, us, UNIT_CONV_LENGTH);
-  writeArray(h_grp, fileName, xmfFile, "InternalEnergy", FLOAT, N, 1, parts,
-             N_total, mpi_rank, offset, u, us, UNIT_CONV_ENERGY_PER_UNIT_MASS);
-  writeArray(h_grp, fileName, xmfFile, "ParticleIDs", ULONGLONG, N, 1, parts,
-             N_total, mpi_rank, offset, id, us, UNIT_CONV_NO_UNITS);
-  writeArray(h_grp, fileName, xmfFile, "Acceleration", FLOAT, N, 3, parts,
-             N_total, mpi_rank, offset, a_hydro, us, UNIT_CONV_ACCELERATION);
-  writeArray(h_grp, fileName, xmfFile, "Density", FLOAT, N, 1, parts, N_total,
-             mpi_rank, offset, rho, us, UNIT_CONV_DENSITY);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Coordinates", DOUBLE,
+             N, 3, parts, N_total, mpi_rank, offset, x, us, UNIT_CONV_LENGTH);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Velocities", FLOAT,
+             N, 3, parts, N_total, mpi_rank, offset, v, us, UNIT_CONV_SPEED);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Masses", FLOAT, N, 1,
+             parts, N_total, mpi_rank, offset, mass, us, UNIT_CONV_MASS);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "SmoothingLength",
+             FLOAT, N, 1, parts, N_total, mpi_rank, offset, h, us,
+             UNIT_CONV_LENGTH);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "InternalEnergy",
+             FLOAT, N, 1, parts, N_total, mpi_rank, offset, u, us,
+             UNIT_CONV_ENERGY_PER_UNIT_MASS);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "ParticleIDs",
+             ULONGLONG, N, 1, parts, N_total, mpi_rank, offset, id, us,
+             UNIT_CONV_NO_UNITS);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Acceleration", FLOAT,
+             N, 3, parts, N_total, mpi_rank, offset, a_hydro, us,
+             UNIT_CONV_ACCELERATION);
+  writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Density", FLOAT, N,
+             1, parts, N_total, mpi_rank, offset, rho, us, UNIT_CONV_DENSITY);
 }
 
 /**
diff --git a/src/parallel_io.c b/src/parallel_io.c
index cffa99a0fd75566ec3e850076d15e104504eeb40..0076c225e1c5361287280f8a567c8062aefd914e 100644
--- a/src/parallel_io.c
+++ b/src/parallel_io.c
@@ -178,9 +178,10 @@ void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N,
  *
  * Calls #error() if an error occurs.
  */
-void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
-                       enum DATA_TYPE type, int N, int dim, long long N_total,
-                       int mpi_rank, long long offset, char* part_c,
+void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile,
+                       char* partTypeGroupName, char* name, enum DATA_TYPE type,
+                       int N, int dim, long long N_total, int mpi_rank,
+                       long long offset, char* part_c, size_t partSize,
                        struct UnitSystem* us,
                        enum UnitConversionFactor convFactor) {
   hid_t h_data = 0, h_err = 0, h_memspace = 0, h_filespace = 0, h_plist_id = 0;
@@ -189,7 +190,6 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
   int i = 0, rank = 0;
   const size_t typeSize = sizeOfType(type);
   const size_t copySize = typeSize * dim;
-  const size_t partSize = sizeof(struct part);
   char* temp_c = 0;
   char buffer[150];
 
@@ -269,7 +269,9 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
   }
 
   /* Write XMF description for this data set */
-  if (mpi_rank == 0) writeXMFline(xmfFile, fileName, name, N_total, dim, type);
+  if (mpi_rank == 0)
+    writeXMFline(xmfFile, fileName, partTypeGroupName, name, N_total, dim,
+                 type);
 
   /* Write unit conversion factors for this data set */
   conversionString(buffer, us, convFactor);
@@ -328,14 +330,16 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
  * @param convFactor The UnitConversionFactor for this array
  *
  */
-#define writeArray(grp, fileName, xmfFile, name, type, N, dim, part, N_total, \
-                   mpi_rank, offset, field, us, convFactor)                   \
-  writeArrayBackEnd(grp, fileName, xmfFile, name, type, N, dim, N_total,      \
-                    mpi_rank, offset, (char*)(&(part[0]).field), us,          \
-                    convFactor)
+#define writeArray(grp, fileName, xmfFile, pTypeGroupName, name, type, N, dim, \
+                   part, N_total, mpi_rank, offset, field, us, convFactor)     \
+  writeArrayBackEnd(grp, fileName, xmfFile, pTypeGroupName, name, type, N,     \
+                    dim, N_total, mpi_rank, offset, (char*)(&(part[0]).field), \
+                    sizeof(part[0]), us, convFactor)
 
 /* Import the right hydro definition */
 #include "hydro_io.h"
+/* Import the right gravity definition */
+#include "gravity_io.h"
 
 /**
  * @brief Reads an HDF5 initial condition file (GADGET-3 type) in parallel
@@ -357,16 +361,17 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
  *
  */
 void read_ic_parallel(char* fileName, double dim[3], struct part** parts,
-                      size_t* N, int* periodic, int mpi_rank, int mpi_size,
-                      MPI_Comm comm, MPI_Info info) {
+                      struct gpart** gparts, size_t* Ngas, size_t* Ngparts,
+                      int* periodic, int mpi_rank, int mpi_size, MPI_Comm comm,
+                      MPI_Info info) {
   hid_t h_file = 0, h_grp = 0;
-  double boxSize[3] = {
-      0.0, -1.0, -1.0}; /* GADGET has only cubic boxes (in cosmological mode) */
-  int numParticles[6] = {
-      0}; /* GADGET has 6 particle types. We only keep the type 0*/
-  int numParticles_highWord[6] = {0};
-  long long offset = 0;
-  long long N_total = 0;
+  /* GADGET has only cubic boxes (in cosmological mode) */
+  double boxSize[3] = {0.0, -1.0, -1.0};
+  int numParticles[NUM_PARTICLE_TYPES] = {0};
+  int numParticles_highWord[NUM_PARTICLE_TYPES] = {0};
+  size_t N[NUM_PARTICLE_TYPES] = {0};
+  long long N_total[NUM_PARTICLE_TYPES] = {0};
+  long long offset[NUM_PARTICLE_TYPES] = {0};
 
   /* Open file */
   /* message("Opening file '%s' as IC.", fileName); */
@@ -398,58 +403,116 @@ void read_ic_parallel(char* fileName, double dim[3], struct part** parts,
   readAttribute(h_grp, "NumPart_Total", UINT, numParticles);
   readAttribute(h_grp, "NumPart_Total_HighWord", UINT, numParticles_highWord);
 
-  N_total = ((long long)numParticles[0]) +
-            ((long long)numParticles_highWord[0] << 32);
+  for (int ptype = 0; ptype < NUM_PARTICLE_TYPES; ++ptype)
+    N_total[ptype] = ((long long)numParticles[ptype]) +
+                     ((long long)numParticles_highWord[ptype] << 32);
+
   dim[0] = boxSize[0];
   dim[1] = (boxSize[1] < 0) ? boxSize[0] : boxSize[1];
   dim[2] = (boxSize[2] < 0) ? boxSize[0] : boxSize[2];
 
-  /* message("Found %d particles in a %speriodic box of size [%f %f %f].",  */
-  /* 	 N_total, (periodic ? "": "non-"), dim[0], dim[1], dim[2]); */
+  /* message("Found %d particles in a %speriodic box of size
+   * [%f %f %f].",  */
+  /* 	 N_total, (periodic ? "": "non-"), dim[0],
+   * dim[1], dim[2]); */
 
   /* Divide the particles among the tasks. */
-  offset = mpi_rank * N_total / mpi_size;
-  *N = (mpi_rank + 1) * N_total / mpi_size - offset;
+  for (int ptype = 0; ptype < NUM_PARTICLE_TYPES; ++ptype) {
+    offset[ptype] = mpi_rank * N_total[ptype] / mpi_size;
+    N[ptype] = (mpi_rank + 1) * N_total[ptype] / mpi_size - offset[ptype];
+  }
 
   /* Close header */
   H5Gclose(h_grp);
 
-  /* Allocate memory to store particles */
-  if (posix_memalign((void*)parts, part_align, *N * sizeof(struct part)) != 0)
+  /* Allocate memory to store SPH particles */
+  *Ngas = N[0];
+  if (posix_memalign((void*)parts, part_align, (*Ngas) * sizeof(struct part)) !=
+      0)
     error("Error while allocating memory for particles");
-  bzero(*parts, *N * sizeof(struct part));
+  bzero(*parts, *Ngas * sizeof(struct part));
 
-  /* message("Allocated %8.2f MB for particles.", *N * sizeof(struct part) /
+  /* Allocate memory to store all particles */
+  const size_t Ndm = N[1];
+  *Ngparts = N[1] + N[0];
+  if (posix_memalign((void*)gparts, gpart_align,
+                     *Ngparts * sizeof(struct gpart)) != 0)
+    error(
+        "Error while allocating memory for gravity "
+        "particles");
+  bzero(*gparts, *Ngparts * sizeof(struct gpart));
+
+  /* message("Allocated %8.2f MB for particles.", *N *
+   * sizeof(struct part) /
    * (1024.*1024.)); */
 
-  /* Open SPH particles group */
-  /* message("Reading particle arrays..."); */
-  h_grp = H5Gopen(h_file, "/PartType0", H5P_DEFAULT);
-  if (h_grp < 0) error("Error while opening particle group.\n");
+  /* message("BoxSize = %lf", dim[0]); */
+  /* message("NumPart = [%zd, %zd] Total = %zd", *Ngas, Ndm,
+   * *Ngparts); */
 
-  /* Read particle fields into the particle structure */
-  hydro_read_particles(h_grp, *N, N_total, offset, *parts);
+  /* Loop over all particle types */
+  for (int ptype = 0; ptype < NUM_PARTICLE_TYPES; ptype++) {
 
-  /* Close particle group */
-  H5Gclose(h_grp);
+    /* Don't do anything if no particle of this kind */
+    if (N_total[ptype] == 0) continue;
+
+    /* Open the particle group in the file */
+    char partTypeGroupName[PARTICLE_GROUP_BUFFER_SIZE];
+    snprintf(partTypeGroupName, PARTICLE_GROUP_BUFFER_SIZE, "/PartType%d",
+             ptype);
+    h_grp = H5Gopen(h_file, partTypeGroupName, H5P_DEFAULT);
+    if (h_grp < 0) {
+      error("Error while opening particle group %s.", partTypeGroupName);
+    }
+
+    /* Read particle fields into the particle structure */
+    switch (ptype) {
+
+      case GAS:
+        hydro_read_particles(h_grp, N[ptype], N_total[ptype], offset[ptype],
+                             *parts);
+        break;
+
+      case DM:
+        darkmatter_read_particles(h_grp, N[ptype], N_total[ptype],
+                                  offset[ptype], *gparts);
+        break;
+
+      default:
+        error("Particle Type %d not yet supported. Aborting", ptype);
+    }
+
+    /* Close particle group */
+    H5Gclose(h_grp);
+  }
+
+  /* Prepare the DM particles */
+  prepare_dm_gparts(*gparts, Ndm);
+
+  /* Now duplicate the hydro particle into gparts */
+  duplicate_hydro_gparts(*parts, *gparts, *Ngas, Ndm);
+
+  /* message("Done Reading particles..."); */
 
   /* Close property handler */
   H5Pclose(h_plist_id);
 
   /* Close file */
   H5Fclose(h_file);
-
-  /* message("Done Reading particles..."); */
 }
 
 /**
- * @brief Writes an HDF5 output file (GADGET-3 type) with its XMF descriptor
+ * @brief Writes an HDF5 output file (GADGET-3 type) with
+ *its XMF descriptor
  *
  * @param e The engine containing all the system.
- * @param us The UnitSystem used for the conversion of units in the output
+ * @param us The UnitSystem used for the conversion of units
+ *in the output
  *
- * Creates an HDF5 output file and writes the particles contained
- * in the engine. If such a file already exists, it is erased and replaced
+ * Creates an HDF5 output file and writes the particles
+ *contained
+ * in the engine. If such a file already exists, it is
+ *erased and replaced
  * by the new one.
  * The companion XMF file is also updated accordingly.
  *
@@ -459,23 +522,27 @@ void read_ic_parallel(char* fileName, double dim[3], struct part** parts,
 void write_output_parallel(struct engine* e, struct UnitSystem* us,
                            int mpi_rank, int mpi_size, MPI_Comm comm,
                            MPI_Info info) {
-
   hid_t h_file = 0, h_grp = 0, h_grpsph = 0;
-  int N = e->s->nr_parts;
+  const size_t Ngas = e->s->nr_parts;
+  const size_t Ntot = e->s->nr_gparts;
   int periodic = e->s->periodic;
-  unsigned int numParticles[6] = {N, 0};
-  unsigned int numParticlesHighWord[6] = {0};
-  unsigned int flagEntropy[6] = {0};
-  long long N_total = 0, offset = 0;
-  double offset_d = 0., N_d = 0., N_total_d = 0.;
   int numFiles = 1;
   struct part* parts = e->s->parts;
-  FILE* xmfFile = 0;
+  struct gpart* gparts = e->s->gparts;
+  struct gpart* dmparts = NULL;
   static int outputCount = 0;
+  FILE* xmfFile = 0;
+
+  /* Number of particles of each type */
+  // const size_t Ndm = Ntot - Ngas;
+
+  /* MATTHIEU: Temporary fix to preserve master */
+  const size_t Ndm = Ntot > 0 ? Ntot - Ngas : 0;
+  /* MATTHIEU: End temporary fix */
 
   /* File name */
-  char fileName[200];
-  sprintf(fileName, "output_%03i.hdf5", outputCount);
+  char fileName[FILENAME_BUFFER_SIZE];
+  snprintf(fileName, FILENAME_BUFFER_SIZE, "output_%03i.hdf5", outputCount);
 
   /* First time, we need to create the XMF file */
   if (outputCount == 0 && mpi_rank == 0) createXMFfile();
@@ -491,21 +558,26 @@ void write_output_parallel(struct engine* e, struct UnitSystem* us,
     error("Error while opening file '%s'.", fileName);
   }
 
-  /* Compute offset in the file and total number of particles */
-  /* Done using double to allow for up to 2^50=10^15 particles */
-  N_d = (double)N;
-  MPI_Exscan(&N_d, &offset_d, 1, MPI_DOUBLE, MPI_SUM, comm);
-  N_total_d = offset_d + N_d;
-  MPI_Bcast(&N_total_d, 1, MPI_DOUBLE, mpi_size - 1, comm);
-  if (N_total_d > 1.e15)
-    error(
-        "Error while computing the offset for parallel output: Simulation has "
-        "more than 10^15 particles.\n");
-  N_total = (long long)N_total_d;
-  offset = (long long)offset_d;
+  /* Compute offset in the file and total number of
+   * particles */
+  size_t N[NUM_PARTICLE_TYPES] = {Ngas, Ndm, 0};
+  long long N_total[NUM_PARTICLE_TYPES] = {0};
+  long long offset[NUM_PARTICLE_TYPES] = {0};
+  MPI_Exscan(&N, &offset, NUM_PARTICLE_TYPES, MPI_LONG_LONG, MPI_SUM, comm);
+  for (int ptype = 0; ptype < NUM_PARTICLE_TYPES; ++ptype)
+    N_total[ptype] = offset[ptype] + N[ptype];
+
+  /* The last rank now has the correct N_total. Let's
+   * broadcast from there */
+  MPI_Bcast(&N_total, 6, MPI_LONG_LONG, mpi_size - 1, comm);
 
-  /* Write the part of the XMF file corresponding to this specific output */
-  if (mpi_rank == 0) writeXMFheader(xmfFile, N_total, fileName, e->time);
+  /* Now everybody konws its offset and the total number of
+   * particles of each
+   * type */
+
+  /* Write the part of the XMF file corresponding to this
+   * specific output */
+  if (mpi_rank == 0) writeXMFoutputheader(xmfFile, fileName, e->time);
 
   /* Open header to write simulation properties */
   /* message("Writing runtime parameters..."); */
@@ -526,19 +598,28 @@ void write_output_parallel(struct engine* e, struct UnitSystem* us,
 
   /* Print the relevant information and print status */
   writeAttribute(h_grp, "BoxSize", DOUBLE, e->s->dim, 3);
-  writeAttribute(h_grp, "NumPart_ThisFile", UINT, numParticles, 6);
   double dblTime = e->time;
   writeAttribute(h_grp, "Time", DOUBLE, &dblTime, 1);
 
   /* GADGET-2 legacy values */
-  numParticles[0] = (unsigned int)N_total;
-  writeAttribute(h_grp, "NumPart_Total", UINT, numParticles, 6);
-  numParticlesHighWord[0] = (unsigned int)(N_total >> 32);
+  /* Number of particles of each type */
+  unsigned int numParticles[NUM_PARTICLE_TYPES] = {0};
+  unsigned int numParticlesHighWord[NUM_PARTICLE_TYPES] = {0};
+  for (int ptype = 0; ptype < NUM_PARTICLE_TYPES; ++ptype) {
+    numParticles[ptype] = (unsigned int)N_total[ptype];
+    numParticlesHighWord[ptype] = (unsigned int)(N_total[ptype] >> 32);
+  }
+  writeAttribute(h_grp, "NumPart_ThisFile", LONGLONG, N_total,
+                 NUM_PARTICLE_TYPES);
+  writeAttribute(h_grp, "NumPart_Total", UINT, numParticles,
+                 NUM_PARTICLE_TYPES);
   writeAttribute(h_grp, "NumPart_Total_HighWord", UINT, numParticlesHighWord,
-                 6);
+                 NUM_PARTICLE_TYPES);
   double MassTable[6] = {0., 0., 0., 0., 0., 0.};
-  writeAttribute(h_grp, "MassTable", DOUBLE, MassTable, 6);
-  writeAttribute(h_grp, "Flag_Entropy_ICs", UINT, flagEntropy, 6);
+  writeAttribute(h_grp, "MassTable", DOUBLE, MassTable, NUM_PARTICLE_TYPES);
+  unsigned int flagEntropy[NUM_PARTICLE_TYPES] = {0};
+  writeAttribute(h_grp, "Flag_Entropy_ICs", UINT, flagEntropy,
+                 NUM_PARTICLE_TYPES);
   writeAttribute(h_grp, "NumFilesPerSnapshot", INT, &numFiles, 1);
 
   /* Close header */
@@ -556,21 +637,71 @@ void write_output_parallel(struct engine* e, struct UnitSystem* us,
   /* Print the system of Units */
   writeUnitSystem(h_file, us);
 
-  /* Create SPH particles group */
-  /* message("Writing particle arrays..."); */
-  h_grp =
-      H5Gcreate(h_file, "/PartType0", H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-  if (h_grp < 0) error("Error while creating particle group.\n");
+  /* Loop over all particle types */
+  for (int ptype = 0; ptype < NUM_PARTICLE_TYPES; ptype++) {
+
+    /* Don't do anything if no particle of this kind */
+    if (N_total[ptype] == 0) continue;
+
+    /* Add the global information for that particle type to
+     * the XMF meta-file */
+    if (mpi_rank == 0)
+      writeXMFgroupheader(xmfFile, fileName, N_total[ptype], ptype);
+
+    /* Open the particle group in the file */
+    char partTypeGroupName[PARTICLE_GROUP_BUFFER_SIZE];
+    snprintf(partTypeGroupName, PARTICLE_GROUP_BUFFER_SIZE, "/PartType%d",
+             ptype);
+    h_grp = H5Gcreate(h_file, partTypeGroupName, H5P_DEFAULT, H5P_DEFAULT,
+                      H5P_DEFAULT);
+    if (h_grp < 0) {
+      error("Error while opening particle group %s.", partTypeGroupName);
+    }
 
-  /* Write particle fields from the particle structure */
-  hydro_write_particles(h_grp, fileName, xmfFile, N, N_total, mpi_rank, offset,
-                        parts, us);
+    /* Read particle fields into the particle structure */
+    switch (ptype) {
 
-  /* Close particle group */
-  H5Gclose(h_grp);
+      case GAS:
+        hydro_write_particles(h_grp, fileName, partTypeGroupName, xmfFile,
+                              N[ptype], N_total[ptype], mpi_rank, offset[ptype],
+                              parts, us);
+
+        break;
+
+      case DM:
+        /* Allocate temporary array */
+        if (posix_memalign((void*)&dmparts, gpart_align,
+                           Ndm * sizeof(struct gpart)) != 0)
+          error(
+              "Error while allocating temporart memory for "
+              "DM particles");
+        bzero(dmparts, Ndm * sizeof(struct gpart));
+
+        /* Collect the DM particles from gpart */
+        collect_dm_gparts(gparts, Ntot, dmparts, Ndm);
+
+        /* Write DM particles */
+        darkmatter_write_particles(h_grp, fileName, partTypeGroupName, xmfFile,
+                                   N[ptype], N_total[ptype], mpi_rank,
+                                   offset[ptype], dmparts, us);
+
+        /* Free temporary array */
+        free(dmparts);
+        break;
+
+      default:
+        error("Particle Type %d not yet supported. Aborting", ptype);
+    }
+
+    /* Close particle group */
+    H5Gclose(h_grp);
+
+    /* Close this particle group in the XMF file as well */
+    if (mpi_rank == 0) writeXMFgroupfooter(xmfFile, ptype);
+  }
 
   /* Write LXMF file descriptor */
-  if (mpi_rank == 0) writeXMFfooter(xmfFile);
+  if (mpi_rank == 0) writeXMFoutputfooter(xmfFile, outputCount, e->time);
 
   /* message("Done writing particles..."); */
 
diff --git a/src/parallel_io.h b/src/parallel_io.h
index a0589944ec845c712abde1e64e305980748db0e7..663f0aabac44c08682b964512839b925673ea5c5 100644
--- a/src/parallel_io.h
+++ b/src/parallel_io.h
@@ -32,8 +32,9 @@
 #if defined(HAVE_HDF5) && defined(WITH_MPI) && defined(HAVE_PARALLEL_HDF5)
 
 void read_ic_parallel(char* fileName, double dim[3], struct part** parts,
-                      size_t* N, int* periodic, int mpi_rank, int mpi_size,
-                      MPI_Comm comm, MPI_Info info);
+                      struct gpart** gparts, size_t* Ngas, size_t* Ngparts,
+                      int* periodic, int mpi_rank, int mpi_size, MPI_Comm comm,
+                      MPI_Info info);
 
 void write_output_parallel(struct engine* e, struct UnitSystem* us,
                            int mpi_rank, int mpi_size, MPI_Comm comm,
diff --git a/src/part.c b/src/part.c
index 6a99325ef23a7062fafb387fa3f3bd6b2203d057..b89abdde40fe8c7a57d1e9ac9e18fece83ba1f21 100644
--- a/src/part.c
+++ b/src/part.c
@@ -26,33 +26,21 @@
 #endif
 
 /* This object's header. */
+#include "error.h"
 #include "part.h"
 
 #ifdef WITH_MPI
-/**
- * @brief Registers and returns an MPI type for the particles
- *
- * @param part_type The type container
- */
-void part_create_mpi_type(MPI_Datatype* part_type) {
-
-  /* This is not the recommended way of doing this.
-     One should define the structure field by field
-     But as long as we don't do serialization via MPI-IO
-     we don't really care.
-     Also we would have to modify this function everytime something
-     is added to the part structure. */
-  MPI_Type_contiguous(sizeof(struct part) / sizeof(unsigned char), MPI_BYTE,
-                      part_type);
-  MPI_Type_commit(part_type);
-}
+/* MPI data type for the particle transfers */
+MPI_Datatype part_mpi_type;
+MPI_Datatype xpart_mpi_type;
+MPI_Datatype gpart_mpi_type;
+#endif
 
+#ifdef WITH_MPI
 /**
- * @brief Registers and returns an MPI type for the xparticles
- *
- * @param xpart_type The type container
+ * @brief Registers MPI particle types.
  */
-void xpart_create_mpi_type(MPI_Datatype* xpart_type) {
+void part_create_mpi_types() {
 
   /* This is not the recommended way of doing this.
      One should define the structure field by field
@@ -60,9 +48,20 @@ void xpart_create_mpi_type(MPI_Datatype* xpart_type) {
      we don't really care.
      Also we would have to modify this function everytime something
      is added to the part structure. */
-  MPI_Type_contiguous(sizeof(struct xpart) / sizeof(unsigned char), MPI_BYTE,
-                      xpart_type);
-  MPI_Type_commit(xpart_type);
+  if (MPI_Type_contiguous(sizeof(struct part) / sizeof(unsigned char), MPI_BYTE,
+                          &part_mpi_type) != MPI_SUCCESS ||
+      MPI_Type_commit(&part_mpi_type) != MPI_SUCCESS) {
+    error("Failed to create MPI type for parts.");
+  }
+  if (MPI_Type_contiguous(sizeof(struct xpart) / sizeof(unsigned char),
+                          MPI_BYTE, &xpart_mpi_type) != MPI_SUCCESS ||
+      MPI_Type_commit(&xpart_mpi_type) != MPI_SUCCESS) {
+    error("Failed to create MPI type for xparts.");
+  }
+  if (MPI_Type_contiguous(sizeof(struct gpart) / sizeof(unsigned char),
+                          MPI_BYTE, &gpart_mpi_type) != MPI_SUCCESS ||
+      MPI_Type_commit(&gpart_mpi_type) != MPI_SUCCESS) {
+    error("Failed to create MPI type for gparts.");
+  }
 }
-
 #endif
diff --git a/src/part.h b/src/part.h
index 865403e8c2c157dc5a8ff7a32bc41be676d7919b..5d4c9c88a1acadea3d23a3df618c04da389fb61d 100644
--- a/src/part.h
+++ b/src/part.h
@@ -35,8 +35,8 @@
 
 /* Some constants. */
 #define part_align 64
-#define gpart_align 32
 #define xpart_align 32
+#define gpart_align 32
 
 /* Import the right particle definition */
 #if defined(MINIMAL_SPH)
@@ -52,8 +52,12 @@
 #include "./gravity/Default/gravity_part.h"
 
 #ifdef WITH_MPI
-void part_create_mpi_type(MPI_Datatype* part_type);
-void xpart_create_mpi_type(MPI_Datatype* xpart_type);
+/* MPI data type for the particle transfers */
+extern MPI_Datatype part_mpi_type;
+extern MPI_Datatype xpart_mpi_type;
+extern MPI_Datatype gpart_mpi_type;
+
+void part_create_mpi_types();
 #endif
 
 #endif /* SWIFT_PART_H */
diff --git a/src/proxy.c b/src/proxy.c
index 7d2e546bf945ca18c2195ea2801d1b2058cb2f58..02263a5653bdcdd2d1bf0a86523ed1a599d4bf21 100644
--- a/src/proxy.c
+++ b/src/proxy.c
@@ -50,11 +50,9 @@ void proxy_cells_exch1(struct proxy *p) {
 
 #ifdef WITH_MPI
 
-  int k, ind;
-
   /* Get the number of pcells we will need to send. */
   p->size_pcells_out = 0;
-  for (k = 0; k < p->nr_cells_out; k++)
+  for (int k = 0; k < p->nr_cells_out; k++)
     p->size_pcells_out += p->cells_out[k]->pcell_size;
 
   /* Send the number of pcells. */
@@ -70,7 +68,7 @@ void proxy_cells_exch1(struct proxy *p) {
   if ((p->pcells_out = malloc(sizeof(struct pcell) * p->size_pcells_out)) ==
       NULL)
     error("Failed to allocate pcell_out buffer.");
-  for (ind = 0, k = 0; k < p->nr_cells_out; k++) {
+  for (int ind = 0, k = 0; k < p->nr_cells_out; k++) {
     memcpy(&p->pcells_out[ind], p->cells_out[k]->pcell,
            sizeof(struct pcell) * p->cells_out[k]->pcell_size);
     ind += p->cells_out[k]->pcell_size;
@@ -131,16 +129,14 @@ void proxy_cells_exch2(struct proxy *p) {
 
 void proxy_addcell_in(struct proxy *p, struct cell *c) {
 
-  int k;
-  struct cell **temp;
-
   /* Check if the cell is already registered with the proxy. */
-  for (k = 0; k < p->nr_cells_in; k++)
+  for (int k = 0; k < p->nr_cells_in; k++)
     if (p->cells_in[k] == c) return;
 
   /* Do we need to grow the number of in cells? */
   if (p->nr_cells_in == p->size_cells_in) {
     p->size_cells_in *= proxy_buffgrow;
+    struct cell **temp;
     if ((temp = malloc(sizeof(struct cell *) * p->size_cells_in)) == NULL)
       error("Failed to allocate incoming cell list.");
     memcpy(temp, p->cells_in, sizeof(struct cell *) * p->nr_cells_in);
@@ -162,16 +158,14 @@ void proxy_addcell_in(struct proxy *p, struct cell *c) {
 
 void proxy_addcell_out(struct proxy *p, struct cell *c) {
 
-  int k;
-  struct cell **temp;
-
   /* Check if the cell is already registered with the proxy. */
-  for (k = 0; k < p->nr_cells_out; k++)
+  for (int k = 0; k < p->nr_cells_out; k++)
     if (p->cells_out[k] == c) return;
 
   /* Do we need to grow the number of out cells? */
   if (p->nr_cells_out == p->size_cells_out) {
     p->size_cells_out *= proxy_buffgrow;
+    struct cell **temp;
     if ((temp = malloc(sizeof(struct cell *) * p->size_cells_out)) == NULL)
       error("Failed to allocate outgoing cell list.");
     memcpy(temp, p->cells_out, sizeof(struct cell *) * p->nr_cells_out);
@@ -195,20 +189,21 @@ void proxy_parts_exch1(struct proxy *p) {
 #ifdef WITH_MPI
 
   /* Send the number of particles. */
-  if (MPI_Isend(&p->nr_parts_out, 1, MPI_INT, p->nodeID,
+  p->buff_out[0] = p->nr_parts_out;
+  p->buff_out[1] = p->nr_gparts_out;
+  if (MPI_Isend(p->buff_out, 2, MPI_INT, p->nodeID,
                 p->mynodeID * proxy_tag_shift + proxy_tag_count, MPI_COMM_WORLD,
                 &p->req_parts_count_out) != MPI_SUCCESS)
     error("Failed to isend nr of parts.");
-  // message( "isent particle count (%i) from node %i to node %i." ,
-  // p->nr_parts_out , p->mynodeID , p->nodeID ); fflush(stdout);
+  /* message( "isent particle counts [%i, %i] from node %i to node %i." ,
+  p->buff_out[0], p->buff_out[1], p->mynodeID , p->nodeID ); fflush(stdout); */
 
   /* Send the particle buffers. */
   if (p->nr_parts_out > 0) {
-    if (MPI_Isend(p->parts_out, sizeof(struct part) * p->nr_parts_out, MPI_BYTE,
-                  p->nodeID, p->mynodeID * proxy_tag_shift + proxy_tag_parts,
+    if (MPI_Isend(p->parts_out, p->nr_parts_out, part_mpi_type, p->nodeID,
+                  p->mynodeID * proxy_tag_shift + proxy_tag_parts,
                   MPI_COMM_WORLD, &p->req_parts_out) != MPI_SUCCESS ||
-        MPI_Isend(p->xparts_out, sizeof(struct xpart) * p->nr_parts_out,
-                  MPI_BYTE, p->nodeID,
+        MPI_Isend(p->xparts_out, p->nr_parts_out, xpart_mpi_type, p->nodeID,
                   p->mynodeID * proxy_tag_shift + proxy_tag_xparts,
                   MPI_COMM_WORLD, &p->req_xparts_out) != MPI_SUCCESS)
       error("Failed to isend part data.");
@@ -219,14 +214,20 @@ void proxy_parts_exch1(struct proxy *p) {
               p->parts_out[k].id, p->parts_out[k].x[0], p->parts_out[k].x[1],
               p->parts_out[k].x[2], p->parts_out[k].h, p->nodeID);*/
   }
+  if (p->nr_gparts_out > 0) {
+    if (MPI_Isend(p->gparts_out, p->nr_gparts_out, gpart_mpi_type, p->nodeID,
+                  p->mynodeID * proxy_tag_shift + proxy_tag_gparts,
+                  MPI_COMM_WORLD, &p->req_gparts_out) != MPI_SUCCESS)
+      error("Failed to isend part data.");
+    // message( "isent gpart data (%i) to node %i." , p->nr_parts_out ,
+    // p->nodeID ); fflush(stdout);
+  }
 
   /* Receive the number of particles. */
-  if (MPI_Irecv(&p->nr_parts_in, 1, MPI_INT, p->nodeID,
+  if (MPI_Irecv(p->buff_in, 2, MPI_INT, p->nodeID,
                 p->nodeID * proxy_tag_shift + proxy_tag_count, MPI_COMM_WORLD,
                 &p->req_parts_count_in) != MPI_SUCCESS)
     error("Failed to irecv nr of parts.");
-// message( "irecv particle count on node %i from node %i." , p->mynodeID ,
-// p->nodeID ); fflush(stdout);
 
 #else
   error("SWIFT was not compiled with MPI support.");
@@ -237,6 +238,10 @@ void proxy_parts_exch2(struct proxy *p) {
 
 #ifdef WITH_MPI
 
+  /* Unpack the incomming parts counts. */
+  p->nr_parts_in = p->buff_in[0];
+  p->nr_gparts_in = p->buff_in[1];
+
   /* Is there enough space in the buffer? */
   if (p->nr_parts_in > p->size_parts_in) {
     do {
@@ -250,19 +255,36 @@ void proxy_parts_exch2(struct proxy *p) {
                                                p->size_parts_in)) == NULL)
       error("Failed to re-allocate parts_in buffers.");
   }
+  if (p->nr_gparts_in > p->size_gparts_in) {
+    do {
+      p->size_gparts_in *= proxy_buffgrow;
+    } while (p->nr_gparts_in > p->size_gparts_in);
+    free(p->gparts_in);
+    if ((p->gparts_in = (struct gpart *)malloc(sizeof(struct gpart) *
+                                               p->size_gparts_in)) == NULL)
+      error("Failed to re-allocate gparts_in buffers.");
+  }
 
   /* Receive the particle buffers. */
   if (p->nr_parts_in > 0) {
-    if (MPI_Irecv(p->parts_in, sizeof(struct part) * p->nr_parts_in, MPI_BYTE,
-                  p->nodeID, p->nodeID * proxy_tag_shift + proxy_tag_parts,
-                  MPI_COMM_WORLD, &p->req_parts_in) != MPI_SUCCESS ||
-        MPI_Irecv(p->xparts_in, sizeof(struct xpart) * p->nr_parts_in, MPI_BYTE,
-                  p->nodeID, p->nodeID * proxy_tag_shift + proxy_tag_xparts,
+    if (MPI_Irecv(p->parts_in, p->nr_parts_in, part_mpi_type, p->nodeID,
+                  p->nodeID * proxy_tag_shift + proxy_tag_parts, MPI_COMM_WORLD,
+                  &p->req_parts_in) != MPI_SUCCESS ||
+        MPI_Irecv(p->xparts_in, p->nr_parts_in, xpart_mpi_type, p->nodeID,
+                  p->nodeID * proxy_tag_shift + proxy_tag_xparts,
                   MPI_COMM_WORLD, &p->req_xparts_in) != MPI_SUCCESS)
       error("Failed to irecv part data.");
     // message( "irecv particle data (%i) from node %i." , p->nr_parts_in ,
     // p->nodeID ); fflush(stdout);
   }
+  if (p->nr_gparts_in > 0) {
+    if (MPI_Irecv(p->gparts_in, p->nr_gparts_in, gpart_mpi_type, p->nodeID,
+                  p->nodeID * proxy_tag_shift + proxy_tag_gparts,
+                  MPI_COMM_WORLD, &p->req_gparts_in) != MPI_SUCCESS)
+      error("Failed to irecv gpart data.");
+    // message( "irecv gpart data (%i) from node %i." , p->nr_gparts_in ,
+    // p->nodeID ); fflush(stdout);
+  }
 
 #else
   error("SWIFT was not compiled with MPI support.");
@@ -278,8 +300,8 @@ void proxy_parts_exch2(struct proxy *p) {
  * @param N The number of parts.
  */
 
-void proxy_parts_load(struct proxy *p, struct part *parts, struct xpart *xparts,
-                      int N) {
+void proxy_parts_load(struct proxy *p, const struct part *parts,
+                      const struct xpart *xparts, int N) {
 
   /* Is there enough space in the buffer? */
   if (p->nr_parts_out + N > p->size_parts_out) {
@@ -309,6 +331,37 @@ void proxy_parts_load(struct proxy *p, struct part *parts, struct xpart *xparts,
   p->nr_parts_out += N;
 }
 
+/**
+ * @brief Load parts onto a proxy for exchange.
+ *
+ * @param p The #proxy.
+ * @param gparts Pointer to an array of #gpart to send.
+ * @param N The number of parts.
+ */
+
+void proxy_gparts_load(struct proxy *p, const struct gpart *gparts, int N) {
+
+  /* Is there enough space in the buffer? */
+  if (p->nr_gparts_out + N > p->size_gparts_out) {
+    do {
+      p->size_gparts_out *= proxy_buffgrow;
+    } while (p->nr_gparts_out + N > p->size_gparts_out);
+    struct gpart *tp;
+    if ((tp = (struct gpart *)malloc(sizeof(struct gpart) *
+                                     p->size_gparts_out)) == NULL)
+      error("Failed to re-allocate gparts_out buffers.");
+    memcpy(tp, p->gparts_out, sizeof(struct gpart) * p->nr_gparts_out);
+    free(p->gparts_out);
+    p->gparts_out = tp;
+  }
+
+  /* Copy the parts and xparts data to the buffer. */
+  memcpy(&p->gparts_out[p->nr_gparts_out], gparts, sizeof(struct gpart) * N);
+
+  /* Increase the counters. */
+  p->nr_gparts_out += N;
+}
+
 /**
  * @brief Initialize the given proxy.
  *
@@ -358,4 +411,20 @@ void proxy_init(struct proxy *p, int mynodeID, int nodeID) {
       error("Failed to allocate parts_out buffers.");
   }
   p->nr_parts_out = 0;
+
+  /* Allocate the gpart send and receive buffers, if needed. */
+  if (p->gparts_in == NULL) {
+    p->size_gparts_in = proxy_buffinit;
+    if ((p->gparts_in = (struct gpart *)malloc(sizeof(struct gpart) *
+                                               p->size_gparts_in)) == NULL)
+      error("Failed to allocate gparts_in buffers.");
+  }
+  p->nr_gparts_in = 0;
+  if (p->gparts_out == NULL) {
+    p->size_gparts_out = proxy_buffinit;
+    if ((p->gparts_out = (struct gpart *)malloc(sizeof(struct gpart) *
+                                                p->size_gparts_out)) == NULL)
+      error("Failed to allocate gparts_out buffers.");
+  }
+  p->nr_gparts_out = 0;
 }
diff --git a/src/proxy.h b/src/proxy.h
index 3cd33e0f0819ee1ecac53213630445b39c809dea..5a747187e05a78a109ce4523ebb3c9d5fe2ad717 100644
--- a/src/proxy.h
+++ b/src/proxy.h
@@ -32,7 +32,8 @@
 #define proxy_tag_count 0
 #define proxy_tag_parts 1
 #define proxy_tag_xparts 2
-#define proxy_tag_cells 3
+#define proxy_tag_gparts 3
+#define proxy_tag_cells 4
 
 /* Data structure for the proxy. */
 struct proxy {
@@ -53,14 +54,21 @@ struct proxy {
   /* The parts and xparts buffers for input and output. */
   struct part *parts_in, *parts_out;
   struct xpart *xparts_in, *xparts_out;
+  struct gpart *gparts_in, *gparts_out;
   int size_parts_in, size_parts_out;
   int nr_parts_in, nr_parts_out;
+  int size_gparts_in, size_gparts_out;
+  int nr_gparts_in, nr_gparts_out;
+
+  /* Buffer to hold the incomming/outgoing particle counts. */
+  int buff_out[2], buff_in[2];
 
 /* MPI request handles. */
 #ifdef WITH_MPI
   MPI_Request req_parts_count_out, req_parts_count_in;
   MPI_Request req_parts_out, req_parts_in;
   MPI_Request req_xparts_out, req_xparts_in;
+  MPI_Request req_gparts_out, req_gparts_in;
   MPI_Request req_cells_count_out, req_cells_count_in;
   MPI_Request req_cells_out, req_cells_in;
 #endif
@@ -68,8 +76,9 @@ struct proxy {
 
 /* Function prototypes. */
 void proxy_init(struct proxy *p, int mynodeID, int nodeID);
-void proxy_parts_load(struct proxy *p, struct part *parts, struct xpart *xparts,
-                      int N);
+void proxy_parts_load(struct proxy *p, const struct part *parts,
+                      const struct xpart *xparts, int N);
+void proxy_gparts_load(struct proxy *p, const struct gpart *gparts, int N);
 void proxy_parts_exch1(struct proxy *p);
 void proxy_parts_exch2(struct proxy *p);
 void proxy_addcell_in(struct proxy *p, struct cell *c);
diff --git a/src/queue.c b/src/queue.c
index a7321155100df9225526c2f19fac2b99531307e4..6b788d7376ba4bdc95f1b1d918ab52a9514e7b4a 100644
--- a/src/queue.c
+++ b/src/queue.c
@@ -136,9 +136,6 @@ struct task *queue_gettask(struct queue *q, const struct task *prev,
   lock_type *qlock = &q->lock;
   struct task *res = NULL;
 
-  /* If there are no tasks, leave immediately. */
-  if (q->count == 0) return NULL;
-
   /* Grab the task lock. */
   if (blocking) {
     if (lock_lock(qlock) != 0) error("Locking the qlock failed.\n");
@@ -146,6 +143,12 @@ struct task *queue_gettask(struct queue *q, const struct task *prev,
     if (lock_trylock(qlock) != 0) return NULL;
   }
 
+  /* If there are no tasks, leave immediately. */
+  if (q->count == 0) {
+    lock_unlock_blind(qlock);
+    return NULL;
+  }
+
   /* Set some pointers we will use often. */
   int *qtid = q->tid;
   struct task *qtasks = q->tasks;
diff --git a/src/runner.c b/src/runner.c
index 7eedb6adc72755ba12faed5429edad43d3849451..5e1b6497d757c8c25230823e7f8d60668f947618 100644
--- a/src/runner.c
+++ b/src/runner.c
@@ -664,7 +664,7 @@ void runner_dodrift(struct runner *r, struct cell *c, int timer) {
   const float ti_current = r->e->ti_current;
   struct part *restrict p, *restrict parts = c->parts;
   struct xpart *restrict xp, *restrict xparts = c->xparts;
-  float dx_max = 0.f, h_max = 0.f;
+  float dx_max = 0.f, dx2_max = 0.f, h_max = 0.f;
   float w;
 
   TIMER_TIC
@@ -709,16 +709,18 @@ void runner_dodrift(struct runner *r, struct cell *c, int timer) {
       /* Predict the values of the extra fields */
       hydro_predict_extra(p, xp, ti_old, ti_current, timeBase);
 
-      /* Compute motion since last cell construction */
-      const float dx =
-          sqrtf((p->x[0] - xp->x_old[0]) * (p->x[0] - xp->x_old[0]) +
-                (p->x[1] - xp->x_old[1]) * (p->x[1] - xp->x_old[1]) +
-                (p->x[2] - xp->x_old[2]) * (p->x[2] - xp->x_old[2]));
-      dx_max = fmaxf(dx_max, dx);
+      /* Compute (square of) motion since last cell construction */
+      const float dx2 = (p->x[0] - xp->x_old[0]) * (p->x[0] - xp->x_old[0]) +
+                        (p->x[1] - xp->x_old[1]) * (p->x[1] - xp->x_old[1]) +
+                        (p->x[2] - xp->x_old[2]) * (p->x[2] - xp->x_old[2]);
+      dx2_max = fmaxf(dx2_max, dx2);
 
       /* Maximal smoothing length */
       h_max = fmaxf(p->h, h_max);
     }
+
+    /* Now, get the maximal particle motion from its square */
+    dx_max = sqrtf(dx2_max);
   }
 
   /* Otherwise, aggregate data from children. */
diff --git a/src/scheduler.c b/src/scheduler.c
index 386b0e27a234a938c72d1af55b35cdc63597e337..38a1cd8c663307e0c0378d8bec2e0cd3d8f37fa8 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -95,31 +95,29 @@ void scheduler_addunlock(struct scheduler *s, struct task *ta,
 
 void scheduler_splittasks(struct scheduler *s) {
 
-  int j, k, ind, sid, tid = 0, redo;
-  struct cell *ci, *cj;
-  double hi, hj, shift[3];
-  struct task *t, *t_old;
-  int pts[7][8] = {{-1, 12, 10, 9, 4, 3, 1, 0},
-                   {-1, -1, 11, 10, 5, 4, 2, 1},
-                   {-1, -1, -1, 12, 7, 6, 4, 3},
-                   {-1, -1, -1, -1, 8, 7, 5, 4},
-                   {-1, -1, -1, -1, -1, 12, 10, 9},
-                   {-1, -1, -1, -1, -1, -1, 11, 10},
-                   {-1, -1, -1, -1, -1, -1, -1, 12}};
-  float sid_scale[13] = {0.1897, 0.4025, 0.1897, 0.4025, 0.5788, 0.4025, 0.1897,
-                         0.4025, 0.1897, 0.4025, 0.5788, 0.4025, 0.5788};
+  const int pts[7][8] = {{-1, 12, 10, 9, 4, 3, 1, 0},
+                         {-1, -1, 11, 10, 5, 4, 2, 1},
+                         {-1, -1, -1, 12, 7, 6, 4, 3},
+                         {-1, -1, -1, -1, 8, 7, 5, 4},
+                         {-1, -1, -1, -1, -1, 12, 10, 9},
+                         {-1, -1, -1, -1, -1, -1, 11, 10},
+                         {-1, -1, -1, -1, -1, -1, -1, 12}};
+  const float sid_scale[13] = {0.1897, 0.4025, 0.1897, 0.4025, 0.5788,
+                               0.4025, 0.1897, 0.4025, 0.1897, 0.4025,
+                               0.5788, 0.4025, 0.5788};
 
   /* Loop through the tasks... */
-  redo = 0;
-  t_old = t = NULL;
+  int tid = 0, redo = 0;
+  struct task *t_old = NULL;
   while (1) {
 
     /* Get a pointer on the task. */
+    struct task *t = t_old;
     if (redo) {
       redo = 0;
-      t = t_old;
     } else {
-      if ((ind = atomic_inc(&tid)) < s->nr_tasks)
+      const int ind = atomic_inc(&tid);
+      if (ind < s->nr_tasks)
         t_old = t = &s->tasks[s->tasks_ind[ind]];
       else
         break;
@@ -160,7 +158,7 @@ void scheduler_splittasks(struct scheduler *s) {
     if (t->type == task_type_self) {
 
       /* Get a handle on the cell involved. */
-      ci = t->ci;
+      struct cell *ci = t->ci;
 
       /* Foreign task? */
       if (ci->nodeID != s->nodeID) {
@@ -186,18 +184,18 @@ void scheduler_splittasks(struct scheduler *s) {
           redo = 1;
 
           /* Add the self task. */
-          for (k = 0; ci->progeny[k] == NULL; k++)
-            ;
-          t->ci = ci->progeny[k];
-          for (k += 1; k < 8; k++)
+          int first_child = 0;
+          while (ci->progeny[first_child] == NULL) first_child++;
+          t->ci = ci->progeny[first_child];
+          for (int k = first_child + 1; k < 8; k++)
             if (ci->progeny[k] != NULL)
               scheduler_addtask(s, task_type_self, t->subtype, 0, 0,
                                 ci->progeny[k], NULL, 0);
 
           /* Make a task for each pair of progeny. */
-          for (j = 0; j < 8; j++)
+          for (int j = 0; j < 8; j++)
             if (ci->progeny[j] != NULL)
-              for (k = j + 1; k < 8; k++)
+              for (int k = j + 1; k < 8; k++)
                 if (ci->progeny[k] != NULL)
                   scheduler_addtask(s, task_type_pair, t->subtype, pts[j][k], 0,
                                     ci->progeny[j], ci->progeny[k], 0);
@@ -210,10 +208,10 @@ void scheduler_splittasks(struct scheduler *s) {
     else if (t->type == task_type_pair) {
 
       /* Get a handle on the cells involved. */
-      ci = t->ci;
-      cj = t->cj;
-      hi = ci->dmin;
-      hj = cj->dmin;
+      struct cell *ci = t->ci;
+      struct cell *cj = t->cj;
+      const double hi = ci->dmin;
+      const double hj = cj->dmin;
 
       /* Foreign task? */
       if (ci->nodeID != s->nodeID && cj->nodeID != s->nodeID) {
@@ -223,7 +221,8 @@ void scheduler_splittasks(struct scheduler *s) {
 
       /* Get the sort ID, use space_getsid and not t->flags
          to make sure we get ci and cj swapped if needed. */
-      sid = space_getsid(s->space, &ci, &cj, shift);
+      double shift[3];
+      int sid = space_getsid(s->space, &ci, &cj, shift);
 
       /* Should this task be split-up? */
       if (ci->split && cj->split &&
@@ -479,9 +478,9 @@ void scheduler_splittasks(struct scheduler *s) {
         /* Replace the current task. */
         t->type = task_type_none;
 
-        for (j = 0; j < 8; j++)
+        for (int j = 0; j < 8; j++)
           if (ci->progeny[j] != NULL)
-            for (k = 0; k < 8; k++)
+            for (int k = 0; k < 8; k++)
               if (cj->progeny[k] != NULL) {
                 t = scheduler_addtask(s, task_type_pair, t->subtype, 0, 0,
                                       ci->progeny[j], cj->progeny[k], 0);
@@ -520,8 +519,8 @@ void scheduler_splittasks(struct scheduler *s) {
     else if (t->type == task_type_grav_mm) {
 
       /* Get a handle on the cells involved. */
-      ci = t->ci;
-      cj = t->cj;
+      struct cell *ci = t->ci;
+      struct cell *cj = t->cj;
 
       /* Self-interaction? */
       if (cj == NULL) {
@@ -545,7 +544,7 @@ void scheduler_splittasks(struct scheduler *s) {
 
             /* Split this task into tasks on its progeny. */
             t->type = task_type_none;
-            for (j = 0; j < 8; j++)
+            for (int j = 0; j < 8; j++)
               if (ci->progeny[j] != NULL && ci->progeny[j]->gcount > 0) {
                 if (t->type == task_type_none) {
                   t->type = task_type_grav_mm;
@@ -554,7 +553,7 @@ void scheduler_splittasks(struct scheduler *s) {
                 } else
                   t = scheduler_addtask(s, task_type_grav_mm, task_subtype_none,
                                         0, 0, ci->progeny[j], NULL, 0);
-                for (k = j + 1; k < 8; k++)
+                for (int k = j + 1; k < 8; k++)
                   if (ci->progeny[k] != NULL && ci->progeny[k]->gcount > 0) {
                     if (t->type == task_type_none) {
                       t->type = task_type_grav_mm;
@@ -593,7 +592,7 @@ void scheduler_splittasks(struct scheduler *s) {
 
           /* Get the opening angle theta. */
           float dx[3], theta;
-          for (k = 0; k < 3; k++) {
+          for (int k = 0; k < 3; k++) {
             dx[k] = fabs(ci->loc[k] - cj->loc[k]);
             if (s->space->periodic && dx[k] > 0.5 * s->space->dim[k])
               dx[k] = -dx[k] + s->space->dim[k];
@@ -614,9 +613,9 @@ void scheduler_splittasks(struct scheduler *s) {
 
               /* Split this task into tasks on its progeny. */
               t->type = task_type_none;
-              for (j = 0; j < 8; j++)
+              for (int j = 0; j < 8; j++)
                 if (ci->progeny[j] != NULL && ci->progeny[j]->gcount > 0) {
-                  for (k = 0; k < 8; k++)
+                  for (int k = 0; k < 8; k++)
                     if (cj->progeny[k] != NULL && cj->progeny[k]->gcount > 0) {
                       if (t->type == task_type_none) {
                         t->type = task_type_grav_mm;
@@ -662,17 +661,14 @@ struct task *scheduler_addtask(struct scheduler *s, int type, int subtype,
                                int flags, int wait, struct cell *ci,
                                struct cell *cj, int tight) {
 
-  int ind;
-  struct task *t;
-
   /* Get the next free task. */
-  ind = atomic_inc(&s->tasks_next);
+  const int ind = atomic_inc(&s->tasks_next);
 
   /* Overflow? */
   if (ind >= s->size) error("Task list overflow.");
 
   /* Get a pointer to the new task. */
-  t = &s->tasks[ind];
+  struct task *t = &s->tasks[ind];
 
   /* Copy the data. */
   t->type = type;
@@ -767,24 +763,24 @@ void scheduler_set_unlocks(struct scheduler *s) {
 
 void scheduler_ranktasks(struct scheduler *s) {
 
-  int i, j = 0, k, temp, left = 0, rank;
-  struct task *t, *tasks = s->tasks;
-  int *tid = s->tasks_ind, nr_tasks = s->nr_tasks;
+  struct task *tasks = s->tasks;
+  int *tid = s->tasks_ind;
+  const int nr_tasks = s->nr_tasks;
 
   /* Run through the tasks and get all the waits right. */
-  for (i = 0, k = 0; k < nr_tasks; k++) {
+  for (int k = 0; k < nr_tasks; k++) {
     tid[k] = k;
-    for (j = 0; j < tasks[k].nr_unlock_tasks; j++)
+    for (int j = 0; j < tasks[k].nr_unlock_tasks; j++)
       tasks[k].unlock_tasks[j]->wait += 1;
   }
 
   /* Main loop. */
-  for (j = 0, rank = 0; left < nr_tasks; rank++) {
+  for (int j = 0, rank = 0, left = 0; left < nr_tasks; rank++) {
 
     /* Load the tids of tasks with no waits. */
-    for (k = left; k < nr_tasks; k++)
+    for (int k = left; k < nr_tasks; k++)
       if (tasks[tid[k]].wait == 0) {
-        temp = tid[j];
+        int temp = tid[j];
         tid[j] = tid[k];
         tid[k] = temp;
         j += 1;
@@ -794,15 +790,16 @@ void scheduler_ranktasks(struct scheduler *s) {
     if (j == left) error("Unsatisfiable task dependencies detected.");
 
     /* Unlock the next layer of tasks. */
-    for (i = left; i < j; i++) {
-      t = &tasks[tid[i]];
+    for (int i = left; i < j; i++) {
+      struct task *t = &tasks[tid[i]];
       t->rank = rank;
       tid[i] = t - tasks;
       if (tid[i] >= nr_tasks) error("Task index overshoot.");
       /* message( "task %i of type %s has rank %i." , i ,
           (t->type == task_type_self) ? "self" : (t->type == task_type_pair) ?
          "pair" : "sort" , rank ); */
-      for (k = 0; k < t->nr_unlock_tasks; k++) t->unlock_tasks[k]->wait -= 1;
+      for (int k = 0; k < t->nr_unlock_tasks; k++)
+        t->unlock_tasks[k]->wait -= 1;
     }
 
     /* The new left (no, not tony). */
@@ -824,8 +821,6 @@ void scheduler_ranktasks(struct scheduler *s) {
 
 void scheduler_reset(struct scheduler *s, int size) {
 
-  int k;
-
   /* Do we need to re-allocate? */
   if (size > s->size) {
 
@@ -852,7 +847,7 @@ void scheduler_reset(struct scheduler *s, int size) {
   s->nr_unlocks = 0;
 
   /* Set the task pointers in the queues. */
-  for (k = 0; k < s->nr_queues; k++) s->queues[k].tasks = s->tasks;
+  for (int k = 0; k < s->nr_queues; k++) s->queues[k].tasks = s->tasks;
 }
 
 /**
@@ -863,21 +858,23 @@ void scheduler_reset(struct scheduler *s, int size) {
 
 void scheduler_reweight(struct scheduler *s) {
 
-  int k, j, nr_tasks = s->nr_tasks, *tid = s->tasks_ind;
-  struct task *t, *tasks = s->tasks;
-  int nodeID = s->nodeID;
-  float sid_scale[13] = {0.1897, 0.4025, 0.1897, 0.4025, 0.5788, 0.4025, 0.1897,
-                         0.4025, 0.1897, 0.4025, 0.5788, 0.4025, 0.5788};
-  float wscale = 0.001;
+  const int nr_tasks = s->nr_tasks;
+  int *tid = s->tasks_ind;
+  struct task *tasks = s->tasks;
+  const int nodeID = s->nodeID;
+  const float sid_scale[13] = {0.1897, 0.4025, 0.1897, 0.4025, 0.5788,
+                               0.4025, 0.1897, 0.4025, 0.1897, 0.4025,
+                               0.5788, 0.4025, 0.5788};
+  const float wscale = 0.001;
   // ticks tic;
 
   /* Run through the tasks backwards and set their waits and
      weights. */
   // tic = getticks();
-  for (k = nr_tasks - 1; k >= 0; k--) {
-    t = &tasks[tid[k]];
+  for (int k = nr_tasks - 1; k >= 0; k--) {
+    struct task *t = &tasks[tid[k]];
     t->weight = 0;
-    for (j = 0; j < t->nr_unlock_tasks; j++)
+    for (int j = 0; j < t->nr_unlock_tasks; j++)
       if (t->unlock_tasks[j]->weight > t->weight)
         t->weight = t->unlock_tasks[j]->weight;
     if (!t->implicit && t->tic > 0)
@@ -958,8 +955,9 @@ void scheduler_reweight(struct scheduler *s) {
 void scheduler_start(struct scheduler *s, unsigned int mask,
                      unsigned int submask) {
 
-  int nr_tasks = s->nr_tasks, *tid = s->tasks_ind;
-  struct task *t, *tasks = s->tasks;
+  const int nr_tasks = s->nr_tasks;
+  int *tid = s->tasks_ind;
+  struct task *tasks = s->tasks;
   // ticks tic;
 
   /* Store the masks */
@@ -985,8 +983,7 @@ void scheduler_start(struct scheduler *s, unsigned int mask,
   const int waiting_old = s->waiting;
 
   /* We are going to use the task structure in a modified way to pass
-     information
-     to the task. Don't do this at home !
+     information to the task. Don't do this at home !
      - ci and cj will give the range of tasks to which the waits will be applied
      - the flags will be used to transfer the mask
      - the rank will be used to transfer the submask
@@ -1011,6 +1008,7 @@ void scheduler_start(struct scheduler *s, unsigned int mask,
 
   /* Wait for the rewait tasks to have executed. */
   pthread_mutex_lock(&s->sleep_mutex);
+  pthread_cond_broadcast(&s->sleep_cond);
   while (s->waiting > waiting_old) {
     pthread_cond_wait(&s->sleep_cond, &s->sleep_mutex);
   }
@@ -1024,7 +1022,7 @@ void scheduler_start(struct scheduler *s, unsigned int mask,
   /* Loop over the tasks and enqueue whoever is ready. */
   // tic = getticks();
   for (int k = 0; k < s->nr_tasks; k++) {
-    t = &tasks[tid[k]];
+    struct task *t = &tasks[tid[k]];
     if (atomic_dec(&t->wait) == 1 && ((1 << t->type) & s->mask) &&
         ((1 << t->subtype) & s->submask) && !t->skip) {
       scheduler_enqueue(s, t);
@@ -1032,6 +1030,11 @@ void scheduler_start(struct scheduler *s, unsigned int mask,
     }
   }
 
+  /* To be safe, fire of one last sleep_cond in a safe way. */
+  pthread_mutex_lock(&s->sleep_mutex);
+  pthread_cond_broadcast(&s->sleep_cond);
+  pthread_mutex_unlock(&s->sleep_mutex);
+
   // message( "enqueueing tasks took %.3f %s." ,
   // clocks_from_ticks( getticks() - tic ), clocks_getunit());
 }
@@ -1045,10 +1048,8 @@ void scheduler_start(struct scheduler *s, unsigned int mask,
 
 void scheduler_enqueue(struct scheduler *s, struct task *t) {
 
+  /* The target queue for this task. */
   int qid = -1;
-#ifdef WITH_MPI
-  int err;
-#endif
 
   /* Fail if this task has already been enqueued before. */
   if (t->rid >= 0) error("Task has already been enqueued.");
@@ -1070,6 +1071,9 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
 
   /* Otherwise, look for a suitable queue. */
   else {
+#ifdef WITH_MPI
+    int err;
+#endif
 
     /* Find the previous owner for each task type, and do
        any pre-processing needed. */
@@ -1092,13 +1096,10 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
         break;
       case task_type_recv:
 #ifdef WITH_MPI
-        if ((err = MPI_Irecv(t->ci->parts, t->ci->count, s->part_mpi_type,
-                             t->ci->nodeID, t->flags, MPI_COMM_WORLD,
-                             &t->req)) != MPI_SUCCESS) {
-          char buff[MPI_MAX_ERROR_STRING];
-          int len;
-          MPI_Error_string(err, buff, &len);
-          error("Failed to emit irecv for particle data (%s).", buff);
+        err = MPI_Irecv(t->ci->parts, t->ci->count, part_mpi_type,
+                        t->ci->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+        if (err != MPI_SUCCESS) {
+          mpi_error(err, "Failed to emit irecv for particle data.");
         }
         // message( "receiving %i parts with tag=%i from %i to %i." ,
         //     t->ci->count , t->flags , t->ci->nodeID , s->nodeID );
@@ -1110,13 +1111,10 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
         break;
       case task_type_send:
 #ifdef WITH_MPI
-        if ((err = MPI_Isend(t->ci->parts, t->ci->count, s->part_mpi_type,
-                             t->cj->nodeID, t->flags, MPI_COMM_WORLD,
-                             &t->req)) != MPI_SUCCESS) {
-          char buff[MPI_MAX_ERROR_STRING];
-          int len;
-          MPI_Error_string(err, buff, &len);
-          error("Failed to emit isend for particle data (%s).", buff);
+        err = MPI_Isend(t->ci->parts, t->ci->count, part_mpi_type,
+                        t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req);
+        if (err != MPI_SUCCESS) {
+          mpi_error(err, "Failed to emit isend for particle data.");
         }
         // message( "sending %i parts with tag=%i from %i to %i." ,
         //     t->ci->count , t->flags , s->nodeID , t->cj->nodeID );
@@ -1132,7 +1130,7 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
 
     if (qid >= s->nr_queues) error("Bad computed qid.");
 
-    /* If no previous owner, find the shortest queue. */
+    /* If no previous owner, pick a random queue. */
     if (qid < 0) qid = rand() % s->nr_queues;
 
     /* Increase the waiting counter. */
@@ -1163,7 +1161,7 @@ struct task *scheduler_done(struct scheduler *s, struct task *t) {
   for (int k = 0; k < t->nr_unlock_tasks; k++) {
     struct task *t2 = t->unlock_tasks[k];
 
-    int res = atomic_dec(&t2->wait);
+    const int res = atomic_dec(&t2->wait);
     if (res < 1) {
       error("Negative wait!");
     } else if (res == 1) {
@@ -1202,7 +1200,7 @@ struct task *scheduler_unlock(struct scheduler *s, struct task *t) {
      they are ready. */
   for (int k = 0; k < t->nr_unlock_tasks; k++) {
     struct task *t2 = t->unlock_tasks[k];
-    int res = atomic_dec(&t2->wait);
+    const int res = atomic_dec(&t2->wait);
     if (res < 1) {
       error("Negative wait!");
     } else if (res == 1) {
@@ -1239,7 +1237,7 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
                                const struct task *prev) {
 
   struct task *res = NULL;
-  int k, nr_queues = s->nr_queues;
+  const int nr_queues = s->nr_queues;
   unsigned int seed = qid;
 
   /* Check qid. */
@@ -1263,10 +1261,10 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
       /* If unsuccessful, try stealing from the other queues. */
       if (s->flags & scheduler_flag_steal) {
         int count = 0, qids[nr_queues];
-        for (k = 0; k < nr_queues; k++)
+        for (int k = 0; k < nr_queues; k++)
           if (s->queues[k].count > 0) qids[count++] = k;
-        for (k = 0; k < scheduler_maxsteal && count > 0; k++) {
-          int ind = rand_r(&seed) % count;
+        for (int k = 0; k < scheduler_maxsteal && count > 0; k++) {
+          const int ind = rand_r(&seed) % count;
           TIMER_TIC
           res = queue_gettask(&s->queues[qids[ind]], prev, 0);
           TIMER_TOC(timer_qsteal);
@@ -1286,7 +1284,10 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
     if (res == NULL) {
 #endif
       pthread_mutex_lock(&s->sleep_mutex);
-      if (s->waiting > 0) pthread_cond_wait(&s->sleep_cond, &s->sleep_mutex);
+      res = queue_gettask(&s->queues[qid], prev, 1);
+      if (res == NULL && s->waiting > 0) {
+        pthread_cond_wait(&s->sleep_cond, &s->sleep_mutex);
+      }
       pthread_mutex_unlock(&s->sleep_mutex);
     }
   }
@@ -1351,12 +1352,6 @@ void scheduler_init(struct scheduler *s, struct space *space, int nr_tasks,
   s->tasks = NULL;
   s->tasks_ind = NULL;
   scheduler_reset(s, nr_tasks);
-
-/* Construct types for MPI communications */
-#ifdef WITH_MPI
-  part_create_mpi_type(&s->part_mpi_type);
-  xpart_create_mpi_type(&s->xpart_mpi_type);
-#endif
 }
 
 /**
@@ -1365,7 +1360,7 @@ void scheduler_init(struct scheduler *s, struct space *space, int nr_tasks,
  * @param s The #scheduler
  * @param fileName Name of the file to write to
  */
-void scheduler_print_tasks(struct scheduler *s, char *fileName) {
+void scheduler_print_tasks(const struct scheduler *s, const char *fileName) {
 
   const int nr_tasks = s->nr_tasks, *tid = s->tasks_ind;
   struct task *t, *tasks = s->tasks;
diff --git a/src/scheduler.h b/src/scheduler.h
index 3f2d8c289d0d691d0d155b20ae0522c5830524aa..64c694aea295c13810a20b626055fc6c15eb0af8 100644
--- a/src/scheduler.h
+++ b/src/scheduler.h
@@ -100,12 +100,6 @@ struct scheduler {
 
   /* The node we are working on. */
   int nodeID;
-
-#ifdef WITH_MPI
-  /* MPI data type for the particle transfers */
-  MPI_Datatype part_mpi_type;
-  MPI_Datatype xpart_mpi_type;
-#endif
 };
 
 /* Function prototypes. */
@@ -128,7 +122,7 @@ struct task *scheduler_unlock(struct scheduler *s, struct task *t);
 void scheduler_addunlock(struct scheduler *s, struct task *ta, struct task *tb);
 void scheduler_set_unlocks(struct scheduler *s);
 void scheduler_dump_queue(struct scheduler *s);
-void scheduler_print_tasks(struct scheduler *s, char *fileName);
+void scheduler_print_tasks(const struct scheduler *s, const char *fileName);
 void scheduler_do_rewait(struct task *t_begin, struct task *t_end,
                          unsigned int mask, unsigned int submask);
 
diff --git a/src/serial_io.c b/src/serial_io.c
index 3fb1a1d1b743e48be4e8e6d81413c28e8034b870..40bd2b1c8921f4acbfa0950984d6915ebd3d241e 100644
--- a/src/serial_io.c
+++ b/src/serial_io.c
@@ -57,14 +57,13 @@
  * @param dim The dimension of the data (1 for scalar, 3 for vector)
  * @param part_c A (char*) pointer on the first occurrence of the field of
  *interest in the parts array
+ * @param partSize The size in bytes of the particle structure.
  * @param importance If COMPULSORY, the data must be present in the IC file. If
  *OPTIONAL, the array will be zeroed when the data is not present.
  *
  * @todo A better version using HDF5 hyper-slabs to read the file directly into
  *the part array
  * will be written once the structures have been stabilized.
- *
- * Calls #error() if an error occurs.
  */
 void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N,
                       int dim, long long N_total, long long offset,
@@ -172,9 +171,10 @@ void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N,
  * Routines writing an output file
  *-----------------------------------------------------------------------------*/
 
-void prepareArray(hid_t grp, char* fileName, FILE* xmfFile, char* name,
-                  enum DATA_TYPE type, long long N_total, int dim,
-                  struct UnitSystem* us, enum UnitConversionFactor convFactor) {
+void prepareArray(hid_t grp, char* fileName, FILE* xmfFile,
+                  char* partTypeGroupName, char* name, enum DATA_TYPE type,
+                  long long N_total, int dim, struct UnitSystem* us,
+                  enum UnitConversionFactor convFactor) {
   hid_t h_data = 0, h_err = 0, h_space = 0, h_prop = 0;
   int rank = 0;
   hsize_t shape[2];
@@ -234,7 +234,7 @@ void prepareArray(hid_t grp, char* fileName, FILE* xmfFile, char* name,
   }
 
   /* Write XMF description for this data set */
-  writeXMFline(xmfFile, fileName, name, N_total, dim, type);
+  writeXMFline(xmfFile, fileName, partTypeGroupName, name, N_total, dim, type);
 
   /* Write unit conversion factors for this data set */
   conversionString(buffer, us, convFactor);
@@ -255,22 +255,23 @@ void prepareArray(hid_t grp, char* fileName, FILE* xmfFile, char* name,
  * @param grp The group in which to write.
  * @param fileName The name of the file in which the data is written
  * @param xmfFile The FILE used to write the XMF description
+ * @param partTypeGroupName The name of the group containing the particles in
+ *the HDF5 file.
  * @param name The name of the array to write.
  * @param type The #DATA_TYPE of the array.
  * @param N The number of particles to write.
  * @param dim The dimension of the data (1 for scalar, 3 for vector)
  * @param part_c A (char*) pointer on the first occurrence of the field of
  *interest in the parts array
+ * @param partSize The size in bytes of the particle structure.
  * @param us The UnitSystem currently in use
- * @param convFactor The UnitConversionFactor for this array
- *
- *
- * Calls #error() if an error occurs.
+ * @param convFactor The UnitConversionFactor for this arrayo
  */
-void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
-                       enum DATA_TYPE type, int N, int dim, long long N_total,
-                       int mpi_rank, long long offset, char* part_c,
-                       size_t partSize, struct UnitSystem* us,
+void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile,
+                       char* partTypeGroupName, char* name, enum DATA_TYPE type,
+                       int N, int dim, long long N_total, int mpi_rank,
+                       long long offset, char* part_c, size_t partSize,
+                       struct UnitSystem* us,
                        enum UnitConversionFactor convFactor) {
 
   hid_t h_data = 0, h_err = 0, h_memspace = 0, h_filespace = 0;
@@ -285,8 +286,8 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
 
   /* Prepare the arrays in the file */
   if (mpi_rank == 0)
-    prepareArray(grp, fileName, xmfFile, name, type, N_total, dim, us,
-                 convFactor);
+    prepareArray(grp, fileName, xmfFile, partTypeGroupName, name, type, N_total,
+                 dim, us, convFactor);
 
   /* Allocate temporary buffer */
   temp = malloc(N * dim * sizeOfType(type));
@@ -370,21 +371,26 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
  * @param fileName Unused parameter in non-MPI mode
  * @param xmfFile Unused parameter in non-MPI mode
  * @param name The name of the array to write.
+ * @param partTypeGroupName The name of the group containing the particles in
+ *the HDF5 file.
  * @param type The #DATA_TYPE of the array.
  * @param N The number of particles to write.
  * @param dim The dimension of the data (1 for scalar, 3 for vector)
  * @param part A (char*) pointer on the first occurrence of the field of
- *interest
- *in the parts array
+ *interest in the parts array
+ * @param N_total Unused parameter in non-MPI mode
+ * @param mpi_rank Unused parameter in non-MPI mode
+ * @param offset Unused parameter in non-MPI mode
  * @param field The name (code name) of the field to read from.
  * @param us The UnitSystem currently in use
  * @param convFactor The UnitConversionFactor for this array
  *
  */
-#define writeArray(grp, fileName, xmfFile, name, type, N, dim, part, N_total, \
-                   mpi_rank, offset, field, us, convFactor)                   \
-  writeArrayBackEnd(grp, fileName, xmfFile, name, type, N, dim, N_total,      \
-                    mpi_rank, offset, (char*)(&(part[0]).field),              \
+#define writeArray(grp, fileName, xmfFile, partTypeGroupName, name, type, N,   \
+                   dim, part, N_total, mpi_rank, offset, field, us,            \
+                   convFactor)                                                 \
+  writeArrayBackEnd(grp, fileName, xmfFile, partTypeGroupName, name, type, N,  \
+                    dim, N_total, mpi_rank, offset, (char*)(&(part[0]).field), \
                     sizeof(part[0]), us, convFactor)
 
 /* Import the right hydro definition */
@@ -609,12 +615,12 @@ void write_output_serial(struct engine* e, struct UnitSystem* us, int mpi_rank,
   FILE* xmfFile = 0;
 
   /* Number of particles of each type */
-  //const size_t Ndm = Ntot - Ngas;
+  // const size_t Ndm = Ntot - Ngas;
 
   /* MATTHIEU: Temporary fix to preserve master */
-  const size_t Ndm = Ntot > 0 ? Ntot - Ngas: 0;
+  const size_t Ndm = Ntot > 0 ? Ntot - Ngas : 0;
   /* MATTHIEU: End temporary fix */
-  
+
   /* File name */
   char fileName[FILENAME_BUFFER_SIZE];
   snprintf(fileName, FILENAME_BUFFER_SIZE, "output_%03i.hdf5", outputCount);
@@ -643,7 +649,7 @@ void write_output_serial(struct engine* e, struct UnitSystem* us, int mpi_rank,
     xmfFile = prepareXMFfile();
 
     /* Write the part corresponding to this specific output */
-    // writeXMFheader(xmfFile, N_total, fileName, e->time);
+    writeXMFoutputheader(xmfFile, fileName, e->time);
 
     /* Open file */
     /* message("Opening file '%s'.", fileName); */
@@ -750,6 +756,11 @@ void write_output_serial(struct engine* e, struct UnitSystem* us, int mpi_rank,
         /* Don't do anything if no particle of this kind */
         if (N_total[ptype] == 0) continue;
 
+        /* Add the global information for that particle type to the XMF
+         * meta-file */
+        if (mpi_rank == 0)
+          writeXMFgroupheader(xmfFile, fileName, N_total[ptype], ptype);
+
         /* Open the particle group in the file */
         char partTypeGroupName[PARTICLE_GROUP_BUFFER_SIZE];
         snprintf(partTypeGroupName, PARTICLE_GROUP_BUFFER_SIZE, "/PartType%d",
@@ -763,9 +774,9 @@ void write_output_serial(struct engine* e, struct UnitSystem* us, int mpi_rank,
         switch (ptype) {
 
           case GAS:
-            hydro_write_particles(h_grp, fileName, xmfFile, N[ptype],
-                                  N_total[ptype], mpi_rank, offset[ptype],
-                                  parts, us);
+            hydro_write_particles(h_grp, fileName, partTypeGroupName, xmfFile,
+                                  N[ptype], N_total[ptype], mpi_rank,
+                                  offset[ptype], parts, us);
 
             break;
 
@@ -780,9 +791,9 @@ void write_output_serial(struct engine* e, struct UnitSystem* us, int mpi_rank,
             collect_dm_gparts(gparts, Ntot, dmparts, Ndm);
 
             /* Write DM particles */
-            darkmatter_write_particles(h_grp, fileName, xmfFile, N[ptype],
-                                       N_total[ptype], mpi_rank, offset[ptype],
-                                       dmparts, us);
+            darkmatter_write_particles(h_grp, fileName, partTypeGroupName,
+                                       xmfFile, N[ptype], N_total[ptype],
+                                       mpi_rank, offset[ptype], dmparts, us);
 
             /* Free temporary array */
             free(dmparts);
@@ -794,6 +805,9 @@ void write_output_serial(struct engine* e, struct UnitSystem* us, int mpi_rank,
 
         /* Close particle group */
         H5Gclose(h_grp);
+
+        /* Close this particle group in the XMF file as well */
+        if (mpi_rank == 0) writeXMFgroupfooter(xmfFile, ptype);
       }
 
       /* Close file */
@@ -805,7 +819,7 @@ void write_output_serial(struct engine* e, struct UnitSystem* us, int mpi_rank,
   }
 
   /* Write footer of LXMF file descriptor */
-  if (mpi_rank == 0) writeXMFfooter(xmfFile);
+  if (mpi_rank == 0) writeXMFoutputfooter(xmfFile, outputCount, e->time);
 
   /* message("Done writing particles..."); */
   ++outputCount;
diff --git a/src/single_io.c b/src/single_io.c
index 5a48406a79f671e7591bab5529712365906e90a4..801428433ef5170082b68dec425e52f845bb41ae 100644
--- a/src/single_io.c
+++ b/src/single_io.c
@@ -52,16 +52,14 @@
  * @param N The number of particles.
  * @param dim The dimension of the data (1 for scalar, 3 for vector)
  * @param part_c A (char*) pointer on the first occurrence of the field of
- * @param partSize The size in bytes of the particle structure.
  *interest in the parts array
+ * @param partSize The size in bytes of the particle structure.
  * @param importance If COMPULSORY, the data must be present in the IC file. If
  *OPTIONAL, the array will be zeroed when the data is not present.
  *
  * @todo A better version using HDF5 hyper-slabs to read the file directly into
  *the part array
  * will be written once the structures have been stabilized.
- *
- * Calls #error() if an error occurs.
  */
 void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N,
                       int dim, char* part_c, size_t partSize,
@@ -139,12 +137,14 @@ void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N,
  * @param grp The group in which to write.
  * @param fileName The name of the file in which the data is written
  * @param xmfFile The FILE used to write the XMF description
+ * @param partTypeGroupName The name of the group containing the particles in
+ *the HDF5 file.
  * @param name The name of the array to write.
  * @param type The #DATA_TYPE of the array.
  * @param N The number of particles to write.
  * @param dim The dimension of the data (1 for scalar, 3 for vector)
  * @param part_c A (char*) pointer on the first occurrence of the field of
- *interest in the parts array
+ *interest in the parts array.
  * @param partSize The size in bytes of the particle structure.
  * @param us The UnitSystem currently in use
  * @param convFactor The UnitConversionFactor for this array
@@ -152,12 +152,11 @@ void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N,
  * @todo A better version using HDF5 hyper-slabs to write the file directly from
  *the part array
  * will be written once the structures have been stabilized.
- *
- * Calls #error() if an error occurs.
  */
-void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
-                       enum DATA_TYPE type, int N, int dim, char* part_c,
-                       size_t partSize, struct UnitSystem* us,
+void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile,
+                       char* partTypeGroupName, char* name, enum DATA_TYPE type,
+                       int N, int dim, char* part_c, size_t partSize,
+                       struct UnitSystem* us,
                        enum UnitConversionFactor convFactor) {
   hid_t h_data = 0, h_err = 0, h_space = 0, h_prop = 0;
   void* temp = 0;
@@ -239,7 +238,7 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
   }
 
   /* Write XMF description for this data set */
-  writeXMFline(xmfFile, fileName, name, N, dim, type);
+  writeXMFline(xmfFile, fileName, partTypeGroupName, name, N, dim, type);
 
   /* Write unit conversion factors for this data set */
   conversionString(buffer, us, convFactor);
@@ -283,6 +282,8 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
  * @param fileName The name of the file in which the data is written
  * @param xmfFile The FILE used to write the XMF description
  * @param name The name of the array to write.
+ * @param partTypeGroupName The name of the group containing the particles in
+ *the HDF5 file.
  * @param type The #DATA_TYPE of the array.
  * @param N The number of particles to write.
  * @param dim The dimension of the data (1 for scalar, 3 for vector)
@@ -296,10 +297,11 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name,
  * @param convFactor The UnitConversionFactor for this array
  *
  */
-#define writeArray(grp, fileName, xmfFile, name, type, N, dim, part, N_total, \
-                   mpi_rank, offset, field, us, convFactor)                   \
-  writeArrayBackEnd(grp, fileName, xmfFile, name, type, N, dim,               \
-                    (char*)(&(part[0]).field), sizeof(part[0]), us,           \
+#define writeArray(grp, fileName, xmfFile, partTypeGroupName, name, type, N,  \
+                   dim, part, N_total, mpi_rank, offset, field, us,           \
+                   convFactor)                                                \
+  writeArrayBackEnd(grp, fileName, xmfFile, partTypeGroupName, name, type, N, \
+                    dim, (char*)(&(part[0]).field), sizeof(part[0]), us,      \
                     convFactor)
 
 /* Import the right hydro definition */
@@ -478,10 +480,10 @@ void write_output_single(struct engine* e, struct UnitSystem* us) {
   static int outputCount = 0;
 
   /* Number of particles of each type */
-  //const size_t Ndm = Ntot - Ngas;
+  // const size_t Ndm = Ntot - Ngas;
 
   /* MATTHIEU: Temporary fix to preserve master */
-  const size_t Ndm = Ntot > 0 ? Ntot - Ngas: 0;
+  const size_t Ndm = Ntot > 0 ? Ntot - Ngas : 0;
   /* MATTHIEU: End temporary fix */
 
   long long N_total[NUM_PARTICLE_TYPES] = {Ngas, Ndm, 0};
@@ -498,7 +500,7 @@ void write_output_single(struct engine* e, struct UnitSystem* us) {
   xmfFile = prepareXMFfile();
 
   /* Write the part corresponding to this specific output */
-  writeXMFheader(xmfFile, Ngas, fileName, e->time);
+  writeXMFoutputheader(xmfFile, fileName, e->time);
 
   /* Open file */
   /* message("Opening file '%s'.", fileName); */
@@ -571,6 +573,9 @@ void write_output_single(struct engine* e, struct UnitSystem* us) {
     /* Don't do anything if no particle of this kind */
     if (numParticles[ptype] == 0) continue;
 
+    /* Add the global information for that particle type to the XMF meta-file */
+    writeXMFgroupheader(xmfFile, fileName, numParticles[ptype], ptype);
+
     /* Open the particle group in the file */
     char partTypeGroupName[PARTICLE_GROUP_BUFFER_SIZE];
     snprintf(partTypeGroupName, PARTICLE_GROUP_BUFFER_SIZE, "/PartType%d",
@@ -587,8 +592,8 @@ void write_output_single(struct engine* e, struct UnitSystem* us) {
     switch (ptype) {
 
       case GAS:
-        hydro_write_particles(h_grp, fileName, xmfFile, Ngas, Ngas, 0, 0, parts,
-                              us);
+        hydro_write_particles(h_grp, fileName, partTypeGroupName, xmfFile, Ngas,
+                              Ngas, 0, 0, parts, us);
         break;
 
       case DM:
@@ -602,8 +607,8 @@ void write_output_single(struct engine* e, struct UnitSystem* us) {
         collect_dm_gparts(gparts, Ntot, dmparts, Ndm);
 
         /* Write DM particles */
-        darkmatter_write_particles(h_grp, fileName, xmfFile, Ndm, Ndm, 0, 0,
-                                   dmparts, us);
+        darkmatter_write_particles(h_grp, fileName, partTypeGroupName, xmfFile,
+                                   Ndm, Ndm, 0, 0, dmparts, us);
 
         /* Free temporary array */
         free(dmparts);
@@ -615,10 +620,13 @@ void write_output_single(struct engine* e, struct UnitSystem* us) {
 
     /* Close particle group */
     H5Gclose(h_grp);
+
+    /* Close this particle group in the XMF file as well */
+    writeXMFgroupfooter(xmfFile, ptype);
   }
 
   /* Write LXMF file descriptor */
-  writeXMFfooter(xmfFile);
+  writeXMFoutputfooter(xmfFile, outputCount, e->time);
 
   /* message("Done writing particles..."); */
 
diff --git a/src/space.c b/src/space.c
index fa2541c2872c38f5000e78ef2802d9a4f719f9fc..0cce068b5d8a8060b6ca4cde71aeff9dc6b69e8d 100644
--- a/src/space.c
+++ b/src/space.c
@@ -97,12 +97,10 @@ const int sortlistID[27] = {
 int space_getsid(struct space *s, struct cell **ci, struct cell **cj,
                  double *shift) {
 
-  int k, sid = 0, periodic = s->periodic;
-  struct cell *temp;
-  double dx[3];
-
   /* Get the relative distance between the pairs, wrapping. */
-  for (k = 0; k < 3; k++) {
+  const int periodic = s->periodic;
+  double dx[3];
+  for (int k = 0; k < 3; k++) {
     dx[k] = (*cj)->loc[k] - (*ci)->loc[k];
     if (periodic && dx[k] < -s->dim[k] / 2)
       shift[k] = s->dim[k];
@@ -114,15 +112,16 @@ int space_getsid(struct space *s, struct cell **ci, struct cell **cj,
   }
 
   /* Get the sorting index. */
-  for (k = 0; k < 3; k++)
+  int sid = 0;
+  for (int k = 0; k < 3; k++)
     sid = 3 * sid + ((dx[k] < 0.0) ? 0 : ((dx[k] > 0.0) ? 2 : 1));
 
   /* Switch the cells around? */
   if (runner_flip[sid]) {
-    temp = *ci;
+    struct cell *temp = *ci;
     *ci = *cj;
     *cj = temp;
-    for (k = 0; k < 3; k++) shift[k] = -shift[k];
+    for (int k = 0; k < 3; k++) shift[k] = -shift[k];
   }
   sid = sortlistID[sid];
 
@@ -137,10 +136,8 @@ int space_getsid(struct space *s, struct cell **ci, struct cell **cj,
 
 void space_rebuild_recycle(struct space *s, struct cell *c) {
 
-  int k;
-
   if (c->split)
-    for (k = 0; k < 8; k++)
+    for (int k = 0; k < 8; k++)
       if (c->progeny[k] != NULL) {
         space_rebuild_recycle(s, c->progeny[k]);
         space_recycle(s, c->progeny[k]);
@@ -158,19 +155,19 @@ void space_rebuild_recycle(struct space *s, struct cell *c) {
 
 void space_regrid(struct space *s, double cell_max, int verbose) {
 
-  float h_max = s->cell_min / kernel_gamma / space_stretch, dmin;
-  int i, j, k, cdim[3], nr_parts = s->nr_parts;
+  float h_max = s->cell_min / kernel_gamma / space_stretch;
+  const size_t nr_parts = s->nr_parts;
   struct cell *restrict c;
   ticks tic = getticks();
 
   /* Run through the parts and get the current h_max. */
   // tic = getticks();
   if (s->cells != NULL) {
-    for (k = 0; k < s->nr_cells; k++) {
+    for (int k = 0; k < s->nr_cells; k++) {
       if (s->cells[k].h_max > h_max) h_max = s->cells[k].h_max;
     }
   } else {
-    for (k = 0; k < nr_parts; k++) {
+    for (int k = 0; k < nr_parts; k++) {
       if (s->parts[k].h > h_max) h_max = s->parts[k].h;
     }
     s->h_max = h_max;
@@ -190,7 +187,8 @@ void space_regrid(struct space *s, double cell_max, int verbose) {
   if (verbose) message("h_max is %.3e (cell_max=%.3e).", h_max, cell_max);
 
   /* Get the new putative cell dimensions. */
-  for (k = 0; k < 3; k++)
+  int cdim[3];
+  for (int k = 0; k < 3; k++)
     cdim[k] =
         floor(s->dim[k] / fmax(h_max * kernel_gamma * space_stretch, cell_max));
 
@@ -213,7 +211,7 @@ void space_regrid(struct space *s, double cell_max, int verbose) {
 
     /* Free the old cells, if they were allocated. */
     if (s->cells != NULL) {
-      for (k = 0; k < s->nr_cells; k++) {
+      for (int k = 0; k < s->nr_cells; k++) {
         space_rebuild_recycle(s, &s->cells[k]);
         if (s->cells[k].sort != NULL) free(s->cells[k].sort);
       }
@@ -222,12 +220,12 @@ void space_regrid(struct space *s, double cell_max, int verbose) {
     }
 
     /* Set the new cell dimensions only if smaller. */
-    for (k = 0; k < 3; k++) {
+    for (int k = 0; k < 3; k++) {
       s->cdim[k] = cdim[k];
       s->h[k] = s->dim[k] / cdim[k];
       s->ih[k] = 1.0 / s->h[k];
     }
-    dmin = fminf(s->h[0], fminf(s->h[1], s->h[2]));
+    const float dmin = fminf(s->h[0], fminf(s->h[1], s->h[2]));
 
     /* Allocate the highest level of cells. */
     s->tot_cells = s->nr_cells = cdim[0] * cdim[1] * cdim[2];
@@ -235,13 +233,13 @@ void space_regrid(struct space *s, double cell_max, int verbose) {
                        s->nr_cells * sizeof(struct cell)) != 0)
       error("Failed to allocate cells.");
     bzero(s->cells, s->nr_cells * sizeof(struct cell));
-    for (k = 0; k < s->nr_cells; k++)
+    for (int k = 0; k < s->nr_cells; k++)
       if (lock_init(&s->cells[k].lock) != 0) error("Failed to init spinlock.");
 
     /* Set the cell location and sizes. */
-    for (i = 0; i < cdim[0]; i++)
-      for (j = 0; j < cdim[1]; j++)
-        for (k = 0; k < cdim[2]; k++) {
+    for (int i = 0; i < cdim[0]; i++)
+      for (int j = 0; j < cdim[1]; j++)
+        for (int k = 0; k < cdim[2]; k++) {
           c = &s->cells[cell_getid(cdim, i, j, k)];
           c->loc[0] = i * s->h[0];
           c->loc[1] = j * s->h[1];
@@ -271,7 +269,7 @@ void space_regrid(struct space *s, double cell_max, int verbose) {
   else {
 
     /* Free the old cells, if they were allocated. */
-    for (k = 0; k < s->nr_cells; k++) {
+    for (int k = 0; k < s->nr_cells; k++) {
       space_rebuild_recycle(s, &s->cells[k]);
       s->cells[k].sorts = NULL;
       s->cells[k].nr_tasks = 0;
@@ -308,7 +306,7 @@ void space_regrid(struct space *s, double cell_max, int verbose) {
 
 void space_rebuild(struct space *s, double cell_max, int verbose) {
 
-  ticks tic = getticks();
+  const ticks tic = getticks();
 
   /* Be verbose about this. */
   // message( "re)building space..." ); fflush(stdout);
@@ -320,23 +318,15 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
   int nr_gparts = s->nr_gparts;
   struct cell *restrict cells = s->cells;
 
-  double ih[3], dim[3];
-  int cdim[3];
-  ih[0] = s->ih[0];
-  ih[1] = s->ih[1];
-  ih[2] = s->ih[2];
-  dim[0] = s->dim[0];
-  dim[1] = s->dim[1];
-  dim[2] = s->dim[2];
-  cdim[0] = s->cdim[0];
-  cdim[1] = s->cdim[1];
-  cdim[2] = s->cdim[2];
+  const double ih[3] = {s->ih[0], s->ih[1], s->ih[2]};
+  const double dim[3] = {s->dim[0], s->dim[1], s->dim[2]};
+  const int cdim[3] = {s->cdim[0], s->cdim[1], s->cdim[2]};
 
   /* Run through the particles and get their cell index. */
   // tic = getticks();
   const size_t ind_size = s->size_parts;
-  size_t *ind;
-  if ((ind = (size_t *)malloc(sizeof(size_t) * ind_size)) == NULL)
+  int *ind;
+  if ((ind = (int *)malloc(sizeof(int) * ind_size)) == NULL)
     error("Failed to allocate temporary particle indices.");
   for (int k = 0; k < nr_parts; k++) {
     struct part *restrict p = &s->parts[k];
@@ -349,37 +339,91 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
         cell_getid(cdim, p->x[0] * ih[0], p->x[1] * ih[1], p->x[2] * ih[2]);
     cells[ind[k]].count++;
   }
+  // message( "getting particle indices took %.3f %s." ,
+  // clocks_from_ticks(getticks() - tic), clocks_getunit()):
+
+  /* Run through the gravity particles and get their cell index. */
+  // tic = getticks();
+  const size_t gind_size = s->size_gparts;
+  int *gind;
+  if ((gind = (int *)malloc(sizeof(int) * gind_size)) == NULL)
+    error("Failed to allocate temporary g-particle indices.");
+  for (int k = 0; k < nr_gparts; k++) {
+    struct gpart *restrict gp = &s->gparts[k];
+    for (int j = 0; j < 3; j++)
+      if (gp->x[j] < 0.0)
+        gp->x[j] += dim[j];
+      else if (gp->x[j] >= dim[j])
+        gp->x[j] -= dim[j];
+    gind[k] =
+        cell_getid(cdim, gp->x[0] * ih[0], gp->x[1] * ih[1], gp->x[2] * ih[2]);
+    cells[gind[k]].gcount++;
+  }
 // message( "getting particle indices took %.3f %s." ,
-// clocks_from_ticks(getticks() - tic), clocks_getunit()):
+// clocks_from_ticks(getticks() - tic), clocks_getunit());
 
 #ifdef WITH_MPI
   /* Move non-local parts to the end of the list. */
-  const int nodeID = s->e->nodeID;
+  const int local_nodeID = s->e->nodeID;
   for (int k = 0; k < nr_parts; k++)
-    if (cells[ind[k]].nodeID != nodeID) {
+    if (cells[ind[k]].nodeID != local_nodeID) {
       cells[ind[k]].count -= 1;
       nr_parts -= 1;
-      struct part tp = s->parts[k];
+      const struct part tp = s->parts[k];
       s->parts[k] = s->parts[nr_parts];
       s->parts[nr_parts] = tp;
-      struct xpart txp = s->xparts[k];
+      if (s->parts[k].gpart != NULL) {
+        s->parts[k].gpart->part = &s->parts[k];
+      }
+      if (s->parts[nr_parts].gpart != NULL) {
+        s->parts[nr_parts].gpart->part = &s->parts[nr_parts];
+      }
+      const struct xpart txp = s->xparts[k];
       s->xparts[k] = s->xparts[nr_parts];
       s->xparts[nr_parts] = txp;
-      int t = ind[k];
+      const int t = ind[k];
       ind[k] = ind[nr_parts];
       ind[nr_parts] = t;
     }
 
+  /* Move non-local gparts to the end of the list. */
+  for (int k = 0; k < nr_gparts; k++)
+    if (cells[gind[k]].nodeID != local_nodeID) {
+      cells[gind[k]].gcount -= 1;
+      nr_gparts -= 1;
+      const struct gpart tp = s->gparts[k];
+      s->gparts[k] = s->gparts[nr_gparts];
+      s->gparts[nr_gparts] = tp;
+      if (s->gparts[k].id > 0) {
+        s->gparts[k].part->gpart = &s->gparts[k];
+      }
+      if (s->gparts[nr_gparts].id > 0) {
+        s->gparts[nr_gparts].part->gpart = &s->gparts[nr_gparts];
+      }
+      const int t = gind[k];
+      gind[k] = gind[nr_gparts];
+      gind[nr_gparts] = t;
+    }
+
   /* Exchange the strays, note that this potentially re-allocates
      the parts arrays. */
-  s->nr_parts =
-      nr_parts + engine_exchange_strays(s->e, nr_parts, &ind[nr_parts],
-                                        s->nr_parts - nr_parts);
+  /* TODO: This function also exchanges gparts, but this is shorted-out
+     until they are fully implemented. */
+  size_t nr_parts_exchanged = s->nr_parts - nr_parts;
+  size_t nr_gparts_exchanged = s->nr_gparts - nr_gparts;
+  engine_exchange_strays(s->e, nr_parts, &ind[nr_parts], &nr_parts_exchanged,
+                         nr_gparts, &gind[nr_gparts], &nr_gparts_exchanged);
+
+  /* Add post-processing, i.e. re-linking/creating of gparts here. */
+
+  /* Set the new particle counts. */
+  s->nr_parts = nr_parts + nr_parts_exchanged;
+  s->nr_gparts = nr_gparts + nr_gparts_exchanged;
 
   /* Re-allocate the index array if needed.. */
   if (s->nr_parts > ind_size) {
-    size_t *ind_new;
-    if ((ind_new = (size_t *)malloc(sizeof(size_t) * s->nr_parts)) == NULL)
+    int *ind_new;
+    if ((ind_new = (int *)malloc(sizeof(int) * s->nr_parts)) == NULL)
       error("Failed to allocate temporary particle indices.");
     memcpy(ind_new, ind, sizeof(size_t) * nr_parts);
     free(ind);
@@ -388,7 +432,7 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
 
   /* Assign each particle to its cell. */
   for (int k = nr_parts; k < s->nr_parts; k++) {
-    struct part *p = &s->parts[k];
+    const struct part *const p = &s->parts[k];
     ind[k] =
         cell_getid(cdim, p->x[0] * ih[0], p->x[1] * ih[1], p->x[2] * ih[2]);
     cells[ind[k]].count += 1;
@@ -419,65 +463,24 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
   /* We no longer need the indices as of here. */
   free(ind);
 
-  /* Run through the gravity particles and get their cell index. */
-  // tic = getticks();
-  const size_t gind_size = s->size_gparts;
-  size_t *gind;
-  if ((gind = (size_t *)malloc(sizeof(size_t) * gind_size)) == NULL)
-    error("Failed to allocate temporary g-particle indices.");
-  for (int k = 0; k < nr_gparts; k++) {
-    struct gpart *gp = &s->gparts[k];
-    for (int j = 0; j < 3; j++)
-      if (gp->x[j] < 0.0)
-        gp->x[j] += dim[j];
-      else if (gp->x[j] >= dim[j])
-        gp->x[j] -= dim[j];
-    gind[k] =
-        cell_getid(cdim, gp->x[0] * ih[0], gp->x[1] * ih[1], gp->x[2] * ih[2]);
-    cells[gind[k]].gcount++;
-  }
-// message( "getting particle indices took %.3f %s." ,
-// clocks_from_ticks(getticks() - tic), clocks_getunit());
-
 #ifdef WITH_MPI
 
-  /* Move non-local gparts to the end of the list. */
-  for (int k = 0; k < nr_gparts; k++)
-    if (cells[ind[k]].nodeID != nodeID) {
-      cells[ind[k]].gcount -= 1;
-      nr_gparts -= 1;
-      struct gpart tp = s->gparts[k];
-      s->gparts[k] = s->gparts[nr_gparts];
-      s->gparts[nr_gparts] = tp;
-      int t = ind[k];
-      ind[k] = ind[nr_gparts];
-      ind[nr_gparts] = t;
-    }
-
-  /* Exchange the strays, note that this potentially re-allocates
-     the parts arrays. */
-  // s->nr_gparts =
-  //    nr_gparts + engine_exchange_strays(s->e, nr_gparts, &ind[nr_gparts],
-  //                                        s->nr_gparts - nr_gparts);
-  if (nr_gparts > 0)
-    error("Need to implement the exchange of strays for the gparts");
-
   /* Re-allocate the index array if needed.. */
   if (s->nr_gparts > gind_size) {
-    size_t *gind_new;
-    if ((gind_new = (size_t *)malloc(sizeof(size_t) * s->nr_gparts)) == NULL)
+    int *gind_new;
+    if ((gind_new = (int *)malloc(sizeof(int) * s->nr_gparts)) == NULL)
       error("Failed to allocate temporary g-particle indices.");
-    memcpy(gind_new, gind, sizeof(size_t) * nr_gparts);
+    memcpy(gind_new, gind, sizeof(int) * nr_gparts);
     free(gind);
     gind = gind_new;
   }
 
   /* Assign each particle to its cell. */
   for (int k = nr_gparts; k < s->nr_gparts; k++) {
-    struct gpart *p = &s->gparts[k];
+    const struct gpart *const p = &s->gparts[k];
     gind[k] =
         cell_getid(cdim, p->x[0] * ih[0], p->x[1] * ih[1], p->x[2] * ih[2]);
-    cells[gind[k]].count += 1;
+    cells[gind[k]].gcount += 1;
     /* if ( cells[ ind[k] ].nodeID != nodeID )
         error( "Received part that does not belong to me (nodeID=%i)." , cells[
        ind[k] ].nodeID ); */
@@ -531,7 +534,7 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
  */
 void space_split(struct space *s, struct cell *cells, int verbose) {
 
-  ticks tic = getticks();
+  const ticks tic = getticks();
 
   for (int k = 0; k < s->nr_cells; k++)
     scheduler_addtask(&s->e->sched, task_type_split_cell, task_subtype_none, k,
@@ -555,7 +558,7 @@ void space_split(struct space *s, struct cell *cells, int verbose) {
  * @param verbose Are we talkative ?
  */
 
-void space_parts_sort(struct space *s, size_t *ind, size_t N, int min, int max,
+void space_parts_sort(struct space *s, int *ind, size_t N, int min, int max,
                       int verbose) {
 
   ticks tic = getticks();
@@ -603,7 +606,7 @@ void space_parts_sort(struct space *s, size_t *ind, size_t N, int min, int max,
 void space_do_parts_sort() {
 
   /* Pointers to the sorting data. */
-  size_t *ind = space_sort_struct.ind;
+  int *ind = space_sort_struct.ind;
   struct part *parts = space_sort_struct.parts;
   struct xpart *xparts = space_sort_struct.xparts;
 
@@ -725,7 +728,7 @@ void space_do_parts_sort() {
   } /* main loop. */
 }
 
-void space_gparts_sort(struct gpart *gparts, size_t *ind, size_t N, int min,
+void space_gparts_sort(struct gpart *gparts, int *ind, size_t N, int min,
                        int max) {
 
   struct qstack {
diff --git a/src/space.h b/src/space.h
index 20f86562010f59f487c9783e83d540c196ddee83..e761595838ae78b0d8a67cca676cfa59f3f700f6 100644
--- a/src/space.h
+++ b/src/space.h
@@ -103,6 +103,8 @@ struct space {
   /* Buffers for parts that we will receive from foreign cells. */
   struct part *parts_foreign;
   size_t nr_parts_foreign, size_parts_foreign;
+  struct gpart *gparts_foreign;
+  size_t nr_gparts_foreign, size_gparts_foreign;
 };
 
 /* Interval stack necessary for parallel particle sorting. */
@@ -114,7 +116,7 @@ struct qstack {
 struct parallel_sort {
   struct part *parts;
   struct xpart *xparts;
-  size_t *ind;
+  int *ind;
   struct qstack *stack;
   unsigned int stack_size;
   volatile unsigned int first, last, waiting;
@@ -122,9 +124,9 @@ struct parallel_sort {
 extern struct parallel_sort space_sort_struct;
 
 /* function prototypes. */
-void space_parts_sort(struct space *s, size_t *ind, size_t N, int min, int max,
+void space_parts_sort(struct space *s, int *ind, size_t N, int min, int max,
                       int verbose);
-void space_gparts_sort(struct gpart *gparts, size_t *ind, size_t N, int min,
+void space_gparts_sort(struct gpart *gparts, int *ind, size_t N, int min,
                        int max);
 struct cell *space_getcell(struct space *s);
 int space_getsid(struct space *s, struct cell **ci, struct cell **cj,