diff --git a/examples/process_cells b/examples/process_cells
index b57ed9e73c76807707d158485e31a5b643d8dec0..a18148f0890285d1aebd72e5f13f6c327b202619 100755
--- a/examples/process_cells
+++ b/examples/process_cells
@@ -38,7 +38,7 @@ ranks=$(ls -v cells_*.dat | sed 's,cells_\(.*\)_.*.dat,\1,' | sort | uniq | wc -
 echo "Number of ranks = $ranks"
 
 #  Now construct a list of files ordered by rank, not step.
-files=$(ls cells_*.dat | sort -t "_" -k 3,3 -n | xargs -n 4)
+files=$(ls cells_*.dat | sort -t "_" -k 3,3 -n | xargs -n $ranks)
 
 #  Need number of steps.
 nfiles=$(echo $files| wc -w)
@@ -48,10 +48,10 @@ echo "Number of steps = $steps"
 
 #  And process them,
 echo "Processing cell dumps files..."
-echo $files | xargs -P $NPROCS -n 4 /bin/bash -c "${SCRIPTHOME}/process_cells_helper $NX $NY $NZ \$0 \$1 \$2 \$3"
+echo $files | xargs -P $NPROCS -n $ranks ${SCRIPTHOME}/process_cells_helper $NX $NY $NZ
 
 #  Create summary.
-grep "top cells" step*-active-cells.dat | sort -h > active_cells.log
+grep "top cells" step*-active-cells.dat | sort -n -k 1.5 > active_cells.log
 
 #  And plot of active cells to edge cells.
 stilts plot2plane ifmt=ascii in=active_cells.log xmin=-0.1 xmax=1.1 ymin=0 ymax=$steps grid=1 \
diff --git a/src/cell.c b/src/cell.c
index a1f57edcca3ff1fb42a93cdf6e222d48a00797ac..118cf656cf7defd6f987b666ea8f65571c023d9a 100644
--- a/src/cell.c
+++ b/src/cell.c
@@ -835,11 +835,15 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, ptrdiff_t sparts_offset,
           memswap(&parts[j], &part, sizeof(struct part));
           memswap(&xparts[j], &xpart, sizeof(struct xpart));
           memswap(&buff[j], &temp_buff, sizeof(struct cell_buff));
+          if (parts[j].gpart)
+            parts[j].gpart->id_or_neg_offset = -(j + parts_offset);
           bid = temp_buff.ind;
         }
         parts[k] = part;
         xparts[k] = xpart;
         buff[k] = temp_buff;
+        if (parts[k].gpart)
+          parts[k].gpart->id_or_neg_offset = -(k + parts_offset);
       }
       bucket_count[bid]++;
     }
@@ -852,10 +856,6 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, ptrdiff_t sparts_offset,
     c->progeny[k]->xparts = &c->xparts[bucket_offset[k]];
   }
 
-  /* Re-link the gparts. */
-  if (count > 0 && gcount > 0)
-    part_relink_gparts_to_parts(parts, count, parts_offset);
-
 #ifdef SWIFT_DEBUG_CHECKS
   /* Check that the buffs are OK. */
   for (int k = 1; k < count; k++) {
@@ -952,10 +952,14 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, ptrdiff_t sparts_offset,
           }
           memswap(&sparts[j], &spart, sizeof(struct spart));
           memswap(&sbuff[j], &temp_buff, sizeof(struct cell_buff));
+          if (sparts[j].gpart)
+            sparts[j].gpart->id_or_neg_offset = -(j + sparts_offset);
           bid = temp_buff.ind;
         }
         sparts[k] = spart;
         sbuff[k] = temp_buff;
+        if (sparts[k].gpart)
+          sparts[k].gpart->id_or_neg_offset = -(k + sparts_offset);
       }
       bucket_count[bid]++;
     }
@@ -967,10 +971,6 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, ptrdiff_t sparts_offset,
     c->progeny[k]->sparts = &c->sparts[bucket_offset[k]];
   }
 
-  /* Re-link the gparts. */
-  if (scount > 0 && gcount > 0)
-    part_relink_gparts_to_sparts(sparts, scount, sparts_offset);
-
   /* Finally, do the same song and dance for the gparts. */
   for (int k = 0; k < 8; k++) bucket_count[k] = 0;
 
@@ -1005,10 +1005,23 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, ptrdiff_t sparts_offset,
           }
           memswap(&gparts[j], &gpart, sizeof(struct gpart));
           memswap(&gbuff[j], &temp_buff, sizeof(struct cell_buff));
+          if (gparts[j].type == swift_type_gas) {
+            parts[-gparts[j].id_or_neg_offset - parts_offset].gpart =
+                &gparts[j];
+          } else if (gparts[j].type == swift_type_star) {
+            sparts[-gparts[j].id_or_neg_offset - sparts_offset].gpart =
+                &gparts[j];
+          }
           bid = temp_buff.ind;
         }
         gparts[k] = gpart;
         gbuff[k] = temp_buff;
+        if (gparts[k].type == swift_type_gas) {
+          parts[-gparts[k].id_or_neg_offset - parts_offset].gpart = &gparts[k];
+        } else if (gparts[k].type == swift_type_star) {
+          sparts[-gparts[k].id_or_neg_offset - sparts_offset].gpart =
+              &gparts[k];
+        }
       }
       bucket_count[bid]++;
     }
@@ -1019,14 +1032,6 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, ptrdiff_t sparts_offset,
     c->progeny[k]->gcount = bucket_count[k];
     c->progeny[k]->gparts = &c->gparts[bucket_offset[k]];
   }
-
-  /* Re-link the parts. */
-  if (count > 0 && gcount > 0)
-    part_relink_parts_to_gparts(gparts, gcount, parts - parts_offset);
-
-  /* Re-link the sparts. */
-  if (scount > 0 && gcount > 0)
-    part_relink_sparts_to_gparts(gparts, gcount, sparts - sparts_offset);
 }
 
 /**
diff --git a/src/debug.c b/src/debug.c
index b1e2cb08bc7fa99330da3d9c9382dbef81b3215a..25301ea5caefe69b7daef2c048b2e4b68d04c289 100644
--- a/src/debug.c
+++ b/src/debug.c
@@ -326,8 +326,7 @@ static void dumpCells_map(struct cell *c, void *data) {
 
     /* So output local super cells that are active and have MPI
      * tasks as requested. */
-    if (c->nodeID == e->nodeID && (!super || (super && c->super == c)) &&
-        active && mpiactive) {
+    if (c->nodeID == e->nodeID && (!super ||((super && c->super == c) || (c->parent == NULL))) && active && mpiactive) {
 
       /* If requested we work out how many particles are active in this cell. */
       int pactcount = 0;
diff --git a/src/engine.c b/src/engine.c
index d74a44b5cf041ea9c4f400e077bcfd057f45e0b7..f5d8cca878de2088dc1a7be322d2a64fd7e0122a 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -565,7 +565,8 @@ void engine_redistribute(struct engine *e) {
 
   /* Sort the particles according to their cell index. */
   if (s->nr_parts > 0)
-    space_parts_sort(s, dest, s->nr_parts, 0, nr_nodes - 1, e->verbose);
+    space_parts_sort(s->parts, s->xparts, dest, &counts[nodeID * nr_nodes],
+                     nr_nodes, 0);
 
 #ifdef SWIFT_DEBUG_CHECKS
   /* Verify that the part have been sorted correctly. */
@@ -656,7 +657,8 @@ void engine_redistribute(struct engine *e) {
 
   /* Sort the particles according to their cell index. */
   if (s->nr_sparts > 0)
-    space_sparts_sort(s, s_dest, s->nr_sparts, 0, nr_nodes - 1, e->verbose);
+    space_sparts_sort(s->sparts, s_dest, &s_counts[nodeID * nr_nodes], nr_nodes,
+                      0);
 
 #ifdef SWIFT_DEBUG_CHECKS
   /* Verify that the spart have been sorted correctly. */
@@ -748,7 +750,8 @@ void engine_redistribute(struct engine *e) {
 
   /* Sort the gparticles according to their cell index. */
   if (s->nr_gparts > 0)
-    space_gparts_sort(s, g_dest, s->nr_gparts, 0, nr_nodes - 1, e->verbose);
+    space_gparts_sort(s->gparts, s->parts, s->sparts, g_dest,
+                      &g_counts[nodeID * nr_nodes], nr_nodes);
 
 #ifdef SWIFT_DEBUG_CHECKS
   /* Verify that the gpart have been sorted correctly. */
@@ -3678,6 +3681,11 @@ void engine_rebuild(struct engine *e, int clean_smoothing_length_values) {
   /* Re-build the space. */
   space_rebuild(e->s, e->verbose);
 
+#ifdef SWIFT_DEBUG_CHECKS
+  part_verify_links(e->s->parts, e->s->gparts, e->s->sparts, e->s->nr_parts,
+                    e->s->nr_gparts, e->s->nr_sparts, e->verbose);
+#endif
+
   /* Initial cleaning up session ? */
   if (clean_smoothing_length_values) space_sanitize(e->s);
 
@@ -4151,7 +4159,7 @@ void engine_first_init_particles(struct engine *e) {
 
   const ticks tic = getticks();
 
-  /* Set the particles in a state where they are ready for a run */
+  /* Set the particles in a state where they are ready for a run. */
   space_first_init_parts(e->s, e->chemistry, e->cooling_func);
   space_first_init_gparts(e->s, e->gravity_properties);
   space_first_init_sparts(e->s);
diff --git a/src/part.c b/src/part.c
index c43ffa4820504282b4fb02e63b5cfc4196f3be77..1b696a8cbc135fd2c128b5ad705a0e6e24a2d5c8 100644
--- a/src/part.c
+++ b/src/part.c
@@ -94,6 +94,26 @@ void part_relink_sparts_to_gparts(struct gpart *gparts, size_t N,
   }
 }
 
+/**
+ * @brief Re-link both the #part%s and #spart%s associated with the list of
+ * #gpart%s.
+ *
+ * @param gparts The list of #gpart.
+ * @param N The number of particles to re-link;
+ * @param parts The global #part array in which to find the #gpart offsets.
+ * @param sparts The global #spart array in which to find the #gpart offsets.
+ */
+void part_relink_all_parts_to_gparts(struct gpart *gparts, size_t N,
+                                     struct part *parts, struct spart *sparts) {
+  for (size_t k = 0; k < N; k++) {
+    if (gparts[k].type == swift_type_gas) {
+      parts[-gparts[k].id_or_neg_offset].gpart = &gparts[k];
+    } else if (gparts[k].type == swift_type_star) {
+      sparts[-gparts[k].id_or_neg_offset].gpart = &gparts[k];
+    }
+  }
+}
+
 /**
  * @brief Verifies that the #gpart, #part and #spart are correctly linked
  * together
@@ -128,19 +148,19 @@ void part_verify_links(struct part *parts, struct gpart *gparts,
 
       /* Check that it is linked */
       if (gparts[k].id_or_neg_offset > 0)
-        error("Gas gpart not linked to anything !");
+        error("Gas gpart not linked to anything!");
 
       /* Find its link */
       const struct part *part = &parts[-gparts[k].id_or_neg_offset];
 
       /* Check the reverse link */
-      if (part->gpart != &gparts[k]) error("Linking problem !");
+      if (part->gpart != &gparts[k]) error("Linking problem!");
 
       /* Check that the particles are at the same place */
       if (gparts[k].x[0] != part->x[0] || gparts[k].x[1] != part->x[1] ||
           gparts[k].x[2] != part->x[2])
         error(
-            "Linked particles are not at the same position !\n"
+            "Linked particles are not at the same position!\n"
             "gp->x=[%e %e %e] p->x=[%e %e %e] diff=[%e %e %e]",
             gparts[k].x[0], gparts[k].x[1], gparts[k].x[2], part->x[0],
             part->x[1], part->x[2], gparts[k].x[0] - part->x[0],
diff --git a/src/part.h b/src/part.h
index 1b40aee0db3deb4790e07e3da9807060900d0c55..3e9ab5180e7cbfb4622b4a05d396a7f785f08b0b 100644
--- a/src/part.h
+++ b/src/part.h
@@ -80,6 +80,8 @@ void part_relink_parts_to_gparts(struct gpart *gparts, size_t N,
                                  struct part *parts);
 void part_relink_sparts_to_gparts(struct gpart *gparts, size_t N,
                                   struct spart *sparts);
+void part_relink_all_parts_to_gparts(struct gpart *gparts, size_t N,
+                                     struct part *parts, struct spart *sparts);
 void part_verify_links(struct part *parts, struct gpart *gparts,
                        struct spart *sparts, size_t nr_parts, size_t nr_gparts,
                        size_t nr_sparts, int verbose);
diff --git a/src/space.c b/src/space.c
index b1d25fb85fd8d412d76a6900d136966b58a44552..b6aef6ea7db1b7bfe0ecbe6a39157f9be5994b50 100644
--- a/src/space.c
+++ b/src/space.c
@@ -101,6 +101,7 @@ struct index_data {
   struct space *s;
   struct cell *cells;
   int *ind;
+  int *cell_counts;
 };
 
 /**
@@ -580,26 +581,33 @@ void space_rebuild(struct space *s, int verbose) {
      an index that is larger than the number of particles to avoid
      re-allocating after shuffling. */
   const size_t ind_size = s->size_parts + 100;
-  int *ind;
-  if ((ind = (int *)malloc(sizeof(int) * ind_size)) == NULL)
-    error("Failed to allocate temporary particle indices.");
-  if (s->size_parts > 0) space_parts_get_cell_index(s, ind, cells_top, verbose);
+  int *ind = (int *)malloc(sizeof(int) * ind_size);
+  if (ind == NULL) error("Failed to allocate temporary particle indices.");
+  int *cell_part_counts = (int *)calloc(sizeof(int), s->nr_cells);
+  if (cell_part_counts == NULL)
+    error("Failed to allocate cell part count buffer.");
+  if (s->size_parts > 0)
+    space_parts_get_cell_index(s, ind, cell_part_counts, cells_top, verbose);
 
   /* Run through the gravity particles and get their cell index. */
   const size_t gind_size = s->size_gparts + 100;
-  int *gind;
-  if ((gind = (int *)malloc(sizeof(int) * gind_size)) == NULL)
-    error("Failed to allocate temporary g-particle indices.");
+  int *gind = (int *)malloc(sizeof(int) * gind_size);
+  if (gind == NULL) error("Failed to allocate temporary g-particle indices.");
+  int *cell_gpart_counts = (int *)calloc(sizeof(int), s->nr_cells);
+  if (cell_gpart_counts == NULL)
+    error("Failed to allocate cell gpart count buffer.");
   if (s->size_gparts > 0)
-    space_gparts_get_cell_index(s, gind, cells_top, verbose);
+    space_gparts_get_cell_index(s, gind, cell_gpart_counts, cells_top, verbose);
 
   /* Run through the star particles and get their cell index. */
   const size_t sind_size = s->size_sparts + 100;
-  int *sind;
-  if ((sind = (int *)malloc(sizeof(int) * sind_size)) == NULL)
-    error("Failed to allocate temporary s-particle indices.");
+  int *sind = (int *)malloc(sizeof(int) * sind_size);
+  if (sind == NULL) error("Failed to allocate temporary s-particle indices.");
+  int *cell_spart_counts = (int *)calloc(sizeof(int), s->nr_cells);
+  if (cell_spart_counts == NULL)
+    error("Failed to allocate cell gpart count buffer.");
   if (s->size_sparts > 0)
-    space_sparts_get_cell_index(s, sind, cells_top, verbose);
+    space_sparts_get_cell_index(s, sind, cell_spart_counts, cells_top, verbose);
 
 #ifdef WITH_MPI
   const int local_nodeID = s->e->nodeID;
@@ -609,9 +617,7 @@ void space_rebuild(struct space *s, int verbose) {
     if (cells_top[ind[k]].nodeID != local_nodeID) {
       nr_parts -= 1;
       /* Swap the particle */
-      const struct part tp = s->parts[k];
-      s->parts[k] = s->parts[nr_parts];
-      s->parts[nr_parts] = tp;
+      memswap(&s->parts[k], &s->parts[nr_parts], sizeof(struct part));
       /* Swap the link with the gpart */
       if (s->parts[k].gpart != NULL) {
         s->parts[k].gpart->id_or_neg_offset = -k;
@@ -620,13 +626,9 @@ void space_rebuild(struct space *s, int verbose) {
         s->parts[nr_parts].gpart->id_or_neg_offset = -nr_parts;
       }
       /* Swap the xpart */
-      const struct xpart txp = s->xparts[k];
-      s->xparts[k] = s->xparts[nr_parts];
-      s->xparts[nr_parts] = txp;
+      memswap(&s->xparts[k], &s->xparts[nr_parts], sizeof(struct xpart));
       /* Swap the index */
-      const int t = ind[k];
-      ind[k] = ind[nr_parts];
-      ind[nr_parts] = t;
+      memswap(&ind[k], &ind[nr_parts], sizeof(int));
     } else {
       /* Increment when not exchanging otherwise we need to retest "k".*/
       k++;
@@ -652,9 +654,7 @@ void space_rebuild(struct space *s, int verbose) {
     if (cells_top[sind[k]].nodeID != local_nodeID) {
       nr_sparts -= 1;
       /* Swap the particle */
-      const struct spart tp = s->sparts[k];
-      s->sparts[k] = s->sparts[nr_sparts];
-      s->sparts[nr_sparts] = tp;
+      memswap(&s->sparts[k], &s->sparts[nr_sparts], sizeof(struct spart));
       /* Swap the link with the gpart */
       if (s->sparts[k].gpart != NULL) {
         s->sparts[k].gpart->id_or_neg_offset = -k;
@@ -663,9 +663,7 @@ void space_rebuild(struct space *s, int verbose) {
         s->sparts[nr_sparts].gpart->id_or_neg_offset = -nr_sparts;
       }
       /* Swap the index */
-      const int t = sind[k];
-      sind[k] = sind[nr_sparts];
-      sind[nr_sparts] = t;
+      memswap(&sind[k], &sind[nr_sparts], sizeof(int));
     } else {
       /* Increment when not exchanging otherwise we need to retest "k".*/
       k++;
@@ -691,9 +689,7 @@ void space_rebuild(struct space *s, int verbose) {
     if (cells_top[gind[k]].nodeID != local_nodeID) {
       nr_gparts -= 1;
       /* Swap the particle */
-      const struct gpart tp = s->gparts[k];
-      s->gparts[k] = s->gparts[nr_gparts];
-      s->gparts[nr_gparts] = tp;
+      memswap(&s->gparts[k], &s->gparts[nr_gparts], sizeof(struct gpart));
       /* Swap the link with part/spart */
       if (s->gparts[k].type == swift_type_gas) {
         s->parts[-s->gparts[k].id_or_neg_offset].gpart = &s->gparts[k];
@@ -708,9 +704,7 @@ void space_rebuild(struct space *s, int verbose) {
             &s->gparts[nr_gparts];
       }
       /* Swap the index */
-      const int t = gind[k];
-      gind[k] = gind[nr_gparts];
-      gind[nr_gparts] = t;
+      memswap(&gind[k], &gind[nr_gparts], sizeof(int));
     } else {
       /* Increment when not exchanging otherwise we need to retest "k".*/
       k++;
@@ -745,6 +739,15 @@ void space_rebuild(struct space *s, int verbose) {
   s->nr_gparts = nr_gparts + nr_gparts_exchanged;
   s->nr_sparts = nr_sparts + nr_sparts_exchanged;
 
+  /* Clear non-local cell counts. */
+  for (int k = 0; k < s->nr_cells; k++) {
+    if (s->cells_top[k].nodeID != local_nodeID) {
+      cell_part_counts[k] = 0;
+      cell_spart_counts[k] = 0;
+      cell_gpart_counts[k] = 0;
+    }
+  }
+
   /* Re-allocate the index array for the parts if needed.. */
   if (s->nr_parts + 1 > ind_size) {
     int *ind_new;
@@ -773,6 +776,7 @@ void space_rebuild(struct space *s, int verbose) {
     const struct part *const p = &s->parts[k];
     ind[k] =
         cell_getid(cdim, p->x[0] * ih[0], p->x[1] * ih[1], p->x[2] * ih[2]);
+    cell_part_counts[ind[k]]++;
 #ifdef SWIFT_DEBUG_CHECKS
     if (cells_top[ind[k]].nodeID != local_nodeID)
       error("Received part that does not belong to me (nodeID=%i).",
@@ -786,6 +790,7 @@ void space_rebuild(struct space *s, int verbose) {
     const struct spart *const sp = &s->sparts[k];
     sind[k] =
         cell_getid(cdim, sp->x[0] * ih[0], sp->x[1] * ih[1], sp->x[2] * ih[2]);
+    cell_spart_counts[sind[k]]++;
 #ifdef SWIFT_DEBUG_CHECKS
     if (cells_top[sind[k]].nodeID != local_nodeID)
       error("Received s-part that does not belong to me (nodeID=%i).",
@@ -794,11 +799,12 @@ void space_rebuild(struct space *s, int verbose) {
   }
   nr_sparts = s->nr_sparts;
 
-#endif /* WITH_MPI */
+#endif  // WITH_MPI
 
   /* Sort the parts according to their cells. */
   if (nr_parts > 0)
-    space_parts_sort(s, ind, nr_parts, 0, s->nr_cells - 1, verbose);
+    space_parts_sort(s->parts, s->xparts, ind, cell_part_counts, s->nr_cells,
+                     0);
 
 #ifdef SWIFT_DEBUG_CHECKS
   /* Verify that the part have been sorted correctly. */
@@ -825,7 +831,7 @@ void space_rebuild(struct space *s, int verbose) {
 
   /* Sort the sparts according to their cells. */
   if (nr_sparts > 0)
-    space_sparts_sort(s, sind, nr_sparts, 0, s->nr_cells - 1, verbose);
+    space_sparts_sort(s->sparts, sind, cell_spart_counts, s->nr_cells, 0);
 
 #ifdef SWIFT_DEBUG_CHECKS
   /* Verify that the spart have been sorted correctly. */
@@ -850,12 +856,6 @@ void space_rebuild(struct space *s, int verbose) {
   }
 #endif
 
-  /* Re-link the gparts to their (s-)particles. */
-  if (nr_parts > 0 && nr_gparts > 0)
-    part_relink_gparts_to_parts(s->parts, nr_parts, 0);
-  if (nr_sparts > 0 && nr_gparts > 0)
-    part_relink_gparts_to_sparts(s->sparts, nr_sparts, 0);
-
   /* Extract the cell counts from the sorted indices. */
   size_t last_index = 0;
   ind[nr_parts] = s->nr_cells;  // sentinel.
@@ -878,7 +878,9 @@ void space_rebuild(struct space *s, int verbose) {
 
   /* We no longer need the indices as of here. */
   free(ind);
+  free(cell_part_counts);
   free(sind);
+  free(cell_spart_counts);
 
 #ifdef WITH_MPI
 
@@ -897,7 +899,7 @@ void space_rebuild(struct space *s, int verbose) {
     const struct gpart *const p = &s->gparts[k];
     gind[k] =
         cell_getid(cdim, p->x[0] * ih[0], p->x[1] * ih[1], p->x[2] * ih[2]);
-
+    cell_gpart_counts[gind[k]]++;
 #ifdef SWIFT_DEBUG_CHECKS
     if (cells_top[gind[k]].nodeID != s->e->nodeID)
       error("Received g-part that does not belong to me (nodeID=%i).",
@@ -906,11 +908,12 @@ void space_rebuild(struct space *s, int verbose) {
   }
   nr_gparts = s->nr_gparts;
 
-#endif /* WITH_MPI */
+#endif  // WITH_MPI
 
   /* Sort the gparts according to their cells. */
   if (nr_gparts > 0)
-    space_gparts_sort(s, gind, nr_gparts, 0, s->nr_cells - 1, verbose);
+    space_gparts_sort(s->gparts, s->parts, s->sparts, gind, cell_gpart_counts,
+                      s->nr_cells);
 
 #ifdef SWIFT_DEBUG_CHECKS
   /* Verify that the gpart have been sorted correctly. */
@@ -935,14 +938,6 @@ void space_rebuild(struct space *s, int verbose) {
   }
 #endif
 
-  /* Re-link the parts. */
-  if (nr_parts > 0 && nr_gparts > 0)
-    part_relink_parts_to_gparts(s->gparts, nr_gparts, s->parts);
-
-  /* Re-link the sparts. */
-  if (nr_sparts > 0 && nr_gparts > 0)
-    part_relink_sparts_to_gparts(s->gparts, nr_gparts, s->sparts);
-
   /* Extract the cell counts from the sorted indices. */
   size_t last_gindex = 0;
   gind[nr_gparts] = s->nr_cells;
@@ -955,6 +950,7 @@ void space_rebuild(struct space *s, int verbose) {
 
   /* We no longer need the indices as of here. */
   free(gind);
+  free(cell_gpart_counts);
 
 #ifdef SWIFT_DEBUG_CHECKS
   /* Verify that the links are correct */
@@ -999,6 +995,9 @@ void space_rebuild(struct space *s, int verbose) {
       cell_check_multipole(&s->cells_top[k], NULL);
 #endif
 
+  /* Clean up any stray sort indices in the cell buffer. */
+  space_free_buff_sort_indices(s);
+
   if (verbose)
     message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
             clocks_getunit());
@@ -1082,6 +1081,12 @@ void space_parts_get_cell_index_mapper(void *map_data, int nr_parts,
   const double ih_y = s->iwidth[1];
   const double ih_z = s->iwidth[2];
 
+  /* Init the local count buffer. */
+  int *cell_counts = (int *)calloc(sizeof(int), s->nr_cells);
+  if (cell_counts == NULL)
+    error("Failed to allocate temporary cell count buffer.");
+
+  /* Loop over the parts. */
   for (int k = 0; k < nr_parts; k++) {
 
     /* Get the particle */
@@ -1100,6 +1105,7 @@ void space_parts_get_cell_index_mapper(void *map_data, int nr_parts,
     const int index =
         cell_getid(cdim, pos_x * ih_x, pos_y * ih_y, pos_z * ih_z);
     ind[k] = index;
+    cell_counts[index]++;
 
 #ifdef SWIFT_DEBUG_CHECKS
     if (index < 0 || index >= cdim[0] * cdim[1] * cdim[2])
@@ -1117,6 +1123,11 @@ void space_parts_get_cell_index_mapper(void *map_data, int nr_parts,
     p->x[1] = pos_y;
     p->x[2] = pos_z;
   }
+
+  /* Write the counts back to the global array. */
+  for (int k = 0; k < s->nr_cells; k++)
+    if (cell_counts[k]) atomic_add(&data->cell_counts[k], cell_counts[k]);
+  free(cell_counts);
 }
 
 /**
@@ -1144,6 +1155,11 @@ void space_gparts_get_cell_index_mapper(void *map_data, int nr_gparts,
   const double ih_y = s->iwidth[1];
   const double ih_z = s->iwidth[2];
 
+  /* Init the local count buffer. */
+  int *cell_counts = (int *)calloc(sizeof(int), s->nr_cells);
+  if (cell_counts == NULL)
+    error("Failed to allocate temporary cell count buffer.");
+
   for (int k = 0; k < nr_gparts; k++) {
 
     /* Get the particle */
@@ -1162,6 +1178,7 @@ void space_gparts_get_cell_index_mapper(void *map_data, int nr_gparts,
     const int index =
         cell_getid(cdim, pos_x * ih_x, pos_y * ih_y, pos_z * ih_z);
     ind[k] = index;
+    cell_counts[index]++;
 
 #ifdef SWIFT_DEBUG_CHECKS
     if (index < 0 || index >= cdim[0] * cdim[1] * cdim[2])
@@ -1179,6 +1196,11 @@ void space_gparts_get_cell_index_mapper(void *map_data, int nr_gparts,
     gp->x[1] = pos_y;
     gp->x[2] = pos_z;
   }
+
+  /* Write the counts back to the global array. */
+  for (int k = 0; k < s->nr_cells; k++)
+    if (cell_counts[k]) atomic_add(&data->cell_counts[k], cell_counts[k]);
+  free(cell_counts);
 }
 
 /**
@@ -1206,6 +1228,11 @@ void space_sparts_get_cell_index_mapper(void *map_data, int nr_sparts,
   const double ih_y = s->iwidth[1];
   const double ih_z = s->iwidth[2];
 
+  /* Init the local count buffer. */
+  int *cell_counts = (int *)calloc(sizeof(int), s->nr_cells);
+  if (cell_counts == NULL)
+    error("Failed to allocate temporary cell count buffer.");
+
   for (int k = 0; k < nr_sparts; k++) {
 
     /* Get the particle */
@@ -1224,6 +1251,7 @@ void space_sparts_get_cell_index_mapper(void *map_data, int nr_sparts,
     const int index =
         cell_getid(cdim, pos_x * ih_x, pos_y * ih_y, pos_z * ih_z);
     ind[k] = index;
+    cell_counts[index]++;
 
 #ifdef SWIFT_DEBUG_CHECKS
     if (index < 0 || index >= cdim[0] * cdim[1] * cdim[2])
@@ -1241,6 +1269,11 @@ void space_sparts_get_cell_index_mapper(void *map_data, int nr_sparts,
     sp->x[1] = pos_y;
     sp->x[2] = pos_z;
   }
+
+  /* Write the counts back to the global array. */
+  for (int k = 0; k < s->nr_cells; k++)
+    if (cell_counts[k]) atomic_add(&data->cell_counts[k], cell_counts[k]);
+  free(cell_counts);
 }
 
 /**
@@ -1248,11 +1281,12 @@ void space_sparts_get_cell_index_mapper(void *map_data, int nr_sparts,
  *
  * @param s The #space.
  * @param ind The array of indices to fill.
+ * @param cell_counts The cell counters to update.
  * @param cells The array of #cell to update.
  * @param verbose Are we talkative ?
  */
-void space_parts_get_cell_index(struct space *s, int *ind, struct cell *cells,
-                                int verbose) {
+void space_parts_get_cell_index(struct space *s, int *ind, int *cell_counts,
+                                struct cell *cells, int verbose) {
 
   const ticks tic = getticks();
 
@@ -1261,6 +1295,7 @@ void space_parts_get_cell_index(struct space *s, int *ind, struct cell *cells,
   data.s = s;
   data.cells = cells;
   data.ind = ind;
+  data.cell_counts = cell_counts;
 
   threadpool_map(&s->e->threadpool, space_parts_get_cell_index_mapper, s->parts,
                  s->nr_parts, sizeof(struct part), 0, &data);
@@ -1275,11 +1310,12 @@ void space_parts_get_cell_index(struct space *s, int *ind, struct cell *cells,
  *
  * @param s The #space.
  * @param gind The array of indices to fill.
+ * @param cell_counts The cell counters to update.
  * @param cells The array of #cell to update.
  * @param verbose Are we talkative ?
  */
-void space_gparts_get_cell_index(struct space *s, int *gind, struct cell *cells,
-                                 int verbose) {
+void space_gparts_get_cell_index(struct space *s, int *gind, int *cell_counts,
+                                 struct cell *cells, int verbose) {
 
   const ticks tic = getticks();
 
@@ -1288,6 +1324,7 @@ void space_gparts_get_cell_index(struct space *s, int *gind, struct cell *cells,
   data.s = s;
   data.cells = cells;
   data.ind = gind;
+  data.cell_counts = cell_counts;
 
   threadpool_map(&s->e->threadpool, space_gparts_get_cell_index_mapper,
                  s->gparts, s->nr_gparts, sizeof(struct gpart), 0, &data);
@@ -1302,11 +1339,12 @@ void space_gparts_get_cell_index(struct space *s, int *gind, struct cell *cells,
  *
  * @param s The #space.
  * @param sind The array of indices to fill.
+ * @param cell_counts The cell counters to update.
  * @param cells The array of #cell to update.
  * @param verbose Are we talkative ?
  */
-void space_sparts_get_cell_index(struct space *s, int *sind, struct cell *cells,
-                                 int verbose) {
+void space_sparts_get_cell_index(struct space *s, int *sind, int *cell_counts,
+                                 struct cell *cells, int verbose) {
 
   const ticks tic = getticks();
 
@@ -1315,6 +1353,7 @@ void space_sparts_get_cell_index(struct space *s, int *sind, struct cell *cells,
   data.s = s;
   data.cells = cells;
   data.ind = sind;
+  data.cell_counts = cell_counts;
 
   threadpool_map(&s->e->threadpool, space_sparts_get_cell_index_mapper,
                  s->sparts, s->nr_sparts, sizeof(struct spart), 0, &data);
@@ -1328,551 +1367,176 @@ void space_sparts_get_cell_index(struct space *s, int *sind, struct cell *cells,
  * @brief Sort the particles and condensed particles according to the given
  * indices.
  *
- * @param s The #space.
+ * @param parts The array of #part to sort.
+ * @param xparts The corresponding #xpart array to sort as well.
  * @param ind The indices with respect to which the parts are sorted.
- * @param N The number of parts
- * @param min Lowest index.
- * @param max highest index.
- * @param verbose Are we talkative ?
+ * @param counts Number of particles per index.
+ * @param num_bins Total number of bins (length of count).
+ * @param parts_offset Offset of the #part array from the global #part array.
  */
-void space_parts_sort(struct space *s, int *ind, size_t N, int min, int max,
-                      int verbose) {
-
-  const ticks tic = getticks();
-
-  /* Populate a parallel_sort structure with the input data */
-  struct parallel_sort sort_struct;
-  sort_struct.parts = s->parts;
-  sort_struct.xparts = s->xparts;
-  sort_struct.ind = ind;
-  sort_struct.stack_size = 2 * (max - min + 1) + 10 + s->e->nr_threads;
-  if ((sort_struct.stack = (struct qstack *)malloc(
-           sizeof(struct qstack) * sort_struct.stack_size)) == NULL)
-    error("Failed to allocate sorting stack.");
-  for (unsigned int i = 0; i < sort_struct.stack_size; i++)
-    sort_struct.stack[i].ready = 0;
-
-  /* Add the first interval. */
-  sort_struct.stack[0].i = 0;
-  sort_struct.stack[0].j = N - 1;
-  sort_struct.stack[0].min = min;
-  sort_struct.stack[0].max = max;
-  sort_struct.stack[0].ready = 1;
-  sort_struct.first = 0;
-  sort_struct.last = 1;
-  sort_struct.waiting = 1;
-
-  /* Launch the sorting tasks with a stride of zero such that the same
-     map data is passed to each thread. */
-  threadpool_map(&s->e->threadpool, space_parts_sort_mapper, &sort_struct,
-                 s->e->threadpool.num_threads, 0, 1, NULL);
-
-#ifdef SWIFT_DEBUG_CHECKS
-  /* Verify space_sort_struct. */
-  for (size_t i = 1; i < N; i++)
-    if (ind[i - 1] > ind[i])
-      error("Sorting failed (ind[%zu]=%i,ind[%zu]=%i), min=%i, max=%i.", i - 1,
-            ind[i - 1], i, ind[i], min, max);
-  if (s->e->nodeID == 0 || verbose) message("Sorting succeeded.");
-#endif
-
-  /* Clean up. */
-  free(sort_struct.stack);
-
-  if (verbose)
-    message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
-            clocks_getunit());
-}
-
-void space_parts_sort_mapper(void *map_data, int num_elements,
-                             void *extra_data) {
-
-  /* Unpack the mapping data. */
-  struct parallel_sort *sort_struct = (struct parallel_sort *)map_data;
-
-  /* Pointers to the sorting data. */
-  int *ind = sort_struct->ind;
-  struct part *parts = sort_struct->parts;
-  struct xpart *xparts = sort_struct->xparts;
-
-  /* Main loop. */
-  while (sort_struct->waiting) {
-
-    /* Grab an interval off the queue. */
-    int qid = atomic_inc(&sort_struct->first) % sort_struct->stack_size;
-
-    /* Wait for the entry to be ready, or for the sorting do be done. */
-    while (!sort_struct->stack[qid].ready)
-      if (!sort_struct->waiting) return;
-
-    /* Get the stack entry. */
-    ptrdiff_t i = sort_struct->stack[qid].i;
-    ptrdiff_t j = sort_struct->stack[qid].j;
-    int min = sort_struct->stack[qid].min;
-    int max = sort_struct->stack[qid].max;
-    sort_struct->stack[qid].ready = 0;
-
-    /* Loop over sub-intervals. */
-    while (1) {
-
-      /* Bring beer. */
-      const int pivot = (min + max) / 2;
-      /* message("Working on interval [%i,%i] with min=%i, max=%i, pivot=%i.",
-              i, j, min, max, pivot); */
-
-      /* One pass of QuickSort's partitioning. */
-      ptrdiff_t ii = i;
-      ptrdiff_t jj = j;
-      while (ii < jj) {
-        while (ii <= j && ind[ii] <= pivot) ii++;
-        while (jj >= i && ind[jj] > pivot) jj--;
-        if (ii < jj) {
-          memswap(&ind[ii], &ind[jj], sizeof(int));
-          memswap(&parts[ii], &parts[jj], sizeof(struct part));
-          memswap(&xparts[ii], &xparts[jj], sizeof(struct xpart));
-        }
+void space_parts_sort(struct part *parts, struct xpart *xparts, int *ind,
+                      int *counts, int num_bins, ptrdiff_t parts_offset) {
+  /* Create the offsets array. */
+  size_t *offsets = (size_t *)malloc(sizeof(size_t) * (num_bins + 1));
+  if (offsets == NULL)
+    error("Failed to allocate temporary cell offsets array.");
+  offsets[0] = 0;
+  for (int k = 1; k <= num_bins; k++) {
+    offsets[k] = offsets[k - 1] + counts[k - 1];
+    counts[k - 1] = 0;
+  }
+
+  /* Loop over local cells. */
+  for (int cid = 0; cid < num_bins; cid++) {
+    for (size_t k = offsets[cid] + counts[cid]; k < offsets[cid + 1]; k++) {
+      counts[cid]++;
+      int target_cid = ind[k];
+      if (target_cid == cid) {
+        continue;
       }
-
-#ifdef SWIFT_DEBUG_CHECKS
-      /* Verify space_sort_struct. */
-      if (i != j) {
-        for (int k = i; k <= jj; k++) {
-          if (ind[k] > pivot) {
-            message(
-                "sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%li, j=%li.", k,
-                ind[k], pivot, i, j);
-            error("Partition failed (<=pivot).");
-          }
-        }
-        for (int k = jj + 1; k <= j; k++) {
-          if (ind[k] <= pivot) {
-            message(
-                "sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%li, j=%li.", k,
-                ind[k], pivot, i, j);
-            error("Partition failed (>pivot).");
-          }
-        }
-      }
-#endif
-
-      /* Split-off largest interval. */
-      if (jj - i > j - jj + 1) {
-
-        /* Recurse on the left? */
-        if (jj > i && pivot > min) {
-          qid = atomic_inc(&sort_struct->last) % sort_struct->stack_size;
-          while (sort_struct->stack[qid].ready)
-            ;
-          sort_struct->stack[qid].i = i;
-          sort_struct->stack[qid].j = jj;
-          sort_struct->stack[qid].min = min;
-          sort_struct->stack[qid].max = pivot;
-          if (atomic_inc(&sort_struct->waiting) >= sort_struct->stack_size)
-            error("Qstack overflow.");
-          sort_struct->stack[qid].ready = 1;
+      struct part temp_part = parts[k];
+      struct xpart temp_xpart = xparts[k];
+      while (target_cid != cid) {
+        size_t j = offsets[target_cid] + counts[target_cid]++;
+        while (ind[j] == target_cid) {
+          j = offsets[target_cid] + counts[target_cid]++;
         }
-
-        /* Recurse on the right? */
-        if (jj + 1 < j && pivot + 1 < max) {
-          i = jj + 1;
-          min = pivot + 1;
-        } else
-          break;
-
-      } else {
-
-        /* Recurse on the right? */
-        if (pivot + 1 < max) {
-          qid = atomic_inc(&sort_struct->last) % sort_struct->stack_size;
-          while (sort_struct->stack[qid].ready)
-            ;
-          sort_struct->stack[qid].i = jj + 1;
-          sort_struct->stack[qid].j = j;
-          sort_struct->stack[qid].min = pivot + 1;
-          sort_struct->stack[qid].max = max;
-          if (atomic_inc(&sort_struct->waiting) >= sort_struct->stack_size)
-            error("Qstack overflow.");
-          sort_struct->stack[qid].ready = 1;
-        }
-
-        /* Recurse on the left? */
-        if (jj > i && pivot > min) {
-          j = jj;
-          max = pivot;
-        } else
-          break;
+        memswap(&parts[j], &temp_part, sizeof(struct part));
+        memswap(&xparts[j], &temp_xpart, sizeof(struct xpart));
+        memswap(&ind[j], &target_cid, sizeof(int));
+        if (parts[j].gpart)
+          parts[j].gpart->id_or_neg_offset = -(j + parts_offset);
       }
+      parts[k] = temp_part;
+      xparts[k] = temp_xpart;
+      ind[k] = target_cid;
+      if (parts[k].gpart)
+        parts[k].gpart->id_or_neg_offset = -(k + parts_offset);
+    }
+  }
 
-    } /* loop over sub-intervals. */
-
-    atomic_dec(&sort_struct->waiting);
-
-  } /* main loop. */
+#ifdef SWIFT_DEBUG_CHECKS
+  for (int k = 0; k < num_bins; k++)
+    if (offsets[k + 1] != offsets[k] + counts[k])
+      error("Bad offsets after shuffle.");
+#endif  // SWIFT_DEBUG_CHECKS
 }
 
 /**
  * @brief Sort the s-particles according to the given indices.
  *
- * @param s The #space.
+ * @param sparts The array of #spart to sort.
  * @param ind The indices with respect to which the #spart are sorted.
- * @param N The number of parts
- * @param min Lowest index.
- * @param max highest index.
- * @param verbose Are we talkative ?
+ * @param counts Number of particles per index.
+ * @param num_bins Total number of bins (length of counts).
+ * @param sparts_offset Offset of the #spart array from the global #spart.
+ * array.
  */
-void space_sparts_sort(struct space *s, int *ind, size_t N, int min, int max,
-                       int verbose) {
-
-  const ticks tic = getticks();
-
-  /* Populate a parallel_sort structure with the input data */
-  struct parallel_sort sort_struct;
-  sort_struct.sparts = s->sparts;
-  sort_struct.ind = ind;
-  sort_struct.stack_size = 2 * (max - min + 1) + 10 + s->e->nr_threads;
-  if ((sort_struct.stack = (struct qstack *)malloc(
-           sizeof(struct qstack) * sort_struct.stack_size)) == NULL)
-    error("Failed to allocate sorting stack.");
-  for (unsigned int i = 0; i < sort_struct.stack_size; i++)
-    sort_struct.stack[i].ready = 0;
-
-  /* Add the first interval. */
-  sort_struct.stack[0].i = 0;
-  sort_struct.stack[0].j = N - 1;
-  sort_struct.stack[0].min = min;
-  sort_struct.stack[0].max = max;
-  sort_struct.stack[0].ready = 1;
-  sort_struct.first = 0;
-  sort_struct.last = 1;
-  sort_struct.waiting = 1;
-
-  /* Launch the sorting tasks with a stride of zero such that the same
-     map data is passed to each thread. */
-  threadpool_map(&s->e->threadpool, space_sparts_sort_mapper, &sort_struct,
-                 s->e->threadpool.num_threads, 0, 1, NULL);
-
-#ifdef SWIFT_DEBUG_CHECKS
-  /* Verify space_sort_struct. */
-  for (size_t i = 1; i < N; i++)
-    if (ind[i - 1] > ind[i])
-      error("Sorting failed (ind[%zu]=%i,ind[%zu]=%i), min=%i, max=%i.", i - 1,
-            ind[i - 1], i, ind[i], min, max);
-  if (s->e->nodeID == 0 || verbose) message("Sorting succeeded.");
-#endif
-
-  /* Clean up. */
-  free(sort_struct.stack);
-
-  if (verbose)
-    message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
-            clocks_getunit());
-}
-
-void space_sparts_sort_mapper(void *map_data, int num_elements,
-                              void *extra_data) {
-
-  /* Unpack the mapping data. */
-  struct parallel_sort *sort_struct = (struct parallel_sort *)map_data;
-
-  /* Pointers to the sorting data. */
-  int *ind = sort_struct->ind;
-  struct spart *sparts = sort_struct->sparts;
-
-  /* Main loop. */
-  while (sort_struct->waiting) {
-
-    /* Grab an interval off the queue. */
-    int qid = atomic_inc(&sort_struct->first) % sort_struct->stack_size;
-
-    /* Wait for the entry to be ready, or for the sorting do be done. */
-    while (!sort_struct->stack[qid].ready)
-      if (!sort_struct->waiting) return;
-
-    /* Get the stack entry. */
-    ptrdiff_t i = sort_struct->stack[qid].i;
-    ptrdiff_t j = sort_struct->stack[qid].j;
-    int min = sort_struct->stack[qid].min;
-    int max = sort_struct->stack[qid].max;
-    sort_struct->stack[qid].ready = 0;
-
-    /* Loop over sub-intervals. */
-    while (1) {
-
-      /* Bring beer. */
-      const int pivot = (min + max) / 2;
-      /* message("Working on interval [%i,%i] with min=%i, max=%i, pivot=%i.",
-              i, j, min, max, pivot); */
-
-      /* One pass of QuickSort's partitioning. */
-      ptrdiff_t ii = i;
-      ptrdiff_t jj = j;
-      while (ii < jj) {
-        while (ii <= j && ind[ii] <= pivot) ii++;
-        while (jj >= i && ind[jj] > pivot) jj--;
-        if (ii < jj) {
-          memswap(&ind[ii], &ind[jj], sizeof(int));
-          memswap(&sparts[ii], &sparts[jj], sizeof(struct spart));
-        }
-      }
-
-#ifdef SWIFT_DEBUG_CHECKS
-      /* Verify space_sort_struct. */
-      if (i != j) {
-        for (int k = i; k <= jj; k++) {
-          if (ind[k] > pivot) {
-            message(
-                "sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%li, j=%li "
-                "min=%i max=%i.",
-                k, ind[k], pivot, i, j, min, max);
-            error("Partition failed (<=pivot).");
-          }
-        }
-        for (int k = jj + 1; k <= j; k++) {
-          if (ind[k] <= pivot) {
-            message(
-                "sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%li, j=%li.", k,
-                ind[k], pivot, i, j);
-            error("Partition failed (>pivot).");
-          }
-        }
+void space_sparts_sort(struct spart *sparts, int *ind, int *counts,
+                       int num_bins, ptrdiff_t sparts_offset) {
+  /* Create the offsets array. */
+  size_t *offsets = (size_t *)malloc(sizeof(size_t) * (num_bins + 1));
+  if (offsets == NULL)
+    error("Failed to allocate temporary cell offsets array.");
+  offsets[0] = 0;
+  for (int k = 1; k <= num_bins; k++) {
+    offsets[k] = offsets[k - 1] + counts[k - 1];
+    counts[k - 1] = 0;
+  }
+
+  /* Loop over local cells. */
+  for (int cid = 0; cid < num_bins; cid++) {
+    for (size_t k = offsets[cid] + counts[cid]; k < offsets[cid + 1]; k++) {
+      counts[cid]++;
+      int target_cid = ind[k];
+      if (target_cid == cid) {
+        continue;
       }
-#endif
-
-      /* Split-off largest interval. */
-      if (jj - i > j - jj + 1) {
-
-        /* Recurse on the left? */
-        if (jj > i && pivot > min) {
-          qid = atomic_inc(&sort_struct->last) % sort_struct->stack_size;
-          while (sort_struct->stack[qid].ready)
-            ;
-          sort_struct->stack[qid].i = i;
-          sort_struct->stack[qid].j = jj;
-          sort_struct->stack[qid].min = min;
-          sort_struct->stack[qid].max = pivot;
-          if (atomic_inc(&sort_struct->waiting) >= sort_struct->stack_size)
-            error("Qstack overflow.");
-          sort_struct->stack[qid].ready = 1;
+      struct spart temp_spart = sparts[k];
+      while (target_cid != cid) {
+        size_t j = offsets[target_cid] + counts[target_cid]++;
+        while (ind[j] == target_cid) {
+          j = offsets[target_cid] + counts[target_cid]++;
         }
-
-        /* Recurse on the right? */
-        if (jj + 1 < j && pivot + 1 < max) {
-          i = jj + 1;
-          min = pivot + 1;
-        } else
-          break;
-
-      } else {
-
-        /* Recurse on the right? */
-        if (pivot + 1 < max) {
-          qid = atomic_inc(&sort_struct->last) % sort_struct->stack_size;
-          while (sort_struct->stack[qid].ready)
-            ;
-          sort_struct->stack[qid].i = jj + 1;
-          sort_struct->stack[qid].j = j;
-          sort_struct->stack[qid].min = pivot + 1;
-          sort_struct->stack[qid].max = max;
-          if (atomic_inc(&sort_struct->waiting) >= sort_struct->stack_size)
-            error("Qstack overflow.");
-          sort_struct->stack[qid].ready = 1;
-        }
-
-        /* Recurse on the left? */
-        if (jj > i && pivot > min) {
-          j = jj;
-          max = pivot;
-        } else
-          break;
+        memswap(&sparts[j], &temp_spart, sizeof(struct spart));
+        memswap(&ind[j], &target_cid, sizeof(int));
+        if (sparts[j].gpart)
+          sparts[j].gpart->id_or_neg_offset = -(j + sparts_offset);
       }
+      sparts[k] = temp_spart;
+      ind[k] = target_cid;
+      if (sparts[k].gpart)
+        sparts[k].gpart->id_or_neg_offset = -(k + sparts_offset);
+    }
+  }
 
-    } /* loop over sub-intervals. */
-
-    atomic_dec(&sort_struct->waiting);
-
-  } /* main loop. */
+#ifdef SWIFT_DEBUG_CHECKS
+  for (int k = 0; k < num_bins; k++)
+    if (offsets[k + 1] != offsets[k] + counts[k])
+      error("Bad offsets after shuffle.");
+#endif  // SWIFT_DEBUG_CHECKS
 }
 
 /**
  * @brief Sort the g-particles according to the given indices.
  *
- * @param s The #space.
+ * @param gparts The array of #gpart to sort.
+ * @param parts Global #part array for re-linking.
+ * @param sparts Global #spart array for re-linking.
  * @param ind The indices with respect to which the gparts are sorted.
- * @param N The number of gparts
- * @param min Lowest index.
- * @param max highest index.
- * @param verbose Are we talkative ?
+ * @param counts Number of particles per index.
+ * @param num_bins Total number of bins (length of counts).
  */
-void space_gparts_sort(struct space *s, int *ind, size_t N, int min, int max,
-                       int verbose) {
-
-  const ticks tic = getticks();
-
-  /*Populate a global parallel_sort structure with the input data */
-  struct parallel_sort sort_struct;
-  sort_struct.gparts = s->gparts;
-  sort_struct.ind = ind;
-  sort_struct.stack_size = 2 * (max - min + 1) + 10 + s->e->nr_threads;
-  if ((sort_struct.stack = (struct qstack *)malloc(
-           sizeof(struct qstack) * sort_struct.stack_size)) == NULL)
-    error("Failed to allocate sorting stack.");
-  for (unsigned int i = 0; i < sort_struct.stack_size; i++)
-    sort_struct.stack[i].ready = 0;
-
-  /* Add the first interval. */
-  sort_struct.stack[0].i = 0;
-  sort_struct.stack[0].j = N - 1;
-  sort_struct.stack[0].min = min;
-  sort_struct.stack[0].max = max;
-  sort_struct.stack[0].ready = 1;
-  sort_struct.first = 0;
-  sort_struct.last = 1;
-  sort_struct.waiting = 1;
-
-  /* Launch the sorting tasks with a stride of zero such that the same
-     map data is passed to each thread. */
-  threadpool_map(&s->e->threadpool, space_gparts_sort_mapper, &sort_struct,
-                 s->e->threadpool.num_threads, 0, 1, NULL);
-
-#ifdef SWIFT_DEBUG_CHECKS
-  /* Verify space_sort_struct. */
-  for (size_t i = 1; i < N; i++)
-    if (ind[i - 1] > ind[i])
-      error("Sorting failed (ind[%zu]=%i,ind[%zu]=%i), min=%i, max=%i.", i - 1,
-            ind[i - 1], i, ind[i], min, max);
-  if (s->e->nodeID == 0 || verbose) message("Sorting succeeded.");
-#endif
-
-  /* Clean up. */
-  free(sort_struct.stack);
-
-  if (verbose)
-    message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
-            clocks_getunit());
-}
-
-void space_gparts_sort_mapper(void *map_data, int num_elements,
-                              void *extra_data) {
-
-  /* Unpack the mapping data. */
-  struct parallel_sort *sort_struct = (struct parallel_sort *)map_data;
-
-  /* Pointers to the sorting data. */
-  int *ind = sort_struct->ind;
-  struct gpart *gparts = sort_struct->gparts;
-
-  /* Main loop. */
-  while (sort_struct->waiting) {
-
-    /* Grab an interval off the queue. */
-    int qid = atomic_inc(&sort_struct->first) % sort_struct->stack_size;
-
-    /* Wait for the entry to be ready, or for the sorting do be done. */
-    while (!sort_struct->stack[qid].ready)
-      if (!sort_struct->waiting) return;
-
-    /* Get the stack entry. */
-    ptrdiff_t i = sort_struct->stack[qid].i;
-    ptrdiff_t j = sort_struct->stack[qid].j;
-    int min = sort_struct->stack[qid].min;
-    int max = sort_struct->stack[qid].max;
-    sort_struct->stack[qid].ready = 0;
-
-    /* Loop over sub-intervals. */
-    while (1) {
-
-      /* Bring beer. */
-      const int pivot = (min + max) / 2;
-      /* message("Working on interval [%i,%i] with min=%i, max=%i, pivot=%i.",
-              i, j, min, max, pivot); */
-
-      /* One pass of QuickSort's partitioning. */
-      ptrdiff_t ii = i;
-      ptrdiff_t jj = j;
-      while (ii < jj) {
-        while (ii <= j && ind[ii] <= pivot) ii++;
-        while (jj >= i && ind[jj] > pivot) jj--;
-        if (ii < jj) {
-          memswap(&ind[ii], &ind[jj], sizeof(int));
-          memswap(&gparts[ii], &gparts[jj], sizeof(struct gpart));
-        }
+void space_gparts_sort(struct gpart *gparts, struct part *parts,
+                       struct spart *sparts, int *ind, int *counts,
+                       int num_bins) {
+  /* Create the offsets array. */
+  size_t *offsets = (size_t *)malloc(sizeof(size_t) * (num_bins + 1));
+  if (offsets == NULL)
+    error("Failed to allocate temporary cell offsets array.");
+  offsets[0] = 0;
+  for (int k = 1; k <= num_bins; k++) {
+    offsets[k] = offsets[k - 1] + counts[k - 1];
+    counts[k - 1] = 0;
+  }
+
+  /* Loop over local cells. */
+  for (int cid = 0; cid < num_bins; cid++) {
+    for (size_t k = offsets[cid] + counts[cid]; k < offsets[cid + 1]; k++) {
+      counts[cid]++;
+      int target_cid = ind[k];
+      if (target_cid == cid) {
+        continue;
       }
-
-#ifdef SWIFT_DEBUG_CHECKS
-      /* Verify space_sort_struct. */
-      if (i != j) {
-        for (int k = i; k <= jj; k++) {
-          if (ind[k] > pivot) {
-            message(
-                "sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%li, j=%li.", k,
-                ind[k], pivot, i, j);
-            error("Partition failed (<=pivot).");
-          }
+      struct gpart temp_gpart = gparts[k];
+      while (target_cid != cid) {
+        size_t j = offsets[target_cid] + counts[target_cid]++;
+        while (ind[j] == target_cid) {
+          j = offsets[target_cid] + counts[target_cid]++;
         }
-        for (int k = jj + 1; k <= j; k++) {
-          if (ind[k] <= pivot) {
-            message(
-                "sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%li, j=%li.", k,
-                ind[k], pivot, i, j);
-            error("Partition failed (>pivot).");
-          }
+        memswap(&gparts[j], &temp_gpart, sizeof(struct gpart));
+        memswap(&ind[j], &target_cid, sizeof(int));
+        if (gparts[j].type == swift_type_gas) {
+          parts[-gparts[j].id_or_neg_offset].gpart = &gparts[j];
+        } else if (gparts[j].type == swift_type_star) {
+          sparts[-gparts[j].id_or_neg_offset].gpart = &gparts[j];
         }
       }
-#endif
-
-      /* Split-off largest interval. */
-      if (jj - i > j - jj + 1) {
-
-        /* Recurse on the left? */
-        if (jj > i && pivot > min) {
-          qid = atomic_inc(&sort_struct->last) % sort_struct->stack_size;
-          while (sort_struct->stack[qid].ready)
-            ;
-          sort_struct->stack[qid].i = i;
-          sort_struct->stack[qid].j = jj;
-          sort_struct->stack[qid].min = min;
-          sort_struct->stack[qid].max = pivot;
-          if (atomic_inc(&sort_struct->waiting) >= sort_struct->stack_size)
-            error("Qstack overflow.");
-          sort_struct->stack[qid].ready = 1;
-        }
-
-        /* Recurse on the right? */
-        if (jj + 1 < j && pivot + 1 < max) {
-          i = jj + 1;
-          min = pivot + 1;
-        } else
-          break;
-
-      } else {
-
-        /* Recurse on the right? */
-        if (pivot + 1 < max) {
-          qid = atomic_inc(&sort_struct->last) % sort_struct->stack_size;
-          while (sort_struct->stack[qid].ready)
-            ;
-          sort_struct->stack[qid].i = jj + 1;
-          sort_struct->stack[qid].j = j;
-          sort_struct->stack[qid].min = pivot + 1;
-          sort_struct->stack[qid].max = max;
-          if (atomic_inc(&sort_struct->waiting) >= sort_struct->stack_size)
-            error("Qstack overflow.");
-          sort_struct->stack[qid].ready = 1;
-        }
-
-        /* Recurse on the left? */
-        if (jj > i && pivot > min) {
-          j = jj;
-          max = pivot;
-        } else
-          break;
+      gparts[k] = temp_gpart;
+      ind[k] = target_cid;
+      if (gparts[k].type == swift_type_gas) {
+        parts[-gparts[k].id_or_neg_offset].gpart = &gparts[k];
+      } else if (gparts[k].type == swift_type_star) {
+        sparts[-gparts[k].id_or_neg_offset].gpart = &gparts[k];
       }
+    }
+  }
 
-    } /* loop over sub-intervals. */
-
-    atomic_dec(&sort_struct->waiting);
-
-  } /* main loop. */
+#ifdef SWIFT_DEBUG_CHECKS
+  for (int k = 0; k < num_bins; k++)
+    if (offsets[k + 1] != offsets[k] + counts[k])
+      error("Bad offsets after shuffle.");
+#endif  // SWIFT_DEBUG_CHECKS
 }
 
 /**
@@ -2410,10 +2074,6 @@ void space_recycle(struct space *s, struct cell *c) {
       lock_destroy(&c->mlock) != 0 || lock_destroy(&c->slock) != 0)
     error("Failed to destroy spinlocks.");
 
-  /* Clear this cell's sort arrays. */
-  for (int i = 0; i < 13; i++)
-    if (c->sort[i] != NULL) free(c->sort[i]);
-
   /* Lock the space. */
   lock_lock(&s->lock);
 
@@ -2463,10 +2123,6 @@ void space_recycle_list(struct space *s, struct cell *cell_list_begin,
         lock_destroy(&c->mlock) != 0 || lock_destroy(&c->slock) != 0)
       error("Failed to destroy spinlocks.");
 
-    /* Clear this cell's sort arrays. */
-    for (int i = 0; i < 13; i++)
-      if (c->sort[i] != NULL) free(c->sort[i]);
-
     /* Count this cell. */
     count += 1;
   }
@@ -2514,6 +2170,9 @@ void space_getcells(struct space *s, int nr_cells, struct cell **cells) {
                          space_cellallocchunk * sizeof(struct cell)) != 0)
         error("Failed to allocate more cells.");
 
+      /* Clear the newly-allocated cells. */
+      bzero(s->cells_sub, sizeof(struct cell) * space_cellallocchunk);
+
       /* Constructed a linked list */
       for (int k = 0; k < space_cellallocchunk - 1; k++)
         s->cells_sub[k].next = &s->cells_sub[k + 1];
@@ -2550,6 +2209,8 @@ void space_getcells(struct space *s, int nr_cells, struct cell **cells) {
 
   /* Init some things in the cell we just got. */
   for (int j = 0; j < nr_cells; j++) {
+    for (int k = 0; k < 13; k++)
+      if (cells[j]->sort[k] != NULL) free(cells[j]->sort[k]);
     struct gravity_tensors *temp = cells[j]->multipole;
     bzero(cells[j], sizeof(struct cell));
     cells[j]->multipole = temp;
@@ -2560,6 +2221,22 @@ void space_getcells(struct space *s, int nr_cells, struct cell **cells) {
   }
 }
 
+/**
+ * @brief Free sort arrays in any cells in the cell buffer.
+ *
+ * @param s The #space.
+ */
+void space_free_buff_sort_indices(struct space *s) {
+  for (struct cell *finger = s->cells_sub; finger != NULL;
+       finger = finger->next) {
+    for (int k = 0; k < 13; k++)
+      if (finger->sort[k] != NULL) {
+        free(finger->sort[k]);
+        finger->sort[k] = NULL;
+      }
+  }
+}
+
 /**
  * @brief Construct the list of top-level cells that have any tasks in
  * their hierarchy.
@@ -2685,6 +2362,9 @@ void space_first_init_parts(struct space *s,
     /* And the cooling */
     cooling_first_init_part(phys_const, us, cosmo, cool_func, &p[i], &xp[i]);
 
+    /* Check part->gpart->part linkeage. */
+    if (p[i].gpart) p[i].gpart->id_or_neg_offset = -i;
+
 #ifdef SWIFT_DEBUG_CHECKS
     p[i].ti_drift = 0;
     p[i].ti_kick = 0;
@@ -2762,6 +2442,9 @@ void space_first_init_sparts(struct space *s) {
 
     star_first_init_spart(&sp[i]);
 
+    /* Check spart->gpart->spart linkeage. */
+    if (sp[i].gpart) sp[i].gpart->id_or_neg_offset = -i;
+
 #ifdef SWIFT_DEBUG_CHECKS
     sp[i].ti_drift = 0;
     sp[i].ti_kick = 0;
diff --git a/src/space.h b/src/space.h
index b3f7f839651a81f525f7071c25a0aab5ebc067aa..db9c9edad3d7435ea619e72814e2c97094344090 100644
--- a/src/space.h
+++ b/src/space.h
@@ -173,12 +173,14 @@ struct space {
 };
 
 /* function prototypes. */
-void space_parts_sort(struct space *s, int *ind, size_t N, int min, int max,
-                      int verbose);
-void space_gparts_sort(struct space *s, int *ind, size_t N, int min, int max,
-                       int verbose);
-void space_sparts_sort(struct space *s, int *ind, size_t N, int min, int max,
-                       int verbose);
+void space_free_buff_sort_indices(struct space *s);
+void space_parts_sort(struct part *parts, struct xpart *xparts, int *ind,
+                      int *counts, int num_bins, ptrdiff_t parts_offset);
+void space_gparts_sort(struct gpart *gparts, struct part *parts,
+                       struct spart *sparts, int *ind, int *counts,
+                       int num_bins);
+void space_sparts_sort(struct spart *sparts, int *ind, int *counts,
+                       int num_bins, ptrdiff_t sparts_offset);
 void space_getcells(struct space *s, int nr_cells, struct cell **cells);
 int space_getsid(struct space *s, struct cell **ci, struct cell **cj,
                  double *shift);
@@ -199,12 +201,6 @@ void space_map_parts_xparts(struct space *s,
                                         struct cell *c));
 void space_map_cells_post(struct space *s, int full,
                           void (*fun)(struct cell *c, void *data), void *data);
-void space_parts_sort_mapper(void *map_data, int num_elements,
-                             void *extra_data);
-void space_gparts_sort_mapper(void *map_data, int num_elements,
-                              void *extra_data);
-void space_sparts_sort_mapper(void *map_data, int num_elements,
-                              void *extra_data);
 void space_rebuild(struct space *s, int verbose);
 void space_recycle(struct space *s, struct cell *c);
 void space_recycle_list(struct space *s, struct cell *cell_list_begin,
@@ -215,12 +211,12 @@ void space_split(struct space *s, struct cell *cells, int nr_cells,
                  int verbose);
 void space_split_mapper(void *map_data, int num_elements, void *extra_data);
 void space_list_cells_with_tasks(struct space *s);
-void space_parts_get_cell_index(struct space *s, int *ind, struct cell *cells,
-                                int verbose);
-void space_gparts_get_cell_index(struct space *s, int *gind, struct cell *cells,
-                                 int verbose);
-void space_sparts_get_cell_index(struct space *s, int *sind, struct cell *cells,
-                                 int verbose);
+void space_parts_get_cell_index(struct space *s, int *ind, int *cell_counts,
+                                struct cell *cells, int verbose);
+void space_gparts_get_cell_index(struct space *s, int *gind, int *cell_counts,
+                                 struct cell *cells, int verbose);
+void space_sparts_get_cell_index(struct space *s, int *sind, int *cell_counts,
+                                 struct cell *cells, int verbose);
 void space_synchronize_particle_positions(struct space *s);
 void space_do_parts_sort();
 void space_do_gparts_sort();