diff --git a/configure.ac b/configure.ac
index be5d4652e46ce15657336219c1ba0dce2c1f25bc..9fa9a1de591d63794dde5db6a8dd733cfcaada09 100644
--- a/configure.ac
+++ b/configure.ac
@@ -296,7 +296,6 @@ if test "$enable_san" = "yes"; then
    fi
 fi
 
-
 # Autoconf stuff.
 AC_PROG_INSTALL
 AC_PROG_MAKE_SET
@@ -473,8 +472,9 @@ if test "$ac_cv_header_fftw3_h" = "yes"; then
 fi
 AC_SUBST([FFTW_LIBS])
 
-# Check for Intel intrinsics header optionally used by vector.h.
+# Check for Intel and PowerPC intrinsics header optionally used by vector.h.
 AC_CHECK_HEADERS([immintrin.h])
+AC_CHECK_HEADERS([altivec.h])
 
 # Check for timing functions needed by cycle.h.
 AC_HEADER_TIME
diff --git a/src/Makefile.am b/src/Makefile.am
index 9c841db1176df09ee5bcaa0e051f4f0ae97df518..18da59c2b04f50cac29d948bdbe502bc2f70bec2 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -44,7 +44,7 @@ include_HEADERS = space.h runner.h queue.h task.h lock.h cell.h part.h const.h \
     common_io.h single_io.h multipole.h map.h tools.h partition.h clocks.h parser.h \
     physical_constants.h physical_constants_cgs.h potential.h version.h \
     hydro_properties.h riemann.h threadpool.h cooling.h cooling_struct.h sourceterms.h \
-    sourceterms_struct.h statistics.h
+    sourceterms_struct.h statistics.h memswap.h
 
 
 # Common source files
@@ -83,7 +83,8 @@ nobase_noinst_HEADERS = align.h approx_math.h atomic.h cycle.h error.h inline.h
 		 potential/softened_isothermal/potential.h \
 		 cooling/none/cooling.h cooling/none/cooling_struct.h \
 	         cooling/const_du/cooling.h cooling/const_du/cooling_struct.h \
-                 cooling/const_lambda/cooling.h cooling/const_lambda/cooling_struct.h 
+                 cooling/const_lambda/cooling.h cooling/const_lambda/cooling_struct.h \
+                 memswap.h
 
 
 # Sources and flags for regular library
diff --git a/src/cell.c b/src/cell.c
index 878401d770ca4a7787056a25da6d3b60d4313f2f..e2767cdaa9e1189ec87b5ef51cc578c91f8cfe4c 100644
--- a/src/cell.c
+++ b/src/cell.c
@@ -53,6 +53,8 @@
 #include "gravity.h"
 #include "hydro.h"
 #include "hydro_properties.h"
+#include "memswap.h"
+#include "minmax.h"
 #include "scheduler.h"
 #include "space.h"
 #include "timers.h"
@@ -463,122 +465,73 @@ void cell_gunlocktree(struct cell *c) {
  * @param c The #cell array to be sorted.
  * @param parts_offset Offset of the cell parts array relative to the
  *        space's parts array, i.e. c->parts - s->parts.
+ * @param buff A buffer with at least max(c->count, c->gcount) entries,
+ *        used for sorting indices.
  */
-void cell_split(struct cell *c, ptrdiff_t parts_offset) {
+void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) {
 
-  int i, j;
   const int count = c->count, gcount = c->gcount;
   struct part *parts = c->parts;
   struct xpart *xparts = c->xparts;
   struct gpart *gparts = c->gparts;
-  int left[8], right[8];
-  double pivot[3];
-
-  /* Init the pivots. */
-  for (int k = 0; k < 3; k++) pivot[k] = c->loc[k] + c->width[k] / 2;
-
-  /* Split along the x-axis. */
-  i = 0;
-  j = count - 1;
-  while (i <= j) {
-    while (i <= count - 1 && parts[i].x[0] <= pivot[0]) i += 1;
-    while (j >= 0 && parts[j].x[0] > pivot[0]) j -= 1;
-    if (i < j) {
-      struct part temp = parts[i];
-      parts[i] = parts[j];
-      parts[j] = temp;
-      struct xpart xtemp = xparts[i];
-      xparts[i] = xparts[j];
-      xparts[j] = xtemp;
-    }
+  const double pivot[3] = {c->loc[0] + c->width[0] / 2,
+                           c->loc[1] + c->width[1] / 2,
+                           c->loc[2] + c->width[2] / 2};
+  int bucket_count[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+  int bucket_offset[9];
+
+  /* If the buff is NULL, allocate it, and remember to free it. */
+  const int allocate_buffer = (buff == NULL);
+  if (allocate_buffer &&
+      (buff = (int *)malloc(sizeof(int) * max(count, gcount))) == NULL)
+    error("Failed to allocate temporary indices.");
+
+  /* Fill the buffer with the indices. */
+  for (int k = 0; k < count; k++) {
+    const int bid = (parts[k].x[0] > pivot[0]) * 4 +
+                    (parts[k].x[1] > pivot[1]) * 2 + (parts[k].x[2] > pivot[2]);
+    bucket_count[bid]++;
+    buff[k] = bid;
   }
 
-#ifdef SWIFT_DEBUG_CHECKS
-  for (int k = 0; k <= j; k++)
-    if (parts[k].x[0] > pivot[0]) error("cell_split: sorting failed.");
-  for (int k = i; k < count; k++)
-    if (parts[k].x[0] < pivot[0]) error("cell_split: sorting failed.");
-#endif
-
-  left[1] = i;
-  right[1] = count - 1;
-  left[0] = 0;
-  right[0] = j;
-
-  /* Split along the y axis, twice. */
-  for (int k = 1; k >= 0; k--) {
-    i = left[k];
-    j = right[k];
-    while (i <= j) {
-      while (i <= right[k] && parts[i].x[1] <= pivot[1]) i += 1;
-      while (j >= left[k] && parts[j].x[1] > pivot[1]) j -= 1;
-      if (i < j) {
-        struct part temp = parts[i];
-        parts[i] = parts[j];
-        parts[j] = temp;
-        struct xpart xtemp = xparts[i];
-        xparts[i] = xparts[j];
-        xparts[j] = xtemp;
-      }
-    }
-
-#ifdef SWIFT_DEBUG_CHECKS
-    for (int kk = left[k]; kk <= j; kk++)
-      if (parts[kk].x[1] > pivot[1]) {
-        message("ival=[%i,%i], i=%i, j=%i.", left[k], right[k], i, j);
-        error("sorting failed (left).");
-      }
-    for (int kk = i; kk <= right[k]; kk++)
-      if (parts[kk].x[1] < pivot[1]) error("sorting failed (right).");
-#endif
-
-    left[2 * k + 1] = i;
-    right[2 * k + 1] = right[k];
-    left[2 * k] = left[k];
-    right[2 * k] = j;
+  /* Set the buffer offsets. */
+  bucket_offset[0] = 0;
+  for (int k = 1; k <= 8; k++) {
+    bucket_offset[k] = bucket_offset[k - 1] + bucket_count[k - 1];
+    bucket_count[k - 1] = 0;
   }
 
-  /* Split along the z axis, four times. */
-  for (int k = 3; k >= 0; k--) {
-    i = left[k];
-    j = right[k];
-    while (i <= j) {
-      while (i <= right[k] && parts[i].x[2] <= pivot[2]) i += 1;
-      while (j >= left[k] && parts[j].x[2] > pivot[2]) j -= 1;
-      if (i < j) {
-        struct part temp = parts[i];
-        parts[i] = parts[j];
-        parts[j] = temp;
-        struct xpart xtemp = xparts[i];
-        xparts[i] = xparts[j];
-        xparts[j] = xtemp;
+  /* Run through the buckets, and swap particles to their correct spot. */
+  for (int bucket = 0; bucket < 8; bucket++) {
+    for (int k = bucket_offset[bucket] + bucket_count[bucket];
+         k < bucket_offset[bucket + 1]; k++) {
+      int bid = buff[k];
+      if (bid != bucket) {
+        struct part part = parts[k];
+        struct xpart xpart = xparts[k];
+        while (bid != bucket) {
+          int j = bucket_offset[bid] + bucket_count[bid]++;
+          while (buff[j] == bid) {
+            j++;
+            bucket_count[bid]++;
+          }
+          memswap(&parts[j], &part, sizeof(struct part));
+          memswap(&xparts[j], &xpart, sizeof(struct xpart));
+          memswap(&buff[j], &bid, sizeof(int));
+        }
+        parts[k] = part;
+        xparts[k] = xpart;
+        buff[k] = bid;
       }
+      bucket_count[bid]++;
     }
-
-#ifdef SWIFT_DEBUG_CHECKS
-    for (int kk = left[k]; kk <= j; kk++)
-      if (parts[kk].x[2] > pivot[2]) {
-        message("ival=[%i,%i], i=%i, j=%i.", left[k], right[k], i, j);
-        error("sorting failed (left).");
-      }
-    for (int kk = i; kk <= right[k]; kk++)
-      if (parts[kk].x[2] < pivot[2]) {
-        message("ival=[%i,%i], i=%i, j=%i.", left[k], right[k], i, j);
-        error("sorting failed (right).");
-      }
-#endif
-
-    left[2 * k + 1] = i;
-    right[2 * k + 1] = right[k];
-    left[2 * k] = left[k];
-    right[2 * k] = j;
   }
 
   /* Store the counts and offsets. */
   for (int k = 0; k < 8; k++) {
-    c->progeny[k]->count = right[k] - left[k] + 1;
-    c->progeny[k]->parts = &c->parts[left[k]];
-    c->progeny[k]->xparts = &c->xparts[left[k]];
+    c->progeny[k]->count = bucket_count[k];
+    c->progeny[k]->parts = &c->parts[bucket_offset[k]];
+    c->progeny[k]->xparts = &c->xparts[bucket_offset[k]];
   }
 
   /* Re-link the gparts. */
@@ -614,66 +567,51 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset) {
 #endif
 
   /* Now do the same song and dance for the gparts. */
-
-  /* Split along the x-axis. */
-  i = 0;
-  j = gcount - 1;
-  while (i <= j) {
-    while (i <= gcount - 1 && gparts[i].x[0] <= pivot[0]) i += 1;
-    while (j >= 0 && gparts[j].x[0] > pivot[0]) j -= 1;
-    if (i < j) {
-      struct gpart gtemp = gparts[i];
-      gparts[i] = gparts[j];
-      gparts[j] = gtemp;
-    }
+  for (int k = 0; k < 8; k++) bucket_count[k] = 0;
+
+  /* Fill the buffer with the indices. */
+  for (int k = 0; k < gcount; k++) {
+    const int bid = (gparts[k].x[0] > pivot[0]) * 4 +
+                    (gparts[k].x[1] > pivot[1]) * 2 +
+                    (gparts[k].x[2] > pivot[2]);
+    bucket_count[bid]++;
+    buff[k] = bid;
   }
-  left[1] = i;
-  right[1] = gcount - 1;
-  left[0] = 0;
-  right[0] = j;
-
-  /* Split along the y axis, twice. */
-  for (int k = 1; k >= 0; k--) {
-    i = left[k];
-    j = right[k];
-    while (i <= j) {
-      while (i <= right[k] && gparts[i].x[1] <= pivot[1]) i += 1;
-      while (j >= left[k] && gparts[j].x[1] > pivot[1]) j -= 1;
-      if (i < j) {
-        struct gpart gtemp = gparts[i];
-        gparts[i] = gparts[j];
-        gparts[j] = gtemp;
-      }
-    }
-    left[2 * k + 1] = i;
-    right[2 * k + 1] = right[k];
-    left[2 * k] = left[k];
-    right[2 * k] = j;
+
+  /* Set the buffer offsets. */
+  bucket_offset[0] = 0;
+  for (int k = 1; k <= 8; k++) {
+    bucket_offset[k] = bucket_offset[k - 1] + bucket_count[k - 1];
+    bucket_count[k - 1] = 0;
   }
 
-  /* Split along the z axis, four times. */
-  for (int k = 3; k >= 0; k--) {
-    i = left[k];
-    j = right[k];
-    while (i <= j) {
-      while (i <= right[k] && gparts[i].x[2] <= pivot[2]) i += 1;
-      while (j >= left[k] && gparts[j].x[2] > pivot[2]) j -= 1;
-      if (i < j) {
-        struct gpart gtemp = gparts[i];
-        gparts[i] = gparts[j];
-        gparts[j] = gtemp;
+  /* Run through the buckets, and swap particles to their correct spot. */
+  for (int bucket = 0; bucket < 8; bucket++) {
+    for (int k = bucket_offset[bucket] + bucket_count[bucket];
+         k < bucket_offset[bucket + 1]; k++) {
+      int bid = buff[k];
+      if (bid != bucket) {
+        struct gpart gpart = gparts[k];
+        while (bid != bucket) {
+          int j = bucket_offset[bid] + bucket_count[bid]++;
+          while (buff[j] == bid) {
+            j++;
+            bucket_count[bid]++;
+          }
+          memswap(&gparts[j], &gpart, sizeof(struct gpart));
+          memswap(&buff[j], &bid, sizeof(int));
+        }
+        gparts[k] = gpart;
+        buff[k] = bid;
       }
+      bucket_count[bid]++;
     }
-    left[2 * k + 1] = i;
-    right[2 * k + 1] = right[k];
-    left[2 * k] = left[k];
-    right[2 * k] = j;
   }
 
   /* Store the counts and offsets. */
   for (int k = 0; k < 8; k++) {
-    c->progeny[k]->gcount = right[k] - left[k] + 1;
-    c->progeny[k]->gparts = &c->gparts[left[k]];
+    c->progeny[k]->gcount = bucket_count[k];
+    c->progeny[k]->gparts = &c->gparts[bucket_offset[k]];
   }
 
   /* Re-link the parts. */
diff --git a/src/cell.h b/src/cell.h
index eefb2114cde1705ab4e4c59f4b837bddeb093e2b..2cd13cf2ab6b934f6aab84bcbacf510270892866 100644
--- a/src/cell.h
+++ b/src/cell.h
@@ -272,7 +272,7 @@ struct cell {
   ((int)(k) + (cdim)[2] * ((int)(j) + (cdim)[1] * (int)(i)))
 
 /* Function prototypes. */
-void cell_split(struct cell *c, ptrdiff_t parts_offset);
+void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff);
 void cell_sanitize(struct cell *c);
 int cell_locktree(struct cell *c);
 void cell_unlocktree(struct cell *c);
diff --git a/src/memswap.h b/src/memswap.h
new file mode 100644
index 0000000000000000000000000000000000000000..4643725535917952d12927d52187bc7306ced5ef
--- /dev/null
+++ b/src/memswap.h
@@ -0,0 +1,79 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (c) 2016 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+#ifndef SWIFT_MEMSWAP_H
+#define SWIFT_MEMSWAP_H
+
+/* Config parameters. */
+#include "../config.h"
+
+#ifdef HAVE_IMMINTRIN_H
+/* Include the header file with the intrinsics for Intel architecture. */
+#include <immintrin.h>
+#endif
+
+#ifdef HAVE_ALTIVEC_H
+/* Include the header file with the intrinsics for Intel architecture. */
+#include <altivec.h>
+#endif
+
+/* Macro for in-place swap of two values a and b of type t. */
+#define swap_loop(t, a, b, c)  \
+  while (c >= sizeof(t)) {     \
+    register t temp = *(t *)a; \
+    *(t *)a = *(t *)b;         \
+    *(t *)b = temp;            \
+    a += sizeof(t);            \
+    b += sizeof(t);            \
+    bytes -= sizeof(t);        \
+  }
+
+/**
+ * @brief Swap the contents of two elements in-place.
+ *
+ * Keep in mind that this function works best when the underlying data
+ * is aligned to the vector length, e.g. with the @c
+ * __attribute__((aligned(32)))
+ * syntax, and the code is compiled with @c -funroll-loops.
+ *
+ * @param void_a Pointer to the first element.
+ * @param void_b Pointer to the second element.
+ * @param bytes Size, in bytes, of the data pointed to by @c a and @c b.
+ */
+__attribute__((always_inline)) inline void memswap(void *void_a, void *void_b,
+                                                   size_t bytes) {
+  char *a = (char *)void_a, *b = (char *)void_b;
+#ifdef __AVX512F__
+  swap_loop(__m512i, a, b, bytes);
+#endif
+#ifdef __AVX__
+  swap_loop(__m256i, a, b, bytes);
+#endif
+#ifdef __SSE2__
+  swap_loop(__m128i, a, b, bytes);
+#endif
+#ifdef __ALTIVEC__
+  swap_loop(vector int, a, b, bytes);
+#endif
+  swap_loop(size_t, a, b, bytes);
+  swap_loop(int, a, b, bytes);
+  swap_loop(short, a, b, bytes);
+  swap_loop(char, a, b, bytes);
+}
+
+#endif /* SWIFT_MEMSWAP_H */
diff --git a/src/space.c b/src/space.c
index e04aad2207e736ac8fe4da28b7d22b7c7999765b..935677a9ebed97acfde8341ec1545ef4f33a56c0 100644
--- a/src/space.c
+++ b/src/space.c
@@ -49,6 +49,7 @@
 #include "hydro.h"
 #include "kernel_hydro.h"
 #include "lock.h"
+#include "memswap.h"
 #include "minmax.h"
 #include "runner.h"
 #include "threadpool.h"
@@ -382,7 +383,7 @@ void space_regrid(struct space *s, int verbose) {
       /* Finished with these. */
       free(oldnodeIDs);
     }
-#endif
+#endif /* WITH_MPI */
 
     // message( "rebuilding upper-level cells took %.3f %s." ,
     // clocks_from_ticks(double)(getticks() - tic), clocks_getunit());
@@ -453,21 +454,22 @@ void space_rebuild(struct space *s, int verbose) {
   struct cell *restrict cells_top = s->cells_top;
   const int ti_current = (s->e != NULL) ? s->e->ti_current : 0;
 
-  /* Run through the particles and get their cell index. */
-  const size_t ind_size = s->size_parts;
+  /* Run through the particles and get their cell index. Allocates
+     an index that is larger than the number of particles to avoid
+     re-allocating after shuffling. */
+  const size_t ind_size = s->size_parts + 100;
   int *ind;
   if ((ind = (int *)malloc(sizeof(int) * ind_size)) == NULL)
     error("Failed to allocate temporary particle indices.");
-  if (ind_size > 0) space_parts_get_cell_index(s, ind, cells_top, verbose);
-  for (size_t i = 0; i < s->nr_parts; ++i) cells_top[ind[i]].count++;
+  if (s->size_parts > 0) space_parts_get_cell_index(s, ind, cells_top, verbose);
 
   /* Run through the gravity particles and get their cell index. */
-  const size_t gind_size = s->size_gparts;
+  const size_t gind_size = s->size_gparts + 100;
   int *gind;
   if ((gind = (int *)malloc(sizeof(int) * gind_size)) == NULL)
     error("Failed to allocate temporary g-particle indices.");
-  if (gind_size > 0) space_gparts_get_cell_index(s, gind, cells_top, verbose);
-  for (size_t i = 0; i < s->nr_gparts; ++i) cells_top[gind[i]].gcount++;
+  if (s->size_gparts > 0)
+    space_gparts_get_cell_index(s, gind, cells_top, verbose);
 
 #ifdef WITH_MPI
 
@@ -475,7 +477,6 @@ void space_rebuild(struct space *s, int verbose) {
   const int local_nodeID = s->e->nodeID;
   for (size_t k = 0; k < nr_parts;) {
     if (cells_top[ind[k]].nodeID != local_nodeID) {
-      cells_top[ind[k]].count -= 1;
       nr_parts -= 1;
       const struct part tp = s->parts[k];
       s->parts[k] = s->parts[nr_parts];
@@ -515,7 +516,6 @@ void space_rebuild(struct space *s, int verbose) {
   /* Move non-local gparts to the end of the list. */
   for (size_t k = 0; k < nr_gparts;) {
     if (cells_top[gind[k]].nodeID != local_nodeID) {
-      cells_top[gind[k]].gcount -= 1;
       nr_gparts -= 1;
       const struct gpart tp = s->gparts[k];
       s->gparts[k] = s->gparts[nr_gparts];
@@ -562,9 +562,9 @@ void space_rebuild(struct space *s, int verbose) {
   s->nr_gparts = nr_gparts + nr_gparts_exchanged;
 
   /* Re-allocate the index array if needed.. */
-  if (s->nr_parts > ind_size) {
+  if (s->nr_parts + 1 > ind_size) {
     int *ind_new;
-    if ((ind_new = (int *)malloc(sizeof(int) * s->nr_parts)) == NULL)
+    if ((ind_new = (int *)malloc(sizeof(int) * (s->nr_parts + 1))) == NULL)
       error("Failed to allocate temporary particle indices.");
     memcpy(ind_new, ind, sizeof(int) * nr_parts);
     free(ind);
@@ -579,7 +579,6 @@ void space_rebuild(struct space *s, int verbose) {
     const struct part *const p = &s->parts[k];
     ind[k] =
         cell_getid(cdim, p->x[0] * ih[0], p->x[1] * ih[1], p->x[2] * ih[2]);
-    cells_top[ind[k]].count += 1;
 #ifdef SWIFT_DEBUG_CHECKS
     if (cells_top[ind[k]].nodeID != local_nodeID)
       error("Received part that does not belong to me (nodeID=%i).",
@@ -591,7 +590,8 @@ void space_rebuild(struct space *s, int verbose) {
 #endif /* WITH_MPI */
 
   /* Sort the parts according to their cells. */
-  space_parts_sort(s, ind, nr_parts, 0, s->nr_cells - 1, verbose);
+  if (nr_parts > 0)
+    space_parts_sort(s, ind, nr_parts, 0, s->nr_cells - 1, verbose);
 
   /* Re-link the gparts. */
   if (nr_parts > 0 && nr_gparts > 0) part_relink_gparts(s->parts, nr_parts, 0);
@@ -609,15 +609,25 @@ void space_rebuild(struct space *s, int verbose) {
   }
 #endif
 
+  /* Extract the cell counts from the sorted indices. */
+  size_t last_index = 0;
+  ind[nr_parts] = s->nr_cells;  // sentinel.
+  for (size_t k = 0; k < nr_parts; k++) {
+    if (ind[k] < ind[k + 1]) {
+      cells_top[ind[k]].count = k - last_index + 1;
+      last_index = k + 1;
+    }
+  }
+
   /* We no longer need the indices as of here. */
   free(ind);
 
 #ifdef WITH_MPI
 
   /* Re-allocate the index array if needed.. */
-  if (s->nr_gparts > gind_size) {
+  if (s->nr_gparts + 1 > gind_size) {
     int *gind_new;
-    if ((gind_new = (int *)malloc(sizeof(int) * s->nr_gparts)) == NULL)
+    if ((gind_new = (int *)malloc(sizeof(int) * (s->nr_gparts + 1))) == NULL)
       error("Failed to allocate temporary g-particle indices.");
     memcpy(gind_new, gind, sizeof(int) * nr_gparts);
     free(gind);
@@ -629,12 +639,11 @@ void space_rebuild(struct space *s, int verbose) {
     const struct gpart *const p = &s->gparts[k];
     gind[k] =
         cell_getid(cdim, p->x[0] * ih[0], p->x[1] * ih[1], p->x[2] * ih[2]);
-    cells_top[gind[k]].gcount += 1;
 
 #ifdef SWIFT_DEBUG_CHECKS
-    if (cells_top[ind[k]].nodeID != s->e->nodeID)
+    if (cells_top[gind[k]].nodeID != s->e->nodeID)
       error("Received part that does not belong to me (nodeID=%i).",
-            cells_top[ind[k]].nodeID);
+            cells_top[gind[k]].nodeID);
 #endif
   }
   nr_gparts = s->nr_gparts;
@@ -642,12 +651,23 @@ void space_rebuild(struct space *s, int verbose) {
 #endif
 
   /* Sort the gparts according to their cells. */
-  space_gparts_sort(s, gind, nr_gparts, 0, s->nr_cells - 1, verbose);
+  if (nr_gparts > 0)
+    space_gparts_sort(s, gind, nr_gparts, 0, s->nr_cells - 1, verbose);
 
   /* Re-link the parts. */
   if (nr_parts > 0 && nr_gparts > 0)
     part_relink_parts(s->gparts, nr_gparts, s->parts);
 
+  /* Extract the cell counts from the sorted indices. */
+  size_t last_gindex = 0;
+  gind[nr_gparts] = s->nr_cells;
+  for (size_t k = 0; k < nr_gparts; k++) {
+    if (gind[k] < gind[k + 1]) {
+      cells_top[gind[k]].gcount = k - last_gindex + 1;
+      last_gindex = k + 1;
+    }
+  }
+
   /* We no longer need the indices as of here. */
   free(gind);
 
@@ -1011,15 +1031,9 @@ void space_parts_sort_mapper(void *map_data, int num_elements,
         while (ii <= j && ind[ii] <= pivot) ii++;
         while (jj >= i && ind[jj] > pivot) jj--;
         if (ii < jj) {
-          size_t temp_i = ind[ii];
-          ind[ii] = ind[jj];
-          ind[jj] = temp_i;
-          struct part temp_p = parts[ii];
-          parts[ii] = parts[jj];
-          parts[jj] = temp_p;
-          struct xpart temp_xp = xparts[ii];
-          xparts[ii] = xparts[jj];
-          xparts[jj] = temp_xp;
+          memswap(&ind[ii], &ind[jj], sizeof(int));
+          memswap(&parts[ii], &parts[jj], sizeof(struct part));
+          memswap(&xparts[ii], &xparts[jj], sizeof(struct xpart));
         }
       }
 
@@ -1194,12 +1208,8 @@ void space_gparts_sort_mapper(void *map_data, int num_elements,
         while (ii <= j && ind[ii] <= pivot) ii++;
         while (jj >= i && ind[jj] > pivot) jj--;
         if (ii < jj) {
-          size_t temp_i = ind[ii];
-          ind[ii] = ind[jj];
-          ind[jj] = temp_i;
-          struct gpart temp_p = gparts[ii];
-          gparts[ii] = gparts[jj];
-          gparts[jj] = temp_p;
+          memswap(&ind[ii], &ind[jj], sizeof(int));
+          memswap(&gparts[ii], &gparts[jj], sizeof(struct gpart));
         }
       }
 
@@ -1425,144 +1435,164 @@ void space_map_cells_pre(struct space *s, int full,
 }
 
 /**
- * @brief #threadpool mapper function to split cells if they contain
- *        too many particles.
+ * @brief Recursively split a cell.
  *
- * @param map_data Pointer towards the top-cells.
- * @param num_cells The number of cells to treat.
- * @param extra_data Pointers to the #space.
+ * @param s The #space in which the cell lives.
+ * @param c The #cell to split recursively.
+ * @param buff A buffer for particle sorting, should be of size at least
+ *        max(c->count, c->gount) or @c NULL.
  */
-void space_split_mapper(void *map_data, int num_cells, void *extra_data) {
-
-  /* Unpack the inputs. */
-  struct space *s = (struct space *)extra_data;
-  struct cell *restrict cells_top = (struct cell *)map_data;
+void space_split_recursive(struct space *s, struct cell *c, int *buff) {
+
+  const int count = c->count;
+  const int gcount = c->gcount;
+  const int depth = c->depth;
+  int maxdepth = 0;
+  float h_max = 0.0f;
+  int ti_end_min = max_nr_timesteps, ti_end_max = 0;
+  struct cell *temp;
+  struct part *parts = c->parts;
+  struct gpart *gparts = c->gparts;
+  struct xpart *xparts = c->xparts;
   struct engine *e = s->e;
 
-  for (int ind = 0; ind < num_cells; ind++) {
+  /* If the buff is NULL, allocate it, and remember to free it. */
+  const int allocate_buffer = (buff == NULL);
+  if (allocate_buffer &&
+      (buff = (int *)malloc(sizeof(int) * max(count, gcount))) == NULL)
+    error("Failed to allocate temporary indices.");
 
-    struct cell *c = &cells_top[ind];
+  /* Check the depth. */
+  while (depth > (maxdepth = s->maxdepth)) {
+    atomic_cas(&s->maxdepth, maxdepth, depth);
+  }
 
-    const int count = c->count;
-    const int gcount = c->gcount;
-    const int depth = c->depth;
-    int maxdepth = 0;
-    float h_max = 0.0f;
-    int ti_end_min = max_nr_timesteps, ti_end_max = 0;
-    struct cell *temp;
-    struct part *parts = c->parts;
-    struct gpart *gparts = c->gparts;
-    struct xpart *xparts = c->xparts;
-
-    /* Check the depth. */
-    while (depth > (maxdepth = s->maxdepth)) {
-      atomic_cas(&s->maxdepth, maxdepth, depth);
-    }
+  /* If the depth is too large, we have a problem and should stop. */
+  if (maxdepth > space_cell_maxdepth) {
+    error("Exceeded maximum depth (%d) when splitting cells, aborting",
+          space_cell_maxdepth);
+  }
 
-    /* If the depth is too large, we have a problem and should stop. */
-    if (maxdepth > space_cell_maxdepth) {
-      error("Exceeded maximum depth (%d) when splitting cells, aborting",
-            space_cell_maxdepth);
+  /* Split or let it be? */
+  if (count > space_splitsize || gcount > space_splitsize) {
+
+    /* No longer just a leaf. */
+    c->split = 1;
+
+    /* Create the cell's progeny. */
+    for (int k = 0; k < 8; k++) {
+      temp = space_getcell(s);
+      temp->count = 0;
+      temp->gcount = 0;
+      temp->ti_old = e->ti_current;
+      temp->loc[0] = c->loc[0];
+      temp->loc[1] = c->loc[1];
+      temp->loc[2] = c->loc[2];
+      temp->width[0] = c->width[0] / 2;
+      temp->width[1] = c->width[1] / 2;
+      temp->width[2] = c->width[2] / 2;
+      temp->dmin = c->dmin / 2;
+      if (k & 4) temp->loc[0] += temp->width[0];
+      if (k & 2) temp->loc[1] += temp->width[1];
+      if (k & 1) temp->loc[2] += temp->width[2];
+      temp->depth = c->depth + 1;
+      temp->split = 0;
+      temp->h_max = 0.0;
+      temp->dx_max = 0.f;
+      temp->nodeID = c->nodeID;
+      temp->parent = c;
+      temp->super = NULL;
+      c->progeny[k] = temp;
     }
 
-    /* Split or let it be? */
-    if (count > space_splitsize || gcount > space_splitsize) {
-
-      /* No longer just a leaf. */
-      c->split = 1;
-
-      /* Create the cell's progeny. */
-      for (int k = 0; k < 8; k++) {
-        temp = space_getcell(s);
-        temp->count = 0;
-        temp->gcount = 0;
-        temp->ti_old = e->ti_current;
-        temp->loc[0] = c->loc[0];
-        temp->loc[1] = c->loc[1];
-        temp->loc[2] = c->loc[2];
-        temp->width[0] = c->width[0] / 2;
-        temp->width[1] = c->width[1] / 2;
-        temp->width[2] = c->width[2] / 2;
-        temp->dmin = c->dmin / 2;
-        if (k & 4) temp->loc[0] += temp->width[0];
-        if (k & 2) temp->loc[1] += temp->width[1];
-        if (k & 1) temp->loc[2] += temp->width[2];
-        temp->depth = c->depth + 1;
-        temp->split = 0;
-        temp->h_max = 0.0;
-        temp->dx_max = 0.f;
-        temp->nodeID = c->nodeID;
-        temp->parent = c;
-        temp->super = NULL;
-        c->progeny[k] = temp;
+    /* Split the cell data. */
+    cell_split(c, c->parts - s->parts, buff);
+
+    /* Remove any progeny with zero parts. */
+    for (int k = 0; k < 8; k++)
+      if (c->progeny[k]->count == 0 && c->progeny[k]->gcount == 0) {
+        space_recycle(s, c->progeny[k]);
+        c->progeny[k] = NULL;
+      } else {
+        space_split_recursive(s, c->progeny[k], buff);
+        h_max = max(h_max, c->progeny[k]->h_max);
+        ti_end_min = min(ti_end_min, c->progeny[k]->ti_end_min);
+        ti_end_max = max(ti_end_max, c->progeny[k]->ti_end_max);
+        if (c->progeny[k]->maxdepth > maxdepth)
+          maxdepth = c->progeny[k]->maxdepth;
       }
 
-      /* Split the cell data. */
-      cell_split(c, c->parts - s->parts);
-
-      /* Remove any progeny with zero parts. */
-      for (int k = 0; k < 8; k++)
-        if (c->progeny[k]->count == 0 && c->progeny[k]->gcount == 0) {
-          space_recycle(s, c->progeny[k]);
-          c->progeny[k] = NULL;
-        } else {
-          space_split_mapper(c->progeny[k], 1, s);
-          h_max = max(h_max, c->progeny[k]->h_max);
-          ti_end_min = min(ti_end_min, c->progeny[k]->ti_end_min);
-          ti_end_max = max(ti_end_max, c->progeny[k]->ti_end_max);
-          if (c->progeny[k]->maxdepth > maxdepth)
-            maxdepth = c->progeny[k]->maxdepth;
-        }
+  }
 
+  /* Otherwise, collect the data for this cell. */
+  else {
+
+    /* Clear the progeny. */
+    bzero(c->progeny, sizeof(struct cell *) * 8);
+    c->split = 0;
+    maxdepth = c->depth;
+
+    /* Get dt_min/dt_max. */
+    for (int k = 0; k < count; k++) {
+      struct part *p = &parts[k];
+      struct xpart *xp = &xparts[k];
+      const float h = p->h;
+      const int ti_end = p->ti_end;
+      xp->x_diff[0] = 0.f;
+      xp->x_diff[1] = 0.f;
+      xp->x_diff[2] = 0.f;
+      if (h > h_max) h_max = h;
+      if (ti_end < ti_end_min) ti_end_min = ti_end;
+      if (ti_end > ti_end_max) ti_end_max = ti_end;
     }
-
-    /* Otherwise, collect the data for this cell. */
-    else {
-
-      /* Clear the progeny. */
-      bzero(c->progeny, sizeof(struct cell *) * 8);
-      c->split = 0;
-      maxdepth = c->depth;
-
-      /* Get dt_min/dt_max. */
-      for (int k = 0; k < count; k++) {
-        struct part *p = &parts[k];
-        struct xpart *xp = &xparts[k];
-        const float h = p->h;
-        const int ti_end = p->ti_end;
-        xp->x_diff[0] = 0.f;
-        xp->x_diff[1] = 0.f;
-        xp->x_diff[2] = 0.f;
-        if (h > h_max) h_max = h;
-        if (ti_end < ti_end_min) ti_end_min = ti_end;
-        if (ti_end > ti_end_max) ti_end_max = ti_end;
-      }
-      for (int k = 0; k < gcount; k++) {
-        struct gpart *gp = &gparts[k];
-        const int ti_end = gp->ti_end;
-        gp->x_diff[0] = 0.f;
-        gp->x_diff[1] = 0.f;
-        gp->x_diff[2] = 0.f;
-        if (ti_end < ti_end_min) ti_end_min = ti_end;
-        if (ti_end > ti_end_max) ti_end_max = ti_end;
-      }
+    for (int k = 0; k < gcount; k++) {
+      struct gpart *gp = &gparts[k];
+      const int ti_end = gp->ti_end;
+      gp->x_diff[0] = 0.f;
+      gp->x_diff[1] = 0.f;
+      gp->x_diff[2] = 0.f;
+      if (ti_end < ti_end_min) ti_end_min = ti_end;
+      if (ti_end > ti_end_max) ti_end_max = ti_end;
     }
+  }
 
-    /* Set the values for this cell. */
-    c->h_max = h_max;
-    c->ti_end_min = ti_end_min;
-    c->ti_end_max = ti_end_max;
-    c->maxdepth = maxdepth;
-
-    /* Set ownership according to the start of the parts array. */
-    if (s->nr_parts > 0)
-      c->owner =
-          ((c->parts - s->parts) % s->nr_parts) * s->nr_queues / s->nr_parts;
-    else if (s->nr_gparts > 0)
-      c->owner = ((c->gparts - s->gparts) % s->nr_gparts) * s->nr_queues /
-                 s->nr_gparts;
-    else
-      c->owner = 0; /* Ok, there is really nothing on this rank... */
+  /* Set the values for this cell. */
+  c->h_max = h_max;
+  c->ti_end_min = ti_end_min;
+  c->ti_end_max = ti_end_max;
+  c->maxdepth = maxdepth;
+
+  /* Set ownership according to the start of the parts array. */
+  if (s->nr_parts > 0)
+    c->owner =
+        ((c->parts - s->parts) % s->nr_parts) * s->nr_queues / s->nr_parts;
+  else if (s->nr_gparts > 0)
+    c->owner =
+        ((c->gparts - s->gparts) % s->nr_gparts) * s->nr_queues / s->nr_gparts;
+  else
+    c->owner = 0; /* Ok, there is really nothing on this rank... */
+
+  /* Clean up. */
+  if (allocate_buffer) free(buff);
+}
+
+/**
+ * @brief #threadpool mapper function to split cells if they contain
+ *        too many particles.
+ *
+ * @param map_data Pointer towards the top-cells.
+ * @param num_cells The number of cells to treat.
+ * @param extra_data Pointers to the #space.
+ */
+void space_split_mapper(void *map_data, int num_cells, void *extra_data) {
+
+  /* Unpack the inputs. */
+  struct space *s = (struct space *)extra_data;
+  struct cell *restrict cells_top = (struct cell *)map_data;
+
+  for (int ind = 0; ind < num_cells; ind++) {
+    struct cell *c = &cells_top[ind];
+    space_split_recursive(s, c, NULL);
   }
 
 #ifdef SWIFT_DEBUG_CHECKS
@@ -1603,7 +1633,6 @@ void space_recycle(struct space *s, struct cell *c) {
   /* Unlock the space. */
   lock_unlock_blind(&s->lock);
 }
-
 /**
  * @brief Get a new empty (sub-)#cell.
  *