diff --git a/configure.ac b/configure.ac
index 9fa9a1de591d63794dde5db6a8dd733cfcaada09..a02dcc57c720f1a9a792b160485caee728a91b98 100644
--- a/configure.ac
+++ b/configure.ac
@@ -351,7 +351,7 @@ AC_ARG_WITH([tcmalloc],
    [with_tcmalloc="no"]
 )
 if test "x$with_tcmalloc" != "xno"; then
-   if test "x$with_tcmalloc" != "xyes" && test "x$with_tcmalloc" != "x"; then
+   if test "x$with_tcmalloc" != "xyes" -a "x$with_tcmalloc" != "x"; then
       tclibs="-L$with_tcmalloc -ltcmalloc"
    else
       tclibs="-ltcmalloc"
@@ -361,7 +361,7 @@ if test "x$with_tcmalloc" != "xno"; then
 
    #  Could just have the minimal version.
    if test "$have_tcmalloc" = "no"; then
-      if test "x$with_tcmalloc" != "xyes" && test "x$with_tcmalloc" != "x"; then
+      if test "x$with_tcmalloc" != "xyes" -a "x$with_tcmalloc" != "x"; then
          tclibs="-L$with_tcmalloc -ltcmalloc_minimal"
       else
          tclibs="-ltcmalloc_minimal"
@@ -394,7 +394,7 @@ AC_ARG_WITH([profiler],
    [with_profiler="yes"]
 )
 if test "x$with_profiler" != "xno"; then
-   if test "x$with_profiler" != "xyes" && test "x$with_profiler" != "x"; then
+   if test "x$with_profiler" != "xyes" -a "x$with_profiler" != "x"; then
       proflibs="-L$with_profiler -lprofiler"
    else
       proflibs="-lprofiler"
@@ -411,6 +411,38 @@ fi
 AC_SUBST([PROFILER_LIBS])
 AM_CONDITIONAL([HAVEPROFILER],[test -n "$PROFILER_LIBS"])
 
+#  Check for jemalloc another fast malloc that is good with contention.
+have_jemalloc="no"
+AC_ARG_WITH([jemalloc],
+   [AS_HELP_STRING([--with-jemalloc],
+      [use jemalloc library or specify the directory with lib @<:@yes/no@:>@]
+   )],
+   [with_jemalloc="$withval"],
+   [with_jemalloc="no"]
+)
+if test "x$with_jemalloc" != "xno"; then
+   if test "x$with_jemalloc" != "xyes" -a "x$with_jemalloc" != "x"; then
+      jelibs="-L$with_jemalloc -ljemalloc"
+   else
+      jelibs="-ljemalloc"
+   fi
+   AC_CHECK_LIB([jemalloc],[malloc_usable_size],[have_jemalloc="yes"],[have_jemalloc="no"],
+                $jelibs)
+
+   if test "$have_jemalloc" = "yes"; then
+      JEMALLOC_LIBS="$jelibs"
+   else
+      JEMALLOC_LIBS=""
+   fi
+fi
+AC_SUBST([JEMALLOC_LIBS])
+AM_CONDITIONAL([HAVEJEMALLOC],[test -n "$JEMALLOC_LIBS"])
+
+#  Don't allow both tcmalloc and jemalloc.
+if test "x$have_tcmalloc" != "xno" -a "x$have_jemalloc" != "xno"; then
+   AC_MSG_ERROR([Cannot use tcmalloc at same time as jemalloc])
+fi
+
 # Check for HDF5. This is required.
 AX_LIB_HDF5
 
@@ -781,6 +813,7 @@ AC_MSG_RESULT([
    FFTW3 enabled   : $have_fftw3
    libNUMA enabled : $have_numa
    Using tcmalloc  : $have_tcmalloc
+   Using jemalloc  : $have_jemalloc
    CPU profiler    : $have_profiler
 
    Hydro scheme       : $with_hydro
diff --git a/examples/Makefile.am b/examples/Makefile.am
index 4da84788a485dacd2103fe85ad3e729ade6b582a..28a4629bdb401c0736379a2fe14a3a5f19caf650 100644
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -24,7 +24,7 @@ AM_CFLAGS = -I$(top_srcdir)/src $(HDF5_CPPFLAGS)
 AM_LDFLAGS = $(HDF5_LDFLAGS)
 
 # Extra libraries.
-EXTRA_LIBS = $(HDF5_LIBS) $(FFTW_LIBS) $(PROFILER_LIBS) $(TCMALLOC_LIBS)
+EXTRA_LIBS = $(HDF5_LIBS) $(FFTW_LIBS) $(PROFILER_LIBS) $(TCMALLOC_LIBS) $(JEMALLOC_LIBS)
 
 # MPI libraries.
 MPI_LIBS = $(METIS_LIBS) $(MPI_THREAD_LIBS)
diff --git a/src/Makefile.am b/src/Makefile.am
index a336c635091694916955d89d9327d5c5c672ec63..1b9d3b4ae9770b9081ba384851fe6cbf1f2ad688 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -25,7 +25,7 @@ AM_LDFLAGS = $(HDF5_LDFLAGS) $(FFTW_LIBS) -version-info 0:0:0
 GIT_CMD = @GIT_CMD@
 
 # Additional dependencies for shared libraries.
-EXTRA_LIBS = $(HDF5_LIBS) $(PROFILER_LIBS) $(TCMALLOC_LIBS)
+EXTRA_LIBS = $(HDF5_LIBS) $(PROFILER_LIBS) $(TCMALLOC_LIBS) $(JEMALLOC_LIBS)
 
 # MPI libraries.
 MPI_LIBS = $(METIS_LIBS) $(MPI_THREAD_LIBS)
diff --git a/src/align.h b/src/align.h
index 84e2909c0866c18f0f8378df9d0efc8d0f6545b5..915af33e6e2ba59be1a0849c4de0e2f1bd5b0d96 100644
--- a/src/align.h
+++ b/src/align.h
@@ -19,9 +19,13 @@
 #ifndef SWIFT_ALIGN_H
 #define SWIFT_ALIGN_H
 
+/**
+ * @brief The default struct alignment in SWIFT.
+ */
+#define SWIFT_STRUCT_ALIGNMENT 32
 /**
  * @brief Defines alignment of structures
  */
-#define SWIFT_STRUCT_ALIGN __attribute__((aligned(32)))
+#define SWIFT_STRUCT_ALIGN __attribute__((aligned(SWIFT_STRUCT_ALIGNMENT)))
 
 #endif /* SWIFT_ALIGN_H */
diff --git a/src/cell.c b/src/cell.c
index e2767cdaa9e1189ec87b5ef51cc578c91f8cfe4c..b08d9ce8b10b01e2f107ecaae5fb56936a4688e6 100644
--- a/src/cell.c
+++ b/src/cell.c
@@ -467,8 +467,11 @@ void cell_gunlocktree(struct cell *c) {
  *        space's parts array, i.e. c->parts - s->parts.
  * @param buff A buffer with at least max(c->count, c->gcount) entries,
  *        used for sorting indices.
+ * @param gbuff A buffer with at least max(c->count, c->gcount) entries,
+ *        used for sorting indices for the gparts.
  */
-void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) {
+void cell_split(struct cell *c, ptrdiff_t parts_offset, struct cell_buff *buff,
+                struct cell_buff *gbuff) {
 
   const int count = c->count, gcount = c->gcount;
   struct part *parts = c->parts;
@@ -480,18 +483,21 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) {
   int bucket_count[8] = {0, 0, 0, 0, 0, 0, 0, 0};
   int bucket_offset[9];
 
-  /* If the buff is NULL, allocate it, and remember to free it. */
-  const int allocate_buffer = (buff == NULL);
-  if (allocate_buffer &&
-      (buff = (int *)malloc(sizeof(int) * max(count, gcount))) == NULL)
-    error("Failed to allocate temporary indices.");
+#ifdef SWIFT_DEBUG_CHECKS
+  /* Check that the buffs are OK. */
+  for (int k = 0; k < count; k++) {
+    if (buff[k].x[0] != parts[k].x[0] || buff[k].x[1] != parts[k].x[1] ||
+        buff[k].x[2] != parts[k].x[2])
+      error("Inconsistent buff contents.");
+  }
+#endif /* SWIFT_DEBUG_CHECKS */
 
   /* Fill the buffer with the indices. */
   for (int k = 0; k < count; k++) {
-    const int bid = (parts[k].x[0] > pivot[0]) * 4 +
-                    (parts[k].x[1] > pivot[1]) * 2 + (parts[k].x[2] > pivot[2]);
+    const int bid = (buff[k].x[0] > pivot[0]) * 4 +
+                    (buff[k].x[1] > pivot[1]) * 2 + (buff[k].x[2] > pivot[2]);
     bucket_count[bid]++;
-    buff[k] = bid;
+    buff[k].ind = bid;
   }
 
   /* Set the buffer offsets. */
@@ -505,23 +511,25 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) {
   for (int bucket = 0; bucket < 8; bucket++) {
     for (int k = bucket_offset[bucket] + bucket_count[bucket];
          k < bucket_offset[bucket + 1]; k++) {
-      int bid = buff[k];
+      int bid = buff[k].ind;
       if (bid != bucket) {
         struct part part = parts[k];
         struct xpart xpart = xparts[k];
+        struct cell_buff temp_buff = buff[k];
         while (bid != bucket) {
           int j = bucket_offset[bid] + bucket_count[bid]++;
-          while (buff[j] == bid) {
+          while (buff[j].ind == bid) {
             j++;
             bucket_count[bid]++;
           }
           memswap(&parts[j], &part, sizeof(struct part));
           memswap(&xparts[j], &xpart, sizeof(struct xpart));
-          memswap(&buff[j], &bid, sizeof(int));
+          memswap(&buff[j], &temp_buff, sizeof(struct cell_buff));
+          bid = temp_buff.ind;
         }
         parts[k] = part;
         xparts[k] = xpart;
-        buff[k] = bid;
+        buff[k] = temp_buff;
       }
       bucket_count[bid]++;
     }
@@ -538,6 +546,14 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) {
   if (count > 0 && gcount > 0) part_relink_gparts(parts, count, parts_offset);
 
 #ifdef SWIFT_DEBUG_CHECKS
+  /* Check that the buffs are OK. */
+  for (int k = 1; k < count; k++) {
+    if (buff[k].ind < buff[k - 1].ind) error("Buff not sorted.");
+    if (buff[k].x[0] != parts[k].x[0] || buff[k].x[1] != parts[k].x[1] ||
+        buff[k].x[2] != parts[k].x[2])
+      error("Inconsistent buff contents (k=%i).", k);
+  }
+
   /* Verify that _all_ the parts have been assigned to a cell. */
   for (int k = 1; k < 8; k++)
     if (&c->progeny[k - 1]->parts[c->progeny[k - 1]->count] !=
@@ -564,6 +580,31 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) {
         c->progeny[2]->parts[k].x[1] <= pivot[1] ||
         c->progeny[2]->parts[k].x[2] > pivot[2])
       error("Sorting failed (progeny=2).");
+  for (int k = 0; k < c->progeny[3]->count; k++)
+    if (c->progeny[3]->parts[k].x[0] > pivot[0] ||
+        c->progeny[3]->parts[k].x[1] <= pivot[1] ||
+        c->progeny[3]->parts[k].x[2] <= pivot[2])
+      error("Sorting failed (progeny=3).");
+  for (int k = 0; k < c->progeny[4]->count; k++)
+    if (c->progeny[4]->parts[k].x[0] <= pivot[0] ||
+        c->progeny[4]->parts[k].x[1] > pivot[1] ||
+        c->progeny[4]->parts[k].x[2] > pivot[2])
+      error("Sorting failed (progeny=4).");
+  for (int k = 0; k < c->progeny[5]->count; k++)
+    if (c->progeny[5]->parts[k].x[0] <= pivot[0] ||
+        c->progeny[5]->parts[k].x[1] > pivot[1] ||
+        c->progeny[5]->parts[k].x[2] <= pivot[2])
+      error("Sorting failed (progeny=5).");
+  for (int k = 0; k < c->progeny[6]->count; k++)
+    if (c->progeny[6]->parts[k].x[0] <= pivot[0] ||
+        c->progeny[6]->parts[k].x[1] <= pivot[1] ||
+        c->progeny[6]->parts[k].x[2] > pivot[2])
+      error("Sorting failed (progeny=6).");
+  for (int k = 0; k < c->progeny[7]->count; k++)
+    if (c->progeny[7]->parts[k].x[0] <= pivot[0] ||
+        c->progeny[7]->parts[k].x[1] <= pivot[1] ||
+        c->progeny[7]->parts[k].x[2] <= pivot[2])
+      error("Sorting failed (progeny=7).");
 #endif
 
   /* Now do the same song and dance for the gparts. */
@@ -571,11 +612,10 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) {
 
   /* Fill the buffer with the indices. */
   for (int k = 0; k < gcount; k++) {
-    const int bid = (gparts[k].x[0] > pivot[0]) * 4 +
-                    (gparts[k].x[1] > pivot[1]) * 2 +
-                    (gparts[k].x[2] > pivot[2]);
+    const int bid = (gbuff[k].x[0] > pivot[0]) * 4 +
+                    (gbuff[k].x[1] > pivot[1]) * 2 + (gbuff[k].x[2] > pivot[2]);
     bucket_count[bid]++;
-    buff[k] = bid;
+    gbuff[k].ind = bid;
   }
 
   /* Set the buffer offsets. */
@@ -589,20 +629,22 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) {
   for (int bucket = 0; bucket < 8; bucket++) {
     for (int k = bucket_offset[bucket] + bucket_count[bucket];
          k < bucket_offset[bucket + 1]; k++) {
-      int bid = buff[k];
+      int bid = gbuff[k].ind;
       if (bid != bucket) {
         struct gpart gpart = gparts[k];
+        struct cell_buff temp_buff = gbuff[k];
         while (bid != bucket) {
           int j = bucket_offset[bid] + bucket_count[bid]++;
-          while (buff[j] == bid) {
+          while (gbuff[j].ind == bid) {
             j++;
             bucket_count[bid]++;
           }
           memswap(&gparts[j], &gpart, sizeof(struct gpart));
-          memswap(&buff[j], &bid, sizeof(int));
+          memswap(&gbuff[j], &temp_buff, sizeof(struct cell_buff));
+          bid = temp_buff.ind;
         }
         gparts[k] = gpart;
-        buff[k] = bid;
+        gbuff[k] = temp_buff;
       }
       bucket_count[bid]++;
     }
diff --git a/src/cell.h b/src/cell.h
index 2cd13cf2ab6b934f6aab84bcbacf510270892866..984ef4bc20cf07285b136461815cd0851c12291a 100644
--- a/src/cell.h
+++ b/src/cell.h
@@ -52,6 +52,12 @@ struct scheduler;
 /* Global variables. */
 extern int cell_next_tag;
 
+/* Struct to temporarily buffer the particle locations and bin id. */
+struct cell_buff {
+  double x[3];
+  int ind;
+} SWIFT_STRUCT_ALIGN;
+
 /* Mini struct to link cells to tasks. Used as a linked list. */
 struct link {
 
@@ -272,7 +278,8 @@ struct cell {
   ((int)(k) + (cdim)[2] * ((int)(j) + (cdim)[1] * (int)(i)))
 
 /* Function prototypes. */
-void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff);
+void cell_split(struct cell *c, ptrdiff_t parts_offset, struct cell_buff *buff,
+                struct cell_buff *gbuff);
 void cell_sanitize(struct cell *c);
 int cell_locktree(struct cell *c);
 void cell_unlocktree(struct cell *c);
diff --git a/src/debug.c b/src/debug.c
index 48572df7f046944613d2598b0d340e949ad3ab7e..21f539b62eef3b0b36f52520486f1e725abc0cda 100644
--- a/src/debug.c
+++ b/src/debug.c
@@ -227,21 +227,21 @@ int checkCellhdxmax(const struct cell *c, int *depth) {
   if (c->h_max != h_max) {
     message("%d Inconsistent h_max: cell %f != parts %f", *depth, c->h_max,
             h_max);
-    message("location: %f %f %f", c->loc[0], c->loc[1], c->loc[2]);
+    error("location: %f %f %f", c->loc[0], c->loc[1], c->loc[2]);
     result = 0;
   }
   if (c->dx_max != dx_max) {
     message("%d Inconsistent dx_max: %f != %f", *depth, c->dx_max, dx_max);
-    message("location: %f %f %f", c->loc[0], c->loc[1], c->loc[2]);
+    error("location: %f %f %f", c->loc[0], c->loc[1], c->loc[2]);
     result = 0;
   }
 
   /* Check rebuild criterion. */
-  if (h_max > c->dmin) {
+  /* if (h_max > c->dmin) {
     message("%d Inconsistent c->dmin: %f > %f", *depth, h_max, c->dmin);
-    message("location: %f %f %f", c->loc[0], c->loc[1], c->loc[2]);
+    error("location: %f %f %f", c->loc[0], c->loc[1], c->loc[2]);
     result = 0;
-  }
+  } */
 
   return result;
 }
diff --git a/src/engine.c b/src/engine.c
index d604042fffd2f9b8f5677cba72d61e9d75b6a311..37b6c45598a0f56ccf4ec10fe7717ea357e90cfd 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -318,6 +318,23 @@ void engine_redistribute(struct engine *e) {
                     MPI_COMM_WORLD) != MPI_SUCCESS)
     error("Failed to allreduce particle transfer counts.");
 
+  /* Report how many particles will be moved. */
+  if (e->verbose) {
+    if (e->nodeID == 0) {
+      size_t total = 0;
+      size_t unmoved = 0;
+      for (int p = 0, r = 0; p < nr_nodes; p++) {
+        for (int s = 0; s < nr_nodes; s++) {
+          total += counts[r];
+          if (p == s) unmoved += counts[r];
+          r++;
+        }
+      }
+      message("%ld of %ld (%.2f%%) of particles moved", total - unmoved, total,
+              100.0 * (double)(total - unmoved) / (double)total);
+    }
+  }
+
   /* Get all the g_counts from all the nodes. */
   if (MPI_Allreduce(MPI_IN_PLACE, g_counts, nr_nodes * nr_nodes, MPI_INT,
                     MPI_SUM, MPI_COMM_WORLD) != MPI_SUCCESS)
diff --git a/src/hydro/Minimal/hydro.h b/src/hydro/Minimal/hydro.h
index 3b3454f1bb348b178ac57899da4f7611802a69cd..beb6f98b8c0d781aa709fb6ee3ca564a52704db2 100644
--- a/src/hydro/Minimal/hydro.h
+++ b/src/hydro/Minimal/hydro.h
@@ -66,7 +66,9 @@ __attribute__((always_inline)) INLINE static float hydro_get_internal_energy(
 __attribute__((always_inline)) INLINE static float hydro_get_pressure(
     const struct part *restrict p, float dt) {
 
-  return p->force.pressure;
+  const float u = p->u + p->u_dt * dt;
+
+  return gas_pressure_from_internal_energy(p->rho, u);
 }
 
 /**
diff --git a/src/memswap.h b/src/memswap.h
index 4643725535917952d12927d52187bc7306ced5ef..92c902eeb158978d4a606f5f2a9416d4113fae0b 100644
--- a/src/memswap.h
+++ b/src/memswap.h
@@ -32,24 +32,27 @@
 #include <altivec.h>
 #endif
 
-/* Macro for in-place swap of two values a and b of type t. */
-#define swap_loop(t, a, b, c)  \
-  while (c >= sizeof(t)) {     \
-    register t temp = *(t *)a; \
-    *(t *)a = *(t *)b;         \
-    *(t *)b = temp;            \
-    a += sizeof(t);            \
-    b += sizeof(t);            \
-    bytes -= sizeof(t);        \
+/* Macro for in-place swap of two values a and b of type t. a and b are
+   assumed to be of type char* so that the pointer arithmetic works. */
+#define swap_loop(type, a, b, count) \
+  while (count >= sizeof(type)) {    \
+    register type temp = *(type *)a; \
+    *(type *)a = *(type *)b;         \
+    *(type *)b = temp;               \
+    a += sizeof(type);               \
+    b += sizeof(type);               \
+    count -= sizeof(type);           \
   }
 
 /**
  * @brief Swap the contents of two elements in-place.
  *
- * Keep in mind that this function works best when the underlying data
+ * Keep in mind that this function only works when the underlying data
  * is aligned to the vector length, e.g. with the @c
  * __attribute__((aligned(32)))
- * syntax, and the code is compiled with @c -funroll-loops.
+ * syntax!
+ * Furthermore, register re-labeling only seems to work when the code is
+ * compiled with @c -funroll-loops.
  *
  * @param void_a Pointer to the first element.
  * @param void_b Pointer to the second element.
@@ -76,4 +79,63 @@ __attribute__((always_inline)) inline void memswap(void *void_a, void *void_b,
   swap_loop(char, a, b, bytes);
 }
 
+/**
+ * @brief Swap the contents of two elements in-place.
+ *
+ * As opposed to #memswap, this function does not require the parameters
+ * to be aligned in any specific way.
+ * Furthermore, register re-labeling only seems to work when the code is
+ * compiled with @c -funroll-loops.
+ *
+ * @param void_a Pointer to the first element.
+ * @param void_b Pointer to the second element.
+ * @param bytes Size, in bytes, of the data pointed to by @c a and @c b.
+ */
+__attribute__((always_inline)) inline void memswap_unaligned(void *void_a,
+                                                             void *void_b,
+                                                             size_t bytes) {
+  char *a = (char *)void_a, *b = (char *)void_b;
+#ifdef __AVX512F__
+  while (bytes >= sizeof(__m512i)) {
+    register __m512i temp;
+    temp = _mm512_loadu_si512((__m512i *)a);
+    _mm512_storeu_si512((__m512i *)a, _mm512_loadu_si512((__m512i *)b));
+    _mm512_storeu_si512((__m512i *)b, temp);
+    a += sizeof(__m512i);
+    b += sizeof(__m512i);
+    bytes -= sizeof(__m512i);
+  }
+#endif
+#ifdef __AVX__
+  while (bytes >= sizeof(__m256i)) {
+    register __m256i temp;
+    temp = _mm256_loadu_si256((__m256i *)a);
+    _mm256_storeu_si256((__m256i *)a, _mm256_loadu_si256((__m256i *)b));
+    _mm256_storeu_si256((__m256i *)b, temp);
+    a += sizeof(__m256i);
+    b += sizeof(__m256i);
+    bytes -= sizeof(__m256i);
+  }
+#endif
+#ifdef __SSE2__
+  while (bytes >= sizeof(__m128i)) {
+    register __m128i temp;
+    temp = _mm_loadu_si128((__m128i *)a);
+    _mm_storeu_si128((__m128i *)a, _mm_loadu_si128((__m128i *)b));
+    _mm_storeu_si128((__m128i *)b, temp);
+    a += sizeof(__m128i);
+    b += sizeof(__m128i);
+    bytes -= sizeof(__m128i);
+  }
+#endif
+#ifdef __ALTIVEC__
+  // Power8 supports unaligned load/stores, but not sure what it will do here.
+  swap_loop(vector int, a, b, bytes);
+#endif
+  swap_loop(size_t, a, b, bytes);
+  swap_loop(int, a, b, bytes);
+  swap_loop(short, a, b, bytes);
+  swap_loop(char, a, b, bytes);
+}
+
 #endif /* SWIFT_MEMSWAP_H */
diff --git a/src/partition.c b/src/partition.c
index 89ba3f2835cb78e07ec2bc5cb04c3f8f71751563..85745d880e3ab0f6beaf918a5c226730b6b82a7c 100644
--- a/src/partition.c
+++ b/src/partition.c
@@ -278,6 +278,18 @@ static void split_metis(struct space *s, int nregions, int *celllist) {
 #endif
 
 #if defined(WITH_MPI) && defined(HAVE_METIS)
+
+/* qsort support. */
+struct indexval {
+  int index;
+  int count;
+};
+static int indexvalcmp(const void *p1, const void *p2) {
+  const struct indexval *iv1 = (const struct indexval *)p1;
+  const struct indexval *iv2 = (const struct indexval *)p2;
+  return iv2->count - iv1->count;
+}
+
 /**
  * @brief Partition the given space into a number of connected regions.
  *
@@ -382,14 +394,70 @@ static void pick_metis(struct space *s, int nregions, int *vertexw, int *edgew,
     if (regionid[k] < 0 || regionid[k] >= nregions)
       error("Got bad nodeID %" PRIDX " for cell %i.", regionid[k], k);
 
+  /* We want a solution in which the current regions of the space are
+   * preserved when possible, to avoid unneccesary particle movement.
+   * So create a 2d-array of cells counts that are common to all pairs
+   * of old and new ranks. Each element of the array has a cell count and
+   * an unique index so we can sort into decreasing counts. */
+  int indmax = nregions * nregions;
+  struct indexval *ivs = malloc(sizeof(struct indexval) * indmax);
+  bzero(ivs, sizeof(struct indexval) * indmax);
+  for (int k = 0; k < ncells; k++) {
+    int index = regionid[k] + nregions * s->cells_top[k].nodeID;
+    ivs[index].count++;
+    ivs[index].index = index;
+  }
+  qsort(ivs, indmax, sizeof(struct indexval), indexvalcmp);
+
+  /* Go through the ivs using the largest counts first, these are the
+   * regions with the most cells in common, old partition to new. */
+  int *oldmap = malloc(sizeof(int) * nregions);
+  int *newmap = malloc(sizeof(int) * nregions);
+  for (int k = 0; k < nregions; k++) {
+    oldmap[k] = -1;
+    newmap[k] = -1;
+  }
+  for (int k = 0; k < indmax; k++) {
+
+    /* Stop when all regions with common cells have been considered. */
+    if (ivs[k].count == 0) break;
+
+    /* Store old and new IDs, if not already used. */
+    int oldregion = ivs[k].index / nregions;
+    int newregion = ivs[k].index - oldregion * nregions;
+    if (newmap[newregion] == -1 && oldmap[oldregion] == -1) {
+      newmap[newregion] = oldregion;
+      oldmap[oldregion] = newregion;
+    }
+  }
+
+  /* Handle any regions that did not get selected by picking an unused rank
+   * from oldmap and assigning to newmap. */
+  int spare = 0;
+  for (int k = 0; k < nregions; k++) {
+    if (newmap[k] == -1) {
+      for (int j = spare; j < nregions; j++) {
+        if (oldmap[j] == -1) {
+          newmap[k] = j;
+          oldmap[j] = j;
+          spare = j;
+          break;
+        }
+      }
+    }
+  }
+
   /* Set the cell list to the region index. */
   for (int k = 0; k < ncells; k++) {
-    celllist[k] = regionid[k];
+    celllist[k] = newmap[regionid[k]];
   }
 
   /* Clean up. */
   if (weights_v != NULL) free(weights_v);
   if (weights_e != NULL) free(weights_e);
+  free(ivs);
+  free(oldmap);
+  free(newmap);
   free(xadj);
   free(adjncy);
   free(regionid);
diff --git a/src/proxy.c b/src/proxy.c
index efe3a3eec108d44d5b9bf8b4718dc025464f8762..e68fe9cb3ad23956d3d3c012573b0af6047c1904 100644
--- a/src/proxy.c
+++ b/src/proxy.c
@@ -65,8 +65,8 @@ void proxy_cells_exch1(struct proxy *p) {
 
   /* Allocate and fill the pcell buffer. */
   if (p->pcells_out != NULL) free(p->pcells_out);
-  if ((p->pcells_out = malloc(sizeof(struct pcell) * p->size_pcells_out)) ==
-      NULL)
+  if (posix_memalign((void **)&p->pcells_out, SWIFT_STRUCT_ALIGNMENT,
+                     sizeof(struct pcell) * p->size_pcells_out) != 0)
     error("Failed to allocate pcell_out buffer.");
   for (int ind = 0, k = 0; k < p->nr_cells_out; k++) {
     memcpy(&p->pcells_out[ind], p->cells_out[k]->pcell,
@@ -102,8 +102,8 @@ void proxy_cells_exch2(struct proxy *p) {
 
   /* Re-allocate the pcell_in buffer. */
   if (p->pcells_in != NULL) free(p->pcells_in);
-  if ((p->pcells_in = (struct pcell *)malloc(sizeof(struct pcell) *
-                                             p->size_pcells_in)) == NULL)
+  if (posix_memalign((void **)&p->pcells_in, SWIFT_STRUCT_ALIGNMENT,
+                     sizeof(struct pcell) * p->size_pcells_in) != 0)
     error("Failed to allocate pcell_in buffer.");
 
   /* Receive the particle buffers. */
diff --git a/src/queue.h b/src/queue.h
index c0a2fb1da6e6e3cbea813a0ef53841084ab0f933..951a3e5a056d7ad0c3935f98341a0d93c805e3ad 100644
--- a/src/queue.h
+++ b/src/queue.h
@@ -30,6 +30,7 @@
 #define queue_sizegrow 2
 #define queue_search_window 8
 #define queue_incoming_size 1024
+#define queue_struct_align 64
 
 /* Counters. */
 enum {
@@ -57,7 +58,7 @@ struct queue {
   int *tid_incoming;
   volatile unsigned int first_incoming, last_incoming, count_incoming;
 
-} __attribute__((aligned(64)));
+} __attribute__((aligned(queue_struct_align)));
 
 /* Function prototypes. */
 struct task *queue_gettask(struct queue *q, const struct task *prev,
diff --git a/src/scheduler.c b/src/scheduler.c
index 0d7c8c4754bac931c7886200176e3e9441c63c53..6dc7d67a1467333ea08ed2c72795c172d9a942e1 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -1408,8 +1408,8 @@ void scheduler_init(struct scheduler *s, struct space *space, int nr_tasks,
   lock_init(&s->lock);
 
   /* Allocate the queues. */
-  if ((s->queues = (struct queue *)malloc(sizeof(struct queue) * nr_queues)) ==
-      NULL)
+  if (posix_memalign((void **)&s->queues, queue_struct_align,
+                     sizeof(struct queue) * nr_queues) != 0)
     error("Failed to allocate queues.");
 
   /* Initialize each queue. */
diff --git a/src/space.c b/src/space.c
index a9d50dc3e94b5b320817f8ada5f81410fefdb93e..84af59d30fdebe839212ea9119aec92d2384802f 100644
--- a/src/space.c
+++ b/src/space.c
@@ -174,18 +174,68 @@ int space_getsid(struct space *s, struct cell **ci, struct cell **cj,
  *
  * @param s The #space.
  * @param c The #cell to recycle.
+ * @param rec_begin Pointer to the start of the list of cells to recycle.
+ * @param rec_end Pointer to the end of the list of cells to recycle.
  */
-void space_rebuild_recycle(struct space *s, struct cell *c) {
-
+void space_rebuild_recycle_rec(struct space *s, struct cell *c,
+                               struct cell **rec_begin, struct cell **rec_end) {
   if (c->split)
     for (int k = 0; k < 8; k++)
       if (c->progeny[k] != NULL) {
-        space_rebuild_recycle(s, c->progeny[k]);
-        space_recycle(s, c->progeny[k]);
+        space_rebuild_recycle_rec(s, c->progeny[k], rec_begin, rec_end);
+        c->progeny[k]->next = *rec_begin;
+        *rec_begin = c->progeny[k];
+        if (*rec_end == NULL) *rec_end = *rec_begin;
         c->progeny[k] = NULL;
       }
 }
 
+void space_rebuild_recycle_mapper(void *map_data, int num_elements,
+                                  void *extra_data) {
+
+  struct space *s = (struct space *)extra_data;
+  struct cell *cells = (struct cell *)map_data;
+
+  for (int k = 0; k < num_elements; k++) {
+    struct cell *c = &cells[k];
+    struct cell *rec_begin = NULL, *rec_end = NULL;
+    space_rebuild_recycle_rec(s, c, &rec_begin, &rec_end);
+    if (rec_begin != NULL) space_recycle_list(s, rec_begin, rec_end);
+    c->sorts = NULL;
+    c->nr_tasks = 0;
+    c->density = NULL;
+    c->gradient = NULL;
+    c->force = NULL;
+    c->grav = NULL;
+    c->dx_max = 0.0f;
+    c->sorted = 0;
+    c->count = 0;
+    c->gcount = 0;
+    c->init = NULL;
+    c->extra_ghost = NULL;
+    c->ghost = NULL;
+    c->kick = NULL;
+    c->cooling = NULL;
+    c->sourceterms = NULL;
+    c->super = c;
+    if (c->sort != NULL) {
+      free(c->sort);
+      c->sort = NULL;
+    }
+#if WITH_MPI
+    c->recv_xv = NULL;
+    c->recv_rho = NULL;
+    c->recv_gradient = NULL;
+    c->recv_ti = NULL;
+
+    c->send_xv = NULL;
+    c->send_rho = NULL;
+    c->send_gradient = NULL;
+    c->send_ti = NULL;
+#endif
+  }
+}
+
 /**
  * @brief Re-build the top-level cell grid.
  *
@@ -298,10 +348,8 @@ void space_regrid(struct space *s, int verbose) {
 
     /* Free the old cells, if they were allocated. */
     if (s->cells_top != NULL) {
-      for (int k = 0; k < s->nr_cells; k++) {
-        space_rebuild_recycle(s, &s->cells_top[k]);
-        if (s->cells_top[k].sort != NULL) free(s->cells_top[k].sort);
-      }
+      threadpool_map(&s->e->threadpool, space_rebuild_recycle_mapper,
+                     s->cells_top, s->nr_cells, sizeof(struct cell), 100, s);
       free(s->cells_top);
       s->maxdepth = 0;
     }
@@ -389,42 +437,12 @@ void space_regrid(struct space *s, int verbose) {
     // message( "rebuilding upper-level cells took %.3f %s." ,
     // clocks_from_ticks(double)(getticks() - tic), clocks_getunit());
 
-  } /* re-build upper-level cells? */
-
+  }      /* re-build upper-level cells? */
   else { /* Otherwise, just clean up the cells. */
 
     /* Free the old cells, if they were allocated. */
-    for (int k = 0; k < s->nr_cells; k++) {
-      space_rebuild_recycle(s, &s->cells_top[k]);
-      s->cells_top[k].sorts = NULL;
-      s->cells_top[k].nr_tasks = 0;
-      s->cells_top[k].density = NULL;
-      s->cells_top[k].gradient = NULL;
-      s->cells_top[k].force = NULL;
-      s->cells_top[k].grav = NULL;
-      s->cells_top[k].dx_max = 0.0f;
-      s->cells_top[k].sorted = 0;
-      s->cells_top[k].count = 0;
-      s->cells_top[k].gcount = 0;
-      s->cells_top[k].init = NULL;
-      s->cells_top[k].extra_ghost = NULL;
-      s->cells_top[k].ghost = NULL;
-      s->cells_top[k].kick = NULL;
-      s->cells_top[k].cooling = NULL;
-      s->cells_top[k].sourceterms = NULL;
-      s->cells_top[k].super = &s->cells_top[k];
-#if WITH_MPI
-      s->cells_top[k].recv_xv = NULL;
-      s->cells_top[k].recv_rho = NULL;
-      s->cells_top[k].recv_gradient = NULL;
-      s->cells_top[k].recv_ti = NULL;
-
-      s->cells_top[k].send_xv = NULL;
-      s->cells_top[k].send_rho = NULL;
-      s->cells_top[k].send_gradient = NULL;
-      s->cells_top[k].send_ti = NULL;
-#endif
-    }
+    threadpool_map(&s->e->threadpool, space_rebuild_recycle_mapper,
+                   s->cells_top, s->nr_cells, sizeof(struct cell), 100, s);
     s->maxdepth = 0;
   }
 
@@ -1441,9 +1459,12 @@ void space_map_cells_pre(struct space *s, int full,
  * @param s The #space in which the cell lives.
  * @param c The #cell to split recursively.
  * @param buff A buffer for particle sorting, should be of size at least
- *        max(c->count, c->gount) or @c NULL.
+ *        c->count or @c NULL.
+ * @param gbuff A buffer for particle sorting, should be of size at least
+ *        c->gcount or @c NULL.
  */
-void space_split_recursive(struct space *s, struct cell *c, int *buff) {
+void space_split_recursive(struct space *s, struct cell *c,
+                           struct cell_buff *buff, struct cell_buff *gbuff) {
 
   const int count = c->count;
   const int gcount = c->gcount;
@@ -1458,10 +1479,29 @@ void space_split_recursive(struct space *s, struct cell *c, int *buff) {
   struct engine *e = s->e;
 
   /* If the buff is NULL, allocate it, and remember to free it. */
-  const int allocate_buffer = (buff == NULL);
-  if (allocate_buffer &&
-      (buff = (int *)malloc(sizeof(int) * max(count, gcount))) == NULL)
-    error("Failed to allocate temporary indices.");
+  const int allocate_buffer = (buff == NULL && gbuff == NULL);
+  if (allocate_buffer) {
+    if (count > 0) {
+      if (posix_memalign((void *)&buff, SWIFT_STRUCT_ALIGNMENT,
+                         sizeof(struct cell_buff) * count) != 0)
+        error("Failed to allocate temporary indices.");
+      for (int k = 0; k < count; k++) {
+        buff[k].x[0] = parts[k].x[0];
+        buff[k].x[1] = parts[k].x[1];
+        buff[k].x[2] = parts[k].x[2];
+      }
+    }
+    if (gcount > 0) {
+      if (posix_memalign((void *)&gbuff, SWIFT_STRUCT_ALIGNMENT,
+                         sizeof(struct cell_buff) * gcount) != 0)
+        error("Failed to allocate temporary indices.");
+      for (int k = 0; k < gcount; k++) {
+        gbuff[k].x[0] = gparts[k].x[0];
+        gbuff[k].x[1] = gparts[k].x[1];
+        gbuff[k].x[2] = gparts[k].x[2];
+      }
+    }
+  }
 
   /* Check the depth. */
   while (depth > (maxdepth = s->maxdepth)) {
@@ -1507,15 +1547,18 @@ void space_split_recursive(struct space *s, struct cell *c, int *buff) {
     }
 
     /* Split the cell data. */
-    cell_split(c, c->parts - s->parts, buff);
+    cell_split(c, c->parts - s->parts, buff, gbuff);
 
     /* Remove any progeny with zero parts. */
+    struct cell_buff *progeny_buff = buff, *progeny_gbuff = gbuff;
     for (int k = 0; k < 8; k++)
       if (c->progeny[k]->count == 0 && c->progeny[k]->gcount == 0) {
         space_recycle(s, c->progeny[k]);
         c->progeny[k] = NULL;
       } else {
-        space_split_recursive(s, c->progeny[k], buff);
+        space_split_recursive(s, c->progeny[k], progeny_buff, progeny_gbuff);
+        progeny_buff += c->progeny[k]->count;
+        progeny_gbuff += c->progeny[k]->gcount;
         h_max = max(h_max, c->progeny[k]->h_max);
         ti_end_min = min(ti_end_min, c->progeny[k]->ti_end_min);
         ti_end_max = max(ti_end_max, c->progeny[k]->ti_end_max);
@@ -1574,7 +1617,10 @@ void space_split_recursive(struct space *s, struct cell *c, int *buff) {
     c->owner = 0; /* Ok, there is really nothing on this rank... */
 
   /* Clean up. */
-  if (allocate_buffer) free(buff);
+  if (allocate_buffer) {
+    if (buff != NULL) free(buff);
+    if (gbuff != NULL) free(gbuff);
+  }
 }
 
 /**
@@ -1593,7 +1639,7 @@ void space_split_mapper(void *map_data, int num_cells, void *extra_data) {
 
   for (int ind = 0; ind < num_cells; ind++) {
     struct cell *c = &cells_top[ind];
-    space_split_recursive(s, c, NULL);
+    space_split_recursive(s, c, NULL, NULL);
   }
 
 #ifdef SWIFT_DEBUG_CHECKS
@@ -1601,30 +1647,28 @@ void space_split_mapper(void *map_data, int num_cells, void *extra_data) {
   for (int ind = 0; ind < num_cells; ind++) {
     int depth = 0;
     if (!checkCellhdxmax(&cells_top[ind], &depth))
-      message("    at cell depth %d", depth);
+      error("    at cell depth %d", depth);
   }
 #endif
 }
 
 /**
- * @brief Return a used cell to the buffer od unused sub-cells.
+ * @brief Return a used cell to the buffer of unused sub-cells.
  *
  * @param s The #space.
  * @param c The #cell.
  */
 void space_recycle(struct space *s, struct cell *c) {
 
-  /* Lock the space. */
-  lock_lock(&s->lock);
-
   /* Clear the cell. */
-  if (lock_destroy(&c->lock) != 0) error("Failed to destroy spinlock.");
+  if (lock_destroy(&c->lock) != 0 || lock_destroy(&c->glock) != 0)
+    error("Failed to destroy spinlock.");
 
   /* Clear this cell's sort arrays. */
   if (c->sort != NULL) free(c->sort);
 
-  /* Clear the cell data. */
-  bzero(c, sizeof(struct cell));
+  /* Lock the space. */
+  lock_lock(&s->lock);
 
   /* Hook this cell into the buffer. */
   c->next = s->cells_sub;
@@ -1634,6 +1678,47 @@ void space_recycle(struct space *s, struct cell *c) {
   /* Unlock the space. */
   lock_unlock_blind(&s->lock);
 }
+
+/**
+ * @brief Return a list of used cells to the buffer of unused sub-cells.
+ *
+ * @param s The #space.
+ * @param list_begin Pointer to the first #cell in the linked list of
+ *        cells joined by their @c next pointers.
+ * @param list_end Pointer to the last #cell in the linked list of
+ *        cells joined by their @c next pointers. It is assumed that this
+ *        cell's @c next pointer is @c NULL.
+ */
+void space_recycle_list(struct space *s, struct cell *list_begin,
+                        struct cell *list_end) {
+
+  int count = 0;
+
+  /* Clean up the list of cells. */
+  for (struct cell *c = list_begin; c != NULL; c = c->next) {
+    /* Clear the cell. */
+    if (lock_destroy(&c->lock) != 0 || lock_destroy(&c->glock) != 0)
+      error("Failed to destroy spinlock.");
+
+    /* Clear this cell's sort arrays. */
+    if (c->sort != NULL) free(c->sort);
+
+    /* Count this cell. */
+    count += 1;
+  }
+
+  /* Lock the space. */
+  lock_lock(&s->lock);
+
+  /* Hook this cell into the buffer. */
+  list_end->next = s->cells_sub;
+  s->cells_sub = list_begin;
+  s->tot_cells -= count;
+
+  /* Unlock the space. */
+  lock_unlock_blind(&s->lock);
+}
+
 /**
  * @brief Get a new empty (sub-)#cell.
  *
@@ -1653,9 +1738,6 @@ struct cell *space_getcell(struct space *s) {
                        space_cellallocchunk * sizeof(struct cell)) != 0)
       error("Failed to allocate more cells.");
 
-    /* Zero everything for good measure */
-    bzero(s->cells_sub, space_cellallocchunk * sizeof(struct cell));
-
     /* Constructed a linked list */
     for (int k = 0; k < space_cellallocchunk - 1; k++)
       s->cells_sub[k].next = &s->cells_sub[k + 1];
diff --git a/src/space.h b/src/space.h
index 36699d1d5091cac4dbd98e3ceeae7547c1012aed..1103cc4867c0ff7ded700331a1a4ff540233e8a7 100644
--- a/src/space.h
+++ b/src/space.h
@@ -178,6 +178,8 @@ void space_gparts_sort_mapper(void *map_data, int num_elements,
                               void *extra_data);
 void space_rebuild(struct space *s, int verbose);
 void space_recycle(struct space *s, struct cell *c);
+void space_recycle_list(struct space *s, struct cell *list_begin,
+                        struct cell *list_end);
 void space_split(struct space *s, struct cell *cells, int nr_cells,
                  int verbose);
 void space_split_mapper(void *map_data, int num_elements, void *extra_data);
diff --git a/src/threadpool.c b/src/threadpool.c
index 6bc887d96cb72804f0fbc8e2801a6522bf27f947..c11fd8121bb02f36fce1796d79a7eb55a38102c4 100644
--- a/src/threadpool.c
+++ b/src/threadpool.c
@@ -91,6 +91,10 @@ void threadpool_init(struct threadpool *tp, int num_threads) {
   tp->num_threads = num_threads;
   tp->num_threads_waiting = 0;
 
+  /* If there is only a single thread, do nothing more as of here as
+     we will just do work in the (blocked) calling thread. */
+  if (num_threads == 1) return;
+
   /* Init the threadpool mutexes. */
   if (pthread_mutex_init(&tp->thread_mutex, NULL) != 0)
     error("Failed to initialize mutexex.");
@@ -144,6 +148,12 @@ void threadpool_map(struct threadpool *tp, threadpool_map_function map_function,
                     void *map_data, size_t N, int stride, int chunk,
                     void *extra_data) {
 
+  /* If we just have a single thread, call the map function directly. */
+  if (tp->num_threads == 1) {
+    map_function(map_data, N, extra_data);
+    return;
+  }
+
   /* Set the map data and signal the threads. */
   pthread_mutex_lock(&tp->thread_mutex);
   tp->map_data_stride = stride;
diff --git a/tests/testInteractions.c b/tests/testInteractions.c
index d14c840ec77819bbef5750b897c72139f4d7b2b4..4faebbbdb35c4bc36cca16cb207517f05629f8c1 100644
--- a/tests/testInteractions.c
+++ b/tests/testInteractions.c
@@ -150,6 +150,9 @@ void write_header(char *fileName) {
  *
  * @param parts Particle array to be interacted
  * @param count No. of particles to be interacted
+ * @param serial_inter_func Function pointer to serial interaction function
+ * @param vec_inter_func Function pointer to vectorised interaction function
+ * @param filePrefix Prefix of output files
  *
  */
 void test_interactions(struct part *parts, int count,