diff --git a/src/cell.c b/src/cell.c
index d17ba527a3d9f7960a7740e7f517d37ec015d5cc..2b3f5ab2e85552416d541222c94a9eeaed056b17 100644
--- a/src/cell.c
+++ b/src/cell.c
@@ -472,8 +472,11 @@ void cell_gunlocktree(struct cell *c) {
  *        space's parts array, i.e. c->parts - s->parts.
  * @param buff A buffer with at least max(c->count, c->gcount) entries,
  *        used for sorting indices.
+ * @param gbuff A buffer with at least max(c->count, c->gcount) entries,
+ *        used for sorting indices for the gparts.
  */
-void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) {
+void cell_split(struct cell *c, ptrdiff_t parts_offset, struct cell_buff *buff,
+                struct cell_buff *gbuff) {
 
   const int count = c->count, gcount = c->gcount;
   struct part *parts = c->parts;
@@ -485,18 +488,21 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) {
   int bucket_count[8] = {0, 0, 0, 0, 0, 0, 0, 0};
   int bucket_offset[9];
 
-  /* If the buff is NULL, allocate it, and remember to free it. */
-  const int allocate_buffer = (buff == NULL);
-  if (allocate_buffer &&
-      (buff = (int *)malloc(sizeof(int) * max(count, gcount))) == NULL)
-    error("Failed to allocate temporary indices.");
+#ifdef SWIFT_DEBUG_CHECKS
+  /* Check that the buffs are OK. */
+  for (int k = 0; k < count; k++) {
+    if (buff[k].x[0] != parts[k].x[0] || buff[k].x[1] != parts[k].x[1] ||
+        buff[k].x[2] != parts[k].x[2])
+      error("Inconsistent buff contents.");
+  }
+#endif /* SWIFT_DEBUG_CHECKS */
 
   /* Fill the buffer with the indices. */
   for (int k = 0; k < count; k++) {
-    const int bid = (parts[k].x[0] > pivot[0]) * 4 +
-                    (parts[k].x[1] > pivot[1]) * 2 + (parts[k].x[2] > pivot[2]);
+    const int bid = (buff[k].x[0] > pivot[0]) * 4 +
+                    (buff[k].x[1] > pivot[1]) * 2 + (buff[k].x[2] > pivot[2]);
     bucket_count[bid]++;
-    buff[k] = bid;
+    buff[k].ind = bid;
   }
 
   /* Set the buffer offsets. */
@@ -510,23 +516,25 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) {
   for (int bucket = 0; bucket < 8; bucket++) {
     for (int k = bucket_offset[bucket] + bucket_count[bucket];
          k < bucket_offset[bucket + 1]; k++) {
-      int bid = buff[k];
+      int bid = buff[k].ind;
       if (bid != bucket) {
         struct part part = parts[k];
         struct xpart xpart = xparts[k];
+        struct cell_buff temp_buff = buff[k];
         while (bid != bucket) {
           int j = bucket_offset[bid] + bucket_count[bid]++;
-          while (buff[j] == bid) {
+          while (buff[j].ind == bid) {
             j++;
             bucket_count[bid]++;
           }
           memswap(&parts[j], &part, sizeof(struct part));
           memswap(&xparts[j], &xpart, sizeof(struct xpart));
-          memswap(&buff[j], &bid, sizeof(int));
+          memswap(&buff[j], &temp_buff, sizeof(struct cell_buff));
+          bid = temp_buff.ind;
         }
         parts[k] = part;
         xparts[k] = xpart;
-        buff[k] = bid;
+        buff[k] = temp_buff;
       }
       bucket_count[bid]++;
     }
@@ -543,6 +551,14 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) {
   if (count > 0 && gcount > 0) part_relink_gparts(parts, count, parts_offset);
 
 #ifdef SWIFT_DEBUG_CHECKS
+  /* Check that the buffs are OK. */
+  for (int k = 1; k < count; k++) {
+    if (buff[k].ind < buff[k - 1].ind) error("Buff not sorted.");
+    if (buff[k].x[0] != parts[k].x[0] || buff[k].x[1] != parts[k].x[1] ||
+        buff[k].x[2] != parts[k].x[2])
+      error("Inconsistent buff contents (k=%i).", k);
+  }
+
   /* Verify that _all_ the parts have been assigned to a cell. */
   for (int k = 1; k < 8; k++)
     if (&c->progeny[k - 1]->parts[c->progeny[k - 1]->count] !=
@@ -569,6 +585,31 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) {
         c->progeny[2]->parts[k].x[1] <= pivot[1] ||
         c->progeny[2]->parts[k].x[2] > pivot[2])
       error("Sorting failed (progeny=2).");
+  for (int k = 0; k < c->progeny[3]->count; k++)
+    if (c->progeny[3]->parts[k].x[0] > pivot[0] ||
+        c->progeny[3]->parts[k].x[1] <= pivot[1] ||
+        c->progeny[3]->parts[k].x[2] <= pivot[2])
+      error("Sorting failed (progeny=3).");
+  for (int k = 0; k < c->progeny[4]->count; k++)
+    if (c->progeny[4]->parts[k].x[0] <= pivot[0] ||
+        c->progeny[4]->parts[k].x[1] > pivot[1] ||
+        c->progeny[4]->parts[k].x[2] > pivot[2])
+      error("Sorting failed (progeny=4).");
+  for (int k = 0; k < c->progeny[5]->count; k++)
+    if (c->progeny[5]->parts[k].x[0] <= pivot[0] ||
+        c->progeny[5]->parts[k].x[1] > pivot[1] ||
+        c->progeny[5]->parts[k].x[2] <= pivot[2])
+      error("Sorting failed (progeny=5).");
+  for (int k = 0; k < c->progeny[6]->count; k++)
+    if (c->progeny[6]->parts[k].x[0] <= pivot[0] ||
+        c->progeny[6]->parts[k].x[1] <= pivot[1] ||
+        c->progeny[6]->parts[k].x[2] > pivot[2])
+      error("Sorting failed (progeny=6).");
+  for (int k = 0; k < c->progeny[7]->count; k++)
+    if (c->progeny[7]->parts[k].x[0] <= pivot[0] ||
+        c->progeny[7]->parts[k].x[1] <= pivot[1] ||
+        c->progeny[7]->parts[k].x[2] <= pivot[2])
+      error("Sorting failed (progeny=7).");
 #endif
 
   /* Now do the same song and dance for the gparts. */
@@ -576,11 +617,10 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) {
 
   /* Fill the buffer with the indices. */
   for (int k = 0; k < gcount; k++) {
-    const int bid = (gparts[k].x[0] > pivot[0]) * 4 +
-                    (gparts[k].x[1] > pivot[1]) * 2 +
-                    (gparts[k].x[2] > pivot[2]);
+    const int bid = (gbuff[k].x[0] > pivot[0]) * 4 +
+                    (gbuff[k].x[1] > pivot[1]) * 2 + (gbuff[k].x[2] > pivot[2]);
     bucket_count[bid]++;
-    buff[k] = bid;
+    gbuff[k].ind = bid;
   }
 
   /* Set the buffer offsets. */
@@ -594,20 +634,22 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) {
   for (int bucket = 0; bucket < 8; bucket++) {
     for (int k = bucket_offset[bucket] + bucket_count[bucket];
          k < bucket_offset[bucket + 1]; k++) {
-      int bid = buff[k];
+      int bid = gbuff[k].ind;
       if (bid != bucket) {
         struct gpart gpart = gparts[k];
+        struct cell_buff temp_buff = gbuff[k];
         while (bid != bucket) {
           int j = bucket_offset[bid] + bucket_count[bid]++;
-          while (buff[j] == bid) {
+          while (gbuff[j].ind == bid) {
             j++;
             bucket_count[bid]++;
           }
           memswap(&gparts[j], &gpart, sizeof(struct gpart));
-          memswap(&buff[j], &bid, sizeof(int));
+          memswap(&gbuff[j], &temp_buff, sizeof(struct cell_buff));
+          bid = temp_buff.ind;
         }
         gparts[k] = gpart;
-        buff[k] = bid;
+        gbuff[k] = temp_buff;
       }
       bucket_count[bid]++;
     }
diff --git a/src/cell.h b/src/cell.h
index c01594d4885067e63cab363718cc1559c3ff1034..d99019f58ea20ad83c835178ba71569192be85a1 100644
--- a/src/cell.h
+++ b/src/cell.h
@@ -52,6 +52,12 @@ struct scheduler;
 /* Global variables. */
 extern int cell_next_tag;
 
+/* Struct to temporarily buffer the particle locations and bin id. */
+struct cell_buff {
+  double x[3];
+  int ind;
+} SWIFT_STRUCT_ALIGN;
+
 /* Mini struct to link cells to tasks. Used as a linked list. */
 struct link {
 
@@ -281,7 +287,8 @@ struct cell {
   ((int)(k) + (cdim)[2] * ((int)(j) + (cdim)[1] * (int)(i)))
 
 /* Function prototypes. */
-void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff);
+void cell_split(struct cell *c, ptrdiff_t parts_offset, struct cell_buff *buff,
+                struct cell_buff *gbuff);
 void cell_sanitize(struct cell *c);
 int cell_locktree(struct cell *c);
 void cell_unlocktree(struct cell *c);
diff --git a/src/memswap.h b/src/memswap.h
index 4643725535917952d12927d52187bc7306ced5ef..92c902eeb158978d4a606f5f2a9416d4113fae0b 100644
--- a/src/memswap.h
+++ b/src/memswap.h
@@ -32,24 +32,27 @@
 #include <altivec.h>
 #endif
 
-/* Macro for in-place swap of two values a and b of type t. */
-#define swap_loop(t, a, b, c)  \
-  while (c >= sizeof(t)) {     \
-    register t temp = *(t *)a; \
-    *(t *)a = *(t *)b;         \
-    *(t *)b = temp;            \
-    a += sizeof(t);            \
-    b += sizeof(t);            \
-    bytes -= sizeof(t);        \
+/* Macro for in-place swap of two values a and b of type t. a and b are
+   assumed to be of type char* so that the pointer arithmetic works. */
+#define swap_loop(type, a, b, count) \
+  while (count >= sizeof(type)) {    \
+    register type temp = *(type *)a; \
+    *(type *)a = *(type *)b;         \
+    *(type *)b = temp;               \
+    a += sizeof(type);               \
+    b += sizeof(type);               \
+    count -= sizeof(type);           \
   }
 
 /**
  * @brief Swap the contents of two elements in-place.
  *
- * Keep in mind that this function works best when the underlying data
+ * Keep in mind that this function only works when the underlying data
  * is aligned to the vector length, e.g. with the @c
  * __attribute__((aligned(32)))
- * syntax, and the code is compiled with @c -funroll-loops.
+ * syntax!
+ * Furthermore, register re-labeling only seems to work when the code is
+ * compiled with @c -funroll-loops.
  *
  * @param void_a Pointer to the first element.
  * @param void_b Pointer to the second element.
@@ -76,4 +79,63 @@ __attribute__((always_inline)) inline void memswap(void *void_a, void *void_b,
   swap_loop(char, a, b, bytes);
 }
 
+/**
+ * @brief Swap the contents of two elements in-place.
+ *
+ * As opposed to #memswap, this function does not require the parameters
+ * to be aligned in any specific way.
+ * Furthermore, register re-labeling only seems to work when the code is
+ * compiled with @c -funroll-loops.
+ *
+ * @param void_a Pointer to the first element.
+ * @param void_b Pointer to the second element.
+ * @param bytes Size, in bytes, of the data pointed to by @c a and @c b.
+ */
+__attribute__((always_inline)) inline void memswap_unaligned(void *void_a,
+                                                             void *void_b,
+                                                             size_t bytes) {
+  char *a = (char *)void_a, *b = (char *)void_b;
+#ifdef __AVX512F__
+  while (bytes >= sizeof(__m512i)) {
+    register __m512i temp;
+    temp = _mm512_loadu_si512((__m512i *)a);
+    _mm512_storeu_si512((__m512i *)a, _mm512_loadu_si512((__m512i *)b));
+    _mm512_storeu_si512((__m512i *)b, temp);
+    a += sizeof(__m512i);
+    b += sizeof(__m512i);
+    bytes -= sizeof(__m512i);
+  }
+#endif
+#ifdef __AVX__
+  while (bytes >= sizeof(__m256i)) {
+    register __m256i temp;
+    temp = _mm256_loadu_si256((__m256i *)a);
+    _mm256_storeu_si256((__m256i *)a, _mm256_loadu_si256((__m256i *)b));
+    _mm256_storeu_si256((__m256i *)b, temp);
+    a += sizeof(__m256i);
+    b += sizeof(__m256i);
+    bytes -= sizeof(__m256i);
+  }
+#endif
+#ifdef __SSE2__
+  while (bytes >= sizeof(__m128i)) {
+    register __m128i temp;
+    temp = _mm_loadu_si128((__m128i *)a);
+    _mm_storeu_si128((__m128i *)a, _mm_loadu_si128((__m128i *)b));
+    _mm_storeu_si128((__m128i *)b, temp);
+    a += sizeof(__m128i);
+    b += sizeof(__m128i);
+    bytes -= sizeof(__m128i);
+  }
+#endif
+#ifdef __ALTIVEC__
+  // Power8 supports unaligned load/stores, but not sure what it will do here.
+  swap_loop(vector int, a, b, bytes);
+#endif
+  swap_loop(size_t, a, b, bytes);
+  swap_loop(int, a, b, bytes);
+  swap_loop(short, a, b, bytes);
+  swap_loop(char, a, b, bytes);
+}
+
 #endif /* SWIFT_MEMSWAP_H */
diff --git a/src/space.c b/src/space.c
index 08dd41cdea5bf598c2b17d47f35e6fc0221c2a06..24c2c413198d32e0942a0d26e5a3efbd5571c68e 100644
--- a/src/space.c
+++ b/src/space.c
@@ -1470,9 +1470,12 @@ void space_map_cells_pre(struct space *s, int full,
  * @param s The #space in which the cell lives.
  * @param c The #cell to split recursively.
  * @param buff A buffer for particle sorting, should be of size at least
- *        max(c->count, c->gount) or @c NULL.
+ *        c->count or @c NULL.
+ * @param gbuff A buffer for particle sorting, should be of size at least
+ *        c->gcount or @c NULL.
  */
-void space_split_recursive(struct space *s, struct cell *c, int *buff) {
+void space_split_recursive(struct space *s, struct cell *c,
+                           struct cell_buff *buff, struct cell_buff *gbuff) {
 
   const int count = c->count;
   const int gcount = c->gcount;
@@ -1486,10 +1489,29 @@ void space_split_recursive(struct space *s, struct cell *c, int *buff) {
   struct engine *e = s->e;
 
   /* If the buff is NULL, allocate it, and remember to free it. */
-  const int allocate_buffer = (buff == NULL);
-  if (allocate_buffer &&
-      (buff = (int *)malloc(sizeof(int) * max(count, gcount))) == NULL)
-    error("Failed to allocate temporary indices.");
+  const int allocate_buffer = (buff == NULL && gbuff == NULL);
+  if (allocate_buffer) {
+    if (count > 0) {
+      if (posix_memalign((void *)&buff, SWIFT_STRUCT_ALIGNMENT,
+                         sizeof(struct cell_buff) * count) != 0)
+        error("Failed to allocate temporary indices.");
+      for (int k = 0; k < count; k++) {
+        buff[k].x[0] = parts[k].x[0];
+        buff[k].x[1] = parts[k].x[1];
+        buff[k].x[2] = parts[k].x[2];
+      }
+    }
+    if (gcount > 0) {
+      if (posix_memalign((void *)&gbuff, SWIFT_STRUCT_ALIGNMENT,
+                         sizeof(struct cell_buff) * gcount) != 0)
+        error("Failed to allocate temporary indices.");
+      for (int k = 0; k < gcount; k++) {
+        gbuff[k].x[0] = gparts[k].x[0];
+        gbuff[k].x[1] = gparts[k].x[1];
+        gbuff[k].x[2] = gparts[k].x[2];
+      }
+    }
+  }
 
   /* Check the depth. */
   while (depth > (maxdepth = s->maxdepth)) {
@@ -1535,15 +1557,18 @@ void space_split_recursive(struct space *s, struct cell *c, int *buff) {
     }
 
     /* Split the cell data. */
-    cell_split(c, c->parts - s->parts, buff);
+    cell_split(c, c->parts - s->parts, buff, gbuff);
 
     /* Remove any progeny with zero parts. */
+    struct cell_buff *progeny_buff = buff, *progeny_gbuff = gbuff;
     for (int k = 0; k < 8; k++)
       if (c->progeny[k]->count == 0 && c->progeny[k]->gcount == 0) {
         space_recycle(s, c->progeny[k]);
         c->progeny[k] = NULL;
       } else {
-        space_split_recursive(s, c->progeny[k], buff);
+        space_split_recursive(s, c->progeny[k], progeny_buff, progeny_gbuff);
+        progeny_buff += c->progeny[k]->count;
+        progeny_gbuff += c->progeny[k]->gcount;
         h_max = max(h_max, c->progeny[k]->h_max);
         ti_end_min = min(ti_end_min, c->progeny[k]->ti_end_min);
         ti_end_max = max(ti_end_max, c->progeny[k]->ti_end_max);
@@ -1604,7 +1629,10 @@ void space_split_recursive(struct space *s, struct cell *c, int *buff) {
     c->owner = 0; /* Ok, there is really nothing on this rank... */
 
   /* Clean up. */
-  if (allocate_buffer) free(buff);
+  if (allocate_buffer) {
+    if (buff != NULL) free(buff);
+    if (gbuff != NULL) free(gbuff);
+  }
 }
 
 /**
@@ -1623,7 +1651,7 @@ void space_split_mapper(void *map_data, int num_cells, void *extra_data) {
 
   for (int ind = 0; ind < num_cells; ind++) {
     struct cell *c = &cells_top[ind];
-    space_split_recursive(s, c, NULL);
+    space_split_recursive(s, c, NULL, NULL);
   }
 
 #ifdef SWIFT_DEBUG_CHECKS
@@ -1631,7 +1659,7 @@ void space_split_mapper(void *map_data, int num_cells, void *extra_data) {
   for (int ind = 0; ind < num_cells; ind++) {
     int depth = 0;
     if (!checkCellhdxmax(&cells_top[ind], &depth))
-      message("    at cell depth %d", depth);
+      error("    at cell depth %d", depth);
   }
 #endif
 }