diff --git a/src/cell.c b/src/cell.c
index 1de475b56fe958de00485de546f2d7c4f185afe4..b881ea3a78e6fd64e4222324ee35891a220e9b33 100644
--- a/src/cell.c
+++ b/src/cell.c
@@ -471,8 +471,11 @@ void cell_gunlocktree(struct cell *c) {
  *        space's parts array, i.e. c->parts - s->parts.
  * @param buff A buffer with at least max(c->count, c->gcount) entries,
  *        used for sorting indices.
+ * @param gbuff A buffer with at least max(c->count, c->gcount) entries,
+ *        used for sorting indices for the gparts.
  */
-void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) {
+void cell_split(struct cell *c, ptrdiff_t parts_offset, struct cell_buff *buff,
+                struct cell_buff *gbuff) {
 
   const int count = c->count, gcount = c->gcount;
   struct part *parts = c->parts;
@@ -484,18 +487,21 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) {
   int bucket_count[8] = {0, 0, 0, 0, 0, 0, 0, 0};
   int bucket_offset[9];
 
-  /* If the buff is NULL, allocate it, and remember to free it. */
-  const int allocate_buffer = (buff == NULL);
-  if (allocate_buffer &&
-      (buff = (int *)malloc(sizeof(int) * max(count, gcount))) == NULL)
-    error("Failed to allocate temporary indices.");
+#ifdef SWIFT_DEBUG_CHECKS
+  /* Check that the buffs are OK. */
+  for (int k = 0; k < count; k++) {
+    if (buff[k].x[0] != parts[k].x[0] || buff[k].x[1] != parts[k].x[1] ||
+        buff[k].x[2] != parts[k].x[2])
+      error("Inconsistent buff contents.");
+  }
+#endif /* SWIFT_DEBUG_CHECKS */
 
   /* Fill the buffer with the indices. */
   for (int k = 0; k < count; k++) {
-    const int bid = (parts[k].x[0] > pivot[0]) * 4 +
-                    (parts[k].x[1] > pivot[1]) * 2 + (parts[k].x[2] > pivot[2]);
+    const int bid = (buff[k].x[0] > pivot[0]) * 4 +
+                    (buff[k].x[1] > pivot[1]) * 2 + (buff[k].x[2] > pivot[2]);
     bucket_count[bid]++;
-    buff[k] = bid;
+    buff[k].ind = bid;
   }
 
   /* Set the buffer offsets. */
@@ -509,23 +515,25 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) {
   for (int bucket = 0; bucket < 8; bucket++) {
     for (int k = bucket_offset[bucket] + bucket_count[bucket];
          k < bucket_offset[bucket + 1]; k++) {
-      int bid = buff[k];
+      int bid = buff[k].ind;
       if (bid != bucket) {
         struct part part = parts[k];
         struct xpart xpart = xparts[k];
+        struct cell_buff temp_buff = buff[k];
         while (bid != bucket) {
           int j = bucket_offset[bid] + bucket_count[bid]++;
-          while (buff[j] == bid) {
+          while (buff[j].ind == bid) {
             j++;
             bucket_count[bid]++;
           }
           memswap(&parts[j], &part, sizeof(struct part));
           memswap(&xparts[j], &xpart, sizeof(struct xpart));
-          memswap(&buff[j], &bid, sizeof(int));
+          memswap(&buff[j], &temp_buff, sizeof(struct cell_buff));
+          bid = temp_buff.ind;
         }
         parts[k] = part;
         xparts[k] = xpart;
-        buff[k] = bid;
+        buff[k] = temp_buff;
       }
       bucket_count[bid]++;
     }
@@ -542,6 +550,14 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) {
   if (count > 0 && gcount > 0) part_relink_gparts(parts, count, parts_offset);
 
 #ifdef SWIFT_DEBUG_CHECKS
+  /* Check that the buffs are OK. */
+  for (int k = 1; k < count; k++) {
+    if (buff[k].ind < buff[k - 1].ind) error("Buff not sorted.");
+    if (buff[k].x[0] != parts[k].x[0] || buff[k].x[1] != parts[k].x[1] ||
+        buff[k].x[2] != parts[k].x[2])
+      error("Inconsistent buff contents (k=%i).", k);
+  }
+
   /* Verify that _all_ the parts have been assigned to a cell. */
   for (int k = 1; k < 8; k++)
     if (&c->progeny[k - 1]->parts[c->progeny[k - 1]->count] !=
@@ -568,6 +584,31 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) {
         c->progeny[2]->parts[k].x[1] <= pivot[1] ||
         c->progeny[2]->parts[k].x[2] > pivot[2])
       error("Sorting failed (progeny=2).");
+  for (int k = 0; k < c->progeny[3]->count; k++)
+    if (c->progeny[3]->parts[k].x[0] > pivot[0] ||
+        c->progeny[3]->parts[k].x[1] <= pivot[1] ||
+        c->progeny[3]->parts[k].x[2] <= pivot[2])
+      error("Sorting failed (progeny=3).");
+  for (int k = 0; k < c->progeny[4]->count; k++)
+    if (c->progeny[4]->parts[k].x[0] <= pivot[0] ||
+        c->progeny[4]->parts[k].x[1] > pivot[1] ||
+        c->progeny[4]->parts[k].x[2] > pivot[2])
+      error("Sorting failed (progeny=4).");
+  for (int k = 0; k < c->progeny[5]->count; k++)
+    if (c->progeny[5]->parts[k].x[0] <= pivot[0] ||
+        c->progeny[5]->parts[k].x[1] > pivot[1] ||
+        c->progeny[5]->parts[k].x[2] <= pivot[2])
+      error("Sorting failed (progeny=5).");
+  for (int k = 0; k < c->progeny[6]->count; k++)
+    if (c->progeny[6]->parts[k].x[0] <= pivot[0] ||
+        c->progeny[6]->parts[k].x[1] <= pivot[1] ||
+        c->progeny[6]->parts[k].x[2] > pivot[2])
+      error("Sorting failed (progeny=6).");
+  for (int k = 0; k < c->progeny[7]->count; k++)
+    if (c->progeny[7]->parts[k].x[0] <= pivot[0] ||
+        c->progeny[7]->parts[k].x[1] <= pivot[1] ||
+        c->progeny[7]->parts[k].x[2] <= pivot[2])
+      error("Sorting failed (progeny=7).");
 #endif
 
   /* Now do the same song and dance for the gparts. */
@@ -575,11 +616,10 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) {
 
   /* Fill the buffer with the indices. */
   for (int k = 0; k < gcount; k++) {
-    const int bid = (gparts[k].x[0] > pivot[0]) * 4 +
-                    (gparts[k].x[1] > pivot[1]) * 2 +
-                    (gparts[k].x[2] > pivot[2]);
+    const int bid = (gbuff[k].x[0] > pivot[0]) * 4 +
+                    (gbuff[k].x[1] > pivot[1]) * 2 + (gbuff[k].x[2] > pivot[2]);
     bucket_count[bid]++;
-    buff[k] = bid;
+    gbuff[k].ind = bid;
   }
 
   /* Set the buffer offsets. */
@@ -593,20 +633,22 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) {
   for (int bucket = 0; bucket < 8; bucket++) {
     for (int k = bucket_offset[bucket] + bucket_count[bucket];
          k < bucket_offset[bucket + 1]; k++) {
-      int bid = buff[k];
+      int bid = gbuff[k].ind;
       if (bid != bucket) {
         struct gpart gpart = gparts[k];
+        struct cell_buff temp_buff = gbuff[k];
         while (bid != bucket) {
           int j = bucket_offset[bid] + bucket_count[bid]++;
-          while (buff[j] == bid) {
+          while (gbuff[j].ind == bid) {
             j++;
             bucket_count[bid]++;
           }
           memswap(&gparts[j], &gpart, sizeof(struct gpart));
-          memswap(&buff[j], &bid, sizeof(int));
+          memswap(&gbuff[j], &temp_buff, sizeof(struct cell_buff));
+          bid = temp_buff.ind;
         }
         gparts[k] = gpart;
-        buff[k] = bid;
+        gbuff[k] = temp_buff;
       }
       bucket_count[bid]++;
     }
diff --git a/src/cell.h b/src/cell.h
index 599daff1066348148a35a779bb6011ab07b37ed8..1978e80c481ceed76a33840d94153061d8244ca3 100644
--- a/src/cell.h
+++ b/src/cell.h
@@ -53,6 +53,12 @@ struct scheduler;
 /* Global variables. */
 extern int cell_next_tag;
 
+/* Struct to temporarily buffer the particle locations and bin id. */
+struct cell_buff {
+  double x[3];
+  int ind;
+} SWIFT_STRUCT_ALIGN;
+
 /* Mini struct to link cells to tasks. Used as a linked list. */
 struct link {
 
@@ -282,7 +288,8 @@ struct cell {
   ((int)(k) + (cdim)[2] * ((int)(j) + (cdim)[1] * (int)(i)))
 
 /* Function prototypes. */
-void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff);
+void cell_split(struct cell *c, ptrdiff_t parts_offset, struct cell_buff *buff,
+                struct cell_buff *gbuff);
 void cell_sanitize(struct cell *c);
 int cell_locktree(struct cell *c);
 void cell_unlocktree(struct cell *c);
diff --git a/src/engine.c b/src/engine.c
index 4a417ee716ee39ef76bc11ecf0128d93e44f6064..8fb7c2d2702b2a62a1bbc0f8b309e0465ca624f2 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -734,9 +734,11 @@ void engine_addtasks_send(struct engine *e, struct cell *ci, struct cell *cj,
       /* The send_xv task should unlock the super-cell's ghost task. */
       scheduler_addunlock(s, t_xv, ci->super->ghost);
 
-      scheduler_addunlock(s, ci->super->drift, t_xv);
 #endif
 
+      /* Drift before you send */
+      scheduler_addunlock(s, ci->super->drift, t_xv);
+
       /* The super-cell's timestep task should unlock the send_ti task. */
       scheduler_addunlock(s, ci->super->timestep, t_ti);
     }
diff --git a/src/memswap.h b/src/memswap.h
index 4643725535917952d12927d52187bc7306ced5ef..92c902eeb158978d4a606f5f2a9416d4113fae0b 100644
--- a/src/memswap.h
+++ b/src/memswap.h
@@ -32,24 +32,27 @@
 #include <altivec.h>
 #endif
 
-/* Macro for in-place swap of two values a and b of type t. */
-#define swap_loop(t, a, b, c)  \
-  while (c >= sizeof(t)) {     \
-    register t temp = *(t *)a; \
-    *(t *)a = *(t *)b;         \
-    *(t *)b = temp;            \
-    a += sizeof(t);            \
-    b += sizeof(t);            \
-    bytes -= sizeof(t);        \
+/* Macro for in-place swap of two values a and b of type t. a and b are
+   assumed to be of type char* so that the pointer arithmetic works. */
+#define swap_loop(type, a, b, count) \
+  while (count >= sizeof(type)) {    \
+    register type temp = *(type *)a; \
+    *(type *)a = *(type *)b;         \
+    *(type *)b = temp;               \
+    a += sizeof(type);               \
+    b += sizeof(type);               \
+    count -= sizeof(type);           \
   }
 
 /**
  * @brief Swap the contents of two elements in-place.
  *
- * Keep in mind that this function works best when the underlying data
+ * Keep in mind that this function only works when the underlying data
  * is aligned to the vector length, e.g. with the @c
  * __attribute__((aligned(32)))
- * syntax, and the code is compiled with @c -funroll-loops.
+ * syntax!
+ * Furthermore, register re-labeling only seems to work when the code is
+ * compiled with @c -funroll-loops.
  *
  * @param void_a Pointer to the first element.
  * @param void_b Pointer to the second element.
@@ -76,4 +79,63 @@ __attribute__((always_inline)) inline void memswap(void *void_a, void *void_b,
   swap_loop(char, a, b, bytes);
 }
 
+/**
+ * @brief Swap the contents of two elements in-place.
+ *
+ * As opposed to #memswap, this function does not require the parameters
+ * to be aligned in any specific way.
+ * Furthermore, register re-labeling only seems to work when the code is
+ * compiled with @c -funroll-loops.
+ *
+ * @param void_a Pointer to the first element.
+ * @param void_b Pointer to the second element.
+ * @param bytes Size, in bytes, of the data pointed to by @c a and @c b.
+ */
+__attribute__((always_inline)) inline void memswap_unaligned(void *void_a,
+                                                             void *void_b,
+                                                             size_t bytes) {
+  char *a = (char *)void_a, *b = (char *)void_b;
+#ifdef __AVX512F__
+  while (bytes >= sizeof(__m512i)) {
+    register __m512i temp;
+    temp = _mm512_loadu_si512((__m512i *)a);
+    _mm512_storeu_si512((__m512i *)a, _mm512_loadu_si512((__m512i *)b));
+    _mm512_storeu_si512((__m512i *)b, temp);
+    a += sizeof(__m512i);
+    b += sizeof(__m512i);
+    bytes -= sizeof(__m512i);
+  }
+#endif
+#ifdef __AVX__
+  while (bytes >= sizeof(__m256i)) {
+    register __m256i temp;
+    temp = _mm256_loadu_si256((__m256i *)a);
+    _mm256_storeu_si256((__m256i *)a, _mm256_loadu_si256((__m256i *)b));
+    _mm256_storeu_si256((__m256i *)b, temp);
+    a += sizeof(__m256i);
+    b += sizeof(__m256i);
+    bytes -= sizeof(__m256i);
+  }
+#endif
+#ifdef __SSE2__
+  while (bytes >= sizeof(__m128i)) {
+    register __m128i temp;
+    temp = _mm_loadu_si128((__m128i *)a);
+    _mm_storeu_si128((__m128i *)a, _mm_loadu_si128((__m128i *)b));
+    _mm_storeu_si128((__m128i *)b, temp);
+    a += sizeof(__m128i);
+    b += sizeof(__m128i);
+    bytes -= sizeof(__m128i);
+  }
+#endif
+#ifdef __ALTIVEC__
+  // Power8 supports unaligned load/stores, but not sure what it will do here.
+  swap_loop(vector int, a, b, bytes);
+#endif
+  swap_loop(size_t, a, b, bytes);
+  swap_loop(int, a, b, bytes);
+  swap_loop(short, a, b, bytes);
+  swap_loop(char, a, b, bytes);
+}
+
 #endif /* SWIFT_MEMSWAP_H */
diff --git a/src/space.c b/src/space.c
index 22674e3cbef820c69d2bdc09a7e73bd04d249083..c6e01f767b57e41be19f3dd2b8a4c747d608b3f5 100644
--- a/src/space.c
+++ b/src/space.c
@@ -1470,9 +1470,12 @@ void space_map_cells_pre(struct space *s, int full,
  * @param s The #space in which the cell lives.
  * @param c The #cell to split recursively.
  * @param buff A buffer for particle sorting, should be of size at least
- *        max(c->count, c->gount) or @c NULL.
+ *        c->count or @c NULL.
+ * @param gbuff A buffer for particle sorting, should be of size at least
+ *        c->gcount or @c NULL.
  */
-void space_split_recursive(struct space *s, struct cell *c, int *buff) {
+void space_split_recursive(struct space *s, struct cell *c,
+                           struct cell_buff *buff, struct cell_buff *gbuff) {
 
   const int count = c->count;
   const int gcount = c->gcount;
@@ -1486,10 +1489,29 @@ void space_split_recursive(struct space *s, struct cell *c, int *buff) {
   struct engine *e = s->e;
 
   /* If the buff is NULL, allocate it, and remember to free it. */
-  const int allocate_buffer = (buff == NULL);
-  if (allocate_buffer &&
-      (buff = (int *)malloc(sizeof(int) * max(count, gcount))) == NULL)
-    error("Failed to allocate temporary indices.");
+  const int allocate_buffer = (buff == NULL && gbuff == NULL);
+  if (allocate_buffer) {
+    if (count > 0) {
+      if (posix_memalign((void *)&buff, SWIFT_STRUCT_ALIGNMENT,
+                         sizeof(struct cell_buff) * count) != 0)
+        error("Failed to allocate temporary indices.");
+      for (int k = 0; k < count; k++) {
+        buff[k].x[0] = parts[k].x[0];
+        buff[k].x[1] = parts[k].x[1];
+        buff[k].x[2] = parts[k].x[2];
+      }
+    }
+    if (gcount > 0) {
+      if (posix_memalign((void *)&gbuff, SWIFT_STRUCT_ALIGNMENT,
+                         sizeof(struct cell_buff) * gcount) != 0)
+        error("Failed to allocate temporary indices.");
+      for (int k = 0; k < gcount; k++) {
+        gbuff[k].x[0] = gparts[k].x[0];
+        gbuff[k].x[1] = gparts[k].x[1];
+        gbuff[k].x[2] = gparts[k].x[2];
+      }
+    }
+  }
 
   /* Check the depth. */
   while (depth > (maxdepth = s->maxdepth)) {
@@ -1535,15 +1557,18 @@ void space_split_recursive(struct space *s, struct cell *c, int *buff) {
     }
 
     /* Split the cell data. */
-    cell_split(c, c->parts - s->parts, buff);
+    cell_split(c, c->parts - s->parts, buff, gbuff);
 
     /* Remove any progeny with zero parts. */
+    struct cell_buff *progeny_buff = buff, *progeny_gbuff = gbuff;
     for (int k = 0; k < 8; k++)
       if (c->progeny[k]->count == 0 && c->progeny[k]->gcount == 0) {
         space_recycle(s, c->progeny[k]);
         c->progeny[k] = NULL;
       } else {
-        space_split_recursive(s, c->progeny[k], buff);
+        space_split_recursive(s, c->progeny[k], progeny_buff, progeny_gbuff);
+        progeny_buff += c->progeny[k]->count;
+        progeny_gbuff += c->progeny[k]->gcount;
         h_max = max(h_max, c->progeny[k]->h_max);
         ti_end_min = min(ti_end_min, c->progeny[k]->ti_end_min);
         ti_end_max = max(ti_end_max, c->progeny[k]->ti_end_max);
@@ -1604,7 +1629,10 @@ void space_split_recursive(struct space *s, struct cell *c, int *buff) {
     c->owner = 0; /* Ok, there is really nothing on this rank... */
 
   /* Clean up. */
-  if (allocate_buffer) free(buff);
+  if (allocate_buffer) {
+    if (buff != NULL) free(buff);
+    if (gbuff != NULL) free(gbuff);
+  }
 }
 
 /**
@@ -1623,7 +1651,7 @@ void space_split_mapper(void *map_data, int num_cells, void *extra_data) {
 
   for (int ind = 0; ind < num_cells; ind++) {
     struct cell *c = &cells_top[ind];
-    space_split_recursive(s, c, NULL);
+    space_split_recursive(s, c, NULL, NULL);
   }
 
 #ifdef SWIFT_DEBUG_CHECKS
@@ -1631,7 +1659,7 @@ void space_split_mapper(void *map_data, int num_cells, void *extra_data) {
   for (int ind = 0; ind < num_cells; ind++) {
     int depth = 0;
     if (!checkCellhdxmax(&cells_top[ind], &depth))
-      message("    at cell depth %d", depth);
+      error("    at cell depth %d", depth);
   }
 #endif
 }
diff --git a/tests/tolerance_27_normal.dat b/tests/tolerance_27_normal.dat
index e2053261ed6675e4f0ce92866db896819eaff0c2..dd93f7191d6c0ad38e3de43de3abebf544d555aa 100644
--- a/tests/tolerance_27_normal.dat
+++ b/tests/tolerance_27_normal.dat
@@ -1,3 +1,3 @@
 #   ID      pos_x      pos_y      pos_z        v_x        v_y        v_z           rho        rho_dh        wcount     wcount_dh         div_v       curl_vx       curl_vy       curl_vz
-    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-6	      2e-5	    2e-4       2e-3		 2e-6	     2e-6	   2e-6		 2e-6
-    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-6	      6e-5	    1e-4       1e-4		 2e-5	     2e-5	   2e-5	 	 2e-5
+    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   2e-6	      4e-5	    2e-4       2e-3		 7e-6	     5e-6	   5e-6		 5e-6
+    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-6	      1.2e-4	    1e-4       1e-4		 2e-4	     1e-4	   1e-4	 	 1e-4
diff --git a/tests/tolerance_27_perturbed.dat b/tests/tolerance_27_perturbed.dat
index 226a3d1423685656b37fa8acfe1741c859099352..478bfd94cbbbbabea791eaab9ca2966b4c52b921 100644
--- a/tests/tolerance_27_perturbed.dat
+++ b/tests/tolerance_27_perturbed.dat
@@ -1,3 +1,3 @@
 #   ID      pos_x      pos_y      pos_z        v_x        v_y        v_z           rho        rho_dh        wcount     wcount_dh         div_v       curl_vx       curl_vy       curl_vz
-    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1.2e-6     1e-4	    4e-5       2e-3		 3.1e-6	     2e-6	   2e-6		 2e-6
-    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-6	      3e-3	    1e-5       1e-4		 2e-5	     2e-3	   2e-3	 	 2e-3
+    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1.2e-6     1e-4	    4e-5       2e-3		 3.1e-6	     3e-6	   3e-6		 3e-6
+    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-6	      5e-3	    1e-5       1e-4		 2e-5	     2e-3	   2e-3	 	 2e-3