diff --git a/src/cell.c b/src/cell.c
index b566584807f6928d906207eb1c4f78181aa423ea..0102caf4a511df0293c32511d56ee3000ed736aa 100644
--- a/src/cell.c
+++ b/src/cell.c
@@ -908,24 +908,29 @@ int cell_is_drift_needed(struct cell *c, const struct engine *e) {
  */
 int cell_unskip_tasks(struct cell *c, struct scheduler *s) {
 
-  /* Reset the sort flags if the particles have moved too much. */
-  if (c->dx_max_sort > space_maxreldx * c->h_max) c->sorted = 0;
-
   /* Un-skip the density tasks involved with this cell. */
   for (struct link *l = c->density; l != NULL; l = l->next) {
     struct task *t = l->t;
-    const struct cell *ci = t->ci;
-    const struct cell *cj = t->cj;
+    struct cell *ci = t->ci;
+    struct cell *cj = t->cj;
     scheduler_activate(s, t);
 
     /* Set the correct sorting flags */
     if (t->type == task_type_pair) {
+      if (1 || ci->dx_max_sort > space_maxreldx * ci->dmin) ci->sorted = 0;
+      if (1 || cj->dx_max_sort > space_maxreldx * cj->dmin) cj->sorted = 0;
       if (!(ci->sorted & (1 << t->flags))) {
-        atomic_or(&ci->sorts->flags, (1 << t->flags));
+#ifdef SWIFT_DEBUG_CHECKS
+        if (!(ci->sorts->flags & (1 << t->flags)))
+          error("bad flags in sort task.");
+#endif
         scheduler_activate(s, ci->sorts);
       }
       if (!(cj->sorted & (1 << t->flags))) {
-        atomic_or(&cj->sorts->flags, (1 << t->flags));
+#ifdef SWIFT_DEBUG_CHECKS
+        if (!(cj->sorts->flags & (1 << t->flags)))
+          error("bad flags in sort task.");
+#endif
         scheduler_activate(s, cj->sorts);
       }
     }
diff --git a/src/engine.c b/src/engine.c
index dba17a4d6aea49a6cd66c0f0f91cde506c4db417..0306c8a7838a04388a9f985b6953733184434944 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -162,6 +162,7 @@ void engine_make_hierarchical_tasks(struct engine *e, struct cell *c) {
 
       scheduler_addunlock(s, c->kick1, c->drift);
       scheduler_addunlock(s, c->drift, c->init);
+      scheduler_addunlock(s, c->drift, c->sorts);
 
       /* Generate the ghost task. */
       if (is_hydro)
diff --git a/src/runner.c b/src/runner.c
index 167ef3969e531c80075537b89bf69998214c6beb..f286e6a3ccdf457cc2bab7c9fa4175a6e5857ba3 100644
--- a/src/runner.c
+++ b/src/runner.c
@@ -337,13 +337,13 @@ void runner_do_sort(struct runner *r, struct cell *c, int flags, int clock) {
   struct xpart *xparts = c->xparts;
   struct entry *sort;
   const int count = c->count;
-  int off[8], inds[8], temp_i, missing;
+  int off[8], inds[8], temp_i;
   float buff[8];
 
   TIMER_TIC
 
   /* Clean-up the flags, i.e. filter out what's already been sorted. */
-  flags &= ~c->sorted;
+  // flags &= ~c->sorted;
   if (flags == 0) return;
 
   /* start by allocating the entry arrays. */
@@ -361,9 +361,8 @@ void runner_do_sort(struct runner *r, struct cell *c, int flags, int clock) {
 
     /* Fill in the gaps within the progeny. */
     for (int k = 0; k < 8; k++) {
-      if (c->progeny[k] == NULL) continue;
-      missing = flags & ~c->progeny[k]->sorted;
-      if (missing) runner_do_sort(r, c->progeny[k], missing, 0);
+      if (c->progeny[k] != NULL) 
+        runner_do_sort(r, c->progeny[k], flags, 0);
     }
 
     /* Loop over the 13 different sort arrays. */
@@ -458,6 +457,9 @@ void runner_do_sort(struct runner *r, struct cell *c, int flags, int clock) {
         c->sorted |= (1 << j);
       }
   }
+  
+  /* Finally, clear the dx_max_sort field of this cell. */
+  c->dx_max_sort = 0.f;
 
 #ifdef SWIFT_DEBUG_CHECKS
   /* Verify the sorting. */
diff --git a/src/runner_doiact.h b/src/runner_doiact.h
index 8ab099f268c85d05db70b2af11086aac02bdaf7a..c56e509342238bc7ef7e34cb28c7e950d385aad6 100644
--- a/src/runner_doiact.h
+++ b/src/runner_doiact.h
@@ -38,8 +38,11 @@
 #define _DOPAIR_SUBSET_NAIVE(f) PASTE(runner_dopair_subset_naive, f)
 #define DOPAIR_SUBSET_NAIVE _DOPAIR_SUBSET_NAIVE(FUNCTION)
 
-#define _DOPAIR_NAIVE(f) PASTE(runner_dopair_naive, f)
-#define DOPAIR_NAIVE _DOPAIR_NAIVE(FUNCTION)
+#define _DOPAIR1_NAIVE(f) PASTE(runner_dopair1_naive, f)
+#define DOPAIR1_NAIVE _DOPAIR1_NAIVE(FUNCTION)
+
+#define _DOPAIR2_NAIVE(f) PASTE(runner_dopair2_naive, f)
+#define DOPAIR2_NAIVE _DOPAIR2_NAIVE(FUNCTION)
 
 #define _DOSELF_NAIVE(f) PASTE(runner_doself_naive, f)
 #define DOSELF_NAIVE _DOSELF_NAIVE(FUNCTION)
@@ -99,18 +102,152 @@
 #define TIMER_DOPAIR_SUBSET _TIMER_DOPAIR_SUBSET(FUNCTION)
 
 /**
- * @brief Compute the interactions between a cell pair.
+ * @brief Compute the interactions between a cell pair (non-symmetric).
  *
  * @param r The #runner.
  * @param ci The first #cell.
  * @param cj The second #cell.
  */
-void DOPAIR_NAIVE(struct runner *r, struct cell *restrict ci,
+void DOPAIR1_NAIVE(struct runner *r, struct cell *restrict ci,
                   struct cell *restrict cj) {
 
   const struct engine *e = r->e;
 
-  error("Don't use in actual runs ! Slow code !");
+#ifndef SWIFT_DEBUG_CHECKS
+  // error("Don't use in actual runs ! Slow code !");
+#endif
+
+#ifdef WITH_VECTORIZATION
+  int icount = 0;
+  float r2q[VEC_SIZE] __attribute__((aligned(16)));
+  float hiq[VEC_SIZE] __attribute__((aligned(16)));
+  float hjq[VEC_SIZE] __attribute__((aligned(16)));
+  float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
+  struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
+#endif
+  TIMER_TIC;
+
+  /* Anything to do here? */
+  if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return;
+
+  const int count_i = ci->count;
+  const int count_j = cj->count;
+  struct part *restrict parts_i = ci->parts;
+  struct part *restrict parts_j = cj->parts;
+
+  /* Get the relative distance between the pairs, wrapping. */
+  double shift[3] = {0.0, 0.0, 0.0};
+  for (int k = 0; k < 3; k++) {
+    if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2)
+      shift[k] = e->s->dim[k];
+    else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2)
+      shift[k] = -e->s->dim[k];
+  }
+
+  /* Loop over the parts in ci. */
+  for (int pid = 0; pid < count_i; pid++) {
+
+    /* Get a hold of the ith part in ci. */
+    struct part *restrict pi = &parts_i[pid];
+    const float hi = pi->h;
+
+    double pix[3];
+    for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
+    const float hig2 = hi * hi * kernel_gamma2;
+
+    /* Loop over the parts in cj. */
+    for (int pjd = 0; pjd < count_j; pjd++) {
+
+      /* Get a pointer to the jth particle. */
+      struct part *restrict pj = &parts_j[pjd];
+
+      /* Compute the pairwise distance. */
+      float r2 = 0.0f;
+      float dx[3];
+      for (int k = 0; k < 3; k++) {
+        dx[k] = pix[k] - pj->x[k];
+        r2 += dx[k] * dx[k];
+      }
+
+      /* Hit or miss? */
+      if (r2 < hig2) {
+
+#ifndef WITH_VECTORIZATION
+
+        IACT_NONSYM(r2, dx, hi, pj->h, pi, pj);
+
+#else
+
+        /* Add this interaction to the queue. */
+        r2q[icount] = r2;
+        dxq[3 * icount + 0] = dx[0];
+        dxq[3 * icount + 1] = dx[1];
+        dxq[3 * icount + 2] = dx[2];
+        hiq[icount] = hi;
+        hjq[icount] = pj->h;
+        piq[icount] = pi;
+        pjq[icount] = pj;
+        icount += 1;
+
+        /* Flush? */
+        if (icount == VEC_SIZE) {
+          IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq);
+          icount = 0;
+        }
+
+#endif
+      }
+      if (r2 < pj->h * pj->h * kernel_gamma2) {
+
+#ifndef WITH_VECTORIZATION
+        
+        for (int k = 0; k < 3; k++) dx[k] = -dx[k];
+        IACT_NONSYM(r2, dx, pj->h, hi, pj, pi);
+
+#else
+
+        /* Add this interaction to the queue. */
+        r2q[icount] = r2;
+        dxq[3 * icount + 0] = -dx[0];
+        dxq[3 * icount + 1] = -dx[1];
+        dxq[3 * icount + 2] = -dx[2];
+        hiq[icount] = pj->h;
+        hjq[icount] = hi;
+        piq[icount] = pj;
+        pjq[icount] = pi;
+        icount += 1;
+
+        /* Flush? */
+        if (icount == VEC_SIZE) {
+          IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq);
+          icount = 0;
+        }
+
+#endif
+      }
+
+    } /* loop over the parts in cj. */
+
+  } /* loop over the parts in ci. */
+
+#ifdef WITH_VECTORIZATION
+  /* Pick up any leftovers. */
+  if (icount > 0)
+    for (int k = 0; k < icount; k++)
+      IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
+#endif
+
+  TIMER_TOC(TIMER_DOPAIR);
+}
+
+void DOPAIR2_NAIVE(struct runner *r, struct cell *restrict ci,
+                  struct cell *restrict cj) {
+
+  const struct engine *e = r->e;
+
+#ifndef SWIFT_DEBUG_CHECKS
+  // error("Don't use in actual runs ! Slow code !");
+#endif
 
 #ifdef WITH_VECTORIZATION
   int icount = 0;
@@ -211,7 +348,9 @@ void DOSELF_NAIVE(struct runner *r, struct cell *restrict c) {
 
   const struct engine *e = r->e;
 
-  error("Don't use in actual runs ! Slow code !");
+#ifndef SWIFT_DEBUG_CHECKS
+  // error("Don't use in actual runs ! Slow code !");
+#endif
 
 #ifdef WITH_VECTORIZATION
   int icount = 0;
diff --git a/src/scheduler.h b/src/scheduler.h
index f2225f5f5b8d0a5db54eb8506e02d78b14f4bb88..224ed4fd494bf83587b29ee9d402229b379bbee9 100644
--- a/src/scheduler.h
+++ b/src/scheduler.h
@@ -42,7 +42,7 @@
 /* Some constants. */
 #define scheduler_maxwait 3
 #define scheduler_init_nr_unlocks 10000
-#define scheduler_dosub 1
+#define scheduler_dosub 0
 #define scheduler_maxsteal 10
 #define scheduler_maxtries 2
 #define scheduler_doforcesplit            \
diff --git a/src/space.h b/src/space.h
index e17b827e6f800a387d78a772d21c44093d0af1c9..67ff51cad34b0a9d1f4fd5492ab0e9d80ef064f2 100644
--- a/src/space.h
+++ b/src/space.h
@@ -44,7 +44,7 @@
 #define space_maxcount_default 10000
 #define space_max_top_level_cells_default 12
 #define space_stretch 1.10f
-#define space_maxreldx 0.25f
+#define space_maxreldx 0.1f
 
 /* Maximum allowed depth of cell splits. */
 #define space_cell_maxdepth 52