diff --git a/examples/EAGLE_100/eagle_100.yml b/examples/EAGLE_100/eagle_100.yml
index 71cb0e96307cd6ce737986e3800824fd80313d48..651c906fb33634009f30f818e7a45d806c87fe46 100644
--- a/examples/EAGLE_100/eagle_100.yml
+++ b/examples/EAGLE_100/eagle_100.yml
@@ -40,4 +40,4 @@ SPH:
 # Parameters related to the initial conditions
 InitialConditions:
   file_name:  ./EAGLE_ICs_100.hdf5     # The file to read
-
+  cleanup_h:  1
diff --git a/examples/check_ngbs.py b/examples/check_ngbs.py
index fca975e76457445480eb7b19e60e8f9174dee941..a4a07ce7bd6ffb817e8106b74d9895a0edbceca7 100644
--- a/examples/check_ngbs.py
+++ b/examples/check_ngbs.py
@@ -15,8 +15,13 @@ error = False
 inputFile1 = ""
 inputFile2 = ""
 
+# Compare the values of two floats
+def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
+    return abs(a-b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)
+
 # Check list of density neighbours and check that they are correct.
-def check_density_neighbours(pids, ngb_ids_naive, ngb_ids_sort, mask, pos, h, num_invalid, acc):
+def check_density_neighbours(pids, ngb_ids_naive, ngb_ids_sort, mask, pos,
+        h_naive, h_sort, num_invalid, acc):
 
     for k in range(0,num_invalid):
 
@@ -48,12 +53,17 @@ def check_density_neighbours(pids, ngb_ids_naive, ngb_ids_sort, mask, pos, h, nu
             pi_pos = pos[np.where(pids == pid)]
             pj_pos = pos[np.where(pids == pjd)]
             
-            hi = h[np.where(pids == pid)]
+            hi = h_naive[np.where(pids == pid)]
             
             dx = pi_pos[0][0] - pj_pos[0][0]
             dy = pi_pos[0][1] - pj_pos[0][1]
             dz = pi_pos[0][2] - pj_pos[0][2]
-            
+           
+            # Correct for BCs
+            dx = nearest(dx)
+            dy = nearest(dy)
+            dz = nearest(dz)
+
             r2 = dx*dx + dy*dy + dz*dz
             
             hig2 = hi*hi*kernel_gamma2
@@ -65,12 +75,21 @@ def check_density_neighbours(pids, ngb_ids_naive, ngb_ids_sort, mask, pos, h, nu
             if diff < acc * hig2:
                 print "Missing interaction due to precision issue will be ignored."
             else:
-                return True
+                hi_2 = h_sort[np.where(pids == pid)]
+
+                # If a neigbour is missing and the particle has the same h throw
+                # an error.
+                if(isclose(hi,hi_2)):
+                    print "Missing interaction found but particle has the same smoothing length (hi_1: %e, hi_2: %e)."%(hi, hi_2)
+                    return True
+                else:
+                    print "Missing interaction due to different smoothing lengths will be ignored (hi_1: %e, hi_2: %e)."%(hi, hi_2)
 
     return False
 
 # Check list of force neighbours and check that they are correct.
-def check_force_neighbours(pids, ngb_ids_naive, ngb_ids_sort, mask, pos, h, num_invalid, acc):
+def check_force_neighbours(pids, ngb_ids_naive, ngb_ids_sort, mask, pos,
+        h_naive, h_sort, num_invalid, acc):
 
     error_val = False
 
@@ -90,13 +109,18 @@ def check_force_neighbours(pids, ngb_ids_naive, ngb_ids_sort, mask, pos, h, num_
             pi_pos = pos[np.where(pids == pid)]
             pj_pos = pos[np.where(pids == pjd)]
             
-            hi = h[np.where(pids == pid)]
-            hj = h[np.where(pids == pjd)]
+            hi = h_naive[np.where(pids == pid)]
+            hj = h_naive[np.where(pids == pjd)]
             
             dx = pi_pos[0][0] - pj_pos[0][0]
             dy = pi_pos[0][1] - pj_pos[0][1]
             dz = pi_pos[0][2] - pj_pos[0][2]
-            
+ 
+            # Correct for BCs
+            dx = nearest(dx)
+            dy = nearest(dy)
+            dz = nearest(dz)
+           
             r2 = dx*dx + dy*dy + dz*dz
             
             hig2 = hi*hi*kernel_gamma2
@@ -106,14 +130,26 @@ def check_force_neighbours(pids, ngb_ids_naive, ngb_ids_sort, mask, pos, h, num_
             
             print "Particle {} is missing {}, hig2: {}, hjg2: {}, r2: {}, |r2 - max(hig2,hjg2)|: {}".format(pid,pjd,hig2, hjg2, r2, diff)
 
-            
             if diff < acc * max(hig2,hjg2):
                 print "Missing interaction due to precision issue will be ignored."
             else:
-                error_val = True
+                hi_2 = h_sort[np.where(pids == pid)]
+                if(isclose(hi,hi_2)):
+                    print "Missing interaction due to the same smoothing lengths will not be ignored (hi_1: %e, hi_2: %e)."%(hi, hi_2)
+                    error_val = True
+                else:
+                    print "Missing interaction due to different smoothing lengths will be ignored (hi_1: %e, hi_2: %e)."%(hi, hi_2)
 
     return error_val
 
+def nearest(dx):
+    if(dx > 0.5 * box_size):
+        return dx - box_size
+    elif(dx < -0.5 * box_size):
+        return dx + box_size
+    else: 
+        return dx
+
 # Parse command line arguments
 if len(sys.argv) < 3:
     print "Error: pass input files as arguments"
@@ -132,6 +168,8 @@ else:
 file_naive = h.File(inputFile1, "r")
 file_sort = h.File(inputFile2, "r")
 
+box_size = file_naive["/Header"].attrs["BoxSize"][0]
+
 # Read input file fields
 ids_naive = file_naive["/PartType0/ParticleIDs"][:]
 ids_sort = file_sort["/PartType0/ParticleIDs"][:]
@@ -140,7 +178,7 @@ h_naive = file_naive["/PartType0/SmoothingLength"][:]
 h_sort = file_sort["/PartType0/SmoothingLength"][:]
 
 pos_naive = file_naive["/PartType0/Coordinates"][:,:]
-pos_sort = file_sort["/PartType0/Coordinates"][:,:]
+#pos_sort = file_sort["/PartType0/Coordinates"][:,:]
 
 num_density_naive = file_naive["/PartType0/Num_ngb_density"][:]
 num_density_sort = file_sort["/PartType0/Num_ngb_density"][:]
@@ -154,6 +192,7 @@ neighbour_ids_density_sort = file_sort["/PartType0/Ids_ngb_density"][:]
 neighbour_ids_force_naive = file_naive["/PartType0/Ids_ngb_force"][:]
 neighbour_ids_force_sort = file_sort["/PartType0/Ids_ngb_force"][:]
 
+
 #wcount_naive = file_naive["/PartType0/Wcount"][:]
 #wcount_sort = file_sort["/PartType0/Wcount"][:]
 #
@@ -194,7 +233,7 @@ neighbour_ids_force_sort = neighbour_ids_force_sort[index_sort]
 h_naive = h_naive[index_naive]
 h_sort = h_sort[index_sort]
 pos_naive = pos_naive[index_naive]
-pos_sort = pos_sort[index_sort]
+#pos_sort = pos_sort[index_sort]
 
 neighbour_length_naive = len(neighbour_ids_density_naive[0])
 neighbour_length_sort = len(neighbour_ids_density_sort[0])
@@ -234,7 +273,7 @@ print ids_naive[mask_density]
 
 # Check density neighbour lists
 error += check_density_neighbours(ids_naive, neighbour_ids_density_naive,
-        neighbour_ids_density_sort, mask_density, pos_naive, h_naive,
+        neighbour_ids_density_sort, mask_density, pos_naive, h_naive, h_sort,
         num_invalid_density, 2e-6)
 
 print "Num of density interactions", inputFile1
@@ -249,7 +288,7 @@ print ids_naive[mask_force]
 
 # Check force neighbour lists
 error += check_force_neighbours(ids_naive, neighbour_ids_force_naive,
-        neighbour_ids_force_sort, mask_force, pos_naive, h_naive,
+        neighbour_ids_force_sort, mask_force, pos_naive, h_naive, h_sort,
         num_invalid_force, 2e-6)
 
 print "Num of force interactions", inputFile1
diff --git a/examples/parameter_example.yml b/examples/parameter_example.yml
index 8975981fd0b3a4a1dab8e2daf278cdd40e2ddbf0..efaa5bf2d172f49954a4e8e118b8b8d555a6b380 100644
--- a/examples/parameter_example.yml
+++ b/examples/parameter_example.yml
@@ -8,14 +8,16 @@ InternalUnitSystem:
 
 # Parameters for the task scheduling
 Scheduler:
-  nr_queues:              0         # (Optional) The number of task queues to use. Use 0  to let the system decide.
-  cell_max_size:          8000000   # (Optional) Maximal number of interactions per task if we force the split (this is the default value).
-  cell_sub_size_pair:     256000000 # (Optional) Maximal number of interactions per sub-pair task  (this is the default value).
-  cell_sub_size_self:     32000     # (Optional) Maximal number of interactions per sub-self task  (this is the default value).
-  cell_split_size:        400       # (Optional) Maximal number of particles per cell (this is the default value).
-  max_top_level_cells:    12        # (Optional) Maximal number of top-level cells in any dimension. The number of top-level cells will be the cube of this (this is the default value).
-  tasks_per_cell:         0         # (Optional) The average number of tasks per cell. If not large enough the simulation will fail (means guess...).
-  mpi_message_limit:      4096      # (Optional) Maximum MPI task message size to send non-buffered, KB.
+  nr_queues:                 0         # (Optional) The number of task queues to use. Use 0  to let the system decide.
+  cell_max_size:             8000000   # (Optional) Maximal number of interactions per task if we force the split (this is the default value).
+  cell_sub_size_pair_hydro:  256000000 # (Optional) Maximal number of interactions per sub-pair hydro task  (this is the default value).
+  cell_sub_size_self_hydro:  32000     # (Optional) Maximal number of interactions per sub-self hydro task  (this is the default value).
+  cell_sub_size_pair_grav:   256000000 # (Optional) Maximal number of interactions per sub-pair gravity task  (this is the default value).
+  cell_sub_size_self_grav:   32000     # (Optional) Maximal number of interactions per sub-self gravity task  (this is the default value).
+  cell_split_size:           400       # (Optional) Maximal number of particles per cell (this is the default value).
+  max_top_level_cells:       12        # (Optional) Maximal number of top-level cells in any dimension. The number of top-level cells will be the cube of this (this is the default value).
+  tasks_per_cell:            0         # (Optional) The average number of tasks per cell. If not large enough the simulation will fail (means guess...).
+  mpi_message_limit:         4096      # (Optional) Maximum MPI task message size to send non-buffered, KB.
 
 # Parameters governing the time integration (Set dt_min and dt_max to the same value for a fixed time-step run.)
 TimeIntegration:
diff --git a/src/cache.h b/src/cache.h
index c939da28589c1421e0e4241dca124a8320d5c87b..c41e11c34246ef0de93bb1ae7500277aab555b9e 100644
--- a/src/cache.h
+++ b/src/cache.h
@@ -216,6 +216,139 @@ __attribute__((always_inline)) INLINE void cache_read_particles(
 #endif
 }
 
+/**
+ * @brief Populate cache by only reading particles that are within range of
+ * each other within the adjoining cell. Also read the particles into the cache
+ * in sorted order.
+ *
+ * @param ci The i #cell.
+ * @param ci_cache The #cache for cell ci.
+ * @param sort_i The array of sorted particle indices for cell ci.
+ * @param first_pi The first particle in cell ci that is in range.
+ * @param last_pi The last particle in cell ci that is in range.
+ * @param last_pi The last particle in cell ci that is in range.
+ * @param loc The cell location to remove from the particle positions.
+ * @param flipped Flag to check whether the cells have been flipped or not.
+ */
+__attribute__((always_inline)) INLINE void cache_read_particles_subset(
+    const struct cell *restrict const ci, struct cache *restrict const ci_cache,
+    const struct entry *restrict sort_i, int *first_pi, int *last_pi,
+    const double *loc, const int flipped) {
+
+#if defined(GADGET2_SPH)
+
+  /* Let the compiler know that the data is aligned and create pointers to the
+   * arrays inside the cache. */
+  swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, y, ci_cache->y, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, z, ci_cache->z, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, h, ci_cache->h, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, m, ci_cache->m, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vx, ci_cache->vx, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT);
+
+  const struct part *restrict parts = ci->parts;
+
+  /* The cell is on the right so read the particles
+   * into the cache from the start of the cell. */
+  if (!flipped) {
+    const int rem = (*last_pi + 1) % VEC_SIZE;
+    if (rem != 0) {
+      const int pad = VEC_SIZE - rem;
+
+      /* Increase last_pi if there are particles in the cell left to read. */
+      if (*last_pi + pad < ci->count) *last_pi += pad;
+    }
+
+    /* Shift the particles positions to a local frame so single precision can be
+     * used instead of double precision. */
+    for (int i = 0; i < *last_pi; i++) {
+      const int idx = sort_i[i].i;
+      x[i] = (float)(parts[idx].x[0] - loc[0]);
+      y[i] = (float)(parts[idx].x[1] - loc[1]);
+      z[i] = (float)(parts[idx].x[2] - loc[2]);
+      h[i] = parts[idx].h;
+      m[i] = parts[idx].mass;
+      vx[i] = parts[idx].v[0];
+      vy[i] = parts[idx].v[1];
+      vz[i] = parts[idx].v[2];
+    }
+
+    /* Pad cache with fake particles that exist outside the cell so will not
+     * interact. We use values of the same magnitude (but negative!) as the real
+     * particles to avoid overflow problems. */
+    const double max_dx = ci->dx_max_part;
+    const float pos_padded[3] = {-(2. * ci->width[0] + max_dx),
+                                 -(2. * ci->width[1] + max_dx),
+                                 -(2. * ci->width[2] + max_dx)};
+    const float h_padded = ci->parts[0].h;
+
+    for (int i = *last_pi; i < *last_pi + VEC_SIZE; i++) {
+      x[i] = pos_padded[0];
+      y[i] = pos_padded[1];
+      z[i] = pos_padded[2];
+      h[i] = h_padded;
+
+      m[i] = 1.f;
+      vx[i] = 1.f;
+      vy[i] = 1.f;
+      vz[i] = 1.f;
+    }
+  }
+  /* The cell is on the left so read the particles
+   * into the cache from the end of the cell. */
+  else {
+    const int rem = (ci->count - *first_pi) % VEC_SIZE;
+    if (rem != 0) {
+      const int pad = VEC_SIZE - rem;
+
+      /* Decrease first_pi if there are particles in the cell left to read. */
+      if (*first_pi - pad >= 0) *first_pi -= pad;
+    }
+
+    const int ci_cache_count = ci->count - *first_pi;
+
+    /* Shift the particles positions to a local frame so single precision can be
+     * used instead of double precision. */
+    for (int i = 0; i < ci_cache_count; i++) {
+      const int idx = sort_i[i + *first_pi].i;
+      x[i] = (float)(parts[idx].x[0] - loc[0]);
+      y[i] = (float)(parts[idx].x[1] - loc[1]);
+      z[i] = (float)(parts[idx].x[2] - loc[2]);
+      h[i] = parts[idx].h;
+      m[i] = parts[idx].mass;
+      vx[i] = parts[idx].v[0];
+      vy[i] = parts[idx].v[1];
+      vz[i] = parts[idx].v[2];
+    }
+
+    /* Pad cache with fake particles that exist outside the cell so will not
+     * interact. We use values of the same magnitude (but negative!) as the real
+     * particles to avoid overflow problems. */
+    const double max_dx = ci->dx_max_part;
+    const float pos_padded[3] = {-(2. * ci->width[0] + max_dx),
+                                 -(2. * ci->width[1] + max_dx),
+                                 -(2. * ci->width[2] + max_dx)};
+    const float h_padded = ci->parts[0].h;
+
+    for (int i = ci->count - *first_pi; i < ci->count - *first_pi + VEC_SIZE;
+         i++) {
+      x[i] = pos_padded[0];
+      y[i] = pos_padded[1];
+      z[i] = pos_padded[2];
+      h[i] = h_padded;
+
+      m[i] = 1.f;
+      vx[i] = 1.f;
+      vy[i] = 1.f;
+      vz[i] = 1.f;
+    }
+  }
+
+#endif
+}
+
 /**
  * @brief Populate cache for force interactions by reading in the particles in
  * unsorted order.
@@ -287,7 +420,6 @@ __attribute__((always_inline)) INLINE void cache_read_force_particles(
  * @param shift The amount to shift the particle positions to account for BCs
  * @param first_pi The first particle in cell ci that is in range.
  * @param last_pj The last particle in cell cj that is in range.
- * interaction.
  */
 __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
     const struct cell *restrict const ci, const struct cell *restrict const cj,
@@ -505,7 +637,6 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
  * @param shift The amount to shift the particle positions to account for BCs
  * @param first_pi The first particle in cell ci that is in range.
  * @param last_pj The last particle in cell cj that is in range.
- * interaction.
  */
 __attribute__((always_inline)) INLINE void
 cache_read_two_partial_cells_sorted_force(
diff --git a/src/cell.c b/src/cell.c
index 488637599f3019cd82128f63e58c303e49d9723e..a36b8fd2fd28157fd1b35df572c3c831f78d26a9 100644
--- a/src/cell.c
+++ b/src/cell.c
@@ -803,8 +803,8 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, ptrdiff_t sparts_offset,
 
   /* Fill the buffer with the indices. */
   for (int k = 0; k < count; k++) {
-    const int bid = (buff[k].x[0] > pivot[0]) * 4 +
-                    (buff[k].x[1] > pivot[1]) * 2 + (buff[k].x[2] > pivot[2]);
+    const int bid = (buff[k].x[0] >= pivot[0]) * 4 +
+                    (buff[k].x[1] >= pivot[1]) * 2 + (buff[k].x[2] >= pivot[2]);
     bucket_count[bid]++;
     buff[k].ind = bid;
   }
@@ -876,44 +876,44 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, ptrdiff_t sparts_offset,
 
   /* Verify a few sub-cells. */
   for (int k = 0; k < c->progeny[0]->count; k++)
-    if (c->progeny[0]->parts[k].x[0] > pivot[0] ||
-        c->progeny[0]->parts[k].x[1] > pivot[1] ||
-        c->progeny[0]->parts[k].x[2] > pivot[2])
+    if (c->progeny[0]->parts[k].x[0] >= pivot[0] ||
+        c->progeny[0]->parts[k].x[1] >= pivot[1] ||
+        c->progeny[0]->parts[k].x[2] >= pivot[2])
       error("Sorting failed (progeny=0).");
   for (int k = 0; k < c->progeny[1]->count; k++)
-    if (c->progeny[1]->parts[k].x[0] > pivot[0] ||
-        c->progeny[1]->parts[k].x[1] > pivot[1] ||
-        c->progeny[1]->parts[k].x[2] <= pivot[2])
+    if (c->progeny[1]->parts[k].x[0] >= pivot[0] ||
+        c->progeny[1]->parts[k].x[1] >= pivot[1] ||
+        c->progeny[1]->parts[k].x[2] < pivot[2])
       error("Sorting failed (progeny=1).");
   for (int k = 0; k < c->progeny[2]->count; k++)
-    if (c->progeny[2]->parts[k].x[0] > pivot[0] ||
-        c->progeny[2]->parts[k].x[1] <= pivot[1] ||
-        c->progeny[2]->parts[k].x[2] > pivot[2])
+    if (c->progeny[2]->parts[k].x[0] >= pivot[0] ||
+        c->progeny[2]->parts[k].x[1] < pivot[1] ||
+        c->progeny[2]->parts[k].x[2] >= pivot[2])
       error("Sorting failed (progeny=2).");
   for (int k = 0; k < c->progeny[3]->count; k++)
-    if (c->progeny[3]->parts[k].x[0] > pivot[0] ||
-        c->progeny[3]->parts[k].x[1] <= pivot[1] ||
-        c->progeny[3]->parts[k].x[2] <= pivot[2])
+    if (c->progeny[3]->parts[k].x[0] >= pivot[0] ||
+        c->progeny[3]->parts[k].x[1] < pivot[1] ||
+        c->progeny[3]->parts[k].x[2] < pivot[2])
       error("Sorting failed (progeny=3).");
   for (int k = 0; k < c->progeny[4]->count; k++)
-    if (c->progeny[4]->parts[k].x[0] <= pivot[0] ||
-        c->progeny[4]->parts[k].x[1] > pivot[1] ||
-        c->progeny[4]->parts[k].x[2] > pivot[2])
+    if (c->progeny[4]->parts[k].x[0] < pivot[0] ||
+        c->progeny[4]->parts[k].x[1] >= pivot[1] ||
+        c->progeny[4]->parts[k].x[2] >= pivot[2])
       error("Sorting failed (progeny=4).");
   for (int k = 0; k < c->progeny[5]->count; k++)
-    if (c->progeny[5]->parts[k].x[0] <= pivot[0] ||
-        c->progeny[5]->parts[k].x[1] > pivot[1] ||
-        c->progeny[5]->parts[k].x[2] <= pivot[2])
+    if (c->progeny[5]->parts[k].x[0] < pivot[0] ||
+        c->progeny[5]->parts[k].x[1] >= pivot[1] ||
+        c->progeny[5]->parts[k].x[2] < pivot[2])
       error("Sorting failed (progeny=5).");
   for (int k = 0; k < c->progeny[6]->count; k++)
-    if (c->progeny[6]->parts[k].x[0] <= pivot[0] ||
-        c->progeny[6]->parts[k].x[1] <= pivot[1] ||
-        c->progeny[6]->parts[k].x[2] > pivot[2])
+    if (c->progeny[6]->parts[k].x[0] < pivot[0] ||
+        c->progeny[6]->parts[k].x[1] < pivot[1] ||
+        c->progeny[6]->parts[k].x[2] >= pivot[2])
       error("Sorting failed (progeny=6).");
   for (int k = 0; k < c->progeny[7]->count; k++)
-    if (c->progeny[7]->parts[k].x[0] <= pivot[0] ||
-        c->progeny[7]->parts[k].x[1] <= pivot[1] ||
-        c->progeny[7]->parts[k].x[2] <= pivot[2])
+    if (c->progeny[7]->parts[k].x[0] < pivot[0] ||
+        c->progeny[7]->parts[k].x[1] < pivot[1] ||
+        c->progeny[7]->parts[k].x[2] < pivot[2])
       error("Sorting failed (progeny=7).");
 #endif
 
diff --git a/src/cooling/grackle/cooling.h b/src/cooling/grackle/cooling.h
index 8e7a7dc86ff6a8f7be2193b43dea10ba933e3765..247df339c68eff6c15c5fcfe07d77afbea0722dd 100644
--- a/src/cooling/grackle/cooling.h
+++ b/src/cooling/grackle/cooling.h
@@ -26,8 +26,8 @@
 
 /* Some standard headers. */
 #include <float.h>
-#include <math.h>
 #include <grackle.h>
+#include <math.h>
 
 /* Local includes. */
 #include "error.h"
@@ -78,37 +78,34 @@ __attribute__((always_inline)) INLINE static float cooling_get_radiated_energy(
   return xp->cooling_data.radiated_energy;
 }
 
-
 /**
  * @brief Prints the properties of the cooling model to stdout.
  *
  * @param cooling The properties of the cooling function.
  */
-__attribute__((always_inline))INLINE static void cooling_print_backend(
+__attribute__((always_inline)) INLINE static void cooling_print_backend(
     const struct cooling_function_data* cooling) {
 
   message("Cooling function is 'Grackle'.");
   message("Using Grackle           = %i", cooling->chemistry.use_grackle);
-  message("Chemical network        = %i", cooling->chemistry.primordial_chemistry);
-  message("Radiative cooling       = %i", cooling->chemistry.with_radiative_cooling);
+  message("Chemical network        = %i",
+          cooling->chemistry.primordial_chemistry);
+  message("Radiative cooling       = %i",
+          cooling->chemistry.with_radiative_cooling);
   message("Metal cooling           = %i", cooling->chemistry.metal_cooling);
-  
-  message("CloudyTable             = %s",
-          cooling->cloudy_table);
+
+  message("CloudyTable             = %s", cooling->cloudy_table);
   message("UVbackground            = %d", cooling->uv_background);
   message("Redshift                = %g", cooling->redshift);
-  message("Density Self Shielding  = %g",
-          cooling->density_self_shielding);
+  message("Density Self Shielding  = %g", cooling->density_self_shielding);
   message("Units:");
   message("\tComoving     = %i", cooling->units.comoving_coordinates);
   message("\tLength       = %g", cooling->units.length_units);
   message("\tDensity      = %g", cooling->units.density_units);
   message("\tTime         = %g", cooling->units.time_units);
   message("\tScale Factor = %g", cooling->units.a_units);
-
 }
 
-
 /**
  * @brief Compute the cooling rate
  *
@@ -126,8 +123,7 @@ __attribute__((always_inline)) INLINE static double cooling_rate(
     const struct cooling_function_data* restrict cooling,
     struct part* restrict p, float dt) {
 
-  if (cooling->chemistry.primordial_chemistry > 1)
-    error("Not implemented");
+  if (cooling->chemistry.primordial_chemistry > 1) error("Not implemented");
 
   /* set current time */
   code_units units = cooling->units;
@@ -144,7 +140,7 @@ __attribute__((always_inline)) INLINE static double cooling_rate(
   int grid_dimension[GRACKLE_RANK] = {GRACKLE_NPART, 1, 1};
   int grid_start[GRACKLE_RANK] = {0, 0, 0};
   int grid_end[GRACKLE_RANK] = {GRACKLE_NPART - 1, 0, 0};
-  
+
   data.grid_rank = GRACKLE_RANK;
   data.grid_dimension = grid_dimension;
   data.grid_start = grid_start;
@@ -171,7 +167,7 @@ __attribute__((always_inline)) INLINE static double cooling_rate(
   /* gr_float HeII_density = 0.; */
   /* gr_float HeIII_density = 0.; */
   /* gr_float e_density = 0.; */
-  
+
   /* data.HI_density = &HI_density; */
   /* data.HII_density = &HII_density; */
   /* data.HeI_density = &HeI_density; */
@@ -183,16 +179,16 @@ __attribute__((always_inline)) INLINE static double cooling_rate(
   /* gr_float HM_density = 0.; */
   /* gr_float H2I_density = 0.; */
   /* gr_float H2II_density = 0.; */
-  
+
   /* data.HM_density = &HM_density; */
   /* data.H2I_density = &H2I_density; */
   /* data.H2II_density = &H2II_density; */
-  
+
   /* /\* primordial chemistry >= 3 *\/ */
   /* gr_float DI_density = 0.; */
   /* gr_float DII_density = 0.; */
   /* gr_float HDI_density = 0.; */
-  
+
   /* data.DI_density = &DI_density; */
   /* data.DII_density = &DII_density; */
   /* data.HDI_density = &HDI_density; */
@@ -206,7 +202,7 @@ __attribute__((always_inline)) INLINE static double cooling_rate(
   /* gr_float volumetric_heating_rate = 0.; */
 
   /* data.volumetric_heating_rate = &volumetric_heating_rate; */
-  
+
   /* /\* specific heating rate *\/ */
   /* gr_float specific_heating_rate = 0.; */
 
@@ -216,7 +212,7 @@ __attribute__((always_inline)) INLINE static double cooling_rate(
   if (solve_chemistry(&units, &data, dt) == 0) {
     error("Error in solve_chemistry.");
   }
-  
+
   return (energy - energy_before) / dt;
 }
 
@@ -244,7 +240,7 @@ __attribute__((always_inline)) INLINE static void cooling_cool_part(
   const float du_dt = cooling_rate(phys_const, us, cooling, p, dt);
 
   /* record energy lost */
-  xp->cooling_data.radiated_energy += - du_dt * dt * hydro_get_mass(p);
+  xp->cooling_data.radiated_energy += -du_dt * dt * hydro_get_mass(p);
 
   /* Update the internal energy */
   hydro_set_internal_energy_dt(p, hydro_du_dt + du_dt);
@@ -276,23 +272,23 @@ __attribute__((always_inline)) INLINE static float cooling_timestep(
  * @param phys_const The physical constants in internal units.
  * @param cooling The cooling properties to initialize
  */
-__attribute__((always_inline))INLINE static void cooling_init_backend(
+__attribute__((always_inline)) INLINE static void cooling_init_backend(
     const struct swift_params* parameter_file, const struct unit_system* us,
     const struct phys_const* phys_const,
     struct cooling_function_data* cooling) {
 
-    /* read parameters */
+  /* read parameters */
   parser_get_param_string(parameter_file, "GrackleCooling:GrackleCloudyTable",
                           cooling->cloudy_table);
   cooling->uv_background =
-    parser_get_param_int(parameter_file, "GrackleCooling:UVbackground");
+      parser_get_param_int(parameter_file, "GrackleCooling:UVbackground");
 
   cooling->redshift =
-    parser_get_param_double(parameter_file, "GrackleCooling:GrackleRedshift");
+      parser_get_param_double(parameter_file, "GrackleCooling:GrackleRedshift");
 
   cooling->density_self_shielding = parser_get_param_double(
       parameter_file, "GrackleCooling:GrackleHSShieldingDensityThreshold");
-  
+
 #ifdef SWIFT_DEBUG_CHECKS
   /* enable verbose for grackle */
   grackle_verbose = 1;
@@ -310,14 +306,16 @@ __attribute__((always_inline))INLINE static void cooling_init_backend(
   cooling->units.comoving_coordinates = 0;
 
   /* then units */
-  cooling->units.density_units = us->UnitMass_in_cgs / pow(us->UnitLength_in_cgs, 3);
+  cooling->units.density_units =
+      us->UnitMass_in_cgs / pow(us->UnitLength_in_cgs, 3);
   cooling->units.length_units = us->UnitLength_in_cgs;
   cooling->units.time_units = us->UnitTime_in_cgs;
-  cooling->units.velocity_units =
-    cooling->units.a_units * cooling->units.length_units / cooling->units.time_units;
+  cooling->units.velocity_units = cooling->units.a_units *
+                                  cooling->units.length_units /
+                                  cooling->units.time_units;
+
+  chemistry_data* chemistry = &cooling->chemistry;
 
-  chemistry_data *chemistry = &cooling->chemistry;
-  
   /* Create a chemistry object for parameters and rate data. */
   if (set_default_chemistry_parameters(chemistry) == 0) {
     error("Error in set_default_chemistry_parameters.");
@@ -341,7 +339,6 @@ __attribute__((always_inline))INLINE static void cooling_init_backend(
     error("Error in initialize_chemistry_data.");
   }
 
-
 #ifdef SWIFT_DEBUG_CHECKS
   if (GRACKLE_NPART != 1)
     error("Grackle with multiple particles not implemented");
@@ -356,11 +353,9 @@ __attribute__((always_inline))INLINE static void cooling_init_backend(
   cooling_print_backend(cooling);
   message("Density Self Shielding = %g atom/cm3", threshold);
 
-
   message("");
   message("***************************************");
 #endif
-
 }
 
 #endif /* SWIFT_COOLING_GRACKLE_H */
diff --git a/src/debug.c b/src/debug.c
index f63ed557d908607e6cd53edf8467675a938245d7..77d791a31787c7997a9a56e3204dac1160178976 100644
--- a/src/debug.c
+++ b/src/debug.c
@@ -224,8 +224,9 @@ int checkCellhdxmax(const struct cell *c, int *depth) {
     struct part *const p = &parts[k];
     struct xpart *const xp = &xparts[k];
 
-    if (p->x[0] < loc_min[0] || p->x[0] > loc_max[0] || p->x[1] < loc_min[1] ||
-        p->x[1] > loc_max[1] || p->x[2] < loc_min[2] || p->x[2] > loc_max[2]) {
+    if (p->x[0] < loc_min[0] || p->x[0] >= loc_max[0] || p->x[1] < loc_min[1] ||
+        p->x[1] >= loc_max[1] || p->x[2] < loc_min[2] ||
+        p->x[2] >= loc_max[2]) {
 
       message(
           "Inconsistent part position p->x=[%e %e %e], c->loc=[%e %e %e] "
diff --git a/src/engine.c b/src/engine.c
index 70b3a1fb9b1208dd2dbd8c4657aca9eb1932271c..cef4e3e6f3aa00c40ba92e3980b9fc472b6898bc 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -2744,7 +2744,7 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       /* Now, build all the dependencies for the hydro */
       engine_make_hydro_loops_dependencies(sched, t, t2, t3, t->ci,
                                            with_cooling);
-      scheduler_addunlock(sched, t3, t->ci->super->kick2);
+      scheduler_addunlock(sched, t3, t->ci->super->end_force);
 #else
 
       /* Start by constructing the task for the second hydro loop */
@@ -2791,14 +2791,14 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       if (t->ci->nodeID == nodeID) {
         engine_make_hydro_loops_dependencies(sched, t, t2, t3, t->ci,
                                              with_cooling);
-        scheduler_addunlock(sched, t3, t->ci->super->kick2);
+        scheduler_addunlock(sched, t3, t->ci->super->end_force);
       }
       if (t->cj->nodeID == nodeID) {
         if (t->ci->super_hydro != t->cj->super_hydro)
           engine_make_hydro_loops_dependencies(sched, t, t2, t3, t->cj,
                                                with_cooling);
         if (t->ci->super != t->cj->super)
-          scheduler_addunlock(sched, t3, t->cj->super->kick2);
+          scheduler_addunlock(sched, t3, t->cj->super->end_force);
       }
 
 #else
@@ -2856,7 +2856,7 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       if (t->ci->nodeID == nodeID) {
         engine_make_hydro_loops_dependencies(sched, t, t2, t3, t->ci,
                                              with_cooling);
-        scheduler_addunlock(sched, t3, t->ci->super->kick2);
+        scheduler_addunlock(sched, t3, t->ci->super->end_force);
       }
 
 #else
@@ -2913,14 +2913,14 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       if (t->ci->nodeID == nodeID) {
         engine_make_hydro_loops_dependencies(sched, t, t2, t3, t->ci,
                                              with_cooling);
-        scheduler_addunlock(sched, t3, t->ci->super->kick2);
+        scheduler_addunlock(sched, t3, t->ci->super->end_force);
       }
       if (t->cj->nodeID == nodeID) {
         if (t->ci->super_hydro != t->cj->super_hydro)
           engine_make_hydro_loops_dependencies(sched, t, t2, t3, t->cj,
                                                with_cooling);
         if (t->ci->super != t->cj->super)
-          scheduler_addunlock(sched, t3, t->cj->super->kick2);
+          scheduler_addunlock(sched, t3, t->cj->super->end_force);
       }
 
 #else
@@ -4251,15 +4251,17 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs,
     /* Sorting should put the same positions next to each other... */
     int failed = 0;
     double *prev_x = s->parts[0].x;
+    long long *prev_id = &s->parts[0].id;
     for (size_t k = 1; k < s->nr_parts; k++) {
       if (prev_x[0] == s->parts[k].x[0] && prev_x[1] == s->parts[k].x[1] &&
           prev_x[2] == s->parts[k].x[2]) {
         if (e->verbose)
-          message("Two particles occupy location: %f %f %f", prev_x[0],
-                  prev_x[1], prev_x[2]);
+          message("Two particles occupy location: %f %f %f id=%lld id=%lld",
+                  prev_x[0], prev_x[1], prev_x[2], *prev_id, s->parts[k].id);
         failed++;
       }
       prev_x = s->parts[k].x;
+      prev_id = &s->parts[k].id;
     }
     if (failed > 0)
       error(
diff --git a/src/partition.c b/src/partition.c
index 297614e4b930851916e7fce7de6c9ca8faac2655..297ac71ac75ea4a5c5446033418b1bb4352b116b 100644
--- a/src/partition.c
+++ b/src/partition.c
@@ -580,7 +580,7 @@ static void repart_edge_metis(int partweights, int bothweights, int timebins,
     if (t->type == task_type_drift_part || t->type == task_type_drift_gpart ||
         t->type == task_type_ghost || t->type == task_type_extra_ghost ||
         t->type == task_type_kick1 || t->type == task_type_kick2 ||
-	t->type == task_type_end_force || t->type == task_type_cooling ||
+        t->type == task_type_end_force || t->type == task_type_cooling ||
         t->type == task_type_timestep || t->type == task_type_init_grav ||
         t->type == task_type_grav_down ||
         t->type == task_type_grav_long_range) {
diff --git a/src/periodic.h b/src/periodic.h
index 5874b8742e89c5c93727111adb5b289cff4cb6a6..7df36ac4c3927363305a7ef9fe3b369f58e7dd85 100644
--- a/src/periodic.h
+++ b/src/periodic.h
@@ -31,12 +31,12 @@
  * Only wraps once. If x > 2b, the returned value will be larger than b.
  * Similarly for x < -b.
  */
-#define box_wrap(x, a, b)                               \
-  ({                                                    \
-    const __typeof__(x) _x = (x);                       \
-    const __typeof__(a) _a = (a);                       \
-    const __typeof__(b) _b = (b);                       \
-    _x < _a ? (_x + _b) : ((_x > _b) ? (_x - _b) : _x); \
+#define box_wrap(x, a, b)                                \
+  ({                                                     \
+    const __typeof__(x) _x = (x);                        \
+    const __typeof__(a) _a = (a);                        \
+    const __typeof__(b) _b = (b);                        \
+    _x < _a ? (_x + _b) : ((_x >= _b) ? (_x - _b) : _x); \
   })
 
 /**
diff --git a/src/runner.c b/src/runner.c
index 289dd95dd027ebbf6d82483a954011de05c61a9f..592ccdb5b411dbfa0ea60dd3928a53a4a79c5cb5 100644
--- a/src/runner.c
+++ b/src/runner.c
@@ -774,23 +774,18 @@ void runner_do_ghost(struct runner *r, struct cell *c, int timer) {
 
             /* Self-interaction? */
             if (l->t->type == task_type_self)
-#if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH)
-              runner_doself_subset_density_vec(r, finger, parts, pid, count);
-#else
-              runner_doself_subset_density(r, finger, parts, pid, count);
-#endif
+              runner_doself_subset_branch_density(r, finger, parts, pid, count);
 
             /* Otherwise, pair interaction? */
             else if (l->t->type == task_type_pair) {
 
               /* Left or right? */
               if (l->t->ci == finger)
-                runner_dopair_subset_density(r, finger, parts, pid, count,
-                                             l->t->cj);
+                runner_dopair_subset_branch_density(r, finger, parts, pid,
+                                                    count, l->t->cj);
               else
-                runner_dopair_subset_density(r, finger, parts, pid, count,
-                                             l->t->ci);
-
+                runner_dopair_subset_branch_density(r, finger, parts, pid,
+                                                    count, l->t->ci);
             }
 
             /* Otherwise, sub-self interaction? */
diff --git a/src/runner_doiact.h b/src/runner_doiact.h
index 09b6a83ec29dd4f57b8a850ee77a2e5692e1c4ec..dda34a6dc3eb7cb3ee77789cfb4a8cd51db37266 100644
--- a/src/runner_doiact.h
+++ b/src/runner_doiact.h
@@ -41,6 +41,9 @@
 #define _DOPAIR_SUBSET(f) PASTE(runner_dopair_subset, f)
 #define DOPAIR_SUBSET _DOPAIR_SUBSET(FUNCTION)
 
+#define _DOPAIR_SUBSET_BRANCH(f) PASTE(runner_dopair_subset_branch, f)
+#define DOPAIR_SUBSET_BRANCH _DOPAIR_SUBSET_BRANCH(FUNCTION)
+
 #define _DOPAIR_SUBSET_NOSORT(f) PASTE(runner_dopair_subset_nosort, f)
 #define DOPAIR_SUBSET_NOSORT _DOPAIR_SUBSET_NOSORT(FUNCTION)
 
@@ -74,6 +77,9 @@
 #define _DOSELF_SUBSET(f) PASTE(runner_doself_subset, f)
 #define DOSELF_SUBSET _DOSELF_SUBSET(FUNCTION)
 
+#define _DOSELF_SUBSET_BRANCH(f) PASTE(runner_doself_subset_branch, f)
+#define DOSELF_SUBSET_BRANCH _DOSELF_SUBSET_BRANCH(FUNCTION)
+
 #define _DOSUB_SELF1(f) PASTE(runner_dosub_self1, f)
 #define DOSUB_SELF1 _DOSUB_SELF1(FUNCTION)
 
@@ -467,27 +473,18 @@ void DOSELF2_NAIVE(struct runner *r, struct cell *restrict c) {
  * @param ind The list of indices of particles in @c ci to interact with.
  * @param count The number of particles in @c ind.
  * @param cj The second #cell.
+ * @param shift The shift vector to apply to the particles in ci.
  */
 void DOPAIR_SUBSET_NAIVE(struct runner *r, struct cell *restrict ci,
                          struct part *restrict parts_i, int *restrict ind,
-                         int count, struct cell *restrict cj) {
-
-  const struct engine *e = r->e;
+                         int count, struct cell *restrict cj,
+                         const double *shift) {
 
   TIMER_TIC;
 
   const int count_j = cj->count;
   struct part *restrict parts_j = cj->parts;
 
-  /* Get the relative distance between the pairs, wrapping. */
-  double shift[3] = {0.0, 0.0, 0.0};
-  for (int k = 0; k < 3; k++) {
-    if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2)
-      shift[k] = e->s->dim[k];
-    else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2)
-      shift[k] = -e->s->dim[k];
-  }
-
   /* Loop over the parts_i. */
   for (int pid = 0; pid < count; pid++) {
 
@@ -499,9 +496,9 @@ void DOPAIR_SUBSET_NAIVE(struct runner *r, struct cell *restrict ci,
     const float hig2 = hi * hi * kernel_gamma2;
 
 #ifdef SWIFT_DEBUG_CHECKS
+    const struct engine *e = r->e;
     if (!part_is_active(pi, e))
       error("Trying to correct smoothing length of inactive particle !");
-
 #endif
 
     /* Loop over the parts in cj. */
@@ -547,48 +544,20 @@ void DOPAIR_SUBSET_NAIVE(struct runner *r, struct cell *restrict ci,
  * @param ind The list of indices of particles in @c ci to interact with.
  * @param count The number of particles in @c ind.
  * @param cj The second #cell.
+ * @param sid The direction of the pair.
+ * @param flipped Flag to check whether the cells have been flipped or not.
+ * @param shift The shift vector to apply to the particles in ci.
  */
 void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci,
                    struct part *restrict parts_i, int *restrict ind, int count,
-                   struct cell *restrict cj) {
-
-  const struct engine *e = r->e;
-
-#ifdef SWIFT_USE_NAIVE_INTERACTIONS
-  DOPAIR_SUBSET_NAIVE(r, ci, parts_i, ind, count, cj);
-  return;
-#endif
+                   struct cell *restrict cj, const int sid, const int flipped,
+                   const double *shift) {
 
   TIMER_TIC;
 
   const int count_j = cj->count;
   struct part *restrict parts_j = cj->parts;
 
-  /* Get the relative distance between the pairs, wrapping. */
-  double shift[3] = {0.0, 0.0, 0.0};
-  for (int k = 0; k < 3; k++) {
-    if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2)
-      shift[k] = e->s->dim[k];
-    else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2)
-      shift[k] = -e->s->dim[k];
-  }
-
-  /* Get the sorting index. */
-  int sid = 0;
-  for (int k = 0; k < 3; k++)
-    sid = 3 * sid + ((cj->loc[k] - ci->loc[k] + shift[k] < 0)
-                         ? 0
-                         : (cj->loc[k] - ci->loc[k] + shift[k] > 0) ? 2 : 1);
-
-  /* Switch the cells around? */
-  const int flipped = runner_flip[sid];
-  sid = sortlistID[sid];
-
-  /* Has the cell cj been sorted? */
-  if (!(cj->sorted & (1 << sid)) ||
-      cj->dx_max_sort_old > space_maxreldx * cj->dmin)
-    error("Interacting unsorted cells.");
-
   /* Pick-out the sorted lists. */
   const struct entry *restrict sort_j = cj->sort[sid];
   const float dxj = cj->dx_max_sort;
@@ -624,6 +593,7 @@ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci,
         const float r2 = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2];
 
 #ifdef SWIFT_DEBUG_CHECKS
+        const struct engine *e = r->e;
         /* Check that particles have been drifted to the current time */
         if (pi->ti_drift != e->ti_current)
           error("Particle pi not drifted to current time");
@@ -671,6 +641,7 @@ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci,
         const float r2 = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2];
 
 #ifdef SWIFT_DEBUG_CHECKS
+        const struct engine *e = r->e;
         /* Check that particles have been drifted to the current time */
         if (pi->ti_drift != e->ti_current)
           error("Particle pi not drifted to current time");
@@ -690,6 +661,64 @@ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci,
   TIMER_TOC(timer_dopair_subset);
 }
 
+/**
+ * @brief Determine which version of DOPAIR_SUBSET needs to be called depending
+ * on the
+ * orientation of the cells or whether DOPAIR_SUBSET needs to be called at all.
+ *
+ * @param r The #runner.
+ * @param ci The first #cell.
+ * @param parts_i The #part to interact with @c cj.
+ * @param ind The list of indices of particles in @c ci to interact with.
+ * @param count The number of particles in @c ind.
+ * @param cj The second #cell.
+ */
+void DOPAIR_SUBSET_BRANCH(struct runner *r, struct cell *restrict ci,
+                          struct part *restrict parts_i, int *restrict ind,
+                          int count, struct cell *restrict cj) {
+
+  const struct engine *e = r->e;
+
+  /* Get the relative distance between the pairs, wrapping. */
+  double shift[3] = {0.0, 0.0, 0.0};
+  for (int k = 0; k < 3; k++) {
+    if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2)
+      shift[k] = e->s->dim[k];
+    else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2)
+      shift[k] = -e->s->dim[k];
+  }
+
+#if !defined(SWIFT_USE_NAIVE_INTERACTIONS)
+  /* Get the sorting index. */
+  int sid = 0;
+  for (int k = 0; k < 3; k++)
+    sid = 3 * sid + ((cj->loc[k] - ci->loc[k] + shift[k] < 0)
+                         ? 0
+                         : (cj->loc[k] - ci->loc[k] + shift[k] > 0) ? 2 : 1);
+
+  /* Switch the cells around? */
+  const int flipped = runner_flip[sid];
+  sid = sortlistID[sid];
+
+  /* Has the cell cj been sorted? */
+  if (!(cj->sorted & (1 << sid)) ||
+      cj->dx_max_sort_old > space_maxreldx * cj->dmin)
+    error("Interacting unsorted cells.");
+#endif
+
+#if defined(SWIFT_USE_NAIVE_INTERACTIONS)
+  DOPAIR_SUBSET_NAIVE(r, ci, parts_i, ind, count, cj, shift);
+#elif defined(WITH_VECTORIZATION) && defined(GADGET2_SPH)
+  if (sort_is_face(sid))
+    runner_dopair_subset_density_vec(r, ci, parts_i, ind, count, cj, sid,
+                                     flipped, shift);
+  else
+    DOPAIR_SUBSET(r, ci, parts_i, ind, count, cj, sid, flipped, shift);
+#else
+  DOPAIR_SUBSET(r, ci, parts_i, ind, count, cj, sid, flipped, shift);
+#endif
+}
+
 /**
  * @brief Compute the interactions between a cell pair, but only for the
  *      given indices in ci.
@@ -757,13 +786,34 @@ void DOSELF_SUBSET(struct runner *r, struct cell *restrict ci,
   TIMER_TOC(timer_doself_subset);
 }
 
+/**
+ * @brief Determine which version of DOSELF_SUBSET needs to be called depending
+ * on the optimisation level.
+
+ * @param r The #runner.
+ * @param ci The first #cell.
+ * @param parts The #part to interact.
+ * @param ind The list of indices of particles in @c ci to interact with.
+ * @param count The number of particles in @c ind.
+ */
+void DOSELF_SUBSET_BRANCH(struct runner *r, struct cell *restrict ci,
+                          struct part *restrict parts, int *restrict ind,
+                          int count) {
+
+#if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH)
+  runner_doself_subset_density_vec(r, ci, parts, ind, count);
+#else
+  DOSELF_SUBSET(r, ci, parts, ind, count);
+#endif
+}
+
 /**
  * @brief Compute the interactions between a cell pair (non-symmetric).
  *
  * @param r The #runner.
  * @param ci The first #cell.
  * @param cj The second #cell.
- * @param sid The direction of the pair
+ * @param sid The direction of the pair.
  * @param shift The shift vector to apply to the particles in ci.
  */
 void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj, const int sid,
@@ -2461,11 +2511,7 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts,
 
     /* Otherwise, compute self-interaction. */
     else
-#if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH)
-      runner_doself_subset_density_vec(r, ci, parts, ind, count);
-#else
-      DOSELF_SUBSET(r, ci, parts, ind, count);
-#endif
+      DOSELF_SUBSET_BRANCH(r, ci, parts, ind, count);
   } /* self-interaction. */
 
   /* Otherwise, it's a pair interaction. */
@@ -2987,7 +3033,7 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts,
       /* Do any of the cells need to be drifted first? */
       if (!cell_are_part_drifted(cj, e)) error("Cell should be drifted!");
 
-      DOPAIR_SUBSET(r, ci, parts, ind, count, cj);
+      DOPAIR_SUBSET_BRANCH(r, ci, parts, ind, count, cj);
     }
 
   } /* otherwise, pair interaction. */
diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c
index a31b39fc5c443132b7487ccd7d57dc47fd540132..aa6e6dc3c15f1846462b900076cf131e726cd883 100644
--- a/src/runner_doiact_vec.c
+++ b/src/runner_doiact_vec.c
@@ -262,7 +262,7 @@ __attribute__((always_inline)) INLINE static void storeInteractions(
  * @param active_ci Is any particle in cell ci active?
  * @param active_cj Is any particle in cell cj active?
  */
-__attribute__((always_inline)) INLINE static void populate_max_index_no_cache(
+__attribute__((always_inline)) INLINE static void populate_max_index_density(
     const struct cell *ci, const struct cell *cj,
     const struct entry *restrict sort_i, const struct entry *restrict sort_j,
     const float dx_max, const float rshift, const double hi_max,
@@ -421,8 +421,7 @@ __attribute__((always_inline)) INLINE static void populate_max_index_no_cache(
  * @param active_ci Is any particle in cell ci active?
  * @param active_cj Is any particle in cell cj active?
  */
-__attribute__((always_inline)) INLINE static void
-populate_max_index_no_cache_force(
+__attribute__((always_inline)) INLINE static void populate_max_index_force(
     const struct cell *ci, const struct cell *cj,
     const struct entry *restrict sort_i, const struct entry *restrict sort_j,
     const float dx_max, const float rshift, const double hi_max_raw,
@@ -558,6 +557,88 @@ populate_max_index_no_cache_force(
   *init_pj = last_pj;
 }
 
+/**
+ * @brief Populates the array max_index_i with the maximum
+ * index of
+ * particles into the neighbouring cell. Also finds the first/last pj that
+ * interacts with any particle in ci.
+ *
+ * @param count_i The number of particles in ci.
+ * @param count_j The number of particles in cj.
+ * @param parts_i The #part to interact with @c cj.
+ * @param ind The list of indices of particles in @c ci to interact with.
+ * @param total_ci_shift The shift vector to apply to the particles in ci.
+ * @param dxj Maximum particle movement allowed in cell cj.
+ * @param di_shift_correction The correction to di after the particles have been
+ * shifted to the frame of cell ci.
+ * @param runner_shift_x The runner_shift in the x direction.
+ * @param runner_shift_y The runner_shift in the y direction.
+ * @param runner_shift_z The runner_shift in the z direction.
+ * @param sort_j #entry array for particle distance in cj
+ * @param max_index_i array to hold the maximum distances of pi particles into
+ * #cell cj
+ * @param flipped Flag to check whether the cells have been flipped or not.
+ * @return first_pj/last_pj first or last pj to interact with any particle in ci
+ * depending whether the cells have been flipped or not.
+ */
+__attribute__((always_inline)) INLINE static int populate_max_index_subset(
+    const int count_i, const int count_j, struct part *restrict parts_i,
+    int *restrict ind, const double *total_ci_shift, const float dxj,
+    const double di_shift_correction, const double runner_shift_x,
+    const double runner_shift_y, const double runner_shift_z,
+    const struct entry *restrict sort_j, int *max_index_i, const int flipped) {
+
+  /* The cell is on the right so read the particles
+   * into the cache from the start of the cell. */
+  if (!flipped) {
+
+    /* Find the rightmost particle in cell j that interacts with any
+     * particle in cell i. */
+    int last_pj = 0;
+
+    for (int pid = 0; pid < count_i; pid++) {
+      struct part *restrict pi = &parts_i[ind[pid]];
+      const float pix = pi->x[0] - total_ci_shift[0];
+      const float piy = pi->x[1] - total_ci_shift[1];
+      const float piz = pi->x[2] - total_ci_shift[2];
+      const float hi = pi->h;
+
+      const double di = hi * kernel_gamma + dxj + pix * runner_shift_x +
+                        piy * runner_shift_y + piz * runner_shift_z +
+                        di_shift_correction;
+
+      for (int pjd = last_pj; pjd < count_j && sort_j[pjd].d < di; pjd++)
+        last_pj++;
+
+      max_index_i[pid] = last_pj;
+    }
+    return last_pj;
+  }
+  /* The cell is on the left so read the particles
+   * into the cache from the end of the cell. */
+  else {
+
+    int first_pj = count_j - 1;
+
+    for (int pid = 0; pid < count_i; pid++) {
+      struct part *restrict pi = &parts_i[ind[pid]];
+      const float pix = pi->x[0] - total_ci_shift[0];
+      const float piy = pi->x[1] - total_ci_shift[1];
+      const float piz = pi->x[2] - total_ci_shift[2];
+      const float hi = pi->h;
+
+      const double di = -hi * kernel_gamma - dxj + pix * runner_shift_x +
+                        piy * runner_shift_y + piz * runner_shift_z +
+                        di_shift_correction;
+
+      for (int pjd = first_pj; pjd > 0 && di < sort_j[pjd].d; pjd--) first_pj--;
+
+      max_index_i[pid] = first_pj;
+    }
+    return first_pj;
+  }
+}
+
 #endif /* WITH_VECTORIZATION && GADGET2_SPH */
 
 /**
@@ -567,8 +648,7 @@ populate_max_index_no_cache_force(
  * @param r The #runner.
  * @param c The #cell.
  */
-__attribute__((always_inline)) INLINE void runner_doself1_density_vec(
-    struct runner *r, struct cell *restrict c) {
+void runner_doself1_density_vec(struct runner *r, struct cell *restrict c) {
 
 #if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH)
 
@@ -802,9 +882,9 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
  * @param ind The list of indices of particles in @c c to interact with.
  * @param pi_count The number of particles in @c ind.
  */
-__attribute__((always_inline)) INLINE void runner_doself_subset_density_vec(
-    struct runner *r, struct cell *restrict c, struct part *restrict parts,
-    int *restrict ind, int pi_count) {
+void runner_doself_subset_density_vec(struct runner *r, struct cell *restrict c,
+                                      struct part *restrict parts,
+                                      int *restrict ind, int pi_count) {
 
 #if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH)
 
@@ -1026,8 +1106,7 @@ __attribute__((always_inline)) INLINE void runner_doself_subset_density_vec(
  * @param r The #runner.
  * @param c The #cell.
  */
-__attribute__((always_inline)) INLINE void runner_doself2_force_vec(
-    struct runner *r, struct cell *restrict c) {
+void runner_doself2_force_vec(struct runner *r, struct cell *restrict c) {
 
 #if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH)
 
@@ -1309,10 +1388,10 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
   /* Find particles maximum index into cj, max_index_i[] and ci, max_index_j[].
    * Also find the first pi that interacts with any particle in cj and the last
    * pj that interacts with any particle in ci. */
-  populate_max_index_no_cache(ci, cj, sort_i, sort_j, dx_max, rshift, hi_max,
-                              hj_max, di_max, dj_min, max_index_i, max_index_j,
-                              &first_pi, &last_pj, max_active_bin, active_ci,
-                              active_cj);
+  populate_max_index_density(ci, cj, sort_i, sort_j, dx_max, rshift, hi_max,
+                             hj_max, di_max, dj_min, max_index_i, max_index_j,
+                             &first_pi, &last_pj, max_active_bin, active_ci,
+                             active_cj);
 
   /* Limits of the outer loops. */
   const int first_pi_loop = first_pi;
@@ -1364,7 +1443,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
       const vector v_hig2 = vector_set1(hig2);
 
       /* Get the inverse of hi. */
-      vector v_hi_inv = vec_reciprocal(v_hi);
+      const vector v_hi_inv = vec_reciprocal(v_hi);
 
       /* Reset cumulative sums of update vectors. */
       vector v_rhoSum = vector_setzero();
@@ -1394,7 +1473,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
         }
 #endif
 
-        /* Load 2 sets of vectors from the particle cache. */
+        /* Load 1 set of vectors from the particle cache. */
         const vector v_pjx = vector_load(&cj_cache->x[cj_cache_idx]);
         const vector v_pjy = vector_load(&cj_cache->y[cj_cache_idx]);
         const vector v_pjz = vector_load(&cj_cache->z[cj_cache_idx]);
@@ -1584,6 +1663,314 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
 #endif /* WITH_VECTORIZATION */
 }
 
+/**
+ * @brief Compute the interactions between a cell pair, but only for the
+ *      given indices in ci. (Vectorised)
+ *
+ * @param r The #runner.
+ * @param ci The first #cell.
+ * @param parts_i The #part to interact with @c cj.
+ * @param ind The list of indices of particles in @c ci to interact with.
+ * @param count The number of particles in @c ind.
+ * @param cj The second #cell.
+ * @param sid The direction of the pair.
+ * @param flipped Flag to check whether the cells have been flipped or not.
+ * @param shift The shift vector to apply to the particles in ci.
+ */
+void runner_dopair_subset_density_vec(struct runner *r,
+                                      struct cell *restrict ci,
+                                      struct part *restrict parts_i,
+                                      int *restrict ind, int count,
+                                      struct cell *restrict cj, const int sid,
+                                      const int flipped, const double *shift) {
+
+#ifdef WITH_VECTORIZATION
+
+  TIMER_TIC;
+
+  const int count_j = cj->count;
+
+  /* Pick-out the sorted lists. */
+  const struct entry *restrict sort_j = cj->sort[sid];
+  const float dxj = cj->dx_max_sort;
+
+  /* Get both particle caches from the runner and re-allocate
+   * them if they are not big enough for the cells. */
+  struct cache *restrict cj_cache = &r->cj_cache;
+
+  if (cj_cache->count < count_j) cache_init(cj_cache, count_j);
+
+  /* Pull each runner_shift from memory. */
+  const double runner_shift_x = runner_shift[sid][0];
+  const double runner_shift_y = runner_shift[sid][1];
+  const double runner_shift_z = runner_shift[sid][2];
+
+  const double total_ci_shift[3] = {
+      ci->loc[0] + shift[0], ci->loc[1] + shift[1], ci->loc[2] + shift[2]};
+
+  /* Calculate the correction to di after the particles have been shifted to the
+   * frame of cell ci. */
+  const double di_shift_correction = ci->loc[0] * runner_shift_x +
+                                     ci->loc[1] * runner_shift_y +
+                                     ci->loc[2] * runner_shift_z;
+
+  double rshift = 0.0;
+  for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k];
+
+  int *restrict max_index_i SWIFT_CACHE_ALIGN;
+  max_index_i = r->ci_cache.max_index;
+
+  /* Parts are on the left? */
+  if (!flipped) {
+
+    int last_pj = populate_max_index_subset(
+        count, count_j, parts_i, ind, total_ci_shift, dxj, di_shift_correction,
+        runner_shift_x, runner_shift_y, runner_shift_z, sort_j, max_index_i, 0);
+
+    /* Read the particles from the cell and store them locally in the cache. */
+    cache_read_particles_subset(cj, cj_cache, sort_j, 0, &last_pj, ci->loc, 0);
+
+    const double dj_min = sort_j[0].d;
+
+    /* Loop over the parts_i. */
+    for (int pid = 0; pid < count; pid++) {
+
+      /* Get a hold of the ith part in ci. */
+      struct part *restrict pi = &parts_i[ind[pid]];
+      const float pix = pi->x[0] - total_ci_shift[0];
+      const float piy = pi->x[1] - total_ci_shift[1];
+      const float piz = pi->x[2] - total_ci_shift[2];
+      const float hi = pi->h;
+      const float hig2 = hi * hi * kernel_gamma2;
+
+      /* Skip this particle if no particle in cj is within range of it. */
+      const double di = hi * kernel_gamma + dxj + pix * runner_shift_x +
+                        piy * runner_shift_y + piz * runner_shift_z +
+                        di_shift_correction;
+      if (di < dj_min) continue;
+
+      /* Fill particle pi vectors. */
+      const vector v_pix = vector_set1(pix);
+      const vector v_piy = vector_set1(piy);
+      const vector v_piz = vector_set1(piz);
+      const vector v_hi = vector_set1(hi);
+      const vector v_vix = vector_set1(pi->v[0]);
+      const vector v_viy = vector_set1(pi->v[1]);
+      const vector v_viz = vector_set1(pi->v[2]);
+      const vector v_hig2 = vector_set1(hig2);
+
+      /* Get the inverse of hi. */
+      vector v_hi_inv = vec_reciprocal(v_hi);
+
+      /* Reset cumulative sums of update vectors. */
+      vector v_rhoSum = vector_setzero();
+      vector v_rho_dhSum = vector_setzero();
+      vector v_wcountSum = vector_setzero();
+      vector v_wcount_dhSum = vector_setzero();
+      vector v_div_vSum = vector_setzero();
+      vector v_curlvxSum = vector_setzero();
+      vector v_curlvySum = vector_setzero();
+      vector v_curlvzSum = vector_setzero();
+
+      int exit_iteration_end = max_index_i[pid] + 1;
+
+      /* Loop over the parts in cj. */
+      for (int pjd = 0; pjd < exit_iteration_end; pjd += VEC_SIZE) {
+
+        /* Get the cache index to the jth particle. */
+        const int cj_cache_idx = pjd;
+
+        vector v_dx, v_dy, v_dz, v_r2;
+
+        /* Load 1 set of vectors from the particle cache. */
+        const vector v_pjx = vector_load(&cj_cache->x[cj_cache_idx]);
+        const vector v_pjy = vector_load(&cj_cache->y[cj_cache_idx]);
+        const vector v_pjz = vector_load(&cj_cache->z[cj_cache_idx]);
+
+        /* Compute the pairwise distance. */
+        v_dx.v = vec_sub(v_pix.v, v_pjx.v);
+        v_dy.v = vec_sub(v_piy.v, v_pjy.v);
+        v_dz.v = vec_sub(v_piz.v, v_pjz.v);
+
+        v_r2.v = vec_mul(v_dx.v, v_dx.v);
+        v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v);
+        v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v);
+
+        /* Form r2 < hig2 mask. */
+        mask_t v_doi_mask;
+        vec_create_mask(v_doi_mask, vec_cmp_lt(v_r2.v, v_hig2.v));
+
+#ifdef DEBUG_INTERACTIONS_SPH
+        struct part *restrict parts_j = cj->parts;
+        for (int bit_index = 0; bit_index < VEC_SIZE; bit_index++) {
+          if (vec_is_mask_true(v_doi_mask) & (1 << bit_index)) {
+            if (pi->num_ngb_density < MAX_NUM_OF_NEIGHBOURS) {
+              pi->ids_ngbs_density[pi->num_ngb_density] =
+                  parts_j[sort_j[pjd + bit_index].i].id;
+            }
+            ++pi->num_ngb_density;
+          }
+        }
+#endif
+
+        /* If there are any interactions perform them. */
+        if (vec_is_mask_true(v_doi_mask))
+          runner_iact_nonsym_1_vec_density(
+              &v_r2, &v_dx, &v_dy, &v_dz, v_hi_inv, v_vix, v_viy, v_viz,
+              &cj_cache->vx[cj_cache_idx], &cj_cache->vy[cj_cache_idx],
+              &cj_cache->vz[cj_cache_idx], &cj_cache->m[cj_cache_idx],
+              &v_rhoSum, &v_rho_dhSum, &v_wcountSum, &v_wcount_dhSum,
+              &v_div_vSum, &v_curlvxSum, &v_curlvySum, &v_curlvzSum,
+              v_doi_mask);
+
+      } /* loop over the parts in cj. */
+
+      /* Perform horizontal adds on vector sums and store result in pi. */
+      VEC_HADD(v_rhoSum, pi->rho);
+      VEC_HADD(v_rho_dhSum, pi->density.rho_dh);
+      VEC_HADD(v_wcountSum, pi->density.wcount);
+      VEC_HADD(v_wcount_dhSum, pi->density.wcount_dh);
+      VEC_HADD(v_div_vSum, pi->density.div_v);
+      VEC_HADD(v_curlvxSum, pi->density.rot_v[0]);
+      VEC_HADD(v_curlvySum, pi->density.rot_v[1]);
+      VEC_HADD(v_curlvzSum, pi->density.rot_v[2]);
+
+    } /* loop over the parts in ci. */
+  }
+
+  /* Parts are on the right. */
+  else {
+
+    int first_pj = populate_max_index_subset(
+        count, count_j, parts_i, ind, total_ci_shift, dxj, di_shift_correction,
+        runner_shift_x, runner_shift_y, runner_shift_z, sort_j, max_index_i, 1);
+
+    /* Read the particles from the cell and store them locally in the cache. */
+    cache_read_particles_subset(cj, cj_cache, sort_j, &first_pj, 0, ci->loc, 1);
+
+    /* Get the number of particles read into the ci cache. */
+    const int cj_cache_count = count_j - first_pj;
+
+    const double dj_max = sort_j[count_j - 1].d;
+
+    /* Loop over the parts_i. */
+    for (int pid = 0; pid < count; pid++) {
+
+      /* Get a hold of the ith part in ci. */
+      struct part *restrict pi = &parts_i[ind[pid]];
+      const float pix = pi->x[0] - total_ci_shift[0];
+      const float piy = pi->x[1] - total_ci_shift[1];
+      const float piz = pi->x[2] - total_ci_shift[2];
+      const float hi = pi->h;
+      const float hig2 = hi * hi * kernel_gamma2;
+
+      /* Skip this particle if no particle in cj is within range of it. */
+      const double di = -hi * kernel_gamma - dxj + pix * runner_shift_x +
+                        piy * runner_shift_y + piz * runner_shift_z +
+                        di_shift_correction;
+      if (di > dj_max) continue;
+
+      /* Fill particle pi vectors. */
+      const vector v_pix = vector_set1(pix);
+      const vector v_piy = vector_set1(piy);
+      const vector v_piz = vector_set1(piz);
+      const vector v_hi = vector_set1(hi);
+      const vector v_vix = vector_set1(pi->v[0]);
+      const vector v_viy = vector_set1(pi->v[1]);
+      const vector v_viz = vector_set1(pi->v[2]);
+      const vector v_hig2 = vector_set1(hig2);
+
+      /* Get the inverse of hi. */
+      vector v_hi_inv = vec_reciprocal(v_hi);
+
+      /* Reset cumulative sums of update vectors. */
+      vector v_rhoSum = vector_setzero();
+      vector v_rho_dhSum = vector_setzero();
+      vector v_wcountSum = vector_setzero();
+      vector v_wcount_dhSum = vector_setzero();
+      vector v_div_vSum = vector_setzero();
+      vector v_curlvxSum = vector_setzero();
+      vector v_curlvySum = vector_setzero();
+      vector v_curlvzSum = vector_setzero();
+
+      int exit_iteration = max_index_i[pid];
+
+      /* Convert exit iteration to cache indices. */
+      int exit_iteration_align = exit_iteration - first_pj;
+
+      /* Pad the exit iteration align so cache reads are aligned. */
+      const int rem = exit_iteration_align % VEC_SIZE;
+      if (exit_iteration_align < VEC_SIZE) {
+        exit_iteration_align = 0;
+      } else
+        exit_iteration_align -= rem;
+
+      /* Loop over the parts in cj. */
+      for (int cj_cache_idx = exit_iteration_align;
+           cj_cache_idx < cj_cache_count; cj_cache_idx += VEC_SIZE) {
+
+        vector v_dx, v_dy, v_dz, v_r2;
+
+        /* Load 1 set of vectors from the particle cache. */
+        const vector v_pjx = vector_load(&cj_cache->x[cj_cache_idx]);
+        const vector v_pjy = vector_load(&cj_cache->y[cj_cache_idx]);
+        const vector v_pjz = vector_load(&cj_cache->z[cj_cache_idx]);
+
+        /* Compute the pairwise distance. */
+        v_dx.v = vec_sub(v_pix.v, v_pjx.v);
+        v_dy.v = vec_sub(v_piy.v, v_pjy.v);
+        v_dz.v = vec_sub(v_piz.v, v_pjz.v);
+
+        v_r2.v = vec_mul(v_dx.v, v_dx.v);
+        v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v);
+        v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v);
+
+        /* Form r2 < hig2 mask. */
+        mask_t v_doi_mask;
+        vec_create_mask(v_doi_mask, vec_cmp_lt(v_r2.v, v_hig2.v));
+
+#ifdef DEBUG_INTERACTIONS_SPH
+        struct part *restrict parts_j = cj->parts;
+        for (int bit_index = 0; bit_index < VEC_SIZE; bit_index++) {
+          if (vec_is_mask_true(v_doi_mask) & (1 << bit_index)) {
+            if (pi->num_ngb_density < MAX_NUM_OF_NEIGHBOURS) {
+              pi->ids_ngbs_density[pi->num_ngb_density] =
+                  parts_j[sort_j[cj_cache_idx + first_pj + bit_index].i].id;
+            }
+            ++pi->num_ngb_density;
+          }
+        }
+#endif
+
+        /* If there are any interactions perform them. */
+        if (vec_is_mask_true(v_doi_mask))
+          runner_iact_nonsym_1_vec_density(
+              &v_r2, &v_dx, &v_dy, &v_dz, v_hi_inv, v_vix, v_viy, v_viz,
+              &cj_cache->vx[cj_cache_idx], &cj_cache->vy[cj_cache_idx],
+              &cj_cache->vz[cj_cache_idx], &cj_cache->m[cj_cache_idx],
+              &v_rhoSum, &v_rho_dhSum, &v_wcountSum, &v_wcount_dhSum,
+              &v_div_vSum, &v_curlvxSum, &v_curlvySum, &v_curlvzSum,
+              v_doi_mask);
+
+      } /* loop over the parts in cj. */
+
+      /* Perform horizontal adds on vector sums and store result in pi. */
+      VEC_HADD(v_rhoSum, pi->rho);
+      VEC_HADD(v_rho_dhSum, pi->density.rho_dh);
+      VEC_HADD(v_wcountSum, pi->density.wcount);
+      VEC_HADD(v_wcount_dhSum, pi->density.wcount_dh);
+      VEC_HADD(v_div_vSum, pi->density.div_v);
+      VEC_HADD(v_curlvxSum, pi->density.rot_v[0]);
+      VEC_HADD(v_curlvySum, pi->density.rot_v[1]);
+      VEC_HADD(v_curlvzSum, pi->density.rot_v[2]);
+
+    } /* loop over the parts in ci. */
+  }
+
+  TIMER_TOC(timer_dopair_subset);
+#endif /* WITH_VECTORIZATION */
+}
+
 /**
  * @brief Compute the force interactions between a cell pair (non-symmetric)
  * using vector intrinsics.
@@ -1691,10 +2078,10 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci,
   /* Find particles maximum distance into cj, max_di[] and ci, max_dj[]. */
   /* Also find the first pi that interacts with any particle in cj and the last
    * pj that interacts with any particle in ci. */
-  populate_max_index_no_cache_force(
-      ci, cj, sort_i, sort_j, dx_max, rshift, hi_max_raw, hj_max_raw, h_max,
-      di_max, dj_min, max_index_i, max_index_j, &first_pi, &last_pj,
-      max_active_bin, active_ci, active_cj);
+  populate_max_index_force(ci, cj, sort_i, sort_j, dx_max, rshift, hi_max_raw,
+                           hj_max_raw, h_max, di_max, dj_min, max_index_i,
+                           max_index_j, &first_pi, &last_pj, max_active_bin,
+                           active_ci, active_cj);
 
   /* Limits of the outer loops. */
   const int first_pi_loop = first_pi;
diff --git a/src/runner_doiact_vec.h b/src/runner_doiact_vec.h
index 66240b1be542ba0c3d76c35e9d6e96a6849b02dc..6b01987c15513343d7a8784322803d7f834dea2c 100644
--- a/src/runner_doiact_vec.h
+++ b/src/runner_doiact_vec.h
@@ -40,6 +40,12 @@ void runner_doself_subset_density_vec(struct runner *r,
                                       int *restrict ind, int count);
 void runner_doself1_density_vec(struct runner *r, struct cell *restrict c);
 void runner_doself2_force_vec(struct runner *r, struct cell *restrict c);
+void runner_dopair_subset_density_vec(struct runner *r,
+                                      struct cell *restrict ci,
+                                      struct part *restrict parts_i,
+                                      int *restrict ind, int count,
+                                      struct cell *restrict cj, const int sid,
+                                      const int flipped, const double *shift);
 void runner_dopair1_density_vec(struct runner *r, struct cell *restrict ci,
                                 struct cell *restrict cj, const int sid,
                                 const double *shift);
diff --git a/src/scheduler.c b/src/scheduler.c
index 6ef2a47d61e51a2ac6d2f103666a6cc9abd71a0d..0d96368e5711935813322107df4286176210552a 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -359,7 +359,7 @@ static void scheduler_splittask_hydro(struct task *t, struct scheduler *s) {
       if (cell_can_split_self_task(ci)) {
 
         /* Make a sub? */
-        if (scheduler_dosub && ci->count < space_subsize_self) {
+        if (scheduler_dosub && ci->count < space_subsize_self_hydro) {
 
           /* convert to a self-subtask. */
           t->type = task_type_sub_self;
@@ -419,7 +419,7 @@ static void scheduler_splittask_hydro(struct task *t, struct scheduler *s) {
 
         /* Replace by a single sub-task? */
         if (scheduler_dosub && /* Use division to avoid integer overflow. */
-            ci->count * sid_scale[sid] < space_subsize_pair / cj->count &&
+            ci->count * sid_scale[sid] < space_subsize_pair_hydro / cj->count &&
             !sort_is_corner(sid)) {
 
           /* Make this task a sub task. */
@@ -1179,7 +1179,7 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
   int *tid = s->tasks_ind;
   struct task *tasks = s->tasks;
   const int nodeID = s->nodeID;
-  const float wscale = 0.001;
+  const float wscale = 0.001f;
   const ticks tic = getticks();
 
   /* Run through the tasks backwards and set their weights. */
@@ -1198,47 +1198,47 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
 
       case task_type_self:
         if (t->subtype == task_subtype_grav)
-          cost = 1 * wscale * t->ci->gcount * t->ci->gcount;
+          cost = 1.f * (wscale * t->ci->gcount) * t->ci->gcount;
         else if (t->subtype == task_subtype_external_grav)
-          cost = 1 * wscale * t->ci->gcount;
+          cost = 1.f * wscale * t->ci->gcount;
         else
-          cost = 1 * wscale * t->ci->count * t->ci->count;
+          cost = 1.f * (wscale * t->ci->count) * t->ci->count;
         break;
 
       case task_type_pair:
         if (t->subtype == task_subtype_grav) {
           if (t->ci->nodeID != nodeID || t->cj->nodeID != nodeID)
-            cost = 3 * wscale * t->ci->gcount * t->cj->gcount;
+            cost = 3.f * (wscale * t->ci->gcount) * t->cj->gcount;
           else
-            cost = 2 * wscale * t->ci->gcount * t->cj->gcount;
+            cost = 2.f * (wscale * t->ci->gcount) * t->cj->gcount;
         } else {
           if (t->ci->nodeID != nodeID || t->cj->nodeID != nodeID)
-            cost =
-                3 * wscale * t->ci->count * t->cj->count * sid_scale[t->flags];
+            cost = 3.f * (wscale * t->ci->count) * t->cj->count *
+                   sid_scale[t->flags];
           else
-            cost =
-                2 * wscale * t->ci->count * t->cj->count * sid_scale[t->flags];
+            cost = 2.f * (wscale * t->ci->count) * t->cj->count *
+                   sid_scale[t->flags];
         }
         break;
 
       case task_type_sub_pair:
         if (t->ci->nodeID != nodeID || t->cj->nodeID != nodeID) {
           if (t->flags < 0)
-            cost = 3 * wscale * t->ci->count * t->cj->count;
+            cost = 3.f * (wscale * t->ci->count) * t->cj->count;
           else
-            cost =
-                3 * wscale * t->ci->count * t->cj->count * sid_scale[t->flags];
+            cost = 3.f * (wscale * t->ci->count) * t->cj->count *
+                   sid_scale[t->flags];
         } else {
           if (t->flags < 0)
-            cost = 2 * wscale * t->ci->count * t->cj->count;
+            cost = 2.f * (wscale * t->ci->count) * t->cj->count;
           else
-            cost =
-                2 * wscale * t->ci->count * t->cj->count * sid_scale[t->flags];
+            cost = 2.f * (wscale * t->ci->count) * t->cj->count *
+                   sid_scale[t->flags];
         }
         break;
 
       case task_type_sub_self:
-        cost = 1 * wscale * t->ci->count * t->ci->count;
+        cost = 1.f * (wscale * t->ci->count) * t->ci->count;
         break;
       case task_type_ghost:
         if (t->ci == t->ci->super_hydro) cost = wscale * t->ci->count;
@@ -1274,10 +1274,16 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
         cost = wscale * t->ci->count + wscale * t->ci->gcount;
         break;
       case task_type_send:
-        cost = 10 * wscale * t->ci->count * t->ci->count;
+        if (t->ci->count < 1e5)
+          cost = 10.f * (wscale * t->ci->count) * t->ci->count;
+        else
+          cost = 2e9;
         break;
       case task_type_recv:
-        cost = 5 * wscale * t->ci->count * t->ci->count;
+        if (t->ci->count < 1e5)
+          cost = 5.f * (wscale * t->ci->count) * t->ci->count;
+        else
+          cost = 1e9;
         break;
       default:
         cost = 0;
diff --git a/src/space.c b/src/space.c
index 02f285edb7b3069022b5e1e869e975abd3e896af..68d24ae689d1b0c36541b6ec96545276b2191363 100644
--- a/src/space.c
+++ b/src/space.c
@@ -60,8 +60,9 @@
 
 /* Split size. */
 int space_splitsize = space_splitsize_default;
-int space_subsize_pair = space_subsize_pair_default;
-int space_subsize_self = space_subsize_self_default;
+int space_subsize_pair_hydro = space_subsize_pair_hydro_default;
+int space_subsize_self_hydro = space_subsize_self_hydro_default;
+int space_subsize_pair_grav = space_subsize_pair_grav_default;
 int space_subsize_self_grav = space_subsize_self_grav_default;
 int space_maxsize = space_maxsize_default;
 #ifdef SWIFT_DEBUG_CHECKS
@@ -1081,7 +1082,11 @@ void space_parts_get_cell_index_mapper(void *map_data, int nr_parts,
     ind[k] = index;
 
 #ifdef SWIFT_DEBUG_CHECKS
-    if (pos_x > dim_x || pos_y > dim_y || pos_z > pos_z || pos_x < 0. ||
+    if (index < 0 || index >= cdim[0] * cdim[1] * cdim[2])
+      error("Invalid index=%d cdim=[%d %d %d] p->x=[%e %e %e]", index, cdim[0],
+            cdim[1], cdim[2], pos_x, pos_y, pos_z);
+
+    if (pos_x >= dim_x || pos_y >= dim_y || pos_z >= dim_z || pos_x < 0. ||
         pos_y < 0. || pos_z < 0.)
       error("Particle outside of simulation box. p->x=[%e %e %e]", pos_x, pos_y,
             pos_z);
@@ -1138,6 +1143,17 @@ void space_gparts_get_cell_index_mapper(void *map_data, int nr_gparts,
         cell_getid(cdim, pos_x * ih_x, pos_y * ih_y, pos_z * ih_z);
     ind[k] = index;
 
+#ifdef SWIFT_DEBUG_CHECKS
+    if (index < 0 || index >= cdim[0] * cdim[1] * cdim[2])
+      error("Invalid index=%d cdim=[%d %d %d] p->x=[%e %e %e]", index, cdim[0],
+            cdim[1], cdim[2], pos_x, pos_y, pos_z);
+
+    if (pos_x >= dim_x || pos_y >= dim_y || pos_z >= dim_z || pos_x < 0. ||
+        pos_y < 0. || pos_z < 0.)
+      error("Particle outside of simulation box. p->x=[%e %e %e]", pos_x, pos_y,
+            pos_z);
+#endif
+
     /* Update the position */
     gp->x[0] = pos_x;
     gp->x[1] = pos_y;
@@ -1189,6 +1205,17 @@ void space_sparts_get_cell_index_mapper(void *map_data, int nr_sparts,
         cell_getid(cdim, pos_x * ih_x, pos_y * ih_y, pos_z * ih_z);
     ind[k] = index;
 
+#ifdef SWIFT_DEBUG_CHECKS
+    if (index < 0 || index >= cdim[0] * cdim[1] * cdim[2])
+      error("Invalid index=%d cdim=[%d %d %d] p->x=[%e %e %e]", index, cdim[0],
+            cdim[1], cdim[2], pos_x, pos_y, pos_z);
+
+    if (pos_x >= dim_x || pos_y >= dim_y || pos_z >= dim_z || pos_x < 0. ||
+        pos_y < 0. || pos_z < 0.)
+      error("Particle outside of simulation box. p->x=[%e %e %e]", pos_x, pos_y,
+            pos_z);
+#endif
+
     /* Update the position */
     sp->x[0] = pos_x;
     sp->x[1] = pos_y;
@@ -2612,8 +2639,8 @@ void space_first_init_parts(struct space *s) {
     hydro_first_init_part(&p[i], &xp[i]);
 
 #ifdef SWIFT_DEBUG_CHECKS
-    p->ti_drift = 0;
-    p->ti_kick = 0;
+    p[i].ti_drift = 0;
+    p[i].ti_kick = 0;
 #endif
   }
 }
@@ -2861,21 +2888,29 @@ void space_init(struct space *s, const struct swift_params *params,
   /* Get the constants for the scheduler */
   space_maxsize = parser_get_opt_param_int(params, "Scheduler:cell_max_size",
                                            space_maxsize_default);
-  space_subsize_pair = parser_get_opt_param_int(
-      params, "Scheduler:cell_sub_size_pair", space_subsize_pair_default);
-  space_subsize_self = parser_get_opt_param_int(
-      params, "Scheduler:cell_sub_size_self", space_subsize_self_default);
+  space_subsize_pair_hydro =
+      parser_get_opt_param_int(params, "Scheduler:cell_sub_size_pair_hydro",
+                               space_subsize_pair_hydro_default);
+  space_subsize_self_hydro =
+      parser_get_opt_param_int(params, "Scheduler:cell_sub_size_self_hydro",
+                               space_subsize_self_hydro_default);
+  space_subsize_pair_grav =
+      parser_get_opt_param_int(params, "Scheduler:cell_sub_size_pair_grav",
+                               space_subsize_pair_grav_default);
   space_subsize_self_grav =
       parser_get_opt_param_int(params, "Scheduler:cell_sub_size_self_grav",
                                space_subsize_self_grav_default);
   space_splitsize = parser_get_opt_param_int(
       params, "Scheduler:cell_split_size", space_splitsize_default);
 
-  if (verbose)
-    message(
-        "max_size set to %d, sub_size_pair set to %d, sub_size_self set to %d, "
-        "split_size set to %d",
-        space_maxsize, space_subsize_pair, space_subsize_self, space_splitsize);
+  if (verbose) {
+    message("max_size set to %d split_size set to %d", space_maxsize,
+            space_splitsize);
+    message("sub_size_pair_hydro set to %d, sub_size_self_hydro set to %d",
+            space_subsize_pair_hydro, space_subsize_self_hydro);
+    message("sub_size_pair_grav set to %d, sub_size_self_grav set to %d",
+            space_subsize_pair_grav, space_subsize_self_grav);
+  }
 
   /* Apply h scaling */
   const double scaling =
diff --git a/src/space.h b/src/space.h
index c49dc37dc0df27cd3647044f219e36c299dcd73b..2d2a8e8ff43b5bfb586beada0a40d0af4d4e15fb 100644
--- a/src/space.h
+++ b/src/space.h
@@ -43,8 +43,9 @@ struct cell;
 #define space_cellallocchunk 1000
 #define space_splitsize_default 400
 #define space_maxsize_default 8000000
-#define space_subsize_pair_default 256000000
-#define space_subsize_self_default 32000
+#define space_subsize_pair_hydro_default 256000000
+#define space_subsize_self_hydro_default 32000
+#define space_subsize_pair_grav_default 256000000
 #define space_subsize_self_grav_default 32000
 #define space_max_top_level_cells_default 12
 #define space_stretch 1.10f
@@ -56,8 +57,9 @@ struct cell;
 /* Split size. */
 extern int space_splitsize;
 extern int space_maxsize;
-extern int space_subsize_pair;
-extern int space_subsize_self;
+extern int space_subsize_pair_hydro;
+extern int space_subsize_self_hydro;
+extern int space_subsize_pair_grav;
 extern int space_subsize_self_grav;
 
 /**
diff --git a/src/timeline.h b/src/timeline.h
index 352056cf340e7bbbce336e03e67a060f172155b1..0f38ff3e9421c122b61caf74290c170b62b2dd92 100644
--- a/src/timeline.h
+++ b/src/timeline.h
@@ -32,7 +32,7 @@ typedef long long integertime_t;
 typedef char timebin_t;
 
 /*! The number of time bins */
-#define num_time_bins 26
+#define num_time_bins 56
 
 /*! The maximal number of timesteps in a simulation */
 #define max_nr_timesteps (1LL << (num_time_bins + 1))
diff --git a/tests/test27cells.c b/tests/test27cells.c
index c66a612d7325522213075d2d5ecd7e07ead6a17b..2d9cce7028026514e78da0ae09984ab39d7dce38 100644
--- a/tests/test27cells.c
+++ b/tests/test27cells.c
@@ -33,19 +33,24 @@
 #if defined(WITH_VECTORIZATION)
 #define DOSELF1 runner_doself1_density_vec
 #define DOSELF1_SUBSET runner_doself_subset_density_vec
+#define DOPAIR1_SUBSET runner_dopair_subset_branch_density
 #define DOPAIR1 runner_dopair1_branch_density
-#ifdef DOSELF_SUBSET
+#ifdef TEST_DOSELF_SUBSET
 #define DOSELF1_NAME "runner_doself_subset_density_vec"
 #else
 #define DOSELF1_NAME "runner_doself_density_vec"
 #endif
-#define DOPAIR1_NAME "runner_dopair1_density_vec"
+#ifdef TEST_DOPAIR_SUBSET
+#define DOPAIR1_NAME "runner_dopair_subset_branch_density"
+#else
+#define DOPAIR1_NAME "runner_dopair_density_vec"
+#endif
 #endif
 
 #ifndef DOSELF1
 #define DOSELF1 runner_doself1_density
 #define DOSELF1_SUBSET runner_doself_subset_density
-#ifdef DOSELF_SUBSET
+#ifdef TEST_DOSELF_SUBSET
 #define DOSELF1_NAME "runner_doself1_subset_density"
 #else
 #define DOSELF1_NAME "runner_doself1_density"
@@ -54,8 +59,13 @@
 
 #ifndef DOPAIR1
 #define DOPAIR1 runner_dopair1_branch_density
+#define DOPAIR1_SUBSET runner_dopair_subset_branch_density
+#ifdef TEST_DOPAIR_SUBSET
+#define DOPAIR1_NAME "runner_dopair1_subset_branch_density"
+#else
 #define DOPAIR1_NAME "runner_dopair1_density"
 #endif
+#endif
 
 #define NODE_ID 1
 
@@ -316,10 +326,19 @@ void runner_dopair1_branch_density(struct runner *r, struct cell *ci,
 void runner_doself_subset_density(struct runner *r, struct cell *restrict ci,
                                   struct part *restrict parts,
                                   int *restrict ind, int count);
+void runner_dopair_subset_density(struct runner *r, struct cell *restrict ci,
+                                  struct part *restrict parts_i,
+                                  int *restrict ind, int count,
+                                  struct cell *restrict cj);
 void runner_doself_subset_density_vec(struct runner *r,
                                       struct cell *restrict ci,
                                       struct part *restrict parts,
                                       int *restrict ind, int count);
+void runner_dopair_subset_branch_density(struct runner *r,
+                                         struct cell *restrict ci,
+                                         struct part *restrict parts_i,
+                                         int *restrict ind, int count,
+                                         struct cell *restrict cj);
 
 /* And go... */
 int main(int argc, char *argv[]) {
@@ -480,18 +499,7 @@ int main(int argc, char *argv[]) {
     cache_init(&runner.cj_cache, 512);
 #endif
 
-    /* Run all the pairs */
-    for (int j = 0; j < 27; ++j) {
-      if (cells[j] != main_cell) {
-        const ticks sub_tic = getticks();
-
-        DOPAIR1(&runner, main_cell, cells[j]);
-
-        timings[j] += getticks() - sub_tic;
-      }
-    }
-
-#ifdef DOSELF_SUBSET
+#if defined(TEST_DOSELF_SUBSET) || defined(TEST_DOPAIR_SUBSET)
     int *pid = NULL;
     int count = 0;
     if ((pid = malloc(sizeof(int) * main_cell->count)) == NULL)
@@ -503,10 +511,26 @@ int main(int argc, char *argv[]) {
       }
 #endif
 
+    /* Run all the pairs */
+    for (int j = 0; j < 27; ++j) {
+      if (cells[j] != main_cell) {
+        const ticks sub_tic = getticks();
+
+#ifdef TEST_DOPAIR_SUBSET
+        DOPAIR1_SUBSET(&runner, main_cell, main_cell->parts, pid, count,
+                       cells[j]);
+#else
+        DOPAIR1(&runner, main_cell, cells[j]);
+#endif
+
+        timings[j] += getticks() - sub_tic;
+      }
+    }
+
     /* And now the self-interaction */
     const ticks self_tic = getticks();
 
-#ifdef DOSELF_SUBSET
+#ifdef TEST_DOSELF_SUBSET
     DOSELF1_SUBSET(&runner, main_cell, main_cell->parts, pid, count);
 #else
     DOSELF1(&runner, main_cell);