diff --git a/.gitignore b/.gitignore
index c75a8f8fa2c16b8c092ab1a361b003d1e3384032..812aa06dd54ee8bb81e87796deb8ee05160fec72 100644
--- a/.gitignore
+++ b/.gitignore
@@ -55,7 +55,9 @@ tests/brute_force_125_standard.dat
 tests/swift_dopair_125_standard.dat
 tests/brute_force_125_perturbed.dat
 tests/swift_dopair_125_perturbed.dat
-tests/brute_force_active.dat
+tests/brute_force_pair_active.dat
+tests/brute_force_dopair2_active.dat 
+tests/swift_dopair2_force_active.dat
 tests/brute_force_periodic_BC_perturbed.dat
 tests/swift_dopair_active.dat
 tests/test_nonsym_density_serial.dat
diff --git a/src/cache.h b/src/cache.h
index 22b79b1e49230c434a852da16512493e242ba3f9..3eb1e194dd4232319ac1d4a4323ca8099f044063 100644
--- a/src/cache.h
+++ b/src/cache.h
@@ -198,10 +198,7 @@ __attribute__((always_inline)) INLINE void cache_read_particles(
   swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT);
 
   const struct part *restrict parts = ci->parts;
-  double loc[3];
-  loc[0] = ci->loc[0];
-  loc[1] = ci->loc[1];
-  loc[2] = ci->loc[2];
+  const double loc[3] = {ci->loc[0], ci->loc[1], ci->loc[2]};
 
   /* Shift the particles positions to a local frame so single precision can be
    * used instead of double precision. */
@@ -210,7 +207,6 @@ __attribute__((always_inline)) INLINE void cache_read_particles(
     y[i] = (float)(parts[i].x[1] - loc[1]);
     z[i] = (float)(parts[i].x[2] - loc[2]);
     h[i] = parts[i].h;
-
     m[i] = parts[i].mass;
     vx[i] = parts[i].v[0];
     vy[i] = parts[i].v[1];
@@ -254,10 +250,7 @@ __attribute__((always_inline)) INLINE void cache_read_force_particles(
                             SWIFT_CACHE_ALIGNMENT);
 
   const struct part *restrict parts = ci->parts;
-  double loc[3];
-  loc[0] = ci->loc[0];
-  loc[1] = ci->loc[1];
-  loc[2] = ci->loc[2];
+  const double loc[3] = {ci->loc[0], ci->loc[1], ci->loc[2]};
 
   /* Shift the particles positions to a local frame so single precision can be
    * used instead of double precision. */
@@ -266,12 +259,10 @@ __attribute__((always_inline)) INLINE void cache_read_force_particles(
     y[i] = (float)(parts[i].x[1] - loc[1]);
     z[i] = (float)(parts[i].x[2] - loc[2]);
     h[i] = parts[i].h;
-
     m[i] = parts[i].mass;
     vx[i] = parts[i].v[0];
     vy[i] = parts[i].v[1];
     vz[i] = parts[i].v[2];
-
     rho[i] = parts[i].rho;
     grad_h[i] = parts[i].force.f;
     pOrho2[i] = parts[i].force.P_over_rho2;
@@ -296,7 +287,6 @@ __attribute__((always_inline)) INLINE void cache_read_force_particles(
  * @param shift The amount to shift the particle positions to account for BCs
  * @param first_pi The first particle in cell ci that is in range.
  * @param last_pj The last particle in cell cj that is in range.
- * @param num_vec_proc Number of vectors that will be used to process the
  * interaction.
  */
 __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
@@ -304,38 +294,39 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
     struct cache *restrict const ci_cache,
     struct cache *restrict const cj_cache, const struct entry *restrict sort_i,
     const struct entry *restrict sort_j, const double *restrict const shift,
-    int *first_pi, int *last_pj, const int num_vec_proc) {
+    int *first_pi, int *last_pj) {
+
+  /* Make the number of particles to be read a multiple of the vector size.
+   * This eliminates serial remainder loops where possible when populating the
+   * cache. */
 
-  int idx;
-  /* Pad number of particles read to the vector size. */
-  int rem = (ci->count - *first_pi) % (num_vec_proc * VEC_SIZE);
+  /* Is the number of particles to read a multiple of the vector size? */
+  int rem = (ci->count - *first_pi) % VEC_SIZE;
   if (rem != 0) {
-    int pad = (num_vec_proc * VEC_SIZE) - rem;
+    int pad = VEC_SIZE - rem;
 
+    /* Decrease first_pi if there are particles in the cell left to read. */
     if (*first_pi - pad >= 0) *first_pi -= pad;
   }
 
-  rem = *last_pj % (num_vec_proc * VEC_SIZE);
+  rem = (*last_pj + 1) % VEC_SIZE;
   if (rem != 0) {
-    int pad = (num_vec_proc * VEC_SIZE) - rem;
+    int pad = VEC_SIZE - rem;
 
+    /* Increase last_pj if there are particles in the cell left to read. */
     if (*last_pj + pad < cj->count) *last_pj += pad;
   }
 
-  int first_pi_align = *first_pi;
-  int last_pj_align = *last_pj;
+  /* Get some local pointers */
+  const int first_pi_align = *first_pi;
+  const int last_pj_align = *last_pj;
   const struct part *restrict parts_i = ci->parts;
   const struct part *restrict parts_j = cj->parts;
-  double loc[3];
-  loc[0] = cj->loc[0];
-  loc[1] = cj->loc[1];
-  loc[2] = cj->loc[2];
 
-  /* Shift ci particles for boundary conditions and location of cell.*/
-  double total_ci_shift[3];
-  total_ci_shift[0] = loc[0] + shift[0];
-  total_ci_shift[1] = loc[1] + shift[1];
-  total_ci_shift[2] = loc[2] + shift[2];
+  /* Shift particles to the local frame and account for boundary conditions.*/
+  const double total_ci_shift[3] = {
+      cj->loc[0] + shift[0], cj->loc[1] + shift[1], cj->loc[2] + shift[2]};
+  const double total_cj_shift[3] = {cj->loc[0], cj->loc[1], cj->loc[2]};
 
   /* Let the compiler know that the data is aligned and create pointers to the
    * arrays inside the cache. */
@@ -349,19 +340,15 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
   swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT);
 
   int ci_cache_count = ci->count - first_pi_align;
+
   /* Shift the particles positions to a local frame (ci frame) so single
-   * precision
-   * can be
-   * used instead of double precision. Also shift the cell ci, particles
-   * positions
-   * due to BCs but leave cell cj. */
+   * precision can be used instead of double precision.  */
   for (int i = 0; i < ci_cache_count; i++) {
-    idx = sort_i[i + first_pi_align].i;
+    const int idx = sort_i[i + first_pi_align].i;
     x[i] = (float)(parts_i[idx].x[0] - total_ci_shift[0]);
     y[i] = (float)(parts_i[idx].x[1] - total_ci_shift[1]);
     z[i] = (float)(parts_i[idx].x[2] - total_ci_shift[2]);
     h[i] = parts_i[idx].h;
-
     m[i] = parts_i[idx].mass;
     vx[i] = parts_i[idx].v[0];
     vy[i] = parts_i[idx].v[1];
@@ -384,36 +371,42 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
           "is not within "
           "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
           "2*space_maxreldx)]. x=%f, ci->width[0]=%f",
-          ci->loc[0], ci->loc[1], ci->loc[2], loc[0], loc[1], loc[2], i, x[i],
-          ci->width[0]);
+          ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1],
+          cj->loc[2], i, x[i], ci->width[0]);
     if (y[i] > shift_threshold_y || y[i] < -shift_threshold_y)
       error(
           "Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d y pos "
           "is not within "
           "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
           "2*space_maxreldx)]. y=%f, ci->width[1]=%f",
-          ci->loc[0], ci->loc[1], ci->loc[2], loc[0], loc[1], loc[2], i, y[i],
-          ci->width[1]);
+          ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1],
+          cj->loc[2], i, y[i], ci->width[1]);
     if (z[i] > shift_threshold_z || z[i] < -shift_threshold_z)
       error(
           "Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d z pos "
           "is not within "
           "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
           "2*space_maxreldx)]. z=%f, ci->width[2]=%f",
-          ci->loc[0], ci->loc[1], ci->loc[2], loc[0], loc[1], loc[2], i, z[i],
-          ci->width[2]);
+          ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1],
+          cj->loc[2], i, z[i], ci->width[2]);
   }
 #endif
 
   /* Pad cache with fake particles that exist outside the cell so will not
-   * interact.*/
-  float fake_pix = 2.0f * parts_i[sort_i[ci->count - 1].i].x[0];
+   * interact. We use values of the same magnitude (but negative!) as the real
+   * particles to avoid overflow problems. */
+  const double max_dx = max(ci->dx_max_part, cj->dx_max_part);
+  const float pos_padded[3] = {-(2. * ci->width[0] + max_dx),
+                               -(2. * ci->width[1] + max_dx),
+                               -(2. * ci->width[2] + max_dx)};
+  const float h_padded = ci->parts[0].h;
+
   for (int i = ci->count - first_pi_align;
        i < ci->count - first_pi_align + VEC_SIZE; i++) {
-    x[i] = fake_pix;
-    y[i] = 1.f;
-    z[i] = 1.f;
-    h[i] = 1.f;
+    x[i] = pos_padded[0];
+    y[i] = pos_padded[1];
+    z[i] = pos_padded[2];
+    h[i] = h_padded;
 
     m[i] = 1.f;
     vx[i] = 1.f;
@@ -433,12 +426,11 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
   swift_declare_aligned_ptr(float, vzj, cj_cache->vz, SWIFT_CACHE_ALIGNMENT);
 
   for (int i = 0; i <= last_pj_align; i++) {
-    idx = sort_j[i].i;
-    xj[i] = (float)(parts_j[idx].x[0] - loc[0]);
-    yj[i] = (float)(parts_j[idx].x[1] - loc[1]);
-    zj[i] = (float)(parts_j[idx].x[2] - loc[2]);
+    const int idx = sort_j[i].i;
+    xj[i] = (float)(parts_j[idx].x[0] - total_cj_shift[0]);
+    yj[i] = (float)(parts_j[idx].x[1] - total_cj_shift[1]);
+    zj[i] = (float)(parts_j[idx].x[2] - total_cj_shift[2]);
     hj[i] = parts_j[idx].h;
-
     mj[i] = parts_j[idx].mass;
     vxj[i] = parts_j[idx].v[0];
     vyj[i] = parts_j[idx].v[1];
@@ -454,40 +446,228 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
           "pos is not within "
           "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
           "2*space_maxreldx)]. xj=%f, ci->width[0]=%f",
-          ci->loc[0], ci->loc[1], ci->loc[2], loc[0], loc[1], loc[2], i, xj[i],
-          ci->width[0]);
+          ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1],
+          cj->loc[2], i, xj[i], ci->width[0]);
     if (yj[i] > shift_threshold_y || yj[i] < -shift_threshold_y)
       error(
           "Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d yj "
           "pos is not within "
           "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
           "2*space_maxreldx)]. yj=%f, ci->width[1]=%f",
-          ci->loc[0], ci->loc[1], ci->loc[2], loc[0], loc[1], loc[2], i, yj[i],
-          ci->width[1]);
+          ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1],
+          cj->loc[2], i, yj[i], ci->width[1]);
     if (zj[i] > shift_threshold_z || zj[i] < -shift_threshold_z)
       error(
           "Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d zj "
           "pos is not within "
           "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
           "2*space_maxreldx)]. zj=%f, ci->width[2]=%f",
-          ci->loc[0], ci->loc[1], ci->loc[2], loc[0], loc[1], loc[2], i, zj[i],
-          ci->width[2]);
+          ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1],
+          cj->loc[2], i, zj[i], ci->width[2]);
   }
 #endif
 
   /* Pad cache with fake particles that exist outside the cell so will not
-   * interact.*/
-  float fake_pjx = 2.0f * cj->parts[sort_j[cj->count - 1].i].x[0];
+   * interact. We use values of the same magnitude (but negative!) as the real
+   * particles to avoid overflow problems. */
+  const float pos_padded_j[3] = {-(2. * cj->width[0] + max_dx),
+                                 -(2. * cj->width[1] + max_dx),
+                                 -(2. * cj->width[2] + max_dx)};
+  const float h_padded_j = cj->parts[0].h;
+
   for (int i = last_pj_align + 1; i < last_pj_align + 1 + VEC_SIZE; i++) {
-    xj[i] = fake_pjx;
-    yj[i] = 1.f;
-    zj[i] = 1.f;
-    hj[i] = 1.f;
+    xj[i] = pos_padded_j[0];
+    yj[i] = pos_padded_j[1];
+    zj[i] = pos_padded_j[2];
+    hj[i] = h_padded_j;
+    mj[i] = 1.f;
+    vxj[i] = 1.f;
+    vyj[i] = 1.f;
+    vzj[i] = 1.f;
+  }
+}
+
+/**
+ * @brief Populate caches by only reading particles that are within range of
+ * each other within the adjoining cell.Also read the particles into the cache
+ * in sorted order.
+ *
+ * @param ci The i #cell.
+ * @param cj The j #cell.
+ * @param ci_cache The #cache for cell ci.
+ * @param cj_cache The #cache for cell cj.
+ * @param sort_i The array of sorted particle indices for cell ci.
+ * @param sort_j The array of sorted particle indices for cell ci.
+ * @param shift The amount to shift the particle positions to account for BCs
+ * @param first_pi The first particle in cell ci that is in range.
+ * @param last_pj The last particle in cell cj that is in range.
+ * interaction.
+ */
+__attribute__((always_inline)) INLINE void
+cache_read_two_partial_cells_sorted_force(
+    const struct cell *const ci, const struct cell *const cj,
+    struct cache *const ci_cache, struct cache *const cj_cache,
+    const struct entry *restrict sort_i, const struct entry *restrict sort_j,
+    const double *const shift, int *first_pi, int *last_pj) {
+
+  /* Make the number of particles to be read a multiple of the vector size.
+   * This eliminates serial remainder loops where possible when populating the
+   * cache. */
+
+  /* Is the number of particles to read a multiple of the vector size? */
+  int rem = (ci->count - *first_pi) % VEC_SIZE;
+  if (rem != 0) {
+    int pad = VEC_SIZE - rem;
+
+    /* Decrease first_pi if there are particles in the cell left to read. */
+    if (*first_pi - pad >= 0) *first_pi -= pad;
+  }
+
+  rem = (*last_pj + 1) % VEC_SIZE;
+  if (rem != 0) {
+    int pad = VEC_SIZE - rem;
+
+    /* Increase last_pj if there are particles in the cell left to read. */
+    if (*last_pj + pad < cj->count) *last_pj += pad;
+  }
+
+  /* Get some local pointers */
+  const int first_pi_align = *first_pi;
+  const int last_pj_align = *last_pj;
+  const struct part *restrict parts_i = ci->parts;
+  const struct part *restrict parts_j = cj->parts;
+
+  /* Shift particles to the local frame and account for boundary conditions.*/
+  const double total_ci_shift[3] = {
+      cj->loc[0] + shift[0], cj->loc[1] + shift[1], cj->loc[2] + shift[2]};
+  const double total_cj_shift[3] = {cj->loc[0], cj->loc[1], cj->loc[2]};
+
+  /* Let the compiler know that the data is aligned and create pointers to the
+   * arrays inside the cache. */
+  swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, y, ci_cache->y, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, z, ci_cache->z, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, h, ci_cache->h, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, m, ci_cache->m, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vx, ci_cache->vx, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, rho, ci_cache->rho, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, grad_h, ci_cache->grad_h,
+                            SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, pOrho2, ci_cache->pOrho2,
+                            SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, balsara, ci_cache->balsara,
+                            SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, soundspeed, ci_cache->soundspeed,
+                            SWIFT_CACHE_ALIGNMENT);
+
+  int ci_cache_count = ci->count - first_pi_align;
+  /* Shift the particles positions to a local frame (ci frame) so single
+   * precision can be  used instead of double precision.  */
+  for (int i = 0; i < ci_cache_count; i++) {
+
+    const int idx = sort_i[i + first_pi_align].i;
+    x[i] = (float)(parts_i[idx].x[0] - total_ci_shift[0]);
+    y[i] = (float)(parts_i[idx].x[1] - total_ci_shift[1]);
+    z[i] = (float)(parts_i[idx].x[2] - total_ci_shift[2]);
+    h[i] = parts_i[idx].h;
+    m[i] = parts_i[idx].mass;
+    vx[i] = parts_i[idx].v[0];
+    vy[i] = parts_i[idx].v[1];
+    vz[i] = parts_i[idx].v[2];
+    rho[i] = parts_i[idx].rho;
+    grad_h[i] = parts_i[idx].force.f;
+    pOrho2[i] = parts_i[idx].force.P_over_rho2;
+    balsara[i] = parts_i[idx].force.balsara;
+    soundspeed[i] = parts_i[idx].force.soundspeed;
+  }
+
+  /* Pad cache with fake particles that exist outside the cell so will not
+   * interact. We use values of the same magnitude (but negative!) as the real
+   * particles to avoid overflow problems. */
+  const double max_dx = max(ci->dx_max_part, cj->dx_max_part);
+  const float pos_padded[3] = {-(2. * ci->width[0] + max_dx),
+                               -(2. * ci->width[1] + max_dx),
+                               -(2. * ci->width[2] + max_dx)};
+  const float h_padded = ci->parts[0].h;
+
+  for (int i = ci->count - first_pi_align;
+       i < ci->count - first_pi_align + VEC_SIZE; i++) {
+    x[i] = pos_padded[0];
+    y[i] = pos_padded[1];
+    z[i] = pos_padded[2];
+    h[i] = h_padded;
+    m[i] = 1.f;
+    vx[i] = 1.f;
+    vy[i] = 1.f;
+    vz[i] = 1.f;
+    rho[i] = 1.f;
+    grad_h[i] = 1.f;
+    pOrho2[i] = 1.f;
+    balsara[i] = 1.f;
+    soundspeed[i] = 1.f;
+  }
+
+  /* Let the compiler know that the data is aligned and create pointers to the
+   * arrays inside the cache. */
+  swift_declare_aligned_ptr(float, xj, cj_cache->x, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, yj, cj_cache->y, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, zj, cj_cache->z, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, hj, cj_cache->h, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, mj, cj_cache->m, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vxj, cj_cache->vx, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vyj, cj_cache->vy, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vzj, cj_cache->vz, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, rhoj, cj_cache->rho, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, grad_hj, cj_cache->grad_h,
+                            SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, pOrho2j, cj_cache->pOrho2,
+                            SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, balsaraj, cj_cache->balsara,
+                            SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, soundspeedj, cj_cache->soundspeed,
+                            SWIFT_CACHE_ALIGNMENT);
+
+  for (int i = 0; i <= last_pj_align; i++) {
+    const int idx = sort_j[i].i;
+    xj[i] = (float)(parts_j[idx].x[0] - total_cj_shift[0]);
+    yj[i] = (float)(parts_j[idx].x[1] - total_cj_shift[1]);
+    zj[i] = (float)(parts_j[idx].x[2] - total_cj_shift[2]);
+    hj[i] = parts_j[idx].h;
+    mj[i] = parts_j[idx].mass;
+    vxj[i] = parts_j[idx].v[0];
+    vyj[i] = parts_j[idx].v[1];
+    vzj[i] = parts_j[idx].v[2];
+    rhoj[i] = parts_j[idx].rho;
+    grad_hj[i] = parts_j[idx].force.f;
+    pOrho2j[i] = parts_j[idx].force.P_over_rho2;
+    balsaraj[i] = parts_j[idx].force.balsara;
+    soundspeedj[i] = parts_j[idx].force.soundspeed;
+  }
 
+  /* Pad cache with fake particles that exist outside the cell so will not
+   * interact. We use values of the same magnitude (but negative!) as the real
+   * particles to avoid overflow problems. */
+  const float pos_padded_j[3] = {-(2. * cj->width[0] + max_dx),
+                                 -(2. * cj->width[1] + max_dx),
+                                 -(2. * cj->width[2] + max_dx)};
+  const float h_padded_j = cj->parts[0].h;
+
+  for (int i = last_pj_align + 1; i < last_pj_align + 1 + VEC_SIZE; i++) {
+    xj[i] = pos_padded_j[0];
+    yj[i] = pos_padded_j[1];
+    zj[i] = pos_padded_j[2];
+    hj[i] = h_padded_j;
     mj[i] = 1.f;
     vxj[i] = 1.f;
     vyj[i] = 1.f;
     vzj[i] = 1.f;
+    rhoj[i] = 1.f;
+    grad_hj[i] = 1.f;
+    pOrho2j[i] = 1.f;
+    balsaraj[i] = 1.f;
+    soundspeedj[i] = 1.f;
   }
 }
 
@@ -506,6 +686,11 @@ static INLINE void cache_clean(struct cache *c) {
     free(c->vz);
     free(c->h);
     free(c->max_index);
+    free(c->rho);
+    free(c->grad_h);
+    free(c->pOrho2);
+    free(c->balsara);
+    free(c->soundspeed);
   }
 }
 
diff --git a/src/hydro/Gadget2/hydro_iact.h b/src/hydro/Gadget2/hydro_iact.h
index 3851dd6e894eafff29d35942ab0f364b5919e029..666e0be835f114959f2f2a074900700b383e10fe 100644
--- a/src/hydro/Gadget2/hydro_iact.h
+++ b/src/hydro/Gadget2/hydro_iact.h
@@ -170,17 +170,15 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz,
                                  mask_t mask) {
 
   vector r, ri, ui, wi, wi_dx;
-  vector mj;
   vector dvx, dvy, dvz;
-  vector vjx, vjy, vjz;
   vector dvdr;
   vector curlvrx, curlvry, curlvrz;
 
   /* Fill the vectors. */
-  mj.v = vec_load(Mj);
-  vjx.v = vec_load(Vjx);
-  vjy.v = vec_load(Vjy);
-  vjz.v = vec_load(Vjz);
+  const vector mj = vector_load(Mj);
+  const vector vjx = vector_load(Vjx);
+  const vector vjy = vector_load(Vjy);
+  const vector vjz = vector_load(Vjz);
 
   /* Get the radius and inverse radius. */
   ri = vec_reciprocal_sqrt(*r2);
@@ -245,38 +243,34 @@ runner_iact_nonsym_2_vec_density(float *R2, float *Dx, float *Dy, float *Dz,
                                  vector *curlvySum, vector *curlvzSum,
                                  mask_t mask, mask_t mask2, short mask_cond) {
 
-  vector r, ri, r2, ui, wi, wi_dx;
-  vector mj;
-  vector dx, dy, dz, dvx, dvy, dvz;
-  vector vjx, vjy, vjz;
+  vector r, ri, ui, wi, wi_dx;
+  vector dvx, dvy, dvz;
   vector dvdr;
   vector curlvrx, curlvry, curlvrz;
-  vector r_2, ri2, r2_2, ui2, wi2, wi_dx2;
-  vector mj2;
-  vector dx2, dy2, dz2, dvx2, dvy2, dvz2;
-  vector vjx2, vjy2, vjz2;
+  vector r_2, ri2, ui2, wi2, wi_dx2;
+  vector dvx2, dvy2, dvz2;
   vector dvdr2;
   vector curlvrx2, curlvry2, curlvrz2;
 
   /* Fill the vectors. */
-  mj.v = vec_load(Mj);
-  mj2.v = vec_load(&Mj[VEC_SIZE]);
-  vjx.v = vec_load(Vjx);
-  vjx2.v = vec_load(&Vjx[VEC_SIZE]);
-  vjy.v = vec_load(Vjy);
-  vjy2.v = vec_load(&Vjy[VEC_SIZE]);
-  vjz.v = vec_load(Vjz);
-  vjz2.v = vec_load(&Vjz[VEC_SIZE]);
-  dx.v = vec_load(Dx);
-  dx2.v = vec_load(&Dx[VEC_SIZE]);
-  dy.v = vec_load(Dy);
-  dy2.v = vec_load(&Dy[VEC_SIZE]);
-  dz.v = vec_load(Dz);
-  dz2.v = vec_load(&Dz[VEC_SIZE]);
+  const vector mj = vector_load(Mj);
+  const vector mj2 = vector_load(&Mj[VEC_SIZE]);
+  const vector vjx = vector_load(Vjx);
+  const vector vjx2 = vector_load(&Vjx[VEC_SIZE]);
+  const vector vjy = vector_load(Vjy);
+  const vector vjy2 = vector_load(&Vjy[VEC_SIZE]);
+  const vector vjz = vector_load(Vjz);
+  const vector vjz2 = vector_load(&Vjz[VEC_SIZE]);
+  const vector dx = vector_load(Dx);
+  const vector dx2 = vector_load(&Dx[VEC_SIZE]);
+  const vector dy = vector_load(Dy);
+  const vector dy2 = vector_load(&Dy[VEC_SIZE]);
+  const vector dz = vector_load(Dz);
+  const vector dz2 = vector_load(&Dz[VEC_SIZE]);
 
   /* Get the radius and inverse radius. */
-  r2.v = vec_load(R2);
-  r2_2.v = vec_load(&R2[VEC_SIZE]);
+  const vector r2 = vector_load(R2);
+  const vector r2_2 = vector_load(&R2[VEC_SIZE]);
   ri = vec_reciprocal_sqrt(r2);
   ri2 = vec_reciprocal_sqrt(r2_2);
   r.v = vec_mul(r2.v, ri.v);
@@ -592,30 +586,29 @@ runner_iact_nonsym_1_vec_force(
 #ifdef WITH_VECTORIZATION
 
   vector r, ri;
-  vector vjx, vjy, vjz, dvx, dvy, dvz;
-  vector pjrho, grad_hj, pjPOrho2, balsara_j, cj, mj;
+  vector dvx, dvy, dvz;
   vector xi, xj;
   vector hid_inv, hjd_inv;
   vector wi_dx, wj_dx, wi_dr, wj_dr, dvdr;
   vector piax, piay, piaz;
   vector pih_dt;
   vector v_sig;
-  vector omega_ij, mu_ij, fac_mu, balsara;
+  vector omega_ij, mu_ij, balsara;
   vector rho_ij, visc, visc_term, sph_term, acc, entropy_dt;
 
   /* Fill vectors. */
-  vjx.v = vec_load(Vjx);
-  vjy.v = vec_load(Vjy);
-  vjz.v = vec_load(Vjz);
-  mj.v = vec_load(Mj);
-
-  pjrho.v = vec_load(Pjrho);
-  grad_hj.v = vec_load(Grad_hj);
-  pjPOrho2.v = vec_load(PjPOrho2);
-  balsara_j.v = vec_load(Balsara_j);
-  cj.v = vec_load(Cj);
-
-  fac_mu.v = vec_set1(1.f); /* Will change with cosmological integration */
+  const vector vjx = vector_load(Vjx);
+  const vector vjy = vector_load(Vjy);
+  const vector vjz = vector_load(Vjz);
+  const vector mj = vector_load(Mj);
+  const vector pjrho = vector_load(Pjrho);
+  const vector grad_hj = vector_load(Grad_hj);
+  const vector pjPOrho2 = vector_load(PjPOrho2);
+  const vector balsara_j = vector_load(Balsara_j);
+  const vector cj = vector_load(Cj);
+
+  const vector fac_mu =
+      vector_set1(1.f); /* Will change with cosmological integration */
 
   /* Load stuff. */
   balsara.v = vec_add(balsara_i.v, balsara_j.v);
@@ -634,7 +627,7 @@ runner_iact_nonsym_1_vec_force(
   hjd_inv = pow_dimension_plus_one_vec(hj_inv);
   xj.v = vec_mul(r.v, hj_inv.v);
 
-  /* Calculate the kernel for two particles. */
+  /* Calculate the kernel. */
   kernel_eval_dWdx_force_vec(&xj, &wj_dx);
 
   wj_dr.v = vec_mul(hjd_inv.v, wj_dx.v);
@@ -720,23 +713,19 @@ runner_iact_nonsym_2_vec_force(
 
 #ifdef WITH_VECTORIZATION
 
-  vector r, r2, ri;
-  vector dx, dy, dz, dvx, dvy, dvz;
-  vector vjx, vjy, vjz;
-  vector pjrho, grad_hj, pjPOrho2, balsara_j, cj, mj, hj_inv;
+  vector r, ri;
+  vector dvx, dvy, dvz;
   vector ui, uj;
   vector hid_inv, hjd_inv;
   vector wi_dx, wj_dx, wi_dr, wj_dr, dvdr;
   vector piax, piay, piaz;
   vector pih_dt;
   vector v_sig;
-  vector omega_ij, mu_ij, fac_mu, balsara;
+  vector omega_ij, mu_ij, balsara;
   vector rho_ij, visc, visc_term, sph_term, acc, entropy_dt;
 
-  vector r_2, r2_2, ri_2;
-  vector dx_2, dy_2, dz_2, dvx_2, dvy_2, dvz_2;
-  vector vjx_2, vjy_2, vjz_2;
-  vector pjrho_2, grad_hj_2, pjPOrho2_2, balsara_j_2, cj_2, mj_2, hj_inv_2;
+  vector r_2, ri_2;
+  vector dvx_2, dvy_2, dvz_2;
   vector ui_2, uj_2;
   vector hjd_inv_2;
   vector wi_dx_2, wj_dx_2, wi_dr_2, wj_dr_2, dvdr_2;
@@ -747,44 +736,45 @@ runner_iact_nonsym_2_vec_force(
   vector rho_ij_2, visc_2, visc_term_2, sph_term_2, acc_2, entropy_dt_2;
 
   /* Fill vectors. */
-  mj.v = vec_load(Mj);
-  mj_2.v = vec_load(&Mj[VEC_SIZE]);
-  vjx.v = vec_load(Vjx);
-  vjx_2.v = vec_load(&Vjx[VEC_SIZE]);
-  vjy.v = vec_load(Vjy);
-  vjy_2.v = vec_load(&Vjy[VEC_SIZE]);
-  vjz.v = vec_load(Vjz);
-  vjz_2.v = vec_load(&Vjz[VEC_SIZE]);
-  dx.v = vec_load(Dx);
-  dx_2.v = vec_load(&Dx[VEC_SIZE]);
-  dy.v = vec_load(Dy);
-  dy_2.v = vec_load(&Dy[VEC_SIZE]);
-  dz.v = vec_load(Dz);
-  dz_2.v = vec_load(&Dz[VEC_SIZE]);
+  const vector mj = vector_load(Mj);
+  const vector mj_2 = vector_load(&Mj[VEC_SIZE]);
+  const vector vjx = vector_load(Vjx);
+  const vector vjx_2 = vector_load(&Vjx[VEC_SIZE]);
+  const vector vjy = vector_load(Vjy);
+  const vector vjy_2 = vector_load(&Vjy[VEC_SIZE]);
+  const vector vjz = vector_load(Vjz);
+  const vector vjz_2 = vector_load(&Vjz[VEC_SIZE]);
+  const vector dx = vector_load(Dx);
+  const vector dx_2 = vector_load(&Dx[VEC_SIZE]);
+  const vector dy = vector_load(Dy);
+  const vector dy_2 = vector_load(&Dy[VEC_SIZE]);
+  const vector dz = vector_load(Dz);
+  const vector dz_2 = vector_load(&Dz[VEC_SIZE]);
 
   /* Get the radius and inverse radius. */
-  r2.v = vec_load(R2);
-  r2_2.v = vec_load(&R2[VEC_SIZE]);
+  const vector r2 = vector_load(R2);
+  const vector r2_2 = vector_load(&R2[VEC_SIZE]);
   ri = vec_reciprocal_sqrt(r2);
   ri_2 = vec_reciprocal_sqrt(r2_2);
   r.v = vec_mul(r2.v, ri.v);
   r_2.v = vec_mul(r2_2.v, ri_2.v);
 
   /* Get remaining properties. */
-  pjrho.v = vec_load(Pjrho);
-  pjrho_2.v = vec_load(&Pjrho[VEC_SIZE]);
-  grad_hj.v = vec_load(Grad_hj);
-  grad_hj_2.v = vec_load(&Grad_hj[VEC_SIZE]);
-  pjPOrho2.v = vec_load(PjPOrho2);
-  pjPOrho2_2.v = vec_load(&PjPOrho2[VEC_SIZE]);
-  balsara_j.v = vec_load(Balsara_j);
-  balsara_j_2.v = vec_load(&Balsara_j[VEC_SIZE]);
-  cj.v = vec_load(Cj);
-  cj_2.v = vec_load(&Cj[VEC_SIZE]);
-  hj_inv.v = vec_load(Hj_inv);
-  hj_inv_2.v = vec_load(&Hj_inv[VEC_SIZE]);
-
-  fac_mu.v = vec_set1(1.f); /* Will change with cosmological integration */
+  const vector pjrho = vector_load(Pjrho);
+  const vector pjrho_2 = vector_load(&Pjrho[VEC_SIZE]);
+  const vector grad_hj = vector_load(Grad_hj);
+  const vector grad_hj_2 = vector_load(&Grad_hj[VEC_SIZE]);
+  const vector pjPOrho2 = vector_load(PjPOrho2);
+  const vector pjPOrho2_2 = vector_load(&PjPOrho2[VEC_SIZE]);
+  const vector balsara_j = vector_load(Balsara_j);
+  const vector balsara_j_2 = vector_load(&Balsara_j[VEC_SIZE]);
+  const vector cj = vector_load(Cj);
+  const vector cj_2 = vector_load(&Cj[VEC_SIZE]);
+  const vector hj_inv = vector_load(Hj_inv);
+  const vector hj_inv_2 = vector_load(&Hj_inv[VEC_SIZE]);
+
+  const vector fac_mu =
+      vector_set1(1.f); /* Will change with cosmological integration */
 
   /* Find the balsara switch. */
   balsara.v = vec_add(balsara_i.v, balsara_j.v);
diff --git a/src/runner.c b/src/runner.c
index b7ebf652738948141f8f3983b56331bad20e5a9d..73d7b5517fa073bef8e6fc3764733c7ec3e1f732 100644
--- a/src/runner.c
+++ b/src/runner.c
@@ -1885,7 +1885,7 @@ void *runner_main(void *data) {
             runner_dopair1_branch_gradient(r, ci, cj);
 #endif
           else if (t->subtype == task_subtype_force)
-            runner_dopair2_force(r, ci, cj);
+            runner_dopair2_branch_force(r, ci, cj);
           else if (t->subtype == task_subtype_grav)
             runner_dopair_grav(r, ci, cj, 1);
           else
diff --git a/src/runner_doiact.h b/src/runner_doiact.h
index 3ea935b950a3989fbb79499a81a54f1bf8aca432..0d642b23c46aee7d31b9b3469afcca1a1041dbc6 100644
--- a/src/runner_doiact.h
+++ b/src/runner_doiact.h
@@ -32,6 +32,9 @@
 #define _DOPAIR1(f) PASTE(runner_dopair1, f)
 #define DOPAIR1 _DOPAIR1(FUNCTION)
 
+#define _DOPAIR2_BRANCH(f) PASTE(runner_dopair2_branch, f)
+#define DOPAIR2_BRANCH _DOPAIR2_BRANCH(FUNCTION)
+
 #define _DOPAIR2(f) PASTE(runner_dopair2, f)
 #define DOPAIR2 _DOPAIR2(FUNCTION)
 
@@ -778,37 +781,6 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj, const int sid,
   const struct entry *restrict sort_j = cj->sort[sid];
 
 #ifdef SWIFT_DEBUG_CHECKS
-  /* Check that the dx_max_sort values in the cell are indeed an upper
-     bound on particle movement. */
-  for (int pid = 0; pid < ci->count; pid++) {
-    const struct part *p = &ci->parts[sort_i[pid].i];
-    const float d = p->x[0] * runner_shift[sid][0] +
-                    p->x[1] * runner_shift[sid][1] +
-                    p->x[2] * runner_shift[sid][2];
-    if (fabsf(d - sort_i[pid].d) - ci->dx_max_sort >
-        1.0e-4 * max(fabsf(d), ci->dx_max_sort_old))
-      error(
-          "particle shift diff exceeds dx_max_sort in cell ci. ci->nodeID=%d "
-          "cj->nodeID=%d d=%e sort_i[pid].d=%e ci->dx_max_sort=%e "
-          "ci->dx_max_sort_old=%e",
-          ci->nodeID, cj->nodeID, d, sort_i[pid].d, ci->dx_max_sort,
-          ci->dx_max_sort_old);
-  }
-  for (int pjd = 0; pjd < cj->count; pjd++) {
-    const struct part *p = &cj->parts[sort_j[pjd].i];
-    const float d = p->x[0] * runner_shift[sid][0] +
-                    p->x[1] * runner_shift[sid][1] +
-                    p->x[2] * runner_shift[sid][2];
-    if (fabsf(d - sort_j[pjd].d) - cj->dx_max_sort >
-        1.0e-4 * max(fabsf(d), cj->dx_max_sort_old))
-      error(
-          "particle shift diff exceeds dx_max_sort in cell cj. cj->nodeID=%d "
-          "ci->nodeID=%d d=%e sort_j[pjd].d=%e cj->dx_max_sort=%e "
-          "cj->dx_max_sort_old=%e",
-          cj->nodeID, ci->nodeID, d, sort_j[pjd].d, cj->dx_max_sort,
-          cj->dx_max_sort_old);
-  }
-
   /* Some constants used to checks that the parts are in the right frame */
   const float shift_threshold_x =
       2. * ci->width[0] + 2. * max(ci->dx_max_part, cj->dx_max_part);
@@ -816,7 +788,6 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj, const int sid,
       2. * ci->width[1] + 2. * max(ci->dx_max_part, cj->dx_max_part);
   const float shift_threshold_z =
       2. * ci->width[2] + 2. * max(ci->dx_max_part, cj->dx_max_part);
-
 #endif /* SWIFT_DEBUG_CHECKS */
 
   /* Get some other useful values. */
@@ -1025,6 +996,43 @@ void DOPAIR1_BRANCH(struct runner *r, struct cell *ci, struct cell *cj) {
       cj->dx_max_sort_old > space_maxreldx * cj->dmin)
     error("Interacting unsorted cells.");
 
+#ifdef SWIFT_DEBUG_CHECKS
+  /* Pick-out the sorted lists. */
+  const struct entry *restrict sort_i = ci->sort[sid];
+  const struct entry *restrict sort_j = cj->sort[sid];
+
+  /* Check that the dx_max_sort values in the cell are indeed an upper
+     bound on particle movement. */
+  for (int pid = 0; pid < ci->count; pid++) {
+    const struct part *p = &ci->parts[sort_i[pid].i];
+    const float d = p->x[0] * runner_shift[sid][0] +
+                    p->x[1] * runner_shift[sid][1] +
+                    p->x[2] * runner_shift[sid][2];
+    if (fabsf(d - sort_i[pid].d) - ci->dx_max_sort >
+        1.0e-4 * max(fabsf(d), ci->dx_max_sort_old))
+      error(
+          "particle shift diff exceeds dx_max_sort in cell ci. ci->nodeID=%d "
+          "cj->nodeID=%d d=%e sort_i[pid].d=%e ci->dx_max_sort=%e "
+          "ci->dx_max_sort_old=%e",
+          ci->nodeID, cj->nodeID, d, sort_i[pid].d, ci->dx_max_sort,
+          ci->dx_max_sort_old);
+  }
+  for (int pjd = 0; pjd < cj->count; pjd++) {
+    const struct part *p = &cj->parts[sort_j[pjd].i];
+    const float d = p->x[0] * runner_shift[sid][0] +
+                    p->x[1] * runner_shift[sid][1] +
+                    p->x[2] * runner_shift[sid][2];
+    if (fabsf(d - sort_j[pjd].d) - cj->dx_max_sort >
+        1.0e-4 * max(fabsf(d), cj->dx_max_sort_old))
+      error(
+          "particle shift diff exceeds dx_max_sort in cell cj. cj->nodeID=%d "
+          "ci->nodeID=%d d=%e sort_j[pjd].d=%e cj->dx_max_sort=%e "
+          "cj->dx_max_sort_old=%e",
+          cj->nodeID, ci->nodeID, d, sort_j[pjd].d, cj->dx_max_sort,
+          cj->dx_max_sort_old);
+  }
+#endif /* SWIFT_DEBUG_CHECKS */
+
 #if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH) && \
     (DOPAIR1_BRANCH == runner_dopair1_density_branch)
   if (!sort_is_corner(sid))
@@ -1042,8 +1050,11 @@ void DOPAIR1_BRANCH(struct runner *r, struct cell *ci, struct cell *cj) {
  * @param r The #runner.
  * @param ci The first #cell.
  * @param cj The second #cell.
+ * @param sid The direction of the pair
+ * @param shift The shift vector to apply to the particles in ci.
  */
-void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
+void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj, const int sid,
+             const double *shift) {
 
   struct engine *restrict e = r->e;
 
@@ -1054,24 +1065,6 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
 
   TIMER_TIC;
 
-  /* Anything to do here? */
-  if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return;
-
-  if (!cell_are_part_drifted(ci, e) || !cell_are_part_drifted(cj, e))
-    error("Interacting undrifted cells.");
-
-  /* Get the shift ID. */
-  double shift[3] = {0.0, 0.0, 0.0};
-  const int sid = space_getsid(e->s, &ci, &cj, shift);
-
-  /* Have the cells been sorted? */
-  if (!(ci->sorted & (1 << sid)) ||
-      ci->dx_max_sort_old > space_maxreldx * ci->dmin)
-    error("Interacting unsorted cells.");
-  if (!(cj->sorted & (1 << sid)) ||
-      cj->dx_max_sort_old > space_maxreldx * cj->dmin)
-    error("Interacting unsorted cells.");
-
   /* Get the cutoff shift. */
   double rshift = 0.0;
   for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k];
@@ -1081,37 +1074,6 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
   struct entry *restrict sort_j = cj->sort[sid];
 
 #ifdef SWIFT_DEBUG_CHECKS
-  /* Check that the dx_max_sort values in the cell are indeed an upper
-     bound on particle movement. */
-  for (int pid = 0; pid < ci->count; pid++) {
-    const struct part *p = &ci->parts[sort_i[pid].i];
-    const float d = p->x[0] * runner_shift[sid][0] +
-                    p->x[1] * runner_shift[sid][1] +
-                    p->x[2] * runner_shift[sid][2];
-    if (fabsf(d - sort_i[pid].d) - ci->dx_max_sort >
-        1.0e-4 * max(fabsf(d), ci->dx_max_sort_old))
-      error(
-          "particle shift diff exceeds dx_max_sort in cell ci. ci->nodeID=%d "
-          "cj->nodeID=%d d=%e sort_i[pid].d=%e ci->dx_max_sort=%e "
-          "ci->dx_max_sort_old=%e",
-          ci->nodeID, cj->nodeID, d, sort_i[pid].d, ci->dx_max_sort,
-          ci->dx_max_sort_old);
-  }
-  for (int pjd = 0; pjd < cj->count; pjd++) {
-    const struct part *p = &cj->parts[sort_j[pjd].i];
-    const float d = p->x[0] * runner_shift[sid][0] +
-                    p->x[1] * runner_shift[sid][1] +
-                    p->x[2] * runner_shift[sid][2];
-    if (fabsf(d - sort_j[pjd].d) - cj->dx_max_sort >
-        1.0e-4 * max(fabsf(d), cj->dx_max_sort_old))
-      error(
-          "particle shift diff exceeds dx_max_sort in cell cj. cj->nodeID=%d "
-          "ci->nodeID=%d d=%e sort_j[pjd].d=%e cj->dx_max_sort=%e "
-          "cj->dx_max_sort_old=%e",
-          cj->nodeID, ci->nodeID, d, sort_j[pjd].d, cj->dx_max_sort,
-          cj->dx_max_sort_old);
-  }
-
   /* Some constants used to checks that the parts are in the right frame */
   const float shift_threshold_x =
       2. * ci->width[0] + 2. * max(ci->dx_max_part, cj->dx_max_part);
@@ -1119,7 +1081,6 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
       2. * ci->width[1] + 2. * max(ci->dx_max_part, cj->dx_max_part);
   const float shift_threshold_z =
       2. * ci->width[2] + 2. * max(ci->dx_max_part, cj->dx_max_part);
-
 #endif /* SWIFT_DEBUG_CHECKS */
 
   /* Get some other useful values. */
@@ -1490,6 +1451,86 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
   TIMER_TOC(TIMER_DOPAIR);
 }
 
+/**
+ * @brief Determine which version of DOPAIR2 needs to be called depending on the
+ * orientation of the cells or whether DOPAIR2 needs to be called at all.
+ *
+ * @param r #runner
+ * @param ci #cell ci
+ * @param cj #cell cj
+ *
+ */
+void DOPAIR2_BRANCH(struct runner *r, struct cell *ci, struct cell *cj) {
+
+  const struct engine *restrict e = r->e;
+
+  /* Anything to do here? */
+  if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return;
+
+  /* Check that cells are drifted. */
+  if (!cell_are_part_drifted(ci, e) || !cell_are_part_drifted(cj, e))
+    error("Interacting undrifted cells.");
+
+  /* Get the sort ID. */
+  double shift[3] = {0.0, 0.0, 0.0};
+  const int sid = space_getsid(e->s, &ci, &cj, shift);
+
+  /* Have the cells been sorted? */
+  if (!(ci->sorted & (1 << sid)) ||
+      ci->dx_max_sort_old > space_maxreldx * ci->dmin)
+    error("Interacting unsorted cells.");
+  if (!(cj->sorted & (1 << sid)) ||
+      cj->dx_max_sort_old > space_maxreldx * cj->dmin)
+    error("Interacting unsorted cells.");
+
+#ifdef SWIFT_DEBUG_CHECKS
+  /* Pick-out the sorted lists. */
+  const struct entry *restrict sort_i = ci->sort[sid];
+  const struct entry *restrict sort_j = cj->sort[sid];
+
+  /* Check that the dx_max_sort values in the cell are indeed an upper
+     bound on particle movement. */
+  for (int pid = 0; pid < ci->count; pid++) {
+    const struct part *p = &ci->parts[sort_i[pid].i];
+    const float d = p->x[0] * runner_shift[sid][0] +
+                    p->x[1] * runner_shift[sid][1] +
+                    p->x[2] * runner_shift[sid][2];
+    if (fabsf(d - sort_i[pid].d) - ci->dx_max_sort >
+        1.0e-4 * max(fabsf(d), ci->dx_max_sort_old))
+      error(
+          "particle shift diff exceeds dx_max_sort in cell ci. ci->nodeID=%d "
+          "cj->nodeID=%d d=%e sort_i[pid].d=%e ci->dx_max_sort=%e "
+          "ci->dx_max_sort_old=%e",
+          ci->nodeID, cj->nodeID, d, sort_i[pid].d, ci->dx_max_sort,
+          ci->dx_max_sort_old);
+  }
+  for (int pjd = 0; pjd < cj->count; pjd++) {
+    const struct part *p = &cj->parts[sort_j[pjd].i];
+    const float d = p->x[0] * runner_shift[sid][0] +
+                    p->x[1] * runner_shift[sid][1] +
+                    p->x[2] * runner_shift[sid][2];
+    if (fabsf(d - sort_j[pjd].d) - cj->dx_max_sort >
+        1.0e-4 * max(fabsf(d), cj->dx_max_sort_old))
+      error(
+          "particle shift diff exceeds dx_max_sort in cell cj. cj->nodeID=%d "
+          "ci->nodeID=%d d=%e sort_j[pjd].d=%e cj->dx_max_sort=%e "
+          "cj->dx_max_sort_old=%e",
+          cj->nodeID, ci->nodeID, d, sort_j[pjd].d, cj->dx_max_sort,
+          cj->dx_max_sort_old);
+  }
+#endif /* SWIFT_DEBUG_CHECKS */
+
+#if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH) && \
+    (DOPAIR2_BRANCH == runner_dopair2_force_branch)
+  if (!sort_is_corner(sid))
+    runner_dopair2_force_vec(r, ci, cj, sid, shift);
+  else
+    DOPAIR2(r, ci, cj, sid, shift);
+#else
+  DOPAIR2(r, ci, cj, sid, shift);
+#endif
+}
+
 /**
  * @brief Compute the cell self-interaction (non-symmetric).
  *
@@ -2295,7 +2336,7 @@ void DOSUB_PAIR2(struct runner *r, struct cell *ci, struct cell *cj, int sid,
       error("Interacting unsorted cells.");
 
     /* Compute the interactions. */
-    DOPAIR2(r, ci, cj);
+    DOPAIR2_BRANCH(r, ci, cj);
   }
 
   if (gettimer) TIMER_TOC(TIMER_DOSUB_PAIR);
diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c
index 5e982830c609bcf2ce533c0423449a3e74e96bf4..7e34939ac4e6741dc9fd970f774bba01ca812593 100644
--- a/src/runner_doiact_vec.c
+++ b/src/runner_doiact_vec.c
@@ -35,21 +35,23 @@ static const vector kernel_gamma2_vec = FILL_VEC(kernel_gamma2);
  * @param int_cache (return) secondary #cache of interactions between two
  * particles.
  * @param icount Interaction count.
- * @param rhoSum (return) #vector holding the cumulative sum of the density
+ * @param v_rhoSum (return) #vector holding the cumulative sum of the density
  * update on pi.
- * @param rho_dhSum (return) #vector holding the cumulative sum of the density
+ * @param v_rho_dhSum (return) #vector holding the cumulative sum of the density
  * gradient update on pi.
- * @param wcountSum (return) #vector holding the cumulative sum of the wcount
+ * @param v_wcountSum (return) #vector holding the cumulative sum of the wcount
  * update on pi.
- * @param wcount_dhSum (return) #vector holding the cumulative sum of the wcount
+ * @param v_wcount_dhSum (return) #vector holding the cumulative sum of the
+ * wcount
  * gradient update on pi.
- * @param div_vSum (return) #vector holding the cumulative sum of the divergence
+ * @param v_div_vSum (return) #vector holding the cumulative sum of the
+ * divergence
  * update on pi.
- * @param curlvxSum (return) #vector holding the cumulative sum of the curl of
+ * @param v_curlvxSum (return) #vector holding the cumulative sum of the curl of
  * vx update on pi.
- * @param curlvySum (return) #vector holding the cumulative sum of the curl of
+ * @param v_curlvySum (return) #vector holding the cumulative sum of the curl of
  * vy update on pi.
- * @param curlvzSum (return) #vector holding the cumulative sum of the curl of
+ * @param v_curlvzSum (return) #vector holding the cumulative sum of the curl of
  * vz update on pi.
  * @param v_hi_inv #vector of 1/h for pi.
  * @param v_vix #vector of x velocity of pi.
@@ -59,11 +61,11 @@ static const vector kernel_gamma2_vec = FILL_VEC(kernel_gamma2);
  * interactions have been performed, should be a multiple of the vector length.
  */
 __attribute__((always_inline)) INLINE static void calcRemInteractions(
-    struct c2_cache *const int_cache, const int icount, vector *rhoSum,
-    vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum,
-    vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum,
-    vector v_hi_inv, vector v_vix, vector v_viy, vector v_viz,
-    int *icount_align) {
+    struct c2_cache *const int_cache, const int icount, vector *v_rhoSum,
+    vector *v_rho_dhSum, vector *v_wcountSum, vector *v_wcount_dhSum,
+    vector *v_div_vSum, vector *v_curlvxSum, vector *v_curlvySum,
+    vector *v_curlvzSum, vector v_hi_inv, vector v_vix, vector v_viy,
+    vector v_viz, int *icount_align) {
 
   mask_t int_mask, int_mask2;
 
@@ -107,9 +109,9 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions(
         &int_cache->dyq[*icount_align], &int_cache->dzq[*icount_align],
         v_hi_inv, v_vix, v_viy, v_viz, &int_cache->vxq[*icount_align],
         &int_cache->vyq[*icount_align], &int_cache->vzq[*icount_align],
-        &int_cache->mq[*icount_align], rhoSum, rho_dhSum, wcountSum,
-        wcount_dhSum, div_vSum, curlvxSum, curlvySum, curlvzSum, int_mask,
-        int_mask2, 1);
+        &int_cache->mq[*icount_align], v_rhoSum, v_rho_dhSum, v_wcountSum,
+        v_wcount_dhSum, v_div_vSum, v_curlvxSum, v_curlvySum, v_curlvzSum,
+        int_mask, int_mask2, 1);
   }
 }
 
@@ -127,20 +129,25 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions(
  * @param int_cache (return) secondary #cache of interactions between two
  * particles.
  * @param icount Interaction count.
- * @param rhoSum #vector holding the cumulative sum of the density update on pi.
- * @param rho_dhSum #vector holding the cumulative sum of the density gradient
+ * @param v_rhoSum #vector holding the cumulative sum of the density update on
+ * pi.
+ * @param v_rho_dhSum #vector holding the cumulative sum of the density gradient
  * update on pi.
- * @param wcountSum #vector holding the cumulative sum of the wcount update on
+ * @param v_wcountSum #vector holding the cumulative sum of the wcount update on
  * pi.
- * @param wcount_dhSum #vector holding the cumulative sum of the wcount gradient
+ * @param v_wcount_dhSum #vector holding the cumulative sum of the wcount
+ * gradient
  * update on pi.
- * @param div_vSum #vector holding the cumulative sum of the divergence update
+ * @param v_div_vSum #vector holding the cumulative sum of the divergence update
  * on pi.
- * @param curlvxSum #vector holding the cumulative sum of the curl of vx update
+ * @param v_curlvxSum #vector holding the cumulative sum of the curl of vx
+ * update
  * on pi.
- * @param curlvySum #vector holding the cumulative sum of the curl of vy update
+ * @param v_curlvySum #vector holding the cumulative sum of the curl of vy
+ * update
  * on pi.
- * @param curlvzSum #vector holding the cumulative sum of the curl of vz update
+ * @param v_curlvzSum #vector holding the cumulative sum of the curl of vz
+ * update
  * on pi.
  * @param v_hi_inv #vector of 1/h for pi.
  * @param v_vix #vector of x velocity of pi.
@@ -150,10 +157,11 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions(
 __attribute__((always_inline)) INLINE static void storeInteractions(
     const int mask, const int pjd, vector *v_r2, vector *v_dx, vector *v_dy,
     vector *v_dz, const struct cache *const cell_cache,
-    struct c2_cache *const int_cache, int *icount, vector *rhoSum,
-    vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum,
-    vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum,
-    vector v_hi_inv, vector v_vix, vector v_viy, vector v_viz) {
+    struct c2_cache *const int_cache, int *icount, vector *v_rhoSum,
+    vector *v_rho_dhSum, vector *v_wcountSum, vector *v_wcount_dhSum,
+    vector *v_div_vSum, vector *v_curlvxSum, vector *v_curlvySum,
+    vector *v_curlvzSum, vector v_hi_inv, vector v_vix, vector v_viy,
+    vector v_viz) {
 
 /* Left-pack values needed into the secondary cache using the interaction mask.
  */
@@ -202,9 +210,10 @@ __attribute__((always_inline)) INLINE static void storeInteractions(
     int icount_align = *icount;
 
     /* Peform remainder interactions. */
-    calcRemInteractions(int_cache, *icount, rhoSum, rho_dhSum, wcountSum,
-                        wcount_dhSum, div_vSum, curlvxSum, curlvySum, curlvzSum,
-                        v_hi_inv, v_vix, v_viy, v_viz, &icount_align);
+    calcRemInteractions(int_cache, *icount, v_rhoSum, v_rho_dhSum, v_wcountSum,
+                        v_wcount_dhSum, v_div_vSum, v_curlvxSum, v_curlvySum,
+                        v_curlvzSum, v_hi_inv, v_vix, v_viy, v_viz,
+                        &icount_align);
 
     mask_t int_mask, int_mask2;
     vec_init_mask_true(int_mask);
@@ -215,9 +224,9 @@ __attribute__((always_inline)) INLINE static void storeInteractions(
       runner_iact_nonsym_2_vec_density(
           &int_cache->r2q[j], &int_cache->dxq[j], &int_cache->dyq[j],
           &int_cache->dzq[j], v_hi_inv, v_vix, v_viy, v_viz, &int_cache->vxq[j],
-          &int_cache->vyq[j], &int_cache->vzq[j], &int_cache->mq[j], rhoSum,
-          rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, curlvySum,
-          curlvzSum, int_mask, int_mask2, 0);
+          &int_cache->vyq[j], &int_cache->vzq[j], &int_cache->mq[j], v_rhoSum,
+          v_rho_dhSum, v_wcountSum, v_wcount_dhSum, v_div_vSum, v_curlvxSum,
+          v_curlvySum, v_curlvzSum, int_mask, int_mask2, 0);
     }
 
     /* Reset interaction count. */
@@ -288,8 +297,9 @@ __attribute__((always_inline)) INLINE static void populate_max_index_no_cache(
     const float first_di =
         sort_i[first_pi].d + pi->h * kernel_gamma + dx_max - rshift;
 
-    /* Loop through particles in cell j until they are not in range of pi. */
-    while (temp < cj->count && first_di > sort_j[temp].d) temp++;
+    /* Loop through particles in cell j until they are not in range of pi.
+     * Make sure that temp stays between 0 and cj->count - 1.*/
+    while (temp < cj->count - 1 && first_di > sort_j[temp].d) temp++;
 
     max_index_i[first_pi] = temp;
 
@@ -300,7 +310,8 @@ __attribute__((always_inline)) INLINE static void populate_max_index_no_cache(
 
       const float di = sort_i[i].d + pi->h * kernel_gamma + dx_max - rshift;
 
-      while (temp < cj->count && di > sort_j[temp].d) temp++;
+      /* Make sure that temp stays between 0 and cj->count - 1.*/
+      while (temp < cj->count - 1 && di > sort_j[temp].d) temp++;
 
       max_index_i[i] = temp;
     }
@@ -357,6 +368,153 @@ __attribute__((always_inline)) INLINE static void populate_max_index_no_cache(
   *init_pi = first_pi;
   *init_pj = last_pj;
 }
+
+/**
+ * @brief Populates the arrays max_index_i and max_index_j with the maximum
+ * indices of
+ * particles into their neighbouring cells. Also finds the first pi that
+ * interacts with any particle in cj and the last pj that interacts with any
+ * particle in ci.
+ *
+ * @param ci #cell pointer to ci
+ * @param cj #cell pointer to cj
+ * @param sort_i #entry array for particle distance in ci
+ * @param sort_j #entry array for particle distance in cj
+ * @param dx_max maximum particle movement allowed in cell
+ * @param rshift cutoff shift
+ * @param hi_max_raw Maximal smoothing length in cell ci
+ * @param hj_max_raw Maximal smoothing length in cell cj
+ * @param hi_max Maximal smoothing length in cell ci scaled by kernel_gamma
+ * @param hj_max Maximal smoothing length in cell cj scaled by kernel_gamma
+ * @param di_max Maximal position on the axis that can interact in cell ci
+ * @param dj_min Minimal position on the axis that can interact in cell ci
+ * @param max_index_i array to hold the maximum distances of pi particles into
+ * #cell cj
+ * @param max_index_j array to hold the maximum distances of pj particles into
+ * #cell cj
+ * @param init_pi first pi to interact with a pj particle
+ * @param init_pj last pj to interact with a pi particle
+ * @param max_active_bin The largest time-bin active during this step.
+ */
+__attribute__((always_inline)) INLINE static void
+populate_max_index_no_cache_force(const struct cell *ci, const struct cell *cj,
+                                  const struct entry *restrict sort_i,
+                                  const struct entry *restrict sort_j,
+                                  const float dx_max, const float rshift,
+                                  const double hi_max_raw,
+                                  const double hj_max_raw, const double hi_max,
+                                  const double hj_max, const double di_max,
+                                  const double dj_min, int *max_index_i,
+                                  int *max_index_j, int *init_pi, int *init_pj,
+                                  const timebin_t max_active_bin) {
+
+  const struct part *restrict parts_i = ci->parts;
+  const struct part *restrict parts_j = cj->parts;
+
+  int first_pi = 0, last_pj = cj->count - 1;
+  int temp;
+
+  /* Find the leftmost active particle in cell i that interacts with any
+   * particle in cell j. */
+  first_pi = ci->count;
+  int active_id = first_pi - 1;
+  while (first_pi > 0 &&
+         sort_i[first_pi - 1].d + dx_max + max(hi_max, hj_max) > dj_min) {
+    first_pi--;
+    /* Store the index of the particle if it is active. */
+    if (part_is_active_no_debug(&parts_i[sort_i[first_pi].i], max_active_bin))
+      active_id = first_pi;
+  }
+
+  /* Set the first active pi in range of any particle in cell j. */
+  first_pi = active_id;
+
+  /* Find the maximum index into cell j for each particle in range in cell i. */
+  if (first_pi < ci->count) {
+
+    /* Start from the first particle in cell j. */
+    temp = 0;
+
+    const struct part *pi = &parts_i[sort_i[first_pi].i];
+    const float first_di = sort_i[first_pi].d +
+                           max(pi->h, hj_max_raw) * kernel_gamma + dx_max -
+                           rshift;
+
+    /* Loop through particles in cell j until they are not in range of pi.
+     * Make sure that temp stays between 0 and cj->count - 1.*/
+    while (temp < cj->count - 1 && first_di > sort_j[temp].d) temp++;
+
+    max_index_i[first_pi] = temp;
+
+    /* Populate max_index_i for remaining particles that are within range. */
+    for (int i = first_pi + 1; i < ci->count; i++) {
+      temp = max_index_i[i - 1];
+      pi = &parts_i[sort_i[i].i];
+
+      const float di =
+          sort_i[i].d + max(pi->h, hj_max_raw) * kernel_gamma + dx_max - rshift;
+
+      /* Make sure that temp stays between 0 and cj->count - 1.*/
+      while (temp < cj->count - 1 && di > sort_j[temp].d) temp++;
+
+      max_index_i[i] = temp;
+    }
+  } else {
+    /* Make sure that max index is set to first particle in cj.*/
+    max_index_i[ci->count - 1] = 0;
+  }
+
+  /* Find the rightmost active particle in cell j that interacts with any
+   * particle in cell i. */
+  last_pj = -1;
+  active_id = last_pj;
+  while (last_pj < cj->count &&
+         sort_j[last_pj + 1].d - max(hj_max, hi_max) - dx_max < di_max) {
+    last_pj++;
+    /* Store the index of the particle if it is active. */
+    if (part_is_active_no_debug(&parts_j[sort_j[last_pj].i], max_active_bin))
+      active_id = last_pj;
+  }
+
+  /* Set the last active pj in range of any particle in cell i. */
+  last_pj = active_id;
+
+  /* Find the maximum index into cell i for each particle in range in cell j. */
+  if (last_pj >= 0) {
+
+    /* Start from the last particle in cell i. */
+    temp = ci->count - 1;
+
+    const struct part *pj = &parts_j[sort_j[last_pj].i];
+    const float last_dj = sort_j[last_pj].d - dx_max -
+                          max(pj->h, hi_max_raw) * kernel_gamma + rshift;
+
+    /* Loop through particles in cell i until they are not in range of pj. */
+    while (temp > 0 && last_dj < sort_i[temp].d) temp--;
+
+    max_index_j[last_pj] = temp;
+
+    /* Populate max_index_j for remaining particles that are within range. */
+    for (int i = last_pj - 1; i >= 0; i--) {
+      temp = max_index_j[i + 1];
+      pj = &parts_j[sort_j[i].i];
+
+      const float dj = sort_j[i].d - dx_max -
+                       (max(pj->h, hi_max_raw) * kernel_gamma) + rshift;
+
+      while (temp > 0 && dj < sort_i[temp].d) temp--;
+
+      max_index_j[i] = temp;
+    }
+  } else {
+    /* Make sure that max index is set to last particle in ci.*/
+    max_index_j[0] = ci->count - 1;
+  }
+
+  *init_pi = first_pi;
+  *init_pj = last_pj;
+}
+
 #endif /* WITH_VECTORIZATION */
 
 /**
@@ -370,20 +528,17 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
     struct runner *r, struct cell *restrict c) {
 
 #ifdef WITH_VECTORIZATION
-  const struct engine *e = r->e;
-  struct part *restrict pi;
-  int count_align;
-  int num_vec_proc = NUM_VEC_PROC;
+  const int num_vec_proc = NUM_VEC_PROC;
 
+  /* Get some local variables */
+  const struct engine *e = r->e;
+  const timebin_t max_active_bin = e->max_active_bin;
   struct part *restrict parts = c->parts;
   const int count = c->count;
 
-  const timebin_t max_active_bin = e->max_active_bin;
-
-  vector v_hi, v_vix, v_viy, v_viz, v_hig2, v_r2;
-
-  TIMER_TIC
+  TIMER_TIC;
 
+  /* Anything to do here? */
   if (!cell_is_active(c, e)) return;
 
   if (!cell_are_part_drifted(c, e)) error("Interacting undrifted cell.");
@@ -407,88 +562,78 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
   for (int pid = 0; pid < count; pid++) {
 
     /* Get a pointer to the ith particle. */
-    pi = &parts[pid];
+    struct part *restrict pi = &parts[pid];
 
     /* Is the ith particle active? */
     if (!part_is_active_no_debug(pi, max_active_bin)) continue;
 
-    vector pix, piy, piz;
+    vector v_r2;
 
     const float hi = cell_cache->h[pid];
 
     /* Fill particle pi vectors. */
-    pix.v = vec_set1(cell_cache->x[pid]);
-    piy.v = vec_set1(cell_cache->y[pid]);
-    piz.v = vec_set1(cell_cache->z[pid]);
-    v_hi.v = vec_set1(hi);
-    v_vix.v = vec_set1(cell_cache->vx[pid]);
-    v_viy.v = vec_set1(cell_cache->vy[pid]);
-    v_viz.v = vec_set1(cell_cache->vz[pid]);
+    const vector v_pix = vector_set1(cell_cache->x[pid]);
+    const vector v_piy = vector_set1(cell_cache->y[pid]);
+    const vector v_piz = vector_set1(cell_cache->z[pid]);
+    const vector v_hi = vector_set1(hi);
+    const vector v_vix = vector_set1(cell_cache->vx[pid]);
+    const vector v_viy = vector_set1(cell_cache->vy[pid]);
+    const vector v_viz = vector_set1(cell_cache->vz[pid]);
 
     const float hig2 = hi * hi * kernel_gamma2;
-    v_hig2.v = vec_set1(hig2);
-
-    /* Reset cumulative sums of update vectors. */
-    vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum,
-        curlvySum, curlvzSum;
+    const vector v_hig2 = vector_set1(hig2);
 
     /* Get the inverse of hi. */
-    vector v_hi_inv;
-
-    v_hi_inv = vec_reciprocal(v_hi);
+    vector v_hi_inv = vec_reciprocal(v_hi);
 
-    rhoSum.v = vec_setzero();
-    rho_dhSum.v = vec_setzero();
-    wcountSum.v = vec_setzero();
-    wcount_dhSum.v = vec_setzero();
-    div_vSum.v = vec_setzero();
-    curlvxSum.v = vec_setzero();
-    curlvySum.v = vec_setzero();
-    curlvzSum.v = vec_setzero();
+    /* Reset cumulative sums of update vectors. */
+    vector v_rhoSum = vector_setzero();
+    vector v_rho_dhSum = vector_setzero();
+    vector v_wcountSum = vector_setzero();
+    vector v_wcount_dhSum = vector_setzero();
+    vector v_div_vSum = vector_setzero();
+    vector v_curlvxSum = vector_setzero();
+    vector v_curlvySum = vector_setzero();
+    vector v_curlvzSum = vector_setzero();
 
     /* Pad cache if there is a serial remainder. */
-    count_align = count;
-    int rem = count % (num_vec_proc * VEC_SIZE);
+    int count_align = count;
+    const int rem = count % (num_vec_proc * VEC_SIZE);
     if (rem != 0) {
-      int pad = (num_vec_proc * VEC_SIZE) - rem;
-
-      count_align += pad;
+      count_align += (num_vec_proc * VEC_SIZE) - rem;
 
       /* Set positions to the same as particle pi so when the r2 > 0 mask is
        * applied these extra contributions are masked out.*/
       for (int i = count; i < count_align; i++) {
-        cell_cache->x[i] = pix.f[0];
-        cell_cache->y[i] = piy.f[0];
-        cell_cache->z[i] = piz.f[0];
+        cell_cache->x[i] = v_pix.f[0];
+        cell_cache->y[i] = v_piy.f[0];
+        cell_cache->z[i] = v_piz.f[0];
       }
     }
 
-    vector pjx, pjy, pjz;
-    vector pjx2, pjy2, pjz2;
-
     /* Find all of particle pi's interacions and store needed values in the
      * secondary cache.*/
     for (int pjd = 0; pjd < count_align; pjd += (num_vec_proc * VEC_SIZE)) {
 
       /* Load 2 sets of vectors from the particle cache. */
-      pjx.v = vec_load(&cell_cache->x[pjd]);
-      pjy.v = vec_load(&cell_cache->y[pjd]);
-      pjz.v = vec_load(&cell_cache->z[pjd]);
+      const vector v_pjx = vector_load(&cell_cache->x[pjd]);
+      const vector v_pjy = vector_load(&cell_cache->y[pjd]);
+      const vector v_pjz = vector_load(&cell_cache->z[pjd]);
 
-      pjx2.v = vec_load(&cell_cache->x[pjd + VEC_SIZE]);
-      pjy2.v = vec_load(&cell_cache->y[pjd + VEC_SIZE]);
-      pjz2.v = vec_load(&cell_cache->z[pjd + VEC_SIZE]);
+      const vector v_pjx2 = vector_load(&cell_cache->x[pjd + VEC_SIZE]);
+      const vector v_pjy2 = vector_load(&cell_cache->y[pjd + VEC_SIZE]);
+      const vector v_pjz2 = vector_load(&cell_cache->z[pjd + VEC_SIZE]);
 
       /* Compute the pairwise distance. */
       vector v_dx, v_dy, v_dz;
       vector v_dx_2, v_dy_2, v_dz_2, v_r2_2;
 
-      v_dx.v = vec_sub(pix.v, pjx.v);
-      v_dx_2.v = vec_sub(pix.v, pjx2.v);
-      v_dy.v = vec_sub(piy.v, pjy.v);
-      v_dy_2.v = vec_sub(piy.v, pjy2.v);
-      v_dz.v = vec_sub(piz.v, pjz.v);
-      v_dz_2.v = vec_sub(piz.v, pjz2.v);
+      v_dx.v = vec_sub(v_pix.v, v_pjx.v);
+      v_dx_2.v = vec_sub(v_pix.v, v_pjx2.v);
+      v_dy.v = vec_sub(v_piy.v, v_pjy.v);
+      v_dy_2.v = vec_sub(v_piy.v, v_pjy2.v);
+      v_dz.v = vec_sub(v_piz.v, v_pjz.v);
+      v_dz_2.v = vec_sub(v_piz.v, v_pjz2.v);
 
       v_r2.v = vec_mul(v_dx.v, v_dx.v);
       v_r2_2.v = vec_mul(v_dx_2.v, v_dx_2.v);
@@ -500,7 +645,6 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
       /* Form a mask from r2 < hig2 and r2 > 0.*/
       mask_t v_doi_mask, v_doi_mask_self_check, v_doi_mask2,
           v_doi_mask2_self_check;
-      int doi_mask, doi_mask_self_check, doi_mask2, doi_mask2_self_check;
 
       /* Form r2 > 0 mask and r2 < hig2 mask. */
       vec_create_mask(v_doi_mask_self_check, vec_cmp_gt(v_r2.v, vec_setzero()));
@@ -511,39 +655,35 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
                       vec_cmp_gt(v_r2_2.v, vec_setzero()));
       vec_create_mask(v_doi_mask2, vec_cmp_lt(v_r2_2.v, v_hig2.v));
 
-      /* Form integer masks. */
-      doi_mask_self_check = vec_form_int_mask(v_doi_mask_self_check);
-      doi_mask = vec_form_int_mask(v_doi_mask);
-
-      doi_mask2_self_check = vec_form_int_mask(v_doi_mask2_self_check);
-      doi_mask2 = vec_form_int_mask(v_doi_mask2);
-
-      /* Combine the two masks. */
-      doi_mask = doi_mask & doi_mask_self_check;
-      doi_mask2 = doi_mask2 & doi_mask2_self_check;
+      /* Combine two masks and form integer masks. */
+      const int doi_mask = vec_is_mask_true(v_doi_mask) &
+                           vec_is_mask_true(v_doi_mask_self_check);
+      const int doi_mask2 = vec_is_mask_true(v_doi_mask2) &
+                            vec_is_mask_true(v_doi_mask2_self_check);
 
       /* If there are any interactions left pack interaction values into c2
        * cache. */
       if (doi_mask) {
         storeInteractions(doi_mask, pjd, &v_r2, &v_dx, &v_dy, &v_dz, cell_cache,
-                          &int_cache, &icount, &rhoSum, &rho_dhSum, &wcountSum,
-                          &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum,
-                          &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz);
+                          &int_cache, &icount, &v_rhoSum, &v_rho_dhSum,
+                          &v_wcountSum, &v_wcount_dhSum, &v_div_vSum,
+                          &v_curlvxSum, &v_curlvySum, &v_curlvzSum, v_hi_inv,
+                          v_vix, v_viy, v_viz);
       }
       if (doi_mask2) {
         storeInteractions(doi_mask2, pjd + VEC_SIZE, &v_r2_2, &v_dx_2, &v_dy_2,
-                          &v_dz_2, cell_cache, &int_cache, &icount, &rhoSum,
-                          &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum,
-                          &curlvxSum, &curlvySum, &curlvzSum, v_hi_inv, v_vix,
-                          v_viy, v_viz);
+                          &v_dz_2, cell_cache, &int_cache, &icount, &v_rhoSum,
+                          &v_rho_dhSum, &v_wcountSum, &v_wcount_dhSum,
+                          &v_div_vSum, &v_curlvxSum, &v_curlvySum, &v_curlvzSum,
+                          v_hi_inv, v_vix, v_viy, v_viz);
       }
     }
 
     /* Perform padded vector remainder interactions if any are present. */
-    calcRemInteractions(&int_cache, icount, &rhoSum, &rho_dhSum, &wcountSum,
-                        &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum,
-                        &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz,
-                        &icount_align);
+    calcRemInteractions(&int_cache, icount, &v_rhoSum, &v_rho_dhSum,
+                        &v_wcountSum, &v_wcount_dhSum, &v_div_vSum,
+                        &v_curlvxSum, &v_curlvySum, &v_curlvzSum, v_hi_inv,
+                        v_vix, v_viy, v_viz, &icount_align);
 
     /* Initialise masks to true in case remainder interactions have been
      * performed. */
@@ -557,21 +697,21 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
           &int_cache.r2q[pjd], &int_cache.dxq[pjd], &int_cache.dyq[pjd],
           &int_cache.dzq[pjd], v_hi_inv, v_vix, v_viy, v_viz,
           &int_cache.vxq[pjd], &int_cache.vyq[pjd], &int_cache.vzq[pjd],
-          &int_cache.mq[pjd], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum,
-          &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, int_mask, int_mask2,
-          0);
+          &int_cache.mq[pjd], &v_rhoSum, &v_rho_dhSum, &v_wcountSum,
+          &v_wcount_dhSum, &v_div_vSum, &v_curlvxSum, &v_curlvySum,
+          &v_curlvzSum, int_mask, int_mask2, 0);
     }
 
     /* Perform horizontal adds on vector sums and store result in particle pi.
      */
-    VEC_HADD(rhoSum, pi->rho);
-    VEC_HADD(rho_dhSum, pi->density.rho_dh);
-    VEC_HADD(wcountSum, pi->density.wcount);
-    VEC_HADD(wcount_dhSum, pi->density.wcount_dh);
-    VEC_HADD(div_vSum, pi->density.div_v);
-    VEC_HADD(curlvxSum, pi->density.rot_v[0]);
-    VEC_HADD(curlvySum, pi->density.rot_v[1]);
-    VEC_HADD(curlvzSum, pi->density.rot_v[2]);
+    VEC_HADD(v_rhoSum, pi->rho);
+    VEC_HADD(v_rho_dhSum, pi->density.rho_dh);
+    VEC_HADD(v_wcountSum, pi->density.wcount);
+    VEC_HADD(v_wcount_dhSum, pi->density.wcount_dh);
+    VEC_HADD(v_div_vSum, pi->density.div_v);
+    VEC_HADD(v_curlvxSum, pi->density.rot_v[0]);
+    VEC_HADD(v_curlvySum, pi->density.rot_v[1]);
+    VEC_HADD(v_curlvzSum, pi->density.rot_v[2]);
 
     /* Reset interaction count. */
     icount = 0;
@@ -602,9 +742,6 @@ __attribute__((always_inline)) INLINE void runner_doself2_force_vec(
   struct part *restrict parts = c->parts;
   const int count = c->count;
 
-  vector v_hi, v_vix, v_viy, v_viz, v_hig2, v_r2;
-  vector v_rhoi, v_grad_hi, v_pOrhoi2, v_balsara_i, v_ci;
-
   TIMER_TIC;
 
   if (!cell_is_active(c, e)) return;
@@ -640,43 +777,36 @@ __attribute__((always_inline)) INLINE void runner_doself2_force_vec(
     /* Is the ith particle active? */
     if (!part_is_active_no_debug(pi, max_active_bin)) continue;
 
-    vector pix, piy, piz;
-
     const float hi = cell_cache->h[pid];
 
     /* Fill particle pi vectors. */
-    pix.v = vec_set1(cell_cache->x[pid]);
-    piy.v = vec_set1(cell_cache->y[pid]);
-    piz.v = vec_set1(cell_cache->z[pid]);
-    v_hi.v = vec_set1(hi);
-    v_vix.v = vec_set1(cell_cache->vx[pid]);
-    v_viy.v = vec_set1(cell_cache->vy[pid]);
-    v_viz.v = vec_set1(cell_cache->vz[pid]);
-
-    v_rhoi.v = vec_set1(cell_cache->rho[pid]);
-    v_grad_hi.v = vec_set1(cell_cache->grad_h[pid]);
-    v_pOrhoi2.v = vec_set1(cell_cache->pOrho2[pid]);
-    v_balsara_i.v = vec_set1(cell_cache->balsara[pid]);
-    v_ci.v = vec_set1(cell_cache->soundspeed[pid]);
+    const vector v_pix = vector_set1(cell_cache->x[pid]);
+    const vector v_piy = vector_set1(cell_cache->y[pid]);
+    const vector v_piz = vector_set1(cell_cache->z[pid]);
+    const vector v_hi = vector_set1(hi);
+    const vector v_vix = vector_set1(cell_cache->vx[pid]);
+    const vector v_viy = vector_set1(cell_cache->vy[pid]);
+    const vector v_viz = vector_set1(cell_cache->vz[pid]);
+
+    const vector v_rhoi = vector_set1(cell_cache->rho[pid]);
+    const vector v_grad_hi = vector_set1(cell_cache->grad_h[pid]);
+    const vector v_pOrhoi2 = vector_set1(cell_cache->pOrho2[pid]);
+    const vector v_balsara_i = vector_set1(cell_cache->balsara[pid]);
+    const vector v_ci = vector_set1(cell_cache->soundspeed[pid]);
 
     const float hig2 = hi * hi * kernel_gamma2;
-    v_hig2.v = vec_set1(hig2);
-
-    /* Reset cumulative sums of update vectors. */
-    vector a_hydro_xSum, a_hydro_ySum, a_hydro_zSum, h_dtSum, v_sigSum,
-        entropy_dtSum;
+    const vector v_hig2 = vector_set1(hig2);
 
     /* Get the inverse of hi. */
-    vector v_hi_inv;
-
-    v_hi_inv = vec_reciprocal(v_hi);
+    vector v_hi_inv = vec_reciprocal(v_hi);
 
-    a_hydro_xSum.v = vec_setzero();
-    a_hydro_ySum.v = vec_setzero();
-    a_hydro_zSum.v = vec_setzero();
-    h_dtSum.v = vec_setzero();
-    v_sigSum.v = vec_set1(pi->force.v_sig);
-    entropy_dtSum.v = vec_setzero();
+    /* Reset cumulative sums of update vectors. */
+    vector v_a_hydro_xSum = vector_setzero();
+    vector v_a_hydro_ySum = vector_setzero();
+    vector v_a_hydro_zSum = vector_setzero();
+    vector v_h_dtSum = vector_setzero();
+    vector v_sigSum = vector_set1(pi->force.v_sig);
+    vector v_entropy_dtSum = vector_setzero();
 
     /* Pad cache if there is a serial remainder. */
     count_align = count;
@@ -689,9 +819,9 @@ __attribute__((always_inline)) INLINE void runner_doself2_force_vec(
       /* Set positions to the same as particle pi so when the r2 > 0 mask is
        * applied these extra contributions are masked out.*/
       for (int i = count; i < count_align; i++) {
-        cell_cache->x[i] = pix.f[0];
-        cell_cache->y[i] = piy.f[0];
-        cell_cache->z[i] = piz.f[0];
+        cell_cache->x[i] = v_pix.f[0];
+        cell_cache->y[i] = v_piy.f[0];
+        cell_cache->z[i] = v_piz.f[0];
         cell_cache->h[i] = 1.f;
         cell_cache->rho[i] = 1.f;
         cell_cache->grad_h[i] = 1.f;
@@ -701,25 +831,23 @@ __attribute__((always_inline)) INLINE void runner_doself2_force_vec(
       }
     }
 
-    vector pjx, pjy, pjz, hj, hjg2;
-
     /* Find all of particle pi's interacions and store needed values in the
      * secondary cache.*/
     for (int pjd = 0; pjd < count_align; pjd += (num_vec_proc * VEC_SIZE)) {
 
       /* Load 1 set of vectors from the particle cache. */
-      pjx.v = vec_load(&cell_cache->x[pjd]);
-      pjy.v = vec_load(&cell_cache->y[pjd]);
-      pjz.v = vec_load(&cell_cache->z[pjd]);
-      hj.v = vec_load(&cell_cache->h[pjd]);
+      vector hjg2;
+      const vector v_pjx = vector_load(&cell_cache->x[pjd]);
+      const vector v_pjy = vector_load(&cell_cache->y[pjd]);
+      const vector v_pjz = vector_load(&cell_cache->z[pjd]);
+      const vector hj = vector_load(&cell_cache->h[pjd]);
       hjg2.v = vec_mul(vec_mul(hj.v, hj.v), kernel_gamma2_vec.v);
 
       /* Compute the pairwise distance. */
-      vector v_dx, v_dy, v_dz;
-
-      v_dx.v = vec_sub(pix.v, pjx.v);
-      v_dy.v = vec_sub(piy.v, pjy.v);
-      v_dz.v = vec_sub(piz.v, pjz.v);
+      vector v_dx, v_dy, v_dz, v_r2;
+      v_dx.v = vec_sub(v_pix.v, v_pjx.v);
+      v_dy.v = vec_sub(v_piy.v, v_pjy.v);
+      v_dz.v = vec_sub(v_piz.v, v_pjz.v);
 
       v_r2.v = vec_mul(v_dx.v, v_dx.v);
       v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v);
@@ -727,7 +855,6 @@ __attribute__((always_inline)) INLINE void runner_doself2_force_vec(
 
       /* Form r2 > 0 mask, r2 < hig2 mask and r2 < hjg2 mask. */
       mask_t v_doi_mask, v_doi_mask_self_check;
-      int doi_mask;
 
       /* Form r2 > 0 mask.*/
       vec_create_mask(v_doi_mask_self_check, vec_cmp_gt(v_r2.v, vec_setzero()));
@@ -737,18 +864,15 @@ __attribute__((always_inline)) INLINE void runner_doself2_force_vec(
       v_h2.v = vec_fmax(v_hig2.v, hjg2.v);
       vec_create_mask(v_doi_mask, vec_cmp_lt(v_r2.v, v_h2.v));
 
-      /* Combine all 3 masks and form integer mask. */
+      /* Combine all 3 masks. */
       vec_combine_masks(v_doi_mask, v_doi_mask_self_check);
-      doi_mask = vec_form_int_mask(v_doi_mask);
 
       /* If there are any interactions perform them. */
-      if (doi_mask) {
-        vector v_hj_inv;
-        v_hj_inv = vec_reciprocal(hj);
+      if (vec_is_mask_true(v_doi_mask)) {
+        vector v_hj_inv = vec_reciprocal(hj);
 
         /* To stop floating point exceptions for when particle separations are
-         * 0.
-        */
+         * 0. */
         v_r2.v = vec_add(v_r2.v, vec_set1(FLT_MIN));
 
         runner_iact_nonsym_1_vec_force(
@@ -757,19 +881,19 @@ __attribute__((always_inline)) INLINE void runner_doself2_force_vec(
             &cell_cache->vy[pjd], &cell_cache->vz[pjd], &cell_cache->rho[pjd],
             &cell_cache->grad_h[pjd], &cell_cache->pOrho2[pjd],
             &cell_cache->balsara[pjd], &cell_cache->soundspeed[pjd],
-            &cell_cache->m[pjd], v_hi_inv, v_hj_inv, &a_hydro_xSum,
-            &a_hydro_ySum, &a_hydro_zSum, &h_dtSum, &v_sigSum, &entropy_dtSum,
-            v_doi_mask);
+            &cell_cache->m[pjd], v_hi_inv, v_hj_inv, &v_a_hydro_xSum,
+            &v_a_hydro_ySum, &v_a_hydro_zSum, &v_h_dtSum, &v_sigSum,
+            &v_entropy_dtSum, v_doi_mask);
       }
 
     } /* Loop over all other particles. */
 
-    VEC_HADD(a_hydro_xSum, pi->a_hydro[0]);
-    VEC_HADD(a_hydro_ySum, pi->a_hydro[1]);
-    VEC_HADD(a_hydro_zSum, pi->a_hydro[2]);
-    VEC_HADD(h_dtSum, pi->force.h_dt);
+    VEC_HADD(v_a_hydro_xSum, pi->a_hydro[0]);
+    VEC_HADD(v_a_hydro_ySum, pi->a_hydro[1]);
+    VEC_HADD(v_a_hydro_zSum, pi->a_hydro[2]);
+    VEC_HADD(v_h_dtSum, pi->force.h_dt);
     VEC_HMAX(v_sigSum, pi->force.v_sig);
-    VEC_HADD(entropy_dtSum, pi->entropy_dt);
+    VEC_HADD(v_entropy_dtSum, pi->entropy_dt);
 
   } /* loop over all particles. */
 
@@ -795,8 +919,6 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
   const struct engine *restrict e = r->e;
   const timebin_t max_active_bin = e->max_active_bin;
 
-  vector v_hi, v_vix, v_viy, v_viz, v_hig2;
-
   TIMER_TIC;
 
   /* Get the cutoff shift. */
@@ -807,39 +929,6 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
   const struct entry *restrict sort_i = ci->sort[sid];
   const struct entry *restrict sort_j = cj->sort[sid];
 
-#ifdef SWIFT_DEBUG_CHECKS
-  /* Check that the dx_max_sort values in the cell are indeed an upper
-     bound on particle movement. */
-  for (int pid = 0; pid < ci->count; pid++) {
-    const struct part *p = &ci->parts[sort_i[pid].i];
-    const float d = p->x[0] * runner_shift[sid][0] +
-                    p->x[1] * runner_shift[sid][1] +
-                    p->x[2] * runner_shift[sid][2];
-    if (fabsf(d - sort_i[pid].d) - ci->dx_max_sort >
-        1.0e-4 * max(fabsf(d), ci->dx_max_sort_old))
-      error(
-          "particle shift diff exceeds dx_max_sort in cell ci. ci->nodeID=%d "
-          "cj->nodeID=%d d=%e sort_i[pid].d=%e ci->dx_max_sort=%e "
-          "ci->dx_max_sort_old=%e",
-          ci->nodeID, cj->nodeID, d, sort_i[pid].d, ci->dx_max_sort,
-          ci->dx_max_sort_old);
-  }
-  for (int pjd = 0; pjd < cj->count; pjd++) {
-    const struct part *p = &cj->parts[sort_j[pjd].i];
-    const float d = p->x[0] * runner_shift[sid][0] +
-                    p->x[1] * runner_shift[sid][1] +
-                    p->x[2] * runner_shift[sid][2];
-    if (fabsf(d - sort_j[pjd].d) - cj->dx_max_sort >
-        1.0e-4 * max(fabsf(d), cj->dx_max_sort_old))
-      error(
-          "particle shift diff exceeds dx_max_sort in cell cj. cj->nodeID=%d "
-          "ci->nodeID=%d d=%e sort_j[pjd].d=%e cj->dx_max_sort=%e "
-          "cj->dx_max_sort_old=%e",
-          cj->nodeID, ci->nodeID, d, sort_j[pjd].d, cj->dx_max_sort,
-          cj->dx_max_sort_old);
-  }
-#endif /* SWIFT_DEBUG_CHECKS */
-
   /* Get some other useful values. */
   const int count_i = ci->count;
   const int count_j = cj->count;
@@ -853,7 +942,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
   const int active_ci = cell_is_active(ci, e);
   const int active_cj = cell_is_active(cj, e);
 
-  /* Check if any particles are active and return if there are not. */
+  /* Count number of particles that are in range and active*/
   int numActive = 0;
 
   if (active_ci) {
@@ -878,6 +967,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
     }
   }
 
+  /* Return if there are no active particles within range */
   if (numActive == 0) return;
 
   /* Get both particle caches from the runner and re-allocate
@@ -900,8 +990,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
   max_index_j = r->cj_cache.max_index;
 
   /* Find particles maximum index into cj, max_index_i[] and ci, max_index_j[].
-  */
-  /* Also find the first pi that interacts with any particle in cj and the last
+   * Also find the first pi that interacts with any particle in cj and the last
    * pj that interacts with any particle in ci. */
   populate_max_index_no_cache(ci, cj, sort_i, sort_j, dx_max, rshift, hi_max,
                               hj_max, di_max, dj_min, max_index_i, max_index_j,
@@ -916,15 +1005,12 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
   last_pj = max(last_pj, max_index_i[count_i - 1]);
   first_pi = min(first_pi, max_index_j[0]);
 
-  /* Read the needed particles into the two caches. */
-  int first_pi_align = first_pi;
-  int last_pj_align = last_pj;
+  /* Read the required particles into the two caches. */
   cache_read_two_partial_cells_sorted(ci, cj, ci_cache, cj_cache, sort_i,
-                                      sort_j, shift, &first_pi_align,
-                                      &last_pj_align, 1);
+                                      sort_j, shift, &first_pi, &last_pj);
 
   /* Get the number of particles read into the ci cache. */
-  int ci_cache_count = count_i - first_pi_align;
+  const int ci_cache_count = count_i - first_pi;
 
   if (active_ci) {
 
@@ -936,7 +1022,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
       if (!part_is_active_no_debug(pi, max_active_bin)) continue;
 
       /* Set the cache index. */
-      int ci_cache_idx = pid - first_pi_align;
+      const int ci_cache_idx = pid - first_pi;
 
       /* Skip this particle if no particle in cj is within range of it. */
       const float hi = ci_cache->h[ci_cache_idx];
@@ -945,113 +1031,101 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
       if (di_test < dj_min) continue;
 
       /* Determine the exit iteration of the interaction loop. */
-      int exit_iteration = max_index_i[pid];
-
-      const float hig2 = hi * hi * kernel_gamma2;
-
-      vector pix, piy, piz;
+      const int exit_iteration = max_index_i[pid];
 
       /* Fill particle pi vectors. */
-      pix.v = vec_set1(ci_cache->x[ci_cache_idx]);
-      piy.v = vec_set1(ci_cache->y[ci_cache_idx]);
-      piz.v = vec_set1(ci_cache->z[ci_cache_idx]);
-      v_hi.v = vec_set1(hi);
-      v_vix.v = vec_set1(ci_cache->vx[ci_cache_idx]);
-      v_viy.v = vec_set1(ci_cache->vy[ci_cache_idx]);
-      v_viz.v = vec_set1(ci_cache->vz[ci_cache_idx]);
-
-      v_hig2.v = vec_set1(hig2);
+      const vector v_pix = vector_set1(ci_cache->x[ci_cache_idx]);
+      const vector v_piy = vector_set1(ci_cache->y[ci_cache_idx]);
+      const vector v_piz = vector_set1(ci_cache->z[ci_cache_idx]);
+      const vector v_hi = vector_set1(hi);
+      const vector v_vix = vector_set1(ci_cache->vx[ci_cache_idx]);
+      const vector v_viy = vector_set1(ci_cache->vy[ci_cache_idx]);
+      const vector v_viz = vector_set1(ci_cache->vz[ci_cache_idx]);
 
-      /* Reset cumulative sums of update vectors. */
-      vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum,
-          curlvySum, curlvzSum;
+      const float hig2 = hi * hi * kernel_gamma2;
+      const vector v_hig2 = vector_set1(hig2);
 
       /* Get the inverse of hi. */
-      vector v_hi_inv;
-
-      v_hi_inv = vec_reciprocal(v_hi);
+      vector v_hi_inv = vec_reciprocal(v_hi);
 
-      rhoSum.v = vec_setzero();
-      rho_dhSum.v = vec_setzero();
-      wcountSum.v = vec_setzero();
-      wcount_dhSum.v = vec_setzero();
-      div_vSum.v = vec_setzero();
-      curlvxSum.v = vec_setzero();
-      curlvySum.v = vec_setzero();
-      curlvzSum.v = vec_setzero();
+      /* Reset cumulative sums of update vectors. */
+      vector v_rhoSum = vector_setzero();
+      vector v_rho_dhSum = vector_setzero();
+      vector v_wcountSum = vector_setzero();
+      vector v_wcount_dhSum = vector_setzero();
+      vector v_div_vSum = vector_setzero();
+      vector v_curlvxSum = vector_setzero();
+      vector v_curlvySum = vector_setzero();
+      vector v_curlvzSum = vector_setzero();
 
       /* Pad the exit iteration if there is a serial remainder. */
       int exit_iteration_align = exit_iteration;
-      int rem = exit_iteration % VEC_SIZE;
+      const int rem = exit_iteration % VEC_SIZE;
       if (rem != 0) {
-        int pad = VEC_SIZE - rem;
+        const int pad = VEC_SIZE - rem;
 
-        if (exit_iteration_align + pad <= last_pj_align + 1)
+        if (exit_iteration_align + pad <= last_pj + 1)
           exit_iteration_align += pad;
       }
 
-      vector pjx, pjy, pjz;
-
-      /* Loop over the parts in cj. */
-      for (int pjd = 0; pjd < exit_iteration_align; pjd += VEC_SIZE) {
+      /* Loop over the parts in cj. Making sure to perform an iteration of the
+       * loop even if exit_iteration_align is zero and there is only one
+       * particle to interact with.*/
+      for (int pjd = 0; pjd <= exit_iteration_align; pjd += VEC_SIZE) {
 
         /* Get the cache index to the jth particle. */
-        int cj_cache_idx = pjd;
+        const int cj_cache_idx = pjd;
 
         vector v_dx, v_dy, v_dz, v_r2;
 
 #ifdef SWIFT_DEBUG_CHECKS
         if (cj_cache_idx % VEC_SIZE != 0 || cj_cache_idx < 0 ||
-            cj_cache_idx + (VEC_SIZE - 1) > (last_pj_align + 1 + VEC_SIZE)) {
-          error("Unaligned read!!! cj_cache_idx=%d, last_pj_align=%d",
-                cj_cache_idx, last_pj_align);
+            cj_cache_idx + (VEC_SIZE - 1) > (last_pj + 1 + VEC_SIZE)) {
+          error("Unaligned read!!! cj_cache_idx=%d, last_pj=%d", cj_cache_idx,
+                last_pj);
         }
 #endif
 
         /* Load 2 sets of vectors from the particle cache. */
-        pjx.v = vec_load(&cj_cache->x[cj_cache_idx]);
-        pjy.v = vec_load(&cj_cache->y[cj_cache_idx]);
-        pjz.v = vec_load(&cj_cache->z[cj_cache_idx]);
+        const vector v_pjx = vector_load(&cj_cache->x[cj_cache_idx]);
+        const vector v_pjy = vector_load(&cj_cache->y[cj_cache_idx]);
+        const vector v_pjz = vector_load(&cj_cache->z[cj_cache_idx]);
 
         /* Compute the pairwise distance. */
-        v_dx.v = vec_sub(pix.v, pjx.v);
-        v_dy.v = vec_sub(piy.v, pjy.v);
-        v_dz.v = vec_sub(piz.v, pjz.v);
+        v_dx.v = vec_sub(v_pix.v, v_pjx.v);
+        v_dy.v = vec_sub(v_piy.v, v_pjy.v);
+        v_dz.v = vec_sub(v_piz.v, v_pjz.v);
 
         v_r2.v = vec_mul(v_dx.v, v_dx.v);
         v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v);
         v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v);
 
         mask_t v_doi_mask;
-        int doi_mask;
 
         /* Form r2 < hig2 mask. */
         vec_create_mask(v_doi_mask, vec_cmp_lt(v_r2.v, v_hig2.v));
 
-        /* Form integer mask. */
-        doi_mask = vec_form_int_mask(v_doi_mask);
-
         /* If there are any interactions perform them. */
-        if (doi_mask)
+        if (vec_is_mask_true(v_doi_mask))
           runner_iact_nonsym_1_vec_density(
               &v_r2, &v_dx, &v_dy, &v_dz, v_hi_inv, v_vix, v_viy, v_viz,
               &cj_cache->vx[cj_cache_idx], &cj_cache->vy[cj_cache_idx],
-              &cj_cache->vz[cj_cache_idx], &cj_cache->m[cj_cache_idx], &rhoSum,
-              &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum,
-              &curlvySum, &curlvzSum, v_doi_mask);
+              &cj_cache->vz[cj_cache_idx], &cj_cache->m[cj_cache_idx],
+              &v_rhoSum, &v_rho_dhSum, &v_wcountSum, &v_wcount_dhSum,
+              &v_div_vSum, &v_curlvxSum, &v_curlvySum, &v_curlvzSum,
+              v_doi_mask);
 
       } /* loop over the parts in cj. */
 
-      /* Perform horizontal adds on vector sums and store result in particle pi.
-      */
-      VEC_HADD(rhoSum, pi->rho);
-      VEC_HADD(rho_dhSum, pi->density.rho_dh);
-      VEC_HADD(wcountSum, pi->density.wcount);
-      VEC_HADD(wcount_dhSum, pi->density.wcount_dh);
-      VEC_HADD(div_vSum, pi->density.div_v);
-      VEC_HADD(curlvxSum, pi->density.rot_v[0]);
-      VEC_HADD(curlvySum, pi->density.rot_v[1]);
-      VEC_HADD(curlvzSum, pi->density.rot_v[2]);
+      /* Perform horizontal adds on vector sums and store result in pi. */
+      VEC_HADD(v_rhoSum, pi->rho);
+      VEC_HADD(v_rho_dhSum, pi->density.rho_dh);
+      VEC_HADD(v_wcountSum, pi->density.wcount);
+      VEC_HADD(v_wcount_dhSum, pi->density.wcount_dh);
+      VEC_HADD(v_div_vSum, pi->density.div_v);
+      VEC_HADD(v_curlvxSum, pi->density.rot_v[0]);
+      VEC_HADD(v_curlvySum, pi->density.rot_v[1]);
+      VEC_HADD(v_curlvzSum, pi->density.rot_v[2]);
 
     } /* loop over the parts in ci. */
   }
@@ -1066,7 +1140,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
       if (!part_is_active_no_debug(pj, max_active_bin)) continue;
 
       /* Set the cache index. */
-      int cj_cache_idx = pjd;
+      const int cj_cache_idx = pjd;
 
       /* Skip this particle if no particle in ci is within range of it. */
       const float hj = cj_cache->h[cj_cache_idx];
@@ -1074,49 +1148,38 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
       if (dj_test > di_max) continue;
 
       /* Determine the exit iteration of the interaction loop. */
-      int exit_iteration = max_index_j[pjd];
-
-      const float hjg2 = hj * hj * kernel_gamma2;
-
-      vector pjx, pjy, pjz;
-      vector v_hj, v_vjx, v_vjy, v_vjz, v_hjg2;
+      const int exit_iteration = max_index_j[pjd];
 
       /* Fill particle pi vectors. */
-      pjx.v = vec_set1(cj_cache->x[cj_cache_idx]);
-      pjy.v = vec_set1(cj_cache->y[cj_cache_idx]);
-      pjz.v = vec_set1(cj_cache->z[cj_cache_idx]);
-      v_hj.v = vec_set1(hj);
-      v_vjx.v = vec_set1(cj_cache->vx[cj_cache_idx]);
-      v_vjy.v = vec_set1(cj_cache->vy[cj_cache_idx]);
-      v_vjz.v = vec_set1(cj_cache->vz[cj_cache_idx]);
+      const vector v_pjx = vector_set1(cj_cache->x[cj_cache_idx]);
+      const vector v_pjy = vector_set1(cj_cache->y[cj_cache_idx]);
+      const vector v_pjz = vector_set1(cj_cache->z[cj_cache_idx]);
+      const vector v_hj = vector_set1(hj);
+      const vector v_vjx = vector_set1(cj_cache->vx[cj_cache_idx]);
+      const vector v_vjy = vector_set1(cj_cache->vy[cj_cache_idx]);
+      const vector v_vjz = vector_set1(cj_cache->vz[cj_cache_idx]);
 
-      v_hjg2.v = vec_set1(hjg2);
-
-      /* Reset cumulative sums of update vectors. */
-      vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum,
-          curlvySum, curlvzSum;
+      const float hjg2 = hj * hj * kernel_gamma2;
+      const vector v_hjg2 = vector_set1(hjg2);
 
       /* Get the inverse of hj. */
-      vector v_hj_inv;
+      vector v_hj_inv = vec_reciprocal(v_hj);
 
-      v_hj_inv = vec_reciprocal(v_hj);
-
-      rhoSum.v = vec_setzero();
-      rho_dhSum.v = vec_setzero();
-      wcountSum.v = vec_setzero();
-      wcount_dhSum.v = vec_setzero();
-      div_vSum.v = vec_setzero();
-      curlvxSum.v = vec_setzero();
-      curlvySum.v = vec_setzero();
-      curlvzSum.v = vec_setzero();
-
-      vector pix, piy, piz;
+      /* Reset cumulative sums of update vectors. */
+      vector v_rhoSum = vector_setzero();
+      vector v_rho_dhSum = vector_setzero();
+      vector v_wcountSum = vector_setzero();
+      vector v_wcount_dhSum = vector_setzero();
+      vector v_div_vSum = vector_setzero();
+      vector v_curlvxSum = vector_setzero();
+      vector v_curlvySum = vector_setzero();
+      vector v_curlvzSum = vector_setzero();
 
       /* Convert exit iteration to cache indices. */
-      int exit_iteration_align = exit_iteration - first_pi_align;
+      int exit_iteration_align = exit_iteration - first_pi;
 
       /* Pad the exit iteration align so cache reads are aligned. */
-      int rem = exit_iteration_align % VEC_SIZE;
+      const int rem = exit_iteration_align % VEC_SIZE;
       if (exit_iteration_align < VEC_SIZE) {
         exit_iteration_align = 0;
       } else
@@ -1128,61 +1191,56 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
 
 #ifdef SWIFT_DEBUG_CHECKS
         if (ci_cache_idx % VEC_SIZE != 0 || ci_cache_idx < 0 ||
-            ci_cache_idx + (VEC_SIZE - 1) >
-                (count_i - first_pi_align + VEC_SIZE)) {
+            ci_cache_idx + (VEC_SIZE - 1) > (count_i - first_pi + VEC_SIZE)) {
           error(
-              "Unaligned read!!! ci_cache_idx=%d, first_pi_align=%d, "
+              "Unaligned read!!! ci_cache_idx=%d, first_pi=%d, "
               "count_i=%d",
-              ci_cache_idx, first_pi_align, count_i);
+              ci_cache_idx, first_pi, count_i);
         }
 #endif
 
         vector v_dx, v_dy, v_dz, v_r2;
 
         /* Load 2 sets of vectors from the particle cache. */
-        pix.v = vec_load(&ci_cache->x[ci_cache_idx]);
-        piy.v = vec_load(&ci_cache->y[ci_cache_idx]);
-        piz.v = vec_load(&ci_cache->z[ci_cache_idx]);
+        const vector v_pix = vector_load(&ci_cache->x[ci_cache_idx]);
+        const vector v_piy = vector_load(&ci_cache->y[ci_cache_idx]);
+        const vector v_piz = vector_load(&ci_cache->z[ci_cache_idx]);
 
         /* Compute the pairwise distance. */
-        v_dx.v = vec_sub(pjx.v, pix.v);
-        v_dy.v = vec_sub(pjy.v, piy.v);
-        v_dz.v = vec_sub(pjz.v, piz.v);
+        v_dx.v = vec_sub(v_pjx.v, v_pix.v);
+        v_dy.v = vec_sub(v_pjy.v, v_piy.v);
+        v_dz.v = vec_sub(v_pjz.v, v_piz.v);
 
         v_r2.v = vec_mul(v_dx.v, v_dx.v);
         v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v);
         v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v);
 
         mask_t v_doj_mask;
-        int doj_mask;
 
         /* Form r2 < hig2 mask. */
         vec_create_mask(v_doj_mask, vec_cmp_lt(v_r2.v, v_hjg2.v));
 
-        /* Form integer mask. */
-        doj_mask = vec_form_int_mask(v_doj_mask);
-
         /* If there are any interactions perform them. */
-        if (doj_mask)
+        if (vec_is_mask_true(v_doj_mask))
           runner_iact_nonsym_1_vec_density(
               &v_r2, &v_dx, &v_dy, &v_dz, v_hj_inv, v_vjx, v_vjy, v_vjz,
               &ci_cache->vx[ci_cache_idx], &ci_cache->vy[ci_cache_idx],
-              &ci_cache->vz[ci_cache_idx], &ci_cache->m[ci_cache_idx], &rhoSum,
-              &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum,
-              &curlvySum, &curlvzSum, v_doj_mask);
+              &ci_cache->vz[ci_cache_idx], &ci_cache->m[ci_cache_idx],
+              &v_rhoSum, &v_rho_dhSum, &v_wcountSum, &v_wcount_dhSum,
+              &v_div_vSum, &v_curlvxSum, &v_curlvySum, &v_curlvzSum,
+              v_doj_mask);
 
       } /* loop over the parts in ci. */
 
-      /* Perform horizontal adds on vector sums and store result in particle pj.
-      */
-      VEC_HADD(rhoSum, pj->rho);
-      VEC_HADD(rho_dhSum, pj->density.rho_dh);
-      VEC_HADD(wcountSum, pj->density.wcount);
-      VEC_HADD(wcount_dhSum, pj->density.wcount_dh);
-      VEC_HADD(div_vSum, pj->density.div_v);
-      VEC_HADD(curlvxSum, pj->density.rot_v[0]);
-      VEC_HADD(curlvySum, pj->density.rot_v[1]);
-      VEC_HADD(curlvzSum, pj->density.rot_v[2]);
+      /* Perform horizontal adds on vector sums and store result in pj. */
+      VEC_HADD(v_rhoSum, pj->rho);
+      VEC_HADD(v_rho_dhSum, pj->density.rho_dh);
+      VEC_HADD(v_wcountSum, pj->density.wcount);
+      VEC_HADD(v_wcount_dhSum, pj->density.wcount_dh);
+      VEC_HADD(v_div_vSum, pj->density.div_v);
+      VEC_HADD(v_curlvxSum, pj->density.rot_v[0]);
+      VEC_HADD(v_curlvySum, pj->density.rot_v[1]);
+      VEC_HADD(v_curlvzSum, pj->density.rot_v[2]);
 
     } /* loop over the parts in cj. */
   }
@@ -1191,3 +1249,381 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
 
 #endif /* WITH_VECTORIZATION */
 }
+
+/**
+ * @brief Compute the force interactions between a cell pair (non-symmetric)
+ * using vector intrinsics.
+ *
+ * @param r The #runner.
+ * @param ci The first #cell.
+ * @param cj The second #cell.
+ * @param sid The direction of the pair
+ * @param shift The shift vector to apply to the particles in ci.
+ */
+void runner_dopair2_force_vec(struct runner *r, struct cell *ci,
+                              struct cell *cj, const int sid,
+                              const double *shift) {
+
+#ifdef WITH_VECTORIZATION
+  const struct engine *restrict e = r->e;
+  const timebin_t max_active_bin = e->max_active_bin;
+
+  TIMER_TIC;
+
+  /* Get the cutoff shift. */
+  double rshift = 0.0;
+  for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k];
+
+  /* Pick-out the sorted lists. */
+  const struct entry *restrict sort_i = ci->sort[sid];
+  const struct entry *restrict sort_j = cj->sort[sid];
+
+  /* Get some other useful values. */
+  const int count_i = ci->count;
+  const int count_j = cj->count;
+  const double hi_max = ci->h_max * kernel_gamma - rshift;
+  const double hj_max = cj->h_max * kernel_gamma;
+  const double hi_max_raw = ci->h_max;
+  const double hj_max_raw = cj->h_max;
+  struct part *restrict parts_i = ci->parts;
+  struct part *restrict parts_j = cj->parts;
+  const double di_max = sort_i[count_i - 1].d - rshift;
+  const double dj_min = sort_j[0].d;
+  const float dx_max = (ci->dx_max_sort + cj->dx_max_sort);
+  const int active_ci = cell_is_active(ci, e);
+  const int active_cj = cell_is_active(cj, e);
+
+  /* Check if any particles are active and in range */
+  int numActive = 0;
+
+  /* Use the largest smoothing length to make sure that no interactions are
+   * missed. */
+  const double h_max = max(hi_max, hj_max);
+
+  if (active_ci) {
+    for (int pid = count_i - 1;
+         pid >= 0 && sort_i[pid].d + h_max + dx_max > dj_min; pid--) {
+      struct part *restrict pi = &parts_i[sort_i[pid].i];
+      if (part_is_active(pi, e)) {
+        numActive++;
+        break;
+      }
+    }
+  }
+
+  if (!numActive && active_cj) {
+    for (int pjd = 0; pjd < count_j && sort_j[pjd].d - h_max - dx_max < di_max;
+         pjd++) {
+      struct part *restrict pj = &parts_j[sort_j[pjd].i];
+      if (part_is_active_no_debug(pj, max_active_bin)) {
+        numActive++;
+        break;
+      }
+    }
+  }
+
+  /* Return if no active particle in range */
+  if (numActive == 0) return;
+
+  /* Get both particle caches from the runner and re-allocate
+   * them if they are not big enough for the cells. */
+  struct cache *restrict ci_cache = &r->ci_cache;
+  struct cache *restrict cj_cache = &r->cj_cache;
+
+  if (ci_cache->count < count_i) {
+    cache_init(ci_cache, count_i);
+  }
+  if (cj_cache->count < count_j) {
+    cache_init(cj_cache, count_j);
+  }
+
+  int first_pi, last_pj;
+  int *max_index_i SWIFT_CACHE_ALIGN;
+  int *max_index_j SWIFT_CACHE_ALIGN;
+
+  max_index_i = r->ci_cache.max_index;
+  max_index_j = r->cj_cache.max_index;
+
+  /* Find particles maximum distance into cj, max_di[] and ci, max_dj[]. */
+  /* Also find the first pi that interacts with any particle in cj and the last
+   * pj that interacts with any particle in ci. */
+  populate_max_index_no_cache_force(ci, cj, sort_i, sort_j, dx_max, rshift,
+                                    hi_max_raw, hj_max_raw, hi_max, hj_max,
+                                    di_max, dj_min, max_index_i, max_index_j,
+                                    &first_pi, &last_pj, max_active_bin);
+
+  /* Limits of the outer loops. */
+  const int first_pi_loop = first_pi;
+  const int last_pj_loop = last_pj;
+
+  /* Take the max/min of both values calculated to work out how many particles
+   * to read into the cache. */
+  last_pj = max(last_pj, max_index_i[count_i - 1]);
+  first_pi = min(first_pi, max_index_j[0]);
+
+  /* Read the required particles into the two caches. */
+  cache_read_two_partial_cells_sorted_force(ci, cj, ci_cache, cj_cache, sort_i,
+                                            sort_j, shift, &first_pi, &last_pj);
+
+  /* Get the number of particles read into the ci cache. */
+  const int ci_cache_count = count_i - first_pi;
+
+  if (active_ci) {
+
+    /* Loop over the parts in ci until nothing is within range in cj. */
+    for (int pid = count_i - 1; pid >= first_pi_loop; pid--) {
+
+      /* Get a hold of the ith part in ci. */
+      struct part *restrict pi = &parts_i[sort_i[pid].i];
+      if (!part_is_active(pi, e)) continue;
+
+      /* Set the cache index. */
+      const int ci_cache_idx = pid - first_pi;
+
+      /* Skip this particle if no particle in cj is within range of it. */
+      const float hi = ci_cache->h[ci_cache_idx];
+      const double di_test =
+          sort_i[pid].d + max(hi, hj_max_raw) * kernel_gamma + dx_max - rshift;
+      if (di_test < dj_min) continue;
+
+      /* Determine the exit iteration of the interaction loop. */
+      const int exit_iteration = max_index_i[pid];
+
+      /* Fill particle pi vectors. */
+      const vector v_pix = vector_set1(ci_cache->x[ci_cache_idx]);
+      const vector v_piy = vector_set1(ci_cache->y[ci_cache_idx]);
+      const vector v_piz = vector_set1(ci_cache->z[ci_cache_idx]);
+      const vector v_hi = vector_set1(hi);
+      const vector v_vix = vector_set1(ci_cache->vx[ci_cache_idx]);
+      const vector v_viy = vector_set1(ci_cache->vy[ci_cache_idx]);
+      const vector v_viz = vector_set1(ci_cache->vz[ci_cache_idx]);
+      const vector v_rhoi = vector_set1(ci_cache->rho[ci_cache_idx]);
+      const vector v_grad_hi = vector_set1(ci_cache->grad_h[ci_cache_idx]);
+      const vector v_pOrhoi2 = vector_set1(ci_cache->pOrho2[ci_cache_idx]);
+      const vector v_balsara_i = vector_set1(ci_cache->balsara[ci_cache_idx]);
+      const vector v_ci = vector_set1(ci_cache->soundspeed[ci_cache_idx]);
+
+      const float hig2 = hi * hi * kernel_gamma2;
+      const vector v_hig2 = vector_set1(hig2);
+
+      /* Get the inverse of hi. */
+      vector v_hi_inv = vec_reciprocal(v_hi);
+
+      /* Reset cumulative sums of update vectors. */
+      vector v_a_hydro_xSum = vector_setzero();
+      vector v_a_hydro_ySum = vector_setzero();
+      vector v_a_hydro_zSum = vector_setzero();
+      vector v_h_dtSum = vector_setzero();
+      vector v_sigSum = vector_set1(pi->force.v_sig);
+      vector v_entropy_dtSum = vector_setzero();
+
+      /* Pad the exit iteration if there is a serial remainder. */
+      int exit_iteration_align = exit_iteration;
+      const int rem = exit_iteration % VEC_SIZE;
+      if (rem != 0) {
+        int pad = VEC_SIZE - rem;
+
+        if (exit_iteration_align + pad <= last_pj + 1)
+          exit_iteration_align += pad;
+      }
+
+      /* Loop over the parts in cj. Making sure to perform an iteration of the
+       * loop even if exit_iteration_align is zero and there is only one
+       * particle to interact with.*/
+      for (int pjd = 0; pjd <= exit_iteration_align; pjd += VEC_SIZE) {
+
+        /* Get the cache index to the jth particle. */
+        const int cj_cache_idx = pjd;
+
+        vector v_dx, v_dy, v_dz, v_r2;
+        vector v_hjg2;
+
+#ifdef SWIFT_DEBUG_CHECKS
+        if (cj_cache_idx % VEC_SIZE != 0 || cj_cache_idx < 0 ||
+            cj_cache_idx + (VEC_SIZE - 1) > (last_pj + 1 + VEC_SIZE)) {
+          error("Unaligned read!!! cj_cache_idx=%d, last_pj=%d", cj_cache_idx,
+                last_pj);
+        }
+#endif
+
+        /* Load 2 sets of vectors from the particle cache. */
+        const vector v_pjx = vector_load(&cj_cache->x[cj_cache_idx]);
+        const vector v_pjy = vector_load(&cj_cache->y[cj_cache_idx]);
+        const vector v_pjz = vector_load(&cj_cache->z[cj_cache_idx]);
+        const vector v_hj = vector_load(&cj_cache->h[cj_cache_idx]);
+        v_hjg2.v = vec_mul(vec_mul(v_hj.v, v_hj.v), kernel_gamma2_vec.v);
+
+        /* Compute the pairwise distance. */
+        v_dx.v = vec_sub(v_pix.v, v_pjx.v);
+        v_dy.v = vec_sub(v_piy.v, v_pjy.v);
+        v_dz.v = vec_sub(v_piz.v, v_pjz.v);
+
+        v_r2.v = vec_mul(v_dx.v, v_dx.v);
+        v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v);
+        v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v);
+
+        mask_t v_doi_mask;
+
+        /* Form a mask from r2 < hig2 mask and r2 < hjg2 mask. */
+        vector v_h2;
+        v_h2.v = vec_fmax(v_hig2.v, v_hjg2.v);
+        vec_create_mask(v_doi_mask, vec_cmp_lt(v_r2.v, v_h2.v));
+
+        /* If there are any interactions perform them. */
+        if (vec_is_mask_true(v_doi_mask)) {
+          vector v_hj_inv = vec_reciprocal(v_hj);
+
+          runner_iact_nonsym_1_vec_force(
+              &v_r2, &v_dx, &v_dy, &v_dz, v_vix, v_viy, v_viz, v_rhoi,
+              v_grad_hi, v_pOrhoi2, v_balsara_i, v_ci,
+              &cj_cache->vx[cj_cache_idx], &cj_cache->vy[cj_cache_idx],
+              &cj_cache->vz[cj_cache_idx], &cj_cache->rho[cj_cache_idx],
+              &cj_cache->grad_h[cj_cache_idx], &cj_cache->pOrho2[cj_cache_idx],
+              &cj_cache->balsara[cj_cache_idx],
+              &cj_cache->soundspeed[cj_cache_idx], &cj_cache->m[cj_cache_idx],
+              v_hi_inv, v_hj_inv, &v_a_hydro_xSum, &v_a_hydro_ySum,
+              &v_a_hydro_zSum, &v_h_dtSum, &v_sigSum, &v_entropy_dtSum,
+              v_doi_mask);
+        }
+
+      } /* loop over the parts in cj. */
+
+      /* Perform horizontal adds on vector sums and store result in pi. */
+      VEC_HADD(v_a_hydro_xSum, pi->a_hydro[0]);
+      VEC_HADD(v_a_hydro_ySum, pi->a_hydro[1]);
+      VEC_HADD(v_a_hydro_zSum, pi->a_hydro[2]);
+      VEC_HADD(v_h_dtSum, pi->force.h_dt);
+      VEC_HMAX(v_sigSum, pi->force.v_sig);
+      VEC_HADD(v_entropy_dtSum, pi->entropy_dt);
+
+    } /* loop over the parts in ci. */
+  }
+
+  if (active_cj) {
+
+    /* Loop over the parts in cj until nothing is within range in ci. */
+    for (int pjd = 0; pjd <= last_pj_loop; pjd++) {
+
+      /* Get a hold of the jth part in cj. */
+      struct part *restrict pj = &parts_j[sort_j[pjd].i];
+      if (!part_is_active(pj, e)) continue;
+
+      /* Set the cache index. */
+      const int cj_cache_idx = pjd;
+
+      /* Skip this particle if no particle in ci is within range of it. */
+      const float hj = cj_cache->h[cj_cache_idx];
+      const double dj_test =
+          sort_j[pjd].d - max(hj, hi_max_raw) * kernel_gamma - dx_max;
+      if (dj_test > di_max) continue;
+
+      /* Determine the exit iteration of the interaction loop. */
+      const int exit_iteration = max_index_j[pjd];
+
+      /* Fill particle pi vectors. */
+      const vector v_pjx = vector_set1(cj_cache->x[cj_cache_idx]);
+      const vector v_pjy = vector_set1(cj_cache->y[cj_cache_idx]);
+      const vector v_pjz = vector_set1(cj_cache->z[cj_cache_idx]);
+      const vector v_hj = vector_set1(hj);
+      const vector v_vjx = vector_set1(cj_cache->vx[cj_cache_idx]);
+      const vector v_vjy = vector_set1(cj_cache->vy[cj_cache_idx]);
+      const vector v_vjz = vector_set1(cj_cache->vz[cj_cache_idx]);
+      const vector v_rhoj = vector_set1(cj_cache->rho[cj_cache_idx]);
+      const vector v_grad_hj = vector_set1(cj_cache->grad_h[cj_cache_idx]);
+      const vector v_pOrhoj2 = vector_set1(cj_cache->pOrho2[cj_cache_idx]);
+      const vector v_balsara_j = vector_set1(cj_cache->balsara[cj_cache_idx]);
+      const vector v_cj = vector_set1(cj_cache->soundspeed[cj_cache_idx]);
+
+      const float hjg2 = hj * hj * kernel_gamma2;
+      const vector v_hjg2 = vector_set1(hjg2);
+
+      /* Get the inverse of hj. */
+      vector v_hj_inv = vec_reciprocal(v_hj);
+
+      /* Reset cumulative sums of update vectors. */
+      vector v_a_hydro_xSum = vector_setzero();
+      vector v_a_hydro_ySum = vector_setzero();
+      vector v_a_hydro_zSum = vector_setzero();
+      vector v_h_dtSum = vector_setzero();
+      vector v_sigSum = vector_set1(pj->force.v_sig);
+      vector v_entropy_dtSum = vector_setzero();
+
+      /* Convert exit iteration to cache indices. */
+      int exit_iteration_align = exit_iteration - first_pi;
+
+      /* Pad the exit iteration align so cache reads are aligned. */
+      const int rem = exit_iteration_align % VEC_SIZE;
+      if (exit_iteration_align < VEC_SIZE) {
+        exit_iteration_align = 0;
+      } else
+        exit_iteration_align -= rem;
+
+      /* Loop over the parts in ci. */
+      for (int ci_cache_idx = exit_iteration_align;
+           ci_cache_idx < ci_cache_count; ci_cache_idx += VEC_SIZE) {
+
+#ifdef SWIFT_DEBUG_CHECKS
+        if (ci_cache_idx % VEC_SIZE != 0 || ci_cache_idx < 0) {
+          error("Unaligned read!!! ci_cache_idx=%d", ci_cache_idx);
+        }
+#endif
+
+        vector v_hig2;
+        vector v_dx, v_dy, v_dz, v_r2;
+
+        /* Load 2 sets of vectors from the particle cache. */
+        const vector v_pix = vector_load(&ci_cache->x[ci_cache_idx]);
+        const vector v_piy = vector_load(&ci_cache->y[ci_cache_idx]);
+        const vector v_piz = vector_load(&ci_cache->z[ci_cache_idx]);
+        const vector v_hi = vector_load(&ci_cache->h[ci_cache_idx]);
+        v_hig2.v = vec_mul(vec_mul(v_hi.v, v_hi.v), kernel_gamma2_vec.v);
+
+        /* Compute the pairwise distance. */
+        v_dx.v = vec_sub(v_pjx.v, v_pix.v);
+        v_dy.v = vec_sub(v_pjy.v, v_piy.v);
+        v_dz.v = vec_sub(v_pjz.v, v_piz.v);
+
+        v_r2.v = vec_mul(v_dx.v, v_dx.v);
+        v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v);
+        v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v);
+
+        mask_t v_doj_mask;
+
+        /* Form a mask from r2 < hig2 mask and r2 < hjg2 mask. */
+        vector v_h2;
+        v_h2.v = vec_fmax(v_hjg2.v, v_hig2.v);
+        vec_create_mask(v_doj_mask, vec_cmp_lt(v_r2.v, v_h2.v));
+
+        /* If there are any interactions perform them. */
+        if (vec_is_mask_true(v_doj_mask)) {
+          vector v_hi_inv = vec_reciprocal(v_hi);
+
+          runner_iact_nonsym_1_vec_force(
+              &v_r2, &v_dx, &v_dy, &v_dz, v_vjx, v_vjy, v_vjz, v_rhoj,
+              v_grad_hj, v_pOrhoj2, v_balsara_j, v_cj,
+              &ci_cache->vx[ci_cache_idx], &ci_cache->vy[ci_cache_idx],
+              &ci_cache->vz[ci_cache_idx], &ci_cache->rho[ci_cache_idx],
+              &ci_cache->grad_h[ci_cache_idx], &ci_cache->pOrho2[ci_cache_idx],
+              &ci_cache->balsara[ci_cache_idx],
+              &ci_cache->soundspeed[ci_cache_idx], &ci_cache->m[ci_cache_idx],
+              v_hj_inv, v_hi_inv, &v_a_hydro_xSum, &v_a_hydro_ySum,
+              &v_a_hydro_zSum, &v_h_dtSum, &v_sigSum, &v_entropy_dtSum,
+              v_doj_mask);
+        }
+      } /* loop over the parts in ci. */
+
+      /* Perform horizontal adds on vector sums and store result in pj. */
+      VEC_HADD(v_a_hydro_xSum, pj->a_hydro[0]);
+      VEC_HADD(v_a_hydro_ySum, pj->a_hydro[1]);
+      VEC_HADD(v_a_hydro_zSum, pj->a_hydro[2]);
+      VEC_HADD(v_h_dtSum, pj->force.h_dt);
+      VEC_HMAX(v_sigSum, pj->force.v_sig);
+      VEC_HADD(v_entropy_dtSum, pj->entropy_dt);
+
+    } /* loop over the parts in cj. */
+
+    TIMER_TOC(timer_dopair_density);
+  }
+
+#endif /* WITH_VECTORIZATION */
+}
diff --git a/src/runner_doiact_vec.h b/src/runner_doiact_vec.h
index 09dc76ef04df5d29ea32f4af24efdc09e433aa73..9d60c75b9545bfbee8023224e34d16d362560a3f 100644
--- a/src/runner_doiact_vec.h
+++ b/src/runner_doiact_vec.h
@@ -39,5 +39,8 @@ void runner_doself2_force_vec(struct runner *r, struct cell *restrict c);
 void runner_dopair1_density_vec(struct runner *r, struct cell *restrict ci,
                                 struct cell *restrict cj, const int sid,
                                 const double *shift);
+void runner_dopair2_force_vec(struct runner *r, struct cell *restrict ci,
+                              struct cell *restrict cj, const int sid,
+                              const double *shift);
 
 #endif /* SWIFT_RUNNER_VEC_H */
diff --git a/src/tools.c b/src/tools.c
index 3ee55db3d5f5348699372d2620b6d15af38b23d0..89d89e62ea092ba6c6ec661e423d3e0ee44eb7fe 100644
--- a/src/tools.c
+++ b/src/tools.c
@@ -254,6 +254,7 @@ void pairs_all_force(struct runner *r, struct cell *ci, struct cell *cj) {
   float r2, hi, hj, hig2, hjg2, dx[3];
   struct part *pi, *pj;
   const double dim[3] = {r->e->s->dim[0], r->e->s->dim[1], r->e->s->dim[2]};
+  const struct engine *e = r->e;
 
   /* Implements a double-for loop and checks every interaction */
   for (int i = 0; i < ci->count; ++i) {
@@ -262,6 +263,9 @@ void pairs_all_force(struct runner *r, struct cell *ci, struct cell *cj) {
     hi = pi->h;
     hig2 = hi * hi * kernel_gamma2;
 
+    /* Skip inactive particles. */
+    if (!part_is_active(pi, e)) continue;
+
     for (int j = 0; j < cj->count; ++j) {
 
       pj = &cj->parts[j];
@@ -292,6 +296,9 @@ void pairs_all_force(struct runner *r, struct cell *ci, struct cell *cj) {
     hj = pj->h;
     hjg2 = hj * hj * kernel_gamma2;
 
+    /* Skip inactive particles. */
+    if (!part_is_active(pj, e)) continue;
+
     for (int i = 0; i < ci->count; ++i) {
 
       pi = &ci->parts[i];
@@ -510,9 +517,7 @@ void shuffle_particles(struct part *parts, const int count) {
 
       parts[i] = particle;
     }
-
-  } else
-    error("Array not big enough to shuffle!");
+  }
 }
 
 /**
diff --git a/src/vector.h b/src/vector.h
index 3222a8a9b6a1bf7f5dc02e914a168d7c5c911b68..15a246d9b6a4d991638f7eb801e6173d977b1273 100644
--- a/src/vector.h
+++ b/src/vector.h
@@ -20,9 +20,6 @@
 #ifndef SWIFT_VECTOR_H
 #define SWIFT_VECTOR_H
 
-/* Have I already read this file? */
-#ifndef VEC_MACRO
-
 /* Config parameters. */
 #include "../config.h"
 
@@ -84,7 +81,7 @@
 #define vec_cmp_lte(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ)
 #define vec_cmp_gte(a, b) _mm512_cmp_ps_mask(a, b, _CMP_GE_OQ)
 #define vec_cmp_result(a) ({ a; })
-#define vec_form_int_mask(a) ({ a; })
+#define vec_is_mask_true(a) ({ a; })
 #define vec_and(a, b) _mm512_and_ps(a, b)
 #define vec_mask_and(a, b) _mm512_kand(a, b)
 #define vec_and_mask(a, mask) _mm512_maskz_mov_ps(mask, a)
@@ -182,7 +179,7 @@
 #define vec_cmp_lte(a, b) _mm256_cmp_ps(a, b, _CMP_LE_OQ)
 #define vec_cmp_gte(a, b) _mm256_cmp_ps(a, b, _CMP_GE_OQ)
 #define vec_cmp_result(a) _mm256_movemask_ps(a)
-#define vec_form_int_mask(a) _mm256_movemask_ps(a.v)
+#define vec_is_mask_true(a) _mm256_movemask_ps(a.v)
 #define vec_and(a, b) _mm256_and_ps(a, b)
 #define vec_mask_and(a, b) _mm256_and_ps(a.v, b.v)
 #define vec_and_mask(a, mask) _mm256_and_ps(a, mask.v)
@@ -431,11 +428,48 @@ __attribute__((always_inline)) INLINE vector vec_reciprocal_sqrt(vector x) {
   return x_inv;
 }
 
+/**
+ * @brief Returns a new vector with data loaded from a memory address.
+ *
+ * @param x memory address to load from.
+ * @return Loaded #vector.
+ */
+__attribute__((always_inline)) INLINE vector vector_load(float *const x) {
+
+  vector temp;
+  temp.v = vec_load(x);
+  return temp;
+}
+
+/**
+ * @brief Returns a vector filled with one value.
+ *
+ * @param x value to set each element.
+ * @return A #vector filled with a given constant.
+ */
+__attribute__((always_inline)) INLINE vector vector_set1(const float x) {
+
+  vector temp;
+  temp.v = vec_set1(x);
+  return temp;
+}
+
+/**
+ * @brief Returns a new vector filled with zeros.
+ *
+ * @return temp set #vector.
+ * @return A #vector filled with zeros.
+ */
+__attribute__((always_inline)) INLINE vector vector_setzero() {
+
+  vector temp;
+  temp.v = vec_setzero();
+  return temp;
+}
+
 #else
 /* Needed for cache alignment. */
 #define VEC_SIZE 8
 #endif /* WITH_VECTORIZATION */
 
-#endif /* VEC_MACRO */
-
 #endif /* SWIFT_VECTOR_H */
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 553980a93e907e83b65bb4539ca49c8bc1b7207b..f6f88b51cbdc5a7c5ab1d331b295418af66ade53 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -102,5 +102,5 @@ EXTRA_DIST = testReading.sh makeInput.py testActivePair.sh \
 	     testPeriodicBCPerturbed.sh test125cells.sh test125cellsPerturbed.sh testParserInput.yaml \
 	     difffloat.py tolerance_125_normal.dat tolerance_125_perturbed.dat \
              tolerance_27_normal.dat tolerance_27_perturbed.dat tolerance_27_perturbed_h.dat tolerance_27_perturbed_h2.dat \
-	     tolerance_testInteractions.dat tolerance_pair_active.dat \
+	     tolerance_testInteractions.dat tolerance_pair_active.dat tolerance_pair_force_active.dat \
 	     fft_params.yml tolerance_periodic_BC_normal.dat tolerance_periodic_BC_perturbed.dat
diff --git a/tests/test125cells.c b/tests/test125cells.c
index dbb3a7c3f5b052a3585aaf04b2cd8b24ded9adba..62ded5c1df47c9d40759dbc7aa239bb4a8a47dc4 100644
--- a/tests/test125cells.c
+++ b/tests/test125cells.c
@@ -33,13 +33,18 @@
 
 #if defined(WITH_VECTORIZATION)
 #define DOSELF2 runner_doself2_force_vec
+#define DOPAIR2 runner_dopair2_branch_force
 #define DOSELF2_NAME "runner_doself2_force_vec"
-#define DOPAIR2_NAME "runner_dopair2_force"
+#define DOPAIR2_NAME "runner_dopair2_force_vec"
 #endif
 
 #ifndef DOSELF2
 #define DOSELF2 runner_doself2_force
 #define DOSELF2_NAME "runner_doself2_density"
+#endif
+
+#ifndef DOPAIR2
+#define DOPAIR2 runner_dopair2_branch_force
 #define DOPAIR2_NAME "runner_dopair2_force"
 #endif
 
@@ -445,11 +450,11 @@ void dump_particle_fields(char *fileName, struct cell *main_cell,
 }
 
 /* Just a forward declaration... */
-void runner_dopair1_density(struct runner *r, struct cell *ci, struct cell *cj);
 void runner_dopair1_branch_density(struct runner *r, struct cell *ci,
                                    struct cell *cj);
 void runner_doself1_density(struct runner *r, struct cell *ci);
-void runner_dopair2_force(struct runner *r, struct cell *ci, struct cell *cj);
+void runner_dopair2_branch_force(struct runner *r, struct cell *ci,
+                                 struct cell *cj);
 void runner_doself2_force(struct runner *r, struct cell *ci);
 void runner_doself2_force_vec(struct runner *r, struct cell *ci);
 
@@ -625,7 +630,6 @@ int main(int argc, char *argv[]) {
 
   /* Start the test */
   ticks time = 0;
-  ticks self_force_time = 0;
   for (size_t n = 0; n < runs; ++n) {
 
     const ticks tic = getticks();
@@ -696,6 +700,14 @@ int main(int argc, char *argv[]) {
 /* Do the force calculation */
 #if !(defined(MINIMAL_SPH) && defined(WITH_VECTORIZATION))
 
+#ifdef WITH_VECTORIZATION
+    /* Initialise the cache. */
+    runner.ci_cache.count = 0;
+    runner.cj_cache.count = 0;
+    cache_init(&runner.ci_cache, 512);
+    cache_init(&runner.cj_cache, 512);
+#endif
+
     int ctr = 0;
     /* Do the pairs (for the central 27 cells) */
     for (int i = 1; i < 4; i++) {
@@ -708,27 +720,19 @@ int main(int argc, char *argv[]) {
 
             const ticks sub_tic = getticks();
 
-            runner_dopair2_force(&runner, main_cell, cj);
+            DOPAIR2(&runner, main_cell, cj);
 
-            const ticks sub_toc = getticks();
-            timings[ctr++] += sub_toc - sub_tic;
+            timings[ctr++] += getticks() - sub_tic;
           }
         }
       }
     }
 
-#ifdef WITH_VECTORIZATION
-    /* Initialise the cache. */
-    runner.ci_cache.count = 0;
-    cache_init(&runner.ci_cache, 512);
-#endif
-
     ticks self_tic = getticks();
 
     /* And now the self-interaction for the main cell */
     DOSELF2(&runner, main_cell);
 
-    self_force_time += getticks() - self_tic;
     timings[26] += getticks() - self_tic;
 #endif
 
@@ -761,11 +765,12 @@ int main(int argc, char *argv[]) {
   ticks face_time = timings[4] + timings[10] + timings[12] + timings[13] +
                     timings[15] + timings[21];
 
+  ticks self_time = timings[26];
+
   message("Corner calculations took       : %15lli ticks.", corner_time / runs);
   message("Edge calculations took         : %15lli ticks.", edge_time / runs);
   message("Face calculations took         : %15lli ticks.", face_time / runs);
-  message("Self calculations took         : %15lli ticks.",
-          self_force_time / runs);
+  message("Self calculations took         : %15lli ticks.", self_time / runs);
   message("SWIFT calculation took         : %15lli ticks.", time / runs);
 
   for (int j = 0; j < 125; ++j)
diff --git a/tests/testActivePair.c b/tests/testActivePair.c
index 1e0111b4f0e480d0f66463b4c2264cdd89bd28c8..c8e0233c2c317be321ed8b923a46a98e0905b36c 100644
--- a/tests/testActivePair.c
+++ b/tests/testActivePair.c
@@ -29,6 +29,9 @@
 /* Local headers. */
 #include "swift.h"
 
+/* Typdef function pointer for interaction function. */
+typedef void (*interaction_func)(struct runner *, struct cell *, struct cell *);
+
 /**
  * @brief Constructs a cell and all of its particle in a valid state prior to
  * a DOPAIR or DOSELF calcuation.
@@ -108,6 +111,12 @@ struct cell *make_cell(size_t n, double *offset, double size, double h,
         else
           part->time_bin = num_time_bins + 1;
 
+        part->rho = 1.f;
+        part->force.f = 1.f;
+        part->force.P_over_rho2 = 1.f;
+        part->force.balsara = 1.f;
+        part->force.soundspeed = 1.f;
+
 #ifdef SWIFT_DEBUG_CHECKS
         part->ti_drift = 8;
         part->ti_kick = 8;
@@ -133,7 +142,7 @@ struct cell *make_cell(size_t n, double *offset, double size, double h,
 
   cell->ti_old_part = 8;
   cell->ti_end_min = 8;
-  cell->ti_end_max = 8;
+  cell->ti_end_max = 10;
 
   shuffle_particles(cell->parts, cell->count);
 
@@ -156,6 +165,11 @@ void clean_up(struct cell *ci) {
 void zero_particle_fields(struct cell *c) {
   for (int pid = 0; pid < c->count; pid++) {
     hydro_init_part(&c->parts[pid], NULL);
+    c->parts[pid].rho = 1.f;
+    c->parts[pid].force.f = 1.f;
+    c->parts[pid].force.P_over_rho2 = 1.f;
+    c->parts[pid].force.balsara = 1.f;
+    c->parts[pid].force.soundspeed = 1.f;
   }
 }
 
@@ -179,20 +193,20 @@ void dump_particle_fields(char *fileName, struct cell *ci, struct cell *cj) {
   FILE *file = fopen(fileName, "a");
 
   /* Write header */
-  fprintf(file, "# %4s %13s\n", "ID", "wcount");
+  fprintf(file, "# %4s %13s %13s\n", "ID", "wcount", "h_dt");
 
   fprintf(file, "# ci --------------------------------------------\n");
 
   for (int pid = 0; pid < ci->count; pid++) {
-    fprintf(file, "%6llu %13e\n", ci->parts[pid].id,
-            ci->parts[pid].density.wcount);
+    fprintf(file, "%6llu %13e %13e\n", ci->parts[pid].id,
+            ci->parts[pid].density.wcount, ci->parts[pid].force.h_dt);
   }
 
   fprintf(file, "# cj --------------------------------------------\n");
 
   for (int pjd = 0; pjd < cj->count; pjd++) {
-    fprintf(file, "%6llu %13e\n", cj->parts[pjd].id,
-            cj->parts[pjd].density.wcount);
+    fprintf(file, "%6llu %13e %13e\n", cj->parts[pjd].id,
+            cj->parts[pjd].density.wcount, cj->parts[pjd].force.h_dt);
   }
 
   fclose(file);
@@ -200,9 +214,13 @@ void dump_particle_fields(char *fileName, struct cell *ci, struct cell *cj) {
 
 /* Just a forward declaration... */
 void runner_dopair1_density(struct runner *r, struct cell *ci, struct cell *cj);
+void runner_dopair2_force_vec(struct runner *r, struct cell *ci,
+                              struct cell *cj);
 void runner_doself1_density_vec(struct runner *r, struct cell *ci);
 void runner_dopair1_branch_density(struct runner *r, struct cell *ci,
                                    struct cell *cj);
+void runner_dopair2_branch_force(struct runner *r, struct cell *ci,
+                                 struct cell *cj);
 
 /**
  * @brief Computes the pair interactions of two cells using SWIFT and a brute
@@ -210,7 +228,9 @@ void runner_dopair1_branch_density(struct runner *r, struct cell *ci,
  */
 void test_pair_interactions(struct runner *runner, struct cell **ci,
                             struct cell **cj, char *swiftOutputFileName,
-                            char *bruteForceOutputFileName) {
+                            char *bruteForceOutputFileName,
+                            interaction_func serial_interaction,
+                            interaction_func vec_interaction) {
 
   runner_do_sort(runner, *ci, 0x1FFF, 0, 0);
   runner_do_sort(runner, *cj, 0x1FFF, 0, 0);
@@ -220,7 +240,7 @@ void test_pair_interactions(struct runner *runner, struct cell **ci,
   zero_particle_fields(*cj);
 
   /* Run the test */
-  runner_dopair1_branch_density(runner, *ci, *cj);
+  vec_interaction(runner, *ci, *cj);
 
   /* Let's get physical ! */
   end_calculation(*ci);
@@ -236,7 +256,7 @@ void test_pair_interactions(struct runner *runner, struct cell **ci,
   zero_particle_fields(*cj);
 
   /* Run the brute-force test */
-  pairs_all_density(runner, *ci, *cj);
+  serial_interaction(runner, *ci, *cj);
 
   /* Let's get physical ! */
   end_calculation(*ci);
@@ -248,16 +268,26 @@ void test_pair_interactions(struct runner *runner, struct cell **ci,
 /**
  * @brief Computes the pair interactions of two cells in various configurations.
  */
-void test_all_pair_interactions(struct runner *runner, double *offset2,
-                                size_t particles, double size, double h,
-                                double rho, long long *partId,
-                                double perturbation, double h_pert,
-                                char *swiftOutputFileName,
-                                char *bruteForceOutputFileName) {
+void test_all_pair_interactions(
+    struct runner *runner, double *offset2, size_t particles, double size,
+    double h, double rho, long long *partId, double perturbation, double h_pert,
+    char *swiftOutputFileName, char *bruteForceOutputFileName,
+    interaction_func serial_interaction, interaction_func vec_interaction) {
 
   double offset1[3] = {0, 0, 0};
   struct cell *ci, *cj;
 
+  /* Only one particle in each cell. */
+  ci = make_cell(1, offset1, size, h, rho, partId, perturbation, h_pert, 1.);
+  cj = make_cell(1, offset2, size, h, rho, partId, perturbation, h_pert, 1.);
+
+  test_pair_interactions(runner, &ci, &cj, swiftOutputFileName,
+                         bruteForceOutputFileName, serial_interaction,
+                         vec_interaction);
+
+  clean_up(ci);
+  clean_up(cj);
+
   /* All active particles. */
   ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert,
                  1.);
@@ -265,7 +295,8 @@ void test_all_pair_interactions(struct runner *runner, double *offset2,
                  1.);
 
   test_pair_interactions(runner, &ci, &cj, swiftOutputFileName,
-                         bruteForceOutputFileName);
+                         bruteForceOutputFileName, serial_interaction,
+                         vec_interaction);
 
   clean_up(ci);
   clean_up(cj);
@@ -277,7 +308,8 @@ void test_all_pair_interactions(struct runner *runner, double *offset2,
                  0.5);
 
   test_pair_interactions(runner, &ci, &cj, swiftOutputFileName,
-                         bruteForceOutputFileName);
+                         bruteForceOutputFileName, serial_interaction,
+                         vec_interaction);
 
   clean_up(ci);
   clean_up(cj);
@@ -289,7 +321,8 @@ void test_all_pair_interactions(struct runner *runner, double *offset2,
                  0.);
 
   test_pair_interactions(runner, &ci, &cj, swiftOutputFileName,
-                         bruteForceOutputFileName);
+                         bruteForceOutputFileName, serial_interaction,
+                         vec_interaction);
 
   clean_up(ci);
   clean_up(cj);
@@ -301,7 +334,8 @@ void test_all_pair_interactions(struct runner *runner, double *offset2,
                  0.1);
 
   test_pair_interactions(runner, &ci, &cj, swiftOutputFileName,
-                         bruteForceOutputFileName);
+                         bruteForceOutputFileName, serial_interaction,
+                         vec_interaction);
 
   clean_up(ci);
   clean_up(cj);
@@ -313,7 +347,8 @@ void test_all_pair_interactions(struct runner *runner, double *offset2,
                  0.);
 
   test_pair_interactions(runner, &ci, &cj, swiftOutputFileName,
-                         bruteForceOutputFileName);
+                         bruteForceOutputFileName, serial_interaction,
+                         vec_interaction);
 
   clean_up(ci);
   clean_up(cj);
@@ -325,7 +360,8 @@ void test_all_pair_interactions(struct runner *runner, double *offset2,
                  1.0);
 
   test_pair_interactions(runner, &ci, &cj, swiftOutputFileName,
-                         bruteForceOutputFileName);
+                         bruteForceOutputFileName, serial_interaction,
+                         vec_interaction);
 
   clean_up(ci);
   clean_up(cj);
@@ -335,7 +371,8 @@ void test_all_pair_interactions(struct runner *runner, double *offset2,
   cj = make_cell(2, offset2, size, h, rho, partId, perturbation, h_pert, 1.0);
 
   test_pair_interactions(runner, &ci, &cj, swiftOutputFileName,
-                         bruteForceOutputFileName);
+                         bruteForceOutputFileName, serial_interaction,
+                         vec_interaction);
 
   clean_up(ci);
   clean_up(cj);
@@ -345,7 +382,8 @@ void test_all_pair_interactions(struct runner *runner, double *offset2,
   cj = make_cell(3, offset2, size, h, rho, partId, perturbation, h_pert, 0.75);
 
   test_pair_interactions(runner, &ci, &cj, swiftOutputFileName,
-                         bruteForceOutputFileName);
+                         bruteForceOutputFileName, serial_interaction,
+                         vec_interaction);
 
   clean_up(ci);
   clean_up(cj);
@@ -357,7 +395,8 @@ void test_all_pair_interactions(struct runner *runner, double *offset2,
                  0.);
 
   test_pair_interactions(runner, &ci, &cj, swiftOutputFileName,
-                         bruteForceOutputFileName);
+                         bruteForceOutputFileName, serial_interaction,
+                         vec_interaction);
 
   clean_up(ci);
   clean_up(cj);
@@ -369,7 +408,8 @@ void test_all_pair_interactions(struct runner *runner, double *offset2,
                  0.5);
 
   test_pair_interactions(runner, &ci, &cj, swiftOutputFileName,
-                         bruteForceOutputFileName);
+                         bruteForceOutputFileName, serial_interaction,
+                         vec_interaction);
 
   /* Clean things to make the sanitizer happy ... */
   clean_up(ci);
@@ -470,7 +510,7 @@ int main(int argc, char *argv[]) {
 
   /* Create output file names. */
   sprintf(swiftOutputFileName, "swift_dopair_%s.dat", outputFileNameExtension);
-  sprintf(bruteForceOutputFileName, "brute_force_%s.dat",
+  sprintf(bruteForceOutputFileName, "brute_force_pair_%s.dat",
           outputFileNameExtension);
 
   /* Delete files if they already exist. */
@@ -486,10 +526,56 @@ int main(int argc, char *argv[]) {
 
   double offset[3] = {1., 0., 0.};
 
+  /* Define which interactions to call */
+  interaction_func serial_inter_func = &pairs_all_density;
+  interaction_func vec_inter_func = &runner_dopair1_branch_density;
+
   /* Test a pair of cells face-on. */
   test_all_pair_interactions(runner, offset, particles, size, h, rho, &partId,
                              perturbation, h_pert, swiftOutputFileName,
-                             bruteForceOutputFileName);
+                             bruteForceOutputFileName, serial_inter_func,
+                             vec_inter_func);
+
+  /* Test a pair of cells edge-on. */
+  offset[0] = 1.;
+  offset[1] = 1.;
+  offset[2] = 0.;
+  test_all_pair_interactions(runner, offset, particles, size, h, rho, &partId,
+                             perturbation, h_pert, swiftOutputFileName,
+                             bruteForceOutputFileName, serial_inter_func,
+                             vec_inter_func);
+
+  /* Test a pair of cells corner-on. */
+  offset[0] = 1.;
+  offset[1] = 1.;
+  offset[2] = 1.;
+  test_all_pair_interactions(runner, offset, particles, size, h, rho, &partId,
+                             perturbation, h_pert, swiftOutputFileName,
+                             bruteForceOutputFileName, serial_inter_func,
+                             vec_inter_func);
+
+  /* Re-assign function pointers. */
+  serial_inter_func = &pairs_all_force;
+  vec_inter_func = &runner_dopair2_branch_force;
+
+  /* Create new output file names. */
+  sprintf(swiftOutputFileName, "swift_dopair2_force_%s.dat",
+          outputFileNameExtension);
+  sprintf(bruteForceOutputFileName, "brute_force_dopair2_%s.dat",
+          outputFileNameExtension);
+
+  /* Delete files if they already exist. */
+  remove(swiftOutputFileName);
+  remove(bruteForceOutputFileName);
+
+  /* Test a pair of cells face-on. */
+  offset[0] = 1.;
+  offset[1] = 0.;
+  offset[2] = 0.;
+  test_all_pair_interactions(runner, offset, particles, size, h, rho, &partId,
+                             perturbation, h_pert, swiftOutputFileName,
+                             bruteForceOutputFileName, serial_inter_func,
+                             vec_inter_func);
 
   /* Test a pair of cells edge-on. */
   offset[0] = 1.;
@@ -497,7 +583,8 @@ int main(int argc, char *argv[]) {
   offset[2] = 0.;
   test_all_pair_interactions(runner, offset, particles, size, h, rho, &partId,
                              perturbation, h_pert, swiftOutputFileName,
-                             bruteForceOutputFileName);
+                             bruteForceOutputFileName, serial_inter_func,
+                             vec_inter_func);
 
   /* Test a pair of cells corner-on. */
   offset[0] = 1.;
@@ -505,6 +592,7 @@ int main(int argc, char *argv[]) {
   offset[2] = 1.;
   test_all_pair_interactions(runner, offset, particles, size, h, rho, &partId,
                              perturbation, h_pert, swiftOutputFileName,
-                             bruteForceOutputFileName);
+                             bruteForceOutputFileName, serial_inter_func,
+                             vec_inter_func);
   return 0;
 }
diff --git a/tests/testActivePair.sh.in b/tests/testActivePair.sh.in
index 108da19c5c667a6a46a81707a7508a27f39f6b38..87d09cad7118c45b41c5a41058c331cbda92b613 100755
--- a/tests/testActivePair.sh.in
+++ b/tests/testActivePair.sh.in
@@ -8,7 +8,9 @@ echo "Running ./testActivePair -n 6 -r 1 -d 0 -f active"
 
 ./testActivePair -n 6 -r 1 -d 0 -f active
 
-python @srcdir@/difffloat.py brute_force_active.dat swift_dopair_active.dat @srcdir@/tolerance_pair_active.dat
+python @srcdir@/difffloat.py brute_force_pair_active.dat swift_dopair_active.dat @srcdir@/tolerance_pair_active.dat
+
+python @srcdir@/difffloat.py brute_force_dopair2_active.dat swift_dopair2_force_active.dat @srcdir@/tolerance_pair_force_active.dat
 
 rm -f brute_force_pair_active.dat swift_dopair_active.dat
 
@@ -17,6 +19,8 @@ echo "Running ./testActivePair -n 6 -r 1 -d 0 -f active -s 1506434777"
 
 ./testActivePair -n 6 -r 1 -d 0 -f active -s 1506434777
 
-python @srcdir@/difffloat.py brute_force_active.dat swift_dopair_active.dat @srcdir@/tolerance_pair_active.dat
+python @srcdir@/difffloat.py brute_force_pair_active.dat swift_dopair_active.dat @srcdir@/tolerance_pair_active.dat
+
+python @srcdir@/difffloat.py brute_force_dopair2_active.dat swift_dopair2_force_active.dat @srcdir@/tolerance_pair_force_active.dat
 
 exit $?
diff --git a/tests/tolerance_27_perturbed_h.dat b/tests/tolerance_27_perturbed_h.dat
index e25c3373afe6f2a8e7788dfc2d9eba0b74efdbee..5d6710c3946c0515f94c58eb3e0ecbfe14135b71 100644
--- a/tests/tolerance_27_perturbed_h.dat
+++ b/tests/tolerance_27_perturbed_h.dat
@@ -1,4 +1,4 @@
 #   ID      pos_x      pos_y      pos_z        v_x        v_y        v_z           rho        rho_dh        wcount     wcount_dh         div_v       curl_vx       curl_vy       curl_vz
-    0	      1e-6       1e-6	      1e-6 	       1e-6 	    1e-6	     1e-6	         2.4e-6       1e-4	        5e-4       1.2e-2		         1.1e-5	       3e-6	         3e-6		       8e-6
-    0	      1e-6       1e-6	      1e-6 	       1e-6 	    1e-6	     1e-6	         1.2e-6	      1.4e-2	      1e-5       2e-3		           2.5e-4	     3e-3	         3e-3	 	       3e-3
+    0	      1e-6       1e-6	      1e-6 	       1e-6 	    1e-6	     1e-6	         3e-6       1e-4	        5e-4       1.4e-2		         1.1e-5	       3e-6	         3e-6		       8e-6
+    0	      1e-6       1e-6	      1e-6 	       1e-6 	    1e-6	     1e-6	         1.5e-6	      1.4e-2	      1e-5     2e-3		           2.5e-4	     3e-3	         3e-3	 	       3e-3
     0	      1e-6       1e-6	      1e-6 	       1e-6 	    1e-6	     1e-6	         1e-6	      1e-6	        1e-6       1e0		           1e-6	       4e-6	         4e-6		       4e-6
diff --git a/tests/tolerance_27_perturbed_h2.dat b/tests/tolerance_27_perturbed_h2.dat
index d83f36a887d92b55c90b21adae65ffe734a2f188..9ae7dfe979bce3d047c810be128521de52f8a8a9 100644
--- a/tests/tolerance_27_perturbed_h2.dat
+++ b/tests/tolerance_27_perturbed_h2.dat
@@ -1,4 +1,4 @@
 #   ID      pos_x      pos_y      pos_z        v_x        v_y        v_z           rho        rho_dh        wcount     wcount_dh         div_v       curl_vx       curl_vy       curl_vz
     0	      1e-6       1e-6	      1e-6 	       1e-6 	    1e-6	     1e-6	         3e-6       1e-4	        5e-4       1.5e-2		         1.4e-5	       3e-6	         3e-6		       1e-5
-    0	      1e-6       1e-6	      1e-6 	       1e-6 	    1e-6	     1e-6	         1.5e-6	    1.57e-2	      1e-5       5.86e-3		       4.96e-4	     3e-3	         3e-3	 	       3e-3
+    0	      1e-6       1e-6	      1e-6 	       1e-6 	    1e-6	     1e-6	         1.5e-6	    2.5e-2	      1e-5       5.86e-3		       4.96e-4	     3e-3	         3.7e-3	 	       3e-3
     0	      1e-6       1e-6	      1e-6 	       1e-6 	    1e-6	     1e-6	         1e-6	      1e-6	        1e-6       1e0		           1e-6	         4e-6	         4e-6		       4e-6
diff --git a/tests/tolerance_pair_active.dat b/tests/tolerance_pair_active.dat
index b07697a686eb7801326ceaf77cf93fb3a1491c2e..087c33fb8fa73c6125231b3f58cc9412a96c771b 100644
--- a/tests/tolerance_pair_active.dat
+++ b/tests/tolerance_pair_active.dat
@@ -1,4 +1,4 @@
-#   ID     wcount
-    0	     1e-2
-    0	     1e-2
-    0	     1e-2
+#   ID     wcount   h_dt
+    0	     1e-2      1
+    0	     1e-2      1
+    0	     1e-2      1
diff --git a/tests/tolerance_pair_force_active.dat b/tests/tolerance_pair_force_active.dat
new file mode 100644
index 0000000000000000000000000000000000000000..c5b5620338b848f1b8f4658049e53a6424e93a60
--- /dev/null
+++ b/tests/tolerance_pair_force_active.dat
@@ -0,0 +1,4 @@
+#   ID  wcount   h_dt
+    0	   1      2.4e-3
+    0	   1      2.4e-3
+    0	   1      1e-5