diff --git a/src/hydro/Gadget2/hydro_iact.h b/src/hydro/Gadget2/hydro_iact.h
index 4cc0ac273941664be6aa43c92c1d7c30e7d3db30..0104868879aaffd3e9bd6549a82199fadfd9f755 100644
--- a/src/hydro/Gadget2/hydro_iact.h
+++ b/src/hydro/Gadget2/hydro_iact.h
@@ -275,56 +275,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density(
   pi->density.rot_v[2] += fac * curlvr[2];
 }
 
-__attribute__((always_inline)) INLINE static void
-runner_iact_nonsym_density_jsw(
-    const float r2, const float hig2, const float dx, const float dy,
-    const float dz, const float h_inv, const float hj, const float vi_x,
-    const float vi_y, const float vi_z, const float vj_x, const float vj_y,
-    const float vj_z, const float mj, float *const restrict rho,
-    float *const restrict rho_dh, float *const restrict wcount,
-    float *const restrict wcount_dh, float *const restrict div_v,
-    float *const restrict curl_vx, float *const restrict curl_vy,
-    float *const restrict curl_vz) {
-
-  if (r2 < hig2) {
-
-    float wi, wi_dx;
-
-    /* Get r and r inverse. */
-    const float r = sqrtf(r2);
-    const float ri = 1.0f / r;
-
-    /* Compute kernel function */
-    const float u = r * h_inv;
-    kernel_deval(u, &wi, &wi_dx);
-
-    const float fac = mj * wi_dx * ri;
-
-    /* Compute dv dot r */
-    const float dv_x = vi_x - vj_x;
-    const float dv_y = vi_y - vj_y;
-    const float dv_z = vi_z - vj_z;
-    const float dvdr = dv_x * dx + dv_y * dy + dv_z * dz;
-    *div_v -= fac * dvdr;
-
-    /* Compute dv cross r */
-    const float curlvr_x = dv_y * dz - dv_z * dy;
-    const float curlvr_y = dv_z * dx - dv_x * dz;
-    const float curlvr_z = dv_x * dy - dv_y * dx;
-
-    /* Compute contribution to the density */
-    *rho += mj * wi;
-    *rho_dh -= mj * (3.0f * wi + u * wi_dx);
-
-    /* Compute contribution to the number of neighbours */
-    *wcount += wi;
-    *wcount_dh -= u * wi_dx;
-    *curl_vx += fac * curlvr_x;
-    *curl_vy += fac * curlvr_y;
-    *curl_vz += fac * curlvr_z;
-  }
-}
-
 /**
  * @brief Density loop (non-symmetric vectorized version)
  */
@@ -425,8 +375,13 @@ runner_iact_nonsym_vec_density(float *R2, float *Dx, float *Hi, float *Hj,
 }
 
 #ifdef WITH_VECTORIZATION
+
+/**
+ * @brief Density interaction computed using 1 vector
+ * (non-symmetric vectorized version).
+ */
 __attribute__((always_inline)) INLINE static void
-runner_iact_nonsym_intrinsic_vec_density(
+runner_iact_nonsym_1_vec_density(
     vector *r2, vector *dx, vector *dy, vector *dz, vector hi_inv, vector vix,
     vector viy, vector viz, float *Vjx, float *Vjy, float *Vjz, float *Mj,
     vector *rhoSum, vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum,
@@ -518,220 +473,6 @@ runner_iact_nonsym_intrinsic_vec_density(
 #endif
 }
 
-__attribute__((always_inline)) INLINE static void
-runner_iact_nonsym_intrinsic_vec_2_density(
-    const struct cache *const cj_cache, const int *const indices, vector *r2,
-    vector *dx, vector *dy, vector *dz, vector hi_inv, vector vix, vector viy,
-    vector viz, vector *rhoSum, vector *rho_dhSum, vector *wcountSum,
-    vector *wcount_dhSum, vector *div_vSum, vector *curlvxSum,
-    vector *curlvySum, vector *curlvzSum, vector mask, int knlMask) {
-
-  // vector r, ri, r2, xi, wi, wi_dx;
-  vector r, ri, xi, wi, wi_dx;
-  vector mj;
-  // vector dx, dy, dz, dvx, dvy, dvz;
-  vector dvx, dvy, dvz;
-  vector vjx, vjy, vjz;
-  vector dvdr;
-  vector curlvrx, curlvry, curlvrz;
-
-  /* Fill the vectors. */
-  mj.v = vec_set(cj_cache->m[indices[0]], cj_cache->m[indices[1]],
-                 cj_cache->m[indices[2]], cj_cache->m[indices[3]],
-                 cj_cache->m[indices[4]], cj_cache->m[indices[5]],
-                 cj_cache->m[indices[6]], cj_cache->m[indices[7]]);
-  vjx.v = vec_set(cj_cache->vx[indices[0]], cj_cache->vx[indices[1]],
-                  cj_cache->vx[indices[2]], cj_cache->vx[indices[3]],
-                  cj_cache->vx[indices[4]], cj_cache->vx[indices[5]],
-                  cj_cache->vx[indices[6]], cj_cache->vx[indices[7]]);
-  vjy.v = vec_set(cj_cache->vy[indices[0]], cj_cache->vy[indices[1]],
-                  cj_cache->vy[indices[2]], cj_cache->vy[indices[3]],
-                  cj_cache->vy[indices[4]], cj_cache->vy[indices[5]],
-                  cj_cache->vy[indices[6]], cj_cache->vy[indices[7]]);
-  vjz.v = vec_set(cj_cache->vz[indices[0]], cj_cache->vz[indices[1]],
-                  cj_cache->vz[indices[2]], cj_cache->vz[indices[3]],
-                  cj_cache->vz[indices[4]], cj_cache->vz[indices[5]],
-                  cj_cache->vz[indices[6]], cj_cache->vz[indices[7]]);
-  // dx.v = vec_load(Dx);
-  // dy.v = vec_load(Dy);
-  // dz.v = vec_load(Dz);
-
-  /* Get the radius and inverse radius. */
-  // r2.v = vec_load(R2);
-  ri = vec_reciprocal_sqrt(*r2);
-  r.v = vec_mul(r2->v, ri.v);
-
-  xi.v = vec_mul(r.v, hi_inv.v);
-
-  /* Calculate the kernel for two particles. */
-  kernel_deval_1_vec(&xi, &wi, &wi_dx);
-
-  /* Compute dv. */
-  dvx.v = vec_sub(vix.v, vjx.v);
-  dvy.v = vec_sub(viy.v, vjy.v);
-  dvz.v = vec_sub(viz.v, vjz.v);
-
-  /* Compute dv dot r */
-  dvdr.v = vec_fma(dvx.v, dx->v, vec_fma(dvy.v, dy->v, vec_mul(dvz.v, dz->v)));
-  dvdr.v = vec_mul(dvdr.v, ri.v);
-
-  /* Compute dv cross r */
-  curlvrx.v =
-      vec_fma(dvy.v, dz->v, vec_mul(vec_set1(-1.0f), vec_mul(dvz.v, dy->v)));
-  curlvry.v =
-      vec_fma(dvz.v, dx->v, vec_mul(vec_set1(-1.0f), vec_mul(dvx.v, dz->v)));
-  curlvrz.v =
-      vec_fma(dvx.v, dy->v, vec_mul(vec_set1(-1.0f), vec_mul(dvy.v, dx->v)));
-  curlvrx.v = vec_mul(curlvrx.v, ri.v);
-  curlvry.v = vec_mul(curlvry.v, ri.v);
-  curlvrz.v = vec_mul(curlvrz.v, ri.v);
-
-/* Mask updates to intermediate vector sums for particle pi. */
-#ifdef HAVE_AVX512_F
-  rhoSum->v =
-      _mm512_mask_add_ps(rhoSum->v, knlMask, vec_mul(mj.v, wi.v), rhoSum->v);
-
-  rho_dhSum->v =
-      _mm512_mask_sub_ps(rho_dhSum->v, knlMask, rho_dhSum->v,
-                         vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
-                                               vec_mul(xi.v, wi_dx.v))));
-
-  wcountSum->v = _mm512_mask_add_ps(wcountSum->v, knlMask, wi.v, wcountSum->v);
-
-  wcount_dhSum->v = _mm512_mask_sub_ps(wcount_dhSum->v, knlMask,
-                                       wcount_dhSum->v, vec_mul(xi.v, wi_dx.v));
-
-  div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask, div_vSum->v,
-                                   vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)));
-
-  curlvxSum->v = _mm512_mask_add_ps(curlvxSum->v, knlMask,
-                                    vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)),
-                                    curlvxSum->v);
-
-  curlvySum->v = _mm512_mask_add_ps(curlvySum->v, knlMask,
-                                    vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)),
-                                    curlvySum->v);
-
-  curlvzSum->v = _mm512_mask_add_ps(curlvzSum->v, knlMask,
-                                    vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)),
-                                    curlvzSum->v);
-#else
-  rhoSum->v += vec_and(vec_mul(mj.v, wi.v), mask.v);
-  rho_dhSum->v -= vec_and(vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
-                                                vec_mul(xi.v, wi_dx.v))),
-                          mask.v);
-  wcountSum->v += vec_and(wi.v, mask.v);
-  wcount_dhSum->v -= vec_and(vec_mul(xi.v, wi_dx.v), mask.v);
-  div_vSum->v -= vec_and(vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask.v);
-  curlvxSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask.v);
-  curlvySum->v += vec_and(vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask.v);
-  curlvzSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask.v);
-#endif
-}
-
-/**
- * @brief Density interaction computed using 2 interleaved vectors
- * (non-symmetric vectorized version).
- */
-__attribute__((always_inline)) INLINE static void
-runner_iact_nonsym_1_vec_density(
-    float *R2, float *Dx, float *Dy, float *Dz, vector hi_inv, vector vix,
-    vector viy, vector viz, float *Vjx, float *Vjy, float *Vjz, float *Mj,
-    vector *rhoSum, vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum,
-    vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum,
-    vector mask, vector mask2, int knlMask, int knlMask2) {
-
-  vector r, ri, r2, xi, wi, wi_dx;
-  vector mj;
-  vector dx, dy, dz, dvx, dvy, dvz;
-  vector vjx, vjy, vjz;
-  vector dvdr;
-  vector curlvrx, curlvry, curlvrz;
-
-  /* Fill the vectors. */
-  mj.v = vec_load(Mj);
-  vjx.v = vec_load(Vjx);
-  vjy.v = vec_load(Vjy);
-  vjz.v = vec_load(Vjz);
-  dx.v = vec_load(Dx);
-  dy.v = vec_load(Dy);
-  dz.v = vec_load(Dz);
-
-  /* Get the radius and inverse radius. */
-  r2.v = vec_load(R2);
-  ri = vec_reciprocal_sqrt(r2);
-  r.v = vec_mul(r2.v, ri.v);
-
-  xi.v = vec_mul(r.v, hi_inv.v);
-
-  /* Calculate the kernel for two particles. */
-  kernel_deval_1_vec(&xi, &wi, &wi_dx);
-
-  /* Compute dv. */
-  dvx.v = vec_sub(vix.v, vjx.v);
-  dvy.v = vec_sub(viy.v, vjy.v);
-  dvz.v = vec_sub(viz.v, vjz.v);
-
-  /* Compute dv dot r */
-  dvdr.v = vec_fma(dvx.v, dx.v, vec_fma(dvy.v, dy.v, vec_mul(dvz.v, dz.v)));
-  dvdr.v = vec_mul(dvdr.v, ri.v);
-
-  /* Compute dv cross r */
-  curlvrx.v =
-      vec_fma(dvy.v, dz.v, vec_mul(vec_set1(-1.0f), vec_mul(dvz.v, dy.v)));
-  curlvry.v =
-      vec_fma(dvz.v, dx.v, vec_mul(vec_set1(-1.0f), vec_mul(dvx.v, dz.v)));
-  curlvrz.v =
-      vec_fma(dvx.v, dy.v, vec_mul(vec_set1(-1.0f), vec_mul(dvy.v, dx.v)));
-  curlvrx.v = vec_mul(curlvrx.v, ri.v);
-  curlvry.v = vec_mul(curlvry.v, ri.v);
-  curlvrz.v = vec_mul(curlvrz.v, ri.v);
-
-/* Mask updates to intermediate vector sums for particle pi. */
-#ifdef HAVE_AVX512_F
-  rhoSum->v =
-      _mm512_mask_add_ps(rhoSum->v, knlMask, vec_mul(mj.v, wi.v), rhoSum->v);
-
-  rho_dhSum->v =
-      _mm512_mask_sub_ps(rho_dhSum->v, knlMask, rho_dhSum->v,
-                         vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
-                                               vec_mul(xi.v, wi_dx.v))));
-
-  wcountSum->v = _mm512_mask_add_ps(wcountSum->v, knlMask, wi.v, wcountSum->v);
-
-  wcount_dhSum->v = _mm512_mask_sub_ps(wcount_dhSum->v, knlMask,
-                                       wcount_dhSum->v, vec_mul(xi.v, wi_dx.v));
-
-  div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask, div_vSum->v,
-                                   vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)));
-
-  curlvxSum->v = _mm512_mask_add_ps(curlvxSum->v, knlMask,
-                                    vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)),
-                                    curlvxSum->v);
-
-  curlvySum->v = _mm512_mask_add_ps(curlvySum->v, knlMask,
-                                    vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)),
-                                    curlvySum->v);
-
-  curlvzSum->v = _mm512_mask_add_ps(curlvzSum->v, knlMask,
-                                    vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)),
-                                    curlvzSum->v);
-#else
-  rhoSum->v += vec_and(vec_mul(mj.v, wi.v), mask.v);
-  rho_dhSum->v -= vec_and(vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
-                                                vec_mul(xi.v, wi_dx.v))),
-                          mask.v);
-  wcountSum->v += vec_and(wi.v, mask.v);
-  wcount_dhSum->v -= vec_and(vec_mul(xi.v, wi_dx.v), mask.v);
-  div_vSum->v -= vec_and(vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask.v);
-  curlvxSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask.v);
-  curlvySum->v += vec_and(vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask.v);
-  curlvzSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask.v);
-#endif
-}
-#endif
-
-#ifdef WITH_VECTORIZATION
 /**
  * @brief Density interaction computed using 2 interleaved vectors
  * (non-symmetric vectorized version).
diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c
index 9f9a9c49a2a18c4905713b5472fa3c6d423c9259..33e6fc06b4537a72bf361fc2e4599e5f3dce9657 100644
--- a/src/runner_doiact_vec.c
+++ b/src/runner_doiact_vec.c
@@ -260,52 +260,6 @@ __attribute__((always_inline)) INLINE static void storeInteractions(
   }
 }
 
-/* @brief Populates the arrays max_di and max_dj with the maximum distances of
- * particles into their neighbouring cells.
- * @param ci #cell pointer to ci
- * @param cj #cell pointer to cj
- * @param sort_i #entry array for particle distance in ci
- * @param sort_j #entry array for particle distance in cj
- * @param ci_cache #cache for cell ci
- * @param cj_cache #cache for cell cj
- * @param dx_max maximum particle movement allowed in cell
- * @param rshift cutoff shift
- * @param max_di array to hold the maximum distances of pi particles into cell
- * cj
- * @param max_dj array to hold the maximum distances of pj particles into cell
- * cj
- */
-__attribute__((always_inline)) INLINE static void populate_max_d(
-    const struct cell *ci, const struct cell *cj,
-    const struct entry *restrict sort_i, const struct entry *restrict sort_j,
-    const struct cache *ci_cache, const struct cache *cj_cache,
-    const float dx_max, const float rshift, float *max_di, float *max_dj) {
-
-  float h = ci_cache->h[0];
-  float d;
-
-  /* For particles in ci */
-  max_di[0] = sort_i[0].d + h * kernel_gamma + dx_max - rshift;
-
-  for (int k = 1; k < ci->count; k++) {
-    h = ci_cache->h[k];
-    d = sort_i[k].d + h * kernel_gamma + dx_max - rshift;
-
-    max_di[k] = fmaxf(max_di[k - 1], d);
-  }
-
-  /* For particles in cj */
-  h = cj_cache->h[0];
-  max_dj[0] = sort_j[0].d - h * kernel_gamma - dx_max - rshift;
-
-  for (int k = 1; k < cj->count; k++) {
-    h = cj_cache->h[k];
-    d = sort_j[k].d - h * kernel_gamma - dx_max - rshift;
-
-    max_dj[k] = fmaxf(max_dj[k - 1], d);
-  }
-}
-
 /* @brief Populates the arrays max_di and max_dj with the maximum distances of
  * particles into their neighbouring cells. Also finds the first pi that
  * interacts with any particle in cj and the last pj that interacts with any
@@ -1224,7 +1178,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
 
       /* If there are any interactions perform them. */
       if (doi_mask)
-        runner_iact_nonsym_intrinsic_vec_density(
+        runner_iact_nonsym_1_vec_density(
             &v_r2, &v_dx, &v_dy, &v_dz, v_hi_inv, v_vix, v_viy, v_viz,
             &cj_cache->vx[cj_cache_idx], &cj_cache->vy[cj_cache_idx],
             &cj_cache->vz[cj_cache_idx], &cj_cache->m[cj_cache_idx], &rhoSum,
@@ -1353,7 +1307,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
 
       /* If there are any interactions perform them. */
       if (doj_mask)
-        runner_iact_nonsym_intrinsic_vec_density(
+        runner_iact_nonsym_1_vec_density(
             &v_r2, &v_dx, &v_dy, &v_dz, v_hj_inv, v_vjx, v_vjy, v_vjz,
             &ci_cache->vx[ci_cache_idx], &ci_cache->vy[ci_cache_idx],
             &ci_cache->vz[ci_cache_idx], &ci_cache->m[ci_cache_idx], &rhoSum,
@@ -1384,2439 +1338,3 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
 
 #endif /* WITH_VECTORIZATION */
 }
-
-FILE *faceIntFile;
-FILE *edgeIntFile;
-FILE *cornerIntFile;
-
-/** C2_CACHE VERSION
- * @brief Compute the interactions between a cell pair (non-symmetric).
- *
- * @param r The #runner.
- * @param ci The first #cell.
- * @param cj The second #cell.
- */
-void runner_dopair1_density_vec_1(struct runner *r, struct cell *ci,
-                                  struct cell *cj) {
-
-#ifdef WITH_VECTORIZATION
-  const struct engine *restrict e = r->e;
-
-  static int faceIntCount = 0;
-  static int faceCtr = 0;
-  static int edgeIntCount = 0;
-  static int edgeCtr = 0;
-  static int cornerIntCount = 0;
-  static int cornerCtr = 0;
-  static int numFaceTested = 0;
-  static int numEdgeTested = 0;
-  static int numCornerTested = 0;
-  int icount = 0, icount_align = 0;
-  struct c2_cache int_cache;
-  int num_vec_proc = 1;
-
-  vector v_hi, v_vix, v_viy, v_viz, v_hig2;
-
-  TIMER_TIC;
-
-  if (faceCtr + edgeCtr + cornerCtr == 0) {
-    faceIntFile = fopen("particle_interactions_face.dat", "w");
-    edgeIntFile = fopen("particle_interactions_edge.dat", "w");
-    cornerIntFile = fopen("particle_interactions_corner.dat", "w");
-  }
-
-  /* Anything to do here? */
-  if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return;
-
-#ifdef SWIFT_DEBUG_CHECKS
-  cell_is_drifted(ci, e);
-  cell_is_drifted(cj, e);
-#endif
-
-  /* Get the sort ID. */
-  double shift[3] = {0.0, 0.0, 0.0};
-  const int sid = space_getsid(e->s, &ci, &cj, shift);
-
-  int face = (sid == 4 || sid == 10 || sid == 12);
-  int edge =
-      (sid == 1 || sid == 3 || sid == 5 || sid == 7 || sid == 9 || sid == 11);
-  int corner = (sid == 0 || sid == 2 || sid == 6 || sid == 8);
-
-  /* Have the cells been sorted? */
-  if (!(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid)))
-    error("Trying to interact unsorted cells.");
-
-  /* Get the cutoff shift. */
-  double rshift = 0.0;
-  for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k];
-
-  /* Pick-out the sorted lists. */
-  const struct entry *restrict sort_i = &ci->sort[sid * (ci->count + 1)];
-  const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)];
-
-  /* Get some other useful values. */
-  const int count_i = ci->count;
-  const int count_j = cj->count;
-  struct part *restrict parts_i = ci->parts;
-  struct part *restrict parts_j = cj->parts;
-  const double di_max = sort_i[count_i - 1].d - rshift;
-  const double dj_min = sort_j[0].d;
-  const float dx_max = (ci->dx_max + cj->dx_max);
-
-  /* Get the particle cache from the runner and re-allocate
-   * the cache if it is not big enough for the cell. */
-  struct cache *restrict ci_cache = &r->ci_cache;
-  struct cache *restrict cj_cache = &r->cj_cache;
-
-  if (ci_cache->count < count_i) {
-    cache_init(ci_cache, count_i);
-  }
-  if (cj_cache->count < count_j) {
-    cache_init(cj_cache, count_j);
-  }
-
-  cache_read_two_cells_sorted(ci, cj, ci_cache, cj_cache, sort_i, sort_j,
-                              shift);
-
-  float *max_di __attribute__((aligned(sizeof(float) * VEC_SIZE)));
-  float *max_dj __attribute__((aligned(sizeof(float) * VEC_SIZE)));
-
-  max_di = r->ci_cache.max_d;
-  max_dj = r->cj_cache.max_d;
-
-  /* Find particles maximum distance into cj, max_di[] and ci, max_dj[]. */
-  /* For particles in ci */
-  populate_max_d(ci, cj, sort_i, sort_j, ci_cache, cj_cache, dx_max, rshift,
-                 max_di, max_dj);
-
-  float di, dj;
-
-  int max_ind_j = count_j - 1;
-
-  /* Loop over the parts in ci. */
-  for (int pid = count_i - 1; pid >= 0 && max_ind_j >= 0; pid--) {
-
-    /* Get a hold of the ith part in ci. */
-    struct part *restrict pi = &parts_i[sort_i[pid].i];
-    if (!part_is_active(pi, e)) continue;
-
-    dj = sort_j[max_ind_j].d;
-    while (max_ind_j > 0 && max_di[pid] < dj) {
-      max_ind_j--;
-
-      dj = sort_j[max_ind_j].d;
-    }
-    int exit_iteration = max_ind_j + 1;
-
-    int ci_cache_idx = pid;  // sort_i[pid].i;
-
-    const float hi = ci_cache->h[ci_cache_idx];
-    const double di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift;
-    if (di < dj_min) continue;
-
-    const float hig2 = hi * hi * kernel_gamma2;
-
-    vector pix, piy, piz;
-
-    /* Fill particle pi vectors. */
-    pix.v = vec_set1(ci_cache->x[ci_cache_idx]);
-    piy.v = vec_set1(ci_cache->y[ci_cache_idx]);
-    piz.v = vec_set1(ci_cache->z[ci_cache_idx]);
-    v_hi.v = vec_set1(hi);
-    v_vix.v = vec_set1(ci_cache->vx[ci_cache_idx]);
-    v_viy.v = vec_set1(ci_cache->vy[ci_cache_idx]);
-    v_viz.v = vec_set1(ci_cache->vz[ci_cache_idx]);
-
-    v_hig2.v = vec_set1(hig2);
-
-    /* Reset cumulative sums of update vectors. */
-    vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum,
-        curlvySum, curlvzSum;
-
-    /* Get the inverse of hi. */
-    vector v_hi_inv;
-
-    v_hi_inv = vec_reciprocal(v_hi);
-
-    rhoSum.v = vec_setzero();
-    rho_dhSum.v = vec_setzero();
-    wcountSum.v = vec_setzero();
-    wcount_dhSum.v = vec_setzero();
-    div_vSum.v = vec_setzero();
-    curlvxSum.v = vec_setzero();
-    curlvySum.v = vec_setzero();
-    curlvzSum.v = vec_setzero();
-
-    /* Pad cache if there is a serial remainder. */
-    int exit_iteration_align = exit_iteration;
-    int rem = exit_iteration % (num_vec_proc * VEC_SIZE);
-    if (rem != 0) {
-      int pad = (num_vec_proc * VEC_SIZE) - rem;
-
-      exit_iteration_align += pad;
-    }
-
-    vector pjx, pjy, pjz;
-    vector pjvx, pjvy, pjvz, mj;
-
-    /* Loop over the parts in cj. */
-    for (int pjd = 0; pjd < exit_iteration_align; pjd += VEC_SIZE) {
-
-      /* Get the cache index to the jth particle. */
-      int cj_cache_idx = pjd;  // sort_j[pjd].i;
-
-      vector v_dx, v_dy, v_dz, v_r2;
-
-      /* Load 2 sets of vectors from the particle cache. */
-      pjx.v = vec_load(&cj_cache->x[cj_cache_idx]);
-      pjy.v = vec_load(&cj_cache->y[cj_cache_idx]);
-      pjz.v = vec_load(&cj_cache->z[cj_cache_idx]);
-      pjvx.v = vec_load(&cj_cache->vx[cj_cache_idx]);
-      pjvy.v = vec_load(&cj_cache->vy[cj_cache_idx]);
-      pjvz.v = vec_load(&cj_cache->vz[cj_cache_idx]);
-      mj.v = vec_load(&cj_cache->m[cj_cache_idx]);
-
-      /* Compute the pairwise distance. */
-      v_dx.v = vec_sub(pix.v, pjx.v);
-      v_dy.v = vec_sub(piy.v, pjy.v);
-      v_dz.v = vec_sub(piz.v, pjz.v);
-
-      v_r2.v = vec_mul(v_dx.v, v_dx.v);
-      v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v);
-      v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v);
-
-      vector v_doi_mask;
-      int doi_mask;
-
-      /* Form r2 < hig2 mask. */
-      v_doi_mask.v = vec_cmp_lt(v_r2.v, v_hig2.v);
-
-      /* Form integer mask. */
-      doi_mask = vec_cmp_result(v_doi_mask.v);
-
-      /* If there are any interactions left pack interaction values into c2
-       * cache. */
-      if (doi_mask)
-        storeInteractions(doi_mask, cj_cache_idx, &v_r2, &v_dx, &v_dy, &v_dz,
-                          &mj, &pjvx, &pjvy, &pjvz, cj_cache, &int_cache,
-                          &icount, &rhoSum, &rho_dhSum, &wcountSum,
-                          &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum,
-                          &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz);
-
-    } /* loop over the parts in cj. */
-
-    /* Perform padded vector remainder interactions if any are present. */
-    calcRemInteractions(&int_cache, icount, &rhoSum, &rho_dhSum, &wcountSum,
-                        &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum,
-                        &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz,
-                        &icount_align);
-
-    /* Initialise masks to true in case remainder interactions have been
-     * performed. */
-    vector int_mask, int_mask2;
-#ifdef HAVE_AVX512_F
-    KNL_MASK_16 knl_mask = 0xFFFF;
-    KNL_MASK_16 knl_mask2 = 0xFFFF;
-    int_mask.m = vec_setint1(0xFFFFFFFF);
-    int_mask2.m = vec_setint1(0xFFFFFFFF);
-#else
-    int_mask.m = vec_setint1(0xFFFFFFFF);
-    int_mask2.m = vec_setint1(0xFFFFFFFF);
-#endif
-
-    /* Perform interaction with 2 vectors. */
-    for (int pjd = 0; pjd < icount_align; pjd += (NUM_VEC_PROC * VEC_SIZE)) {
-      runner_iact_nonsym_2_vec_density(
-          &int_cache.r2q[pjd], &int_cache.dxq[pjd], &int_cache.dyq[pjd],
-          &int_cache.dzq[pjd], v_hi_inv, v_vix, v_viy, v_viz,
-          &int_cache.vxq[pjd], &int_cache.vyq[pjd], &int_cache.vzq[pjd],
-          &int_cache.mq[pjd], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum,
-          &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, int_mask, int_mask2,
-#ifdef HAVE_AVX512_F
-          knl_mask, knl_mask2);
-#else
-          0, 0);
-#endif
-    }
-
-    /* Perform horizontal adds on vector sums and store result in particle pi.
-     */
-    VEC_HADD(rhoSum, pi->rho);
-    VEC_HADD(rho_dhSum, pi->density.rho_dh);
-    VEC_HADD(wcountSum, pi->density.wcount);
-    VEC_HADD(wcount_dhSum, pi->density.wcount_dh);
-    VEC_HADD(div_vSum, pi->density.div_v);
-    VEC_HADD(curlvxSum, pi->density.rot_v[0]);
-    VEC_HADD(curlvySum, pi->density.rot_v[1]);
-    VEC_HADD(curlvzSum, pi->density.rot_v[2]);
-
-    if (face) {
-      faceIntCount += icount;
-      fprintf(faceIntFile, "%d\n", icount);
-      numFaceTested++;
-    } else if (edge) {
-      edgeIntCount += icount;
-      fprintf(edgeIntFile, "%d\n", icount);
-      numEdgeTested++;
-    } else if (corner) {
-      cornerIntCount += icount;
-      fprintf(cornerIntFile, "%d\n", icount);
-      numCornerTested++;
-    }
-
-    icount = 0;
-
-  } /* loop over the parts in ci. */
-
-  if (face) {
-    faceCtr++;
-    message(
-        "Total number of face interactions: %d, average per particle: %f, "
-        "number tested: %d.",
-        faceIntCount, ((float)faceIntCount) / ((float)numFaceTested),
-        numFaceTested);
-  } else if (edge) {
-    edgeCtr++;
-    message(
-        "Total number of edge interactions: %d, average per particle: %f, "
-        "number tested: %d",
-        edgeIntCount, ((float)edgeIntCount) / ((float)numEdgeTested),
-        numEdgeTested);
-  } else if (corner) {
-    cornerCtr++;
-    message(
-        "Total number of corner interactions: %d, average per particle: %f, "
-        "number tested: %d",
-        cornerIntCount, ((float)cornerIntCount) / ((float)numCornerTested),
-        numCornerTested);
-  }
-
-  int max_ind_i = 0;
-  /* Loop over the parts in cj. */
-  for (int pjd = 0; pjd < count_j && max_ind_i < count_i; pjd++) {
-
-    /* Get a hold of the jth part in cj. */
-    struct part *restrict pj = &parts_j[sort_j[pjd].i];
-    if (!part_is_active(pj, e)) continue;
-
-    di = sort_i[max_ind_i].d;
-    while (max_ind_i < count_i - 1 && max_dj[pjd] > di) {
-      max_ind_i++;
-
-      di = sort_i[max_ind_i].d;
-    }
-    int exit_iteration = max_ind_i - 1;
-
-    int cj_cache_idx = pjd;
-
-    const float hj = cj_cache->h[cj_cache_idx];
-    const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift;
-    if (dj > di_max) continue;
-
-    const float hjg2 = hj * hj * kernel_gamma2;
-
-    vector pjx, pjy, pjz;
-    vector v_hj, v_vjx, v_vjy, v_vjz, v_hjg2;
-
-    /* Fill particle pi vectors. */
-    pjx.v = vec_set1(cj_cache->x[cj_cache_idx]);
-    pjy.v = vec_set1(cj_cache->y[cj_cache_idx]);
-    pjz.v = vec_set1(cj_cache->z[cj_cache_idx]);
-    v_hj.v = vec_set1(hj);
-    v_vjx.v = vec_set1(cj_cache->vx[cj_cache_idx]);
-    v_vjy.v = vec_set1(cj_cache->vy[cj_cache_idx]);
-    v_vjz.v = vec_set1(cj_cache->vz[cj_cache_idx]);
-
-    v_hjg2.v = vec_set1(hjg2);
-
-    /* Reset cumulative sums of update vectors. */
-    vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum,
-        curlvySum, curlvzSum;
-
-    /* Get the inverse of hj. */
-    vector v_hj_inv;
-
-    v_hj_inv = vec_reciprocal(v_hj);
-
-    rhoSum.v = vec_setzero();
-    rho_dhSum.v = vec_setzero();
-    wcountSum.v = vec_setzero();
-    wcount_dhSum.v = vec_setzero();
-    div_vSum.v = vec_setzero();
-    curlvxSum.v = vec_setzero();
-    curlvySum.v = vec_setzero();
-    curlvzSum.v = vec_setzero();
-
-    /* Pad cache if there is a serial remainder. */
-    int exit_iteration_align = exit_iteration;
-    int rem = exit_iteration % (num_vec_proc * VEC_SIZE);
-    if (rem != 0) {
-      int pad = (num_vec_proc * VEC_SIZE) - rem;
-
-      exit_iteration_align -= pad;
-    }
-
-    vector pix, piy, piz;
-    vector pivx, pivy, pivz, mi;
-
-    /* Loop over the parts in ci. */
-    // for (int pid = count_i - 1; pid >= 0; pid -= VEC_SIZE) {
-    for (int pid = exit_iteration_align; pid < count_i;
-         pid += (num_vec_proc * VEC_SIZE)) {
-
-      /* Get the cache index to the ith particle. */
-      int ci_cache_idx = pid;  // sort_i[pid].i;
-
-      vector v_dx, v_dy, v_dz, v_r2;
-
-      /* Load 2 sets of vectors from the particle cache. */
-      pix.v = vec_load(&ci_cache->x[ci_cache_idx]);
-      piy.v = vec_load(&ci_cache->y[ci_cache_idx]);
-      piz.v = vec_load(&ci_cache->z[ci_cache_idx]);
-      pivx.v = vec_load(&ci_cache->vx[ci_cache_idx]);
-      pivy.v = vec_load(&ci_cache->vy[ci_cache_idx]);
-      pivz.v = vec_load(&ci_cache->vz[ci_cache_idx]);
-      mi.v = vec_load(&ci_cache->m[ci_cache_idx]);
-
-      /* Compute the pairwise distance. */
-      v_dx.v = vec_sub(pjx.v, pix.v);
-      v_dy.v = vec_sub(pjy.v, piy.v);
-      v_dz.v = vec_sub(pjz.v, piz.v);
-
-      v_r2.v = vec_mul(v_dx.v, v_dx.v);
-      v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v);
-      v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v);
-
-      vector v_doj_mask;
-      int doj_mask;
-
-      /* Form r2 < hig2 mask. */
-      v_doj_mask.v = vec_cmp_lt(v_r2.v, v_hjg2.v);
-
-      /* Form integer mask. */
-      doj_mask = vec_cmp_result(v_doj_mask.v);
-
-      /* If there are any interactions left pack interaction values into c2
-       * cache. */
-      if (doj_mask)
-        storeInteractions(doj_mask, ci_cache_idx, &v_r2, &v_dx, &v_dy, &v_dz,
-                          &mi, &pivx, &pivy, &pivz, ci_cache, &int_cache,
-                          &icount, &rhoSum, &rho_dhSum, &wcountSum,
-                          &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum,
-                          &curlvzSum, v_hj_inv, v_vjx, v_vjy, v_vjz);
-
-    } /* loop over the parts in cj. */
-
-    /* Perform padded vector remainder interactions if any are present. */
-    calcRemInteractions(&int_cache, icount, &rhoSum, &rho_dhSum, &wcountSum,
-                        &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum,
-                        &curlvzSum, v_hj_inv, v_vjx, v_vjy, v_vjz,
-                        &icount_align);
-
-    /* Initialise masks to true in case remainder interactions have been
-     * performed. */
-    vector int_mask, int_mask2;
-#ifdef HAVE_AVX512_F
-    KNL_MASK_16 knl_mask = 0xFFFF;
-    KNL_MASK_16 knl_mask2 = 0xFFFF;
-    int_mask.m = vec_setint1(0xFFFFFFFF);
-    int_mask2.m = vec_setint1(0xFFFFFFFF);
-#else
-    int_mask.m = vec_setint1(0xFFFFFFFF);
-    int_mask2.m = vec_setint1(0xFFFFFFFF);
-#endif
-
-    /* Perform interaction with 2 vectors. */
-    for (int pjd = 0; pjd < icount_align; pjd += (NUM_VEC_PROC * VEC_SIZE)) {
-      runner_iact_nonsym_2_vec_density(
-          &int_cache.r2q[pjd], &int_cache.dxq[pjd], &int_cache.dyq[pjd],
-          &int_cache.dzq[pjd], v_hj_inv, v_vjx, v_vjy, v_vjz,
-          &int_cache.vxq[pjd], &int_cache.vyq[pjd], &int_cache.vzq[pjd],
-          &int_cache.mq[pjd], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum,
-          &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, int_mask, int_mask2,
-#ifdef HAVE_AVX512_F
-          knl_mask, knl_mask2);
-#else
-          0, 0);
-#endif
-    }
-
-    /* Perform horizontal adds on vector sums and store result in particle pi.
-     */
-    VEC_HADD(rhoSum, pj->rho);
-    VEC_HADD(rho_dhSum, pj->density.rho_dh);
-    VEC_HADD(wcountSum, pj->density.wcount);
-    VEC_HADD(wcount_dhSum, pj->density.wcount_dh);
-    VEC_HADD(div_vSum, pj->density.div_v);
-    VEC_HADD(curlvxSum, pj->density.rot_v[0]);
-    VEC_HADD(curlvySum, pj->density.rot_v[1]);
-    VEC_HADD(curlvzSum, pj->density.rot_v[2]);
-
-    icount = 0;
-
-  } /* loop over the parts in ci. */
-
-  if (face) {
-    faceCtr++;
-    message(
-        "Total number of face interactions: %d, average per particle: %f, "
-        "number tested: %d.",
-        faceIntCount, ((float)faceIntCount) / ((float)numFaceTested),
-        numFaceTested);
-  } else if (edge) {
-    edgeCtr++;
-    message(
-        "Total number of edge interactions: %d, average per particle: %f, "
-        "number tested: %d",
-        edgeIntCount, ((float)edgeIntCount) / ((float)numEdgeTested),
-        numEdgeTested);
-  } else if (corner) {
-    cornerCtr++;
-    message(
-        "Total number of corner interactions: %d, average per particle: %f, "
-        "number tested: %d",
-        cornerIntCount, ((float)cornerIntCount) / ((float)numCornerTested),
-        numCornerTested);
-  }
-  TIMER_TOC(timer_dopair_density);
-
-#endif /* WITH_VECTORIZATION */
-}
-
-/* Similar to AUTO-VEC but process 2 pi at a time and use two vectors in
- * interaction loop. */
-void runner_dopair1_density_vec_2(struct runner *r, struct cell *ci,
-                                  struct cell *cj) {
-
-#ifdef WITH_VECTORIZATION
-  const struct engine *restrict e = r->e;
-
-  const int num_vec_proc = 2;
-
-  vector v_hi, v_vix, v_viy, v_viz, v_hig2;
-  vector v_hi_2, v_vix2, v_viy2, v_viz2, v_hig2_2;
-
-  TIMER_TIC;
-
-  /* Anything to do here? */
-  if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return;
-
-#ifdef SWIFT_DEBUG_CHECKS
-  cell_is_drifted(ci, e);
-  cell_is_drifted(cj, e);
-#endif
-
-  /* Get the sort ID. */
-  double shift[3] = {0.0, 0.0, 0.0};
-  const int sid = space_getsid(e->s, &ci, &cj, shift);
-
-  /* Have the cells been sorted? */
-  if (!(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid)))
-    error("Trying to interact unsorted cells.");
-
-  /* Get the cutoff shift. */
-  double rshift = 0.0;
-  for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k];
-
-  /* Pick-out the sorted lists. */
-  const struct entry *restrict sort_i = &ci->sort[sid * (ci->count + 1)];
-  const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)];
-
-  /* Get some other useful values. */
-  const int count_i = ci->count;
-  const int count_j = cj->count;
-  struct part *restrict parts_i = ci->parts;
-  struct part *restrict parts_j = cj->parts;
-  const double di_max = sort_i[count_i - 1].d - rshift;
-  const double dj_min = sort_j[0].d;
-  const float dx_max = (ci->dx_max + cj->dx_max);
-
-  /* Get the particle cache from the runner and re-allocate
-   * the cache if it is not big enough for the cell. */
-  struct cache *restrict ci_cache = &r->ci_cache;
-  struct cache *restrict cj_cache = &r->cj_cache;
-
-  if (ci_cache->count < count_i) {
-    cache_init(ci_cache, count_i);
-  }
-  if (cj_cache->count < count_j) {
-    cache_init(cj_cache, count_j);
-  }
-
-  float *max_di __attribute__((aligned(sizeof(float) * VEC_SIZE)));
-  float *max_dj __attribute__((aligned(sizeof(float) * VEC_SIZE)));
-
-  max_di = r->ci_cache.max_d;
-  max_dj = r->cj_cache.max_d;
-
-  int first_pi, last_pj;
-  /* Find particles maximum distance into cj, max_di[] and ci, max_dj[]. */
-  /* For particles in ci */
-  populate_max_d_no_cache(ci, cj, sort_i, sort_j, dx_max, rshift, max_di,
-                          max_dj, &first_pi, &last_pj);
-
-  float di, dj;
-
-  int max_ind_j = count_j - 1;
-  int max_ind_i = 0;
-
-  dj = sort_j[max_ind_j].d;
-  while (max_ind_j > 0 && max_di[count_i - 1] < dj) {
-    max_ind_j--;
-
-    dj = sort_j[max_ind_j].d;
-  }
-
-  di = sort_i[max_ind_i].d;
-  while (max_ind_i < count_i - 1 && max_dj[0] > di) {
-    max_ind_i++;
-
-    di = sort_i[max_ind_i].d;
-  }
-
-  last_pj = max(last_pj, max_ind_j);
-  first_pi = min(first_pi, max_ind_i);
-
-  cache_read_two_partial_cells_sorted(ci, cj, ci_cache, cj_cache, sort_i,
-                                      sort_j, shift, &first_pi, &last_pj,
-                                      num_vec_proc);
-
-  /* Loop over the parts in ci. */
-  for (int pid = count_i - 1; pid >= first_pi && max_ind_j >= 0; pid -= 2) {
-
-    /* Get a hold of the ith part in ci. */
-    struct part *restrict pi = &parts_i[sort_i[pid].i];
-    struct part *restrict pi2 = &parts_i[sort_i[pid - 1].i];
-    if (!part_is_active(pi, e)) continue;
-
-    dj = sort_j[max_ind_j].d;
-    while (max_ind_j > 0 && max_di[pid] < dj) {
-      max_ind_j--;
-
-      dj = sort_j[max_ind_j].d;
-    }
-    int exit_iteration = max_ind_j;
-
-    int ci_cache_idx = pid;  // sort_i[pid].i;
-
-    const float hi = ci_cache->h[ci_cache_idx];
-    const float hi_2 = ci_cache->h[ci_cache_idx - 1];
-    const double di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift;
-    if (di < dj_min) continue;
-
-    const float hig2 = hi * hi * kernel_gamma2;
-    const float hig2_2 = hi_2 * hi_2 * kernel_gamma2;
-
-    vector pix, piy, piz;
-    vector pix2, piy2, piz2;
-
-    /* Fill particle pi vectors. */
-    pix.v = vec_set1(ci_cache->x[ci_cache_idx]);
-    piy.v = vec_set1(ci_cache->y[ci_cache_idx]);
-    piz.v = vec_set1(ci_cache->z[ci_cache_idx]);
-    v_hi.v = vec_set1(hi);
-    v_vix.v = vec_set1(ci_cache->vx[ci_cache_idx]);
-    v_viy.v = vec_set1(ci_cache->vy[ci_cache_idx]);
-    v_viz.v = vec_set1(ci_cache->vz[ci_cache_idx]);
-
-    v_hig2.v = vec_set1(hig2);
-
-    pix2.v = vec_set1(ci_cache->x[ci_cache_idx - 1]);
-    piy2.v = vec_set1(ci_cache->y[ci_cache_idx - 1]);
-    piz2.v = vec_set1(ci_cache->z[ci_cache_idx - 1]);
-    v_hi_2.v = vec_set1(hi_2);
-    v_vix2.v = vec_set1(ci_cache->vx[ci_cache_idx - 1]);
-    v_viy2.v = vec_set1(ci_cache->vy[ci_cache_idx - 1]);
-    v_viz2.v = vec_set1(ci_cache->vz[ci_cache_idx - 1]);
-
-    v_hig2_2.v = vec_set1(hig2_2);
-
-    /* Reset cumulative sums of update vectors. */
-    vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum,
-        curlvySum, curlvzSum;
-
-    vector rhoSum2, rho_dhSum2, wcountSum2, wcount_dhSum2, div_vSum2,
-        curlvxSum2, curlvySum2, curlvzSum2;
-
-    /* Get the inverse of hi. */
-    vector v_hi_inv;
-    vector v_hi_inv_2;
-
-    v_hi_inv = vec_reciprocal(v_hi);
-    v_hi_inv_2 = vec_reciprocal(v_hi_2);
-
-    rhoSum.v = vec_setzero();
-    rho_dhSum.v = vec_setzero();
-    wcountSum.v = vec_setzero();
-    wcount_dhSum.v = vec_setzero();
-    div_vSum.v = vec_setzero();
-    curlvxSum.v = vec_setzero();
-    curlvySum.v = vec_setzero();
-    curlvzSum.v = vec_setzero();
-
-    rhoSum2.v = vec_setzero();
-    rho_dhSum2.v = vec_setzero();
-    wcountSum2.v = vec_setzero();
-    wcount_dhSum2.v = vec_setzero();
-    div_vSum2.v = vec_setzero();
-    curlvxSum2.v = vec_setzero();
-    curlvySum2.v = vec_setzero();
-    curlvzSum2.v = vec_setzero();
-
-    /* Pad cache if there is a serial remainder. */
-    int exit_iteration_align = exit_iteration;
-    int rem = exit_iteration % (num_vec_proc * VEC_SIZE);
-    if (rem != 0) {
-      int pad = (num_vec_proc * VEC_SIZE) - rem;
-
-      exit_iteration_align += pad;
-    }
-
-    vector pjx, pjy, pjz;
-    vector pjx2, pjy2, pjz2;
-
-    /* Loop over the parts in cj. */
-    for (int pjd = 0; pjd < exit_iteration_align;
-         pjd += (num_vec_proc * VEC_SIZE)) {
-
-      /* Get the cache index to the jth particle. */
-      int cj_cache_idx = pjd;  // sort_j[pjd].i;
-
-      vector v_dx, v_dy, v_dz, v_r2;
-      vector v_dx2, v_dy2, v_dz2, v_r2_2;
-      vector v2_dx, v2_dy, v2_dz, v2_r2;
-      vector v2_dx2, v2_dy2, v2_dz2, v2_r2_2;
-
-      /* Load 2 sets of vectors from the particle cache. */
-      pjx.v = vec_load(&cj_cache->x[cj_cache_idx]);
-      pjx2.v = vec_load(&cj_cache->x[cj_cache_idx + VEC_SIZE]);
-      pjy.v = vec_load(&cj_cache->y[cj_cache_idx]);
-      pjy2.v = vec_load(&cj_cache->y[cj_cache_idx + VEC_SIZE]);
-      pjz.v = vec_load(&cj_cache->z[cj_cache_idx]);
-      pjz2.v = vec_load(&cj_cache->z[cj_cache_idx + VEC_SIZE]);
-      // pjvx.v = vec_load(&cj_cache.vx[cj_cache_idx]);
-      // pjvy.v = vec_load(&cj_cache.vy[cj_cache_idx]);
-      // pjvz.v = vec_load(&cj_cache.vz[cj_cache_idx]);
-      // mj.v = vec_load(&cj_cache.m[cj_cache_idx]);
-
-      /* Compute the pairwise distance. */
-      v_dx.v = vec_sub(pix.v, pjx.v);
-      v_dx2.v = vec_sub(pix.v, pjx2.v);
-      v2_dx.v = vec_sub(pix2.v, pjx.v);
-      v2_dx2.v = vec_sub(pix2.v, pjx2.v);
-
-      v_dy.v = vec_sub(piy.v, pjy.v);
-      v_dy2.v = vec_sub(piy.v, pjy2.v);
-      v2_dy.v = vec_sub(piy2.v, pjy.v);
-      v2_dy2.v = vec_sub(piy2.v, pjy2.v);
-
-      v_dz.v = vec_sub(piz.v, pjz.v);
-      v_dz2.v = vec_sub(piz.v, pjz2.v);
-      v2_dz.v = vec_sub(piz2.v, pjz.v);
-      v2_dz2.v = vec_sub(piz2.v, pjz2.v);
-
-      v_r2.v = vec_mul(v_dx.v, v_dx.v);
-      v_r2_2.v = vec_mul(v_dx2.v, v_dx2.v);
-      v2_r2.v = vec_mul(v2_dx.v, v2_dx.v);
-      v2_r2_2.v = vec_mul(v2_dx2.v, v2_dx2.v);
-
-      v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v);
-      v_r2_2.v = vec_fma(v_dy2.v, v_dy2.v, v_r2_2.v);
-      v2_r2.v = vec_fma(v2_dy.v, v2_dy.v, v2_r2.v);
-      v2_r2_2.v = vec_fma(v2_dy2.v, v2_dy2.v, v2_r2_2.v);
-
-      v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v);
-      v_r2_2.v = vec_fma(v_dz2.v, v_dz2.v, v_r2_2.v);
-      v2_r2.v = vec_fma(v2_dz.v, v2_dz.v, v2_r2.v);
-      v2_r2_2.v = vec_fma(v2_dz2.v, v2_dz2.v, v2_r2_2.v);
-
-      vector v_doi_mask, v_doi_mask2;
-      int doi_mask, doi_mask2;
-
-      vector v2_doi_mask, v2_doi_mask2;
-      int doi2_mask, doi2_mask2;
-
-      /* Form r2 < hig2 mask. */
-      v_doi_mask.v = vec_cmp_lt(v_r2.v, v_hig2.v);
-      v_doi_mask2.v = vec_cmp_lt(v_r2_2.v, v_hig2.v);
-      v2_doi_mask.v = vec_cmp_lt(v2_r2.v, v_hig2_2.v);
-      v2_doi_mask2.v = vec_cmp_lt(v2_r2_2.v, v_hig2_2.v);
-
-      /* Form integer mask. */
-      doi_mask = vec_cmp_result(v_doi_mask.v);
-      doi_mask2 = vec_cmp_result(v_doi_mask2.v);
-      doi2_mask = vec_cmp_result(v2_doi_mask.v);
-      doi2_mask2 = vec_cmp_result(v2_doi_mask2.v);
-
-      if (doi_mask)
-        runner_iact_nonsym_intrinsic_vec_density(
-            &v_r2, &v_dx, &v_dy, &v_dz, v_hi_inv, v_vix, v_viy, v_viz,
-            &cj_cache->vx[cj_cache_idx], &cj_cache->vy[cj_cache_idx],
-            &cj_cache->vz[cj_cache_idx], &cj_cache->m[cj_cache_idx], &rhoSum,
-            &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum,
-            &curlvySum, &curlvzSum, v_doi_mask,
-#ifdef HAVE_AVX512_F
-            knl_mask);
-#else
-            0);
-#endif
-      if (doi_mask2)
-        runner_iact_nonsym_intrinsic_vec_density(
-            &v_r2_2, &v_dx2, &v_dy2, &v_dz2, v_hi_inv, v_vix, v_viy, v_viz,
-            &cj_cache->vx[cj_cache_idx + VEC_SIZE],
-            &cj_cache->vy[cj_cache_idx + VEC_SIZE],
-            &cj_cache->vz[cj_cache_idx + VEC_SIZE],
-            &cj_cache->m[cj_cache_idx + VEC_SIZE], &rhoSum, &rho_dhSum,
-            &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum,
-            &curlvzSum, v_doi_mask2,
-#ifdef HAVE_AVX512_F
-            knl_mask);
-#else
-            0);
-#endif
-      if (doi2_mask)
-        runner_iact_nonsym_intrinsic_vec_density(
-            &v2_r2, &v2_dx, &v2_dy, &v2_dz, v_hi_inv_2, v_vix2, v_viy2, v_viz2,
-            &cj_cache->vx[cj_cache_idx], &cj_cache->vy[cj_cache_idx],
-            &cj_cache->vz[cj_cache_idx], &cj_cache->m[cj_cache_idx], &rhoSum2,
-            &rho_dhSum2, &wcountSum2, &wcount_dhSum2, &div_vSum2, &curlvxSum2,
-            &curlvySum2, &curlvzSum2, v2_doi_mask,
-#ifdef HAVE_AVX512_F
-            knl_mask);
-#else
-            0);
-#endif
-      if (doi2_mask2)
-        runner_iact_nonsym_intrinsic_vec_density(
-            &v2_r2_2, &v2_dx2, &v2_dy2, &v2_dz2, v_hi_inv_2, v_vix2, v_viy2,
-            v_viz2, &cj_cache->vx[cj_cache_idx + VEC_SIZE],
-            &cj_cache->vy[cj_cache_idx + VEC_SIZE],
-            &cj_cache->vz[cj_cache_idx + VEC_SIZE],
-            &cj_cache->m[cj_cache_idx + VEC_SIZE], &rhoSum2, &rho_dhSum2,
-            &wcountSum2, &wcount_dhSum2, &div_vSum2, &curlvxSum2, &curlvySum2,
-            &curlvzSum2, v2_doi_mask2,
-#ifdef HAVE_AVX512_F
-            knl_mask);
-#else
-            0);
-#endif
-
-    } /* loop over the parts in cj. */
-
-    /* Perform horizontal adds on vector sums and store result in particle pi.
-     */
-    VEC_HADD(rhoSum, pi->rho);
-    VEC_HADD(rho_dhSum, pi->density.rho_dh);
-    VEC_HADD(wcountSum, pi->density.wcount);
-    VEC_HADD(wcount_dhSum, pi->density.wcount_dh);
-    VEC_HADD(div_vSum, pi->density.div_v);
-    VEC_HADD(curlvxSum, pi->density.rot_v[0]);
-    VEC_HADD(curlvySum, pi->density.rot_v[1]);
-    VEC_HADD(curlvzSum, pi->density.rot_v[2]);
-
-    VEC_HADD(rhoSum2, pi2->rho);
-    VEC_HADD(rho_dhSum2, pi2->density.rho_dh);
-    VEC_HADD(wcountSum2, pi2->density.wcount);
-    VEC_HADD(wcount_dhSum2, pi2->density.wcount_dh);
-    VEC_HADD(div_vSum2, pi2->density.div_v);
-    VEC_HADD(curlvxSum2, pi2->density.rot_v[0]);
-    VEC_HADD(curlvySum2, pi2->density.rot_v[1]);
-    VEC_HADD(curlvzSum2, pi2->density.rot_v[2]);
-
-  } /* loop over the parts in ci. */
-
-  /* Loop over the parts in cj. */
-  for (int pjd = 0; pjd <= last_pj && max_ind_i < count_i; pjd += 2) {
-
-    /* Get a hold of the jth part in cj. */
-    struct part *restrict pj = &parts_j[sort_j[pjd].i];
-    struct part *restrict pj2 = &parts_j[sort_j[pjd + 1].i];
-    if (!part_is_active(pj, e)) continue;
-
-    di = sort_i[max_ind_i].d;
-    while (max_ind_i < count_i - 1 && max_dj[pjd] > di) {
-      max_ind_i++;
-
-      di = sort_i[max_ind_i].d;
-    }
-    int exit_iteration = max_ind_i;
-
-    int cj_cache_idx = pjd;
-
-    const float hj = cj_cache->h[cj_cache_idx];
-    const float hj_2 = cj_cache->h[cj_cache_idx + 1];
-    const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift;
-    if (dj > di_max) continue;
-
-    const float hjg2 = hj * hj * kernel_gamma2;
-    const float hjg2_2 = hj_2 * hj_2 * kernel_gamma2;
-
-    vector pjx, pjy, pjz;
-    vector pjx2, pjy2, pjz2;
-    vector v_hj, v_vjx, v_vjy, v_vjz, v_hjg2;
-    vector v_hj_2, v_vjx2, v_vjy2, v_vjz2, v_hjg2_2;
-
-    /* Fill particle pi vectors. */
-    pjx.v = vec_set1(cj_cache->x[cj_cache_idx]);
-    pjy.v = vec_set1(cj_cache->y[cj_cache_idx]);
-    pjz.v = vec_set1(cj_cache->z[cj_cache_idx]);
-    v_hj.v = vec_set1(hj);
-    v_vjx.v = vec_set1(cj_cache->vx[cj_cache_idx]);
-    v_vjy.v = vec_set1(cj_cache->vy[cj_cache_idx]);
-    v_vjz.v = vec_set1(cj_cache->vz[cj_cache_idx]);
-
-    v_hjg2.v = vec_set1(hjg2);
-
-    pjx2.v = vec_set1(cj_cache->x[cj_cache_idx + 1]);
-    pjy2.v = vec_set1(cj_cache->y[cj_cache_idx + 1]);
-    pjz2.v = vec_set1(cj_cache->z[cj_cache_idx + 1]);
-    v_hj_2.v = vec_set1(hj_2);
-    v_vjx2.v = vec_set1(cj_cache->vx[cj_cache_idx + 1]);
-    v_vjy2.v = vec_set1(cj_cache->vy[cj_cache_idx + 1]);
-    v_vjz2.v = vec_set1(cj_cache->vz[cj_cache_idx + 1]);
-
-    v_hjg2_2.v = vec_set1(hjg2_2);
-
-    /* Reset cumulative sums of update vectors. */
-    vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum,
-        curlvySum, curlvzSum;
-
-    vector rhoSum2, rho_dhSum2, wcountSum2, wcount_dhSum2, div_vSum2,
-        curlvxSum2, curlvySum2, curlvzSum2;
-
-    /* Get the inverse of hj. */
-    vector v_hj_inv;
-    vector v_hj_inv_2;
-
-    v_hj_inv = vec_reciprocal(v_hj);
-    v_hj_inv_2 = vec_reciprocal(v_hj_2);
-
-    rhoSum.v = vec_setzero();
-    rho_dhSum.v = vec_setzero();
-    wcountSum.v = vec_setzero();
-    wcount_dhSum.v = vec_setzero();
-    div_vSum.v = vec_setzero();
-    curlvxSum.v = vec_setzero();
-    curlvySum.v = vec_setzero();
-    curlvzSum.v = vec_setzero();
-
-    rhoSum2.v = vec_setzero();
-    rho_dhSum2.v = vec_setzero();
-    wcountSum2.v = vec_setzero();
-    wcount_dhSum2.v = vec_setzero();
-    div_vSum2.v = vec_setzero();
-    curlvxSum2.v = vec_setzero();
-    curlvySum2.v = vec_setzero();
-    curlvzSum2.v = vec_setzero();
-
-    /* Pad cache if there is a serial remainder. */
-    int exit_iteration_align = exit_iteration;
-    int rem = exit_iteration % (num_vec_proc * VEC_SIZE);
-    if (rem != 0) {
-      int pad = (num_vec_proc * VEC_SIZE) - rem;
-
-      exit_iteration_align -= pad;
-    }
-
-    vector pix, piy, piz;
-    vector pix2, piy2, piz2;
-    // vector pivx, pivy, pivz, mi;
-
-    /* Loop over the parts in ci. */
-    for (int pid = exit_iteration_align; pid < count_i;
-         pid += (num_vec_proc * VEC_SIZE)) {
-
-      /* Get the cache index to the ith particle. */
-      int ci_cache_idx = pid;  // sort_i[pid].i;
-      int ci2_cache_idx = pid + VEC_SIZE;
-
-      vector v_dx, v_dy, v_dz, v_r2;
-      vector v_dx2, v_dy2, v_dz2, v_r2_2;
-      vector v2_dx, v2_dy, v2_dz, v2_r2;
-      vector v2_dx2, v2_dy2, v2_dz2, v2_r2_2;
-
-      /* Load 2 sets of vectors from the particle cache. */
-      pix.v = vec_load(&ci_cache->x[ci_cache_idx]);
-      pix2.v = vec_load(&ci_cache->x[ci2_cache_idx]);
-      piy.v = vec_load(&ci_cache->y[ci_cache_idx]);
-      piy2.v = vec_load(&ci_cache->y[ci2_cache_idx]);
-      piz.v = vec_load(&ci_cache->z[ci_cache_idx]);
-      piz2.v = vec_load(&ci_cache->z[ci2_cache_idx]);
-      // pivx.v = vec_load(&ci_cache->vx[ci_cache_idx]);
-      // pivy.v = vec_load(&ci_cache->vy[ci_cache_idx]);
-      // pivz.v = vec_load(&ci_cache->vz[ci_cache_idx]);
-      // mi.v = vec_load(&ci_cache->m[ci_cache_idx]);
-
-      /* Compute the pairwise distance. */
-      v_dx.v = vec_sub(pjx.v, pix.v);
-      v_dx2.v = vec_sub(pjx.v, pix2.v);
-      v2_dx.v = vec_sub(pjx2.v, pix.v);
-      v2_dx2.v = vec_sub(pjx2.v, pix2.v);
-
-      v_dy.v = vec_sub(pjy.v, piy.v);
-      v_dy2.v = vec_sub(pjy.v, piy2.v);
-      v2_dy.v = vec_sub(pjy2.v, piy.v);
-      v2_dy2.v = vec_sub(pjy2.v, piy2.v);
-
-      v_dz.v = vec_sub(pjz.v, piz.v);
-      v_dz2.v = vec_sub(pjz.v, piz2.v);
-      v2_dz.v = vec_sub(pjz2.v, piz.v);
-      v2_dz2.v = vec_sub(pjz2.v, piz2.v);
-
-      v_r2.v = vec_mul(v_dx.v, v_dx.v);
-      v_r2_2.v = vec_mul(v_dx2.v, v_dx2.v);
-      v2_r2.v = vec_mul(v2_dx.v, v2_dx.v);
-      v2_r2_2.v = vec_mul(v2_dx2.v, v2_dx2.v);
-
-      v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v);
-      v_r2_2.v = vec_fma(v_dy2.v, v_dy2.v, v_r2_2.v);
-      v2_r2.v = vec_fma(v2_dy.v, v2_dy.v, v2_r2.v);
-      v2_r2_2.v = vec_fma(v2_dy2.v, v2_dy2.v, v2_r2_2.v);
-
-      v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v);
-      v_r2_2.v = vec_fma(v_dz2.v, v_dz2.v, v_r2_2.v);
-      v2_r2.v = vec_fma(v2_dz.v, v2_dz.v, v2_r2.v);
-      v2_r2_2.v = vec_fma(v2_dz2.v, v2_dz2.v, v2_r2_2.v);
-
-      vector v_doj_mask, v_doj_mask2;
-      int doj_mask, doj_mask2;
-
-      vector v2_doj_mask, v2_doj_mask2;
-      int doj2_mask, doj2_mask2;
-
-      /* Form r2 < hig2 mask. */
-      v_doj_mask.v = vec_cmp_lt(v_r2.v, v_hjg2.v);
-      v_doj_mask2.v = vec_cmp_lt(v_r2_2.v, v_hjg2.v);
-      v2_doj_mask.v = vec_cmp_lt(v2_r2.v, v_hjg2_2.v);
-      v2_doj_mask2.v = vec_cmp_lt(v2_r2_2.v, v_hjg2_2.v);
-
-      /* Form integer mask. */
-      doj_mask = vec_cmp_result(v_doj_mask.v);
-      doj_mask2 = vec_cmp_result(v_doj_mask2.v);
-      doj2_mask = vec_cmp_result(v2_doj_mask.v);
-      doj2_mask2 = vec_cmp_result(v2_doj_mask2.v);
-
-      /* Perform interaction with 2 vectors. */
-      if (doj_mask)
-        runner_iact_nonsym_intrinsic_vec_density(
-            &v_r2, &v_dx, &v_dy, &v_dz, v_hj_inv, v_vjx, v_vjy, v_vjz,
-            &ci_cache->vx[ci_cache_idx], &ci_cache->vy[ci_cache_idx],
-            &ci_cache->vz[ci_cache_idx], &ci_cache->m[ci_cache_idx], &rhoSum,
-            &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum,
-            &curlvySum, &curlvzSum, v_doj_mask,
-#ifdef HAVE_AVX512_F
-            knl_mask);
-#else
-            0);
-#endif
-      if (doj_mask2)
-        runner_iact_nonsym_intrinsic_vec_density(
-            &v_r2_2, &v_dx2, &v_dy2, &v_dz2, v_hj_inv, v_vjx, v_vjy, v_vjz,
-            &ci_cache->vx[ci2_cache_idx], &ci_cache->vy[ci2_cache_idx],
-            &ci_cache->vz[ci2_cache_idx], &ci_cache->m[ci2_cache_idx], &rhoSum,
-            &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum,
-            &curlvySum, &curlvzSum, v_doj_mask2,
-#ifdef HAVE_AVX512_F
-            knl_mask);
-#else
-            0);
-#endif
-      if (doj2_mask)
-        runner_iact_nonsym_intrinsic_vec_density(
-            &v2_r2, &v2_dx, &v2_dy, &v2_dz, v_hj_inv_2, v_vjx2, v_vjy2, v_vjz2,
-            &ci_cache->vx[ci_cache_idx], &ci_cache->vy[ci_cache_idx],
-            &ci_cache->vz[ci_cache_idx], &ci_cache->m[ci_cache_idx], &rhoSum2,
-            &rho_dhSum2, &wcountSum2, &wcount_dhSum2, &div_vSum2, &curlvxSum2,
-            &curlvySum2, &curlvzSum2, v2_doj_mask,
-#ifdef HAVE_AVX512_F
-            knl_mask);
-#else
-            0);
-#endif
-      if (doj2_mask2)
-        runner_iact_nonsym_intrinsic_vec_density(
-            &v2_r2_2, &v2_dx2, &v2_dy2, &v2_dz2, v_hj_inv_2, v_vjx2, v_vjy2,
-            v_vjz2, &ci_cache->vx[ci2_cache_idx], &ci_cache->vy[ci2_cache_idx],
-            &ci_cache->vz[ci2_cache_idx], &ci_cache->m[ci2_cache_idx], &rhoSum2,
-            &rho_dhSum2, &wcountSum2, &wcount_dhSum2, &div_vSum2, &curlvxSum2,
-            &curlvySum2, &curlvzSum2, v2_doj_mask2,
-#ifdef HAVE_AVX512_F
-            knl_mask);
-#else
-            0);
-#endif
-    } /* loop over the parts in cj. */
-
-    /* Perform horizontal adds on vector sums and store result in particle pi.
-     */
-    VEC_HADD(rhoSum, pj->rho);
-    VEC_HADD(rho_dhSum, pj->density.rho_dh);
-    VEC_HADD(wcountSum, pj->density.wcount);
-    VEC_HADD(wcount_dhSum, pj->density.wcount_dh);
-    VEC_HADD(div_vSum, pj->density.div_v);
-    VEC_HADD(curlvxSum, pj->density.rot_v[0]);
-    VEC_HADD(curlvySum, pj->density.rot_v[1]);
-    VEC_HADD(curlvzSum, pj->density.rot_v[2]);
-
-    VEC_HADD(rhoSum2, pj2->rho);
-    VEC_HADD(rho_dhSum2, pj2->density.rho_dh);
-    VEC_HADD(wcountSum2, pj2->density.wcount);
-    VEC_HADD(wcount_dhSum2, pj2->density.wcount_dh);
-    VEC_HADD(div_vSum2, pj2->density.div_v);
-    VEC_HADD(curlvxSum2, pj2->density.rot_v[0]);
-    VEC_HADD(curlvySum2, pj2->density.rot_v[1]);
-    VEC_HADD(curlvzSum2, pj2->density.rot_v[2]);
-
-  } /* loop over the parts in ci. */
-
-  TIMER_TOC(timer_dopair_density);
-
-#endif /* WITH_VECTORIZATION */
-}
-
-/* Use unsorted cache. */
-void runner_dopair1_density_vec_3(struct runner *r, struct cell *ci,
-                                  struct cell *cj) {
-
-#ifdef WITH_VECTORIZATION
-  const struct engine *restrict e = r->e;
-
-  int num_vec_proc = 2;
-
-  vector v_hi, v_vix, v_viy, v_viz, v_hig2;
-  // vector v_hi_2, v_vix2, v_viy2, v_viz2, v_hig2_2;
-
-  TIMER_TIC;
-
-  /* Anything to do here? */
-  if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return;
-
-#ifdef SWIFT_DEBUG_CHECKS
-  cell_is_drifted(ci, e);
-  cell_is_drifted(cj, e);
-#endif
-
-  /* Get the sort ID. */
-  double shift[3] = {0.0, 0.0, 0.0};
-  const int sid = space_getsid(e->s, &ci, &cj, shift);
-
-  /* Have the cells been sorted? */
-  if (!(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid)))
-    error("Trying to interact unsorted cells.");
-
-  /* Get the cutoff shift. */
-  double rshift = 0.0;
-  for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k];
-
-  /* Pick-out the sorted lists. */
-  const struct entry *restrict sort_i = &ci->sort[sid * (ci->count + 1)];
-  const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)];
-
-  /* Get some other useful values. */
-  const int count_i = ci->count;
-  const int count_j = cj->count;
-  struct part *restrict parts_i = ci->parts;
-  struct part *restrict parts_j = cj->parts;
-  const double di_max = sort_i[count_i - 1].d - rshift;
-  const double dj_min = sort_j[0].d;
-  const float dx_max = (ci->dx_max + cj->dx_max);
-
-  /* Get the particle cache from the runner and re-allocate
-   * the cache if it is not big enough for the cell. */
-  struct cache *restrict ci_cache = &r->ci_cache;
-  struct cache *restrict cj_cache = &r->cj_cache;
-
-  if (ci_cache->count < count_i) {
-    cache_init(ci_cache, count_i);
-  }
-  if (cj_cache->count < count_j) {
-    cache_init(cj_cache, count_j);
-  }
-
-  cache_read_two_cells(ci, cj, ci_cache, cj_cache, shift);
-  // cache_read_two_cells_sorted(ci, cj, ci_cache, &cj_cache, sort_i, sort_j,
-  // shift);
-
-  float *max_di __attribute__((aligned(sizeof(float) * VEC_SIZE)));
-  float *max_dj __attribute__((aligned(sizeof(float) * VEC_SIZE)));
-
-  max_di = r->ci_cache.max_d;
-  max_dj = r->cj_cache.max_d;
-
-  /* Find particles maximum distance into cj, max_di[] and ci, max_dj[]. */
-  /* For particles in ci */
-  populate_max_d(ci, cj, sort_i, sort_j, ci_cache, cj_cache, dx_max, rshift,
-                 max_di, max_dj);
-
-  float di, dj;
-
-  int max_ind_j = count_j - 1;
-
-  /* Loop over the parts in ci. */
-  for (int pid = count_i - 1; pid >= 0 && max_ind_j >= 0; pid--) {
-    // for (int pid = count_i - 1; pid >= 0 && max_ind_j >= 0; pid-=2) {
-
-    /* Get a hold of the ith part in ci. */
-    struct part *restrict pi = &parts_i[sort_i[pid].i];
-    // struct part *restrict pi2 = &parts_i[sort_i[pid - 1].i];
-    if (!part_is_active(pi, e)) continue;
-
-    dj = sort_j[max_ind_j].d;
-    while (max_ind_j > 0 && max_di[pid] < dj) {
-      max_ind_j--;
-
-      dj = sort_j[max_ind_j].d;
-    }
-    int exit_iteration = max_ind_j;
-
-    int ci_cache_idx = sort_i[pid].i;
-
-    const float hi = ci_cache->h[ci_cache_idx];
-    // const float hi_2 = ci_cache->h[ci_cache_idx - 1];
-    const double di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift;
-    if (di < dj_min) continue;
-
-    const float hig2 = hi * hi * kernel_gamma2;
-    // const float hig2_2 = hi_2 * hi_2 * kernel_gamma2;
-
-    vector pix, piy, piz;
-    // vector pix2, piy2, piz2;
-
-    /* Fill particle pi vectors. */
-    pix.v = vec_set1(ci_cache->x[ci_cache_idx]);
-    piy.v = vec_set1(ci_cache->y[ci_cache_idx]);
-    piz.v = vec_set1(ci_cache->z[ci_cache_idx]);
-    v_hi.v = vec_set1(hi);
-    v_vix.v = vec_set1(ci_cache->vx[ci_cache_idx]);
-    v_viy.v = vec_set1(ci_cache->vy[ci_cache_idx]);
-    v_viz.v = vec_set1(ci_cache->vz[ci_cache_idx]);
-
-    v_hig2.v = vec_set1(hig2);
-
-    // pix2.v = vec_set1(ci_cache->x[ci_cache_idx - 1]);
-    // piy2.v = vec_set1(ci_cache->y[ci_cache_idx - 1]);
-    // piz2.v = vec_set1(ci_cache->z[ci_cache_idx - 1]);
-    // v_hi_2.v = vec_set1(hi_2);
-    // v_vix2.v = vec_set1(ci_cache->vx[ci_cache_idx - 1]);
-    // v_viy2.v = vec_set1(ci_cache->vy[ci_cache_idx - 1]);
-    // v_viz2.v = vec_set1(ci_cache->vz[ci_cache_idx - 1]);
-
-    // v_hig2_2.v = vec_set1(hig2_2);
-
-    /* Reset cumulative sums of update vectors. */
-    vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum,
-        curlvySum, curlvzSum;
-
-    // vector rhoSum2, rho_dhSum2, wcountSum2, wcount_dhSum2, div_vSum2,
-    // curlvxSum2,
-    //    curlvySum2, curlvzSum2;
-
-    /* Get the inverse of hi. */
-    vector v_hi_inv;
-    // vector v_hi_inv_2;
-
-    v_hi_inv = vec_reciprocal(v_hi);
-    // v_hi_inv_2 = vec_reciprocal(v_hi_2);
-
-    rhoSum.v = vec_setzero();
-    rho_dhSum.v = vec_setzero();
-    wcountSum.v = vec_setzero();
-    wcount_dhSum.v = vec_setzero();
-    div_vSum.v = vec_setzero();
-    curlvxSum.v = vec_setzero();
-    curlvySum.v = vec_setzero();
-    curlvzSum.v = vec_setzero();
-
-    // rhoSum2.v = vec_setzero();
-    // rho_dhSum2.v = vec_setzero();
-    // wcountSum2.v = vec_setzero();
-    // wcount_dhSum2.v = vec_setzero();
-    // div_vSum2.v = vec_setzero();
-    // curlvxSum2.v = vec_setzero();
-    // curlvySum2.v = vec_setzero();
-    // curlvzSum2.v = vec_setzero();
-
-    /* Pad cache if there is a serial remainder. */
-    int exit_iteration_align = exit_iteration;
-    int rem = exit_iteration % (num_vec_proc * VEC_SIZE);
-    if (rem != 0) {
-      int pad = (num_vec_proc * VEC_SIZE) - rem;
-
-      exit_iteration_align += pad;
-    }
-
-    vector pjx, pjy, pjz;
-    vector pjx2, pjy2, pjz2;
-
-    /* Loop over the parts in cj. */
-    for (int pjd = 0; pjd < exit_iteration_align;
-         pjd += (num_vec_proc * VEC_SIZE)) {
-
-      /* Get the cache index to the jth particle. */
-      // int cj_cache_idx = sort_j[pjd].i;
-      int indices[2 * VEC_SIZE];
-
-      for (int i = 0; i < 2 * VEC_SIZE; i++) indices[i] = sort_j[pjd + i].i;
-
-      vector v_dx, v_dy, v_dz, v_r2;
-      vector v_dx2, v_dy2, v_dz2, v_r2_2;
-      // vector v2_dx, v2_dy, v2_dz, v2_r2;
-      // vector v2_dx2, v2_dy2, v2_dz2, v2_r2_2;
-
-      /* Load 2 sets of vectors from the particle cache. */
-      // pjx.v = vec_load(&cj_cache.x[cj_cache_idx]);
-      // pjx2.v = vec_load(&cj_cache.x[cj_cache_idx + VEC_SIZE]);
-      // pjy.v = vec_load(&cj_cache.y[cj_cache_idx]);
-      // pjy2.v = vec_load(&cj_cache.y[cj_cache_idx + VEC_SIZE]);
-      // pjz.v = vec_load(&cj_cache.z[cj_cache_idx]);
-      // pjz2.v = vec_load(&cj_cache.z[cj_cache_idx + VEC_SIZE]);
-      // pjvx.v = vec_load(&cj_cache.vx[cj_cache_idx]);
-      // pjvy.v = vec_load(&cj_cache.vy[cj_cache_idx]);
-      // pjvz.v = vec_load(&cj_cache.vz[cj_cache_idx]);
-      // mj.v = vec_load(&cj_cache.m[cj_cache_idx]);
-
-      pjx.v = vec_set(cj_cache->x[indices[0]], cj_cache->x[indices[1]],
-                      cj_cache->x[indices[2]], cj_cache->x[indices[3]],
-                      cj_cache->x[indices[4]], cj_cache->x[indices[5]],
-                      cj_cache->x[indices[6]], cj_cache->x[indices[7]]);
-      pjx2.v = vec_set(cj_cache->x[indices[8]], cj_cache->x[indices[9]],
-                       cj_cache->x[indices[10]], cj_cache->x[indices[11]],
-                       cj_cache->x[indices[12]], cj_cache->x[indices[13]],
-                       cj_cache->x[indices[14]], cj_cache->x[indices[15]]);
-      pjy.v = vec_set(cj_cache->y[indices[0]], cj_cache->y[indices[1]],
-                      cj_cache->y[indices[2]], cj_cache->y[indices[3]],
-                      cj_cache->y[indices[4]], cj_cache->y[indices[5]],
-                      cj_cache->y[indices[6]], cj_cache->y[indices[7]]);
-      pjy2.v = vec_set(cj_cache->y[indices[8]], cj_cache->y[indices[9]],
-                       cj_cache->y[indices[10]], cj_cache->y[indices[11]],
-                       cj_cache->y[indices[12]], cj_cache->y[indices[13]],
-                       cj_cache->y[indices[14]], cj_cache->y[indices[15]]);
-      pjz.v = vec_set(cj_cache->z[indices[0]], cj_cache->z[indices[1]],
-                      cj_cache->z[indices[2]], cj_cache->z[indices[3]],
-                      cj_cache->z[indices[4]], cj_cache->z[indices[5]],
-                      cj_cache->z[indices[6]], cj_cache->z[indices[7]]);
-      pjz2.v = vec_set(cj_cache->z[indices[8]], cj_cache->z[indices[9]],
-                       cj_cache->z[indices[10]], cj_cache->z[indices[11]],
-                       cj_cache->z[indices[12]], cj_cache->z[indices[13]],
-                       cj_cache->z[indices[14]], cj_cache->z[indices[15]]);
-
-      /* Compute the pairwise distance. */
-      v_dx.v = vec_sub(pix.v, pjx.v);
-      v_dx2.v = vec_sub(pix.v, pjx2.v);
-      // v2_dx.v = vec_sub(pix2.v, pjx.v);
-      // v2_dx2.v = vec_sub(pix2.v, pjx2.v);
-
-      v_dy.v = vec_sub(piy.v, pjy.v);
-      v_dy2.v = vec_sub(piy.v, pjy2.v);
-      // v2_dy.v = vec_sub(piy2.v, pjy.v);
-      // v2_dy2.v = vec_sub(piy2.v, pjy2.v);
-
-      v_dz.v = vec_sub(piz.v, pjz.v);
-      v_dz2.v = vec_sub(piz.v, pjz2.v);
-      // v2_dz.v = vec_sub(piz2.v, pjz.v);
-      // v2_dz2.v = vec_sub(piz2.v, pjz2.v);
-
-      v_r2.v = vec_mul(v_dx.v, v_dx.v);
-      v_r2_2.v = vec_mul(v_dx2.v, v_dx2.v);
-      // v2_r2.v = vec_mul(v2_dx.v, v2_dx.v);
-      // v2_r2_2.v = vec_mul(v2_dx2.v, v2_dx2.v);
-
-      v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v);
-      v_r2_2.v = vec_fma(v_dy2.v, v_dy2.v, v_r2_2.v);
-      // v2_r2.v = vec_fma(v2_dy.v, v2_dy.v, v2_r2.v);
-      // v2_r2_2.v = vec_fma(v2_dy2.v, v2_dy2.v, v2_r2_2.v);
-
-      v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v);
-      v_r2_2.v = vec_fma(v_dz2.v, v_dz2.v, v_r2_2.v);
-      // v2_r2.v = vec_fma(v2_dz.v, v2_dz.v, v2_r2.v);
-      // v2_r2_2.v = vec_fma(v2_dz2.v, v2_dz2.v, v2_r2_2.v);
-
-      vector v_doi_mask, v_doi_mask2;
-      int doi_mask, doi_mask2;
-
-      // vector v2_doi_mask, v2_doi_mask2;
-      // int doi2_mask, doi2_mask2;
-
-      /* Form r2 < hig2 mask. */
-      v_doi_mask.v = vec_cmp_lt(v_r2.v, v_hig2.v);
-      v_doi_mask2.v = vec_cmp_lt(v_r2_2.v, v_hig2.v);
-      // v2_doi_mask.v = vec_cmp_lt(v2_r2.v, v_hig2_2.v);
-      // v2_doi_mask2.v = vec_cmp_lt(v2_r2_2.v, v_hig2_2.v);
-
-      /* Form integer mask. */
-      doi_mask = vec_cmp_result(v_doi_mask.v);
-      doi_mask2 = vec_cmp_result(v_doi_mask2.v);
-      // doi2_mask = vec_cmp_result(v2_doi_mask.v);
-      // doi2_mask2 = vec_cmp_result(v2_doi_mask2.v);
-
-      if (doi_mask)
-        runner_iact_nonsym_intrinsic_vec_2_density(
-            cj_cache, &indices[0], &v_r2, &v_dx, &v_dy, &v_dz, v_hi_inv, v_vix,
-            v_viy, v_viz, &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum,
-            &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, v_doi_mask,
-#ifdef HAVE_AVX512_F
-            knl_mask);
-#else
-            0);
-#endif
-      if (doi_mask2)
-        runner_iact_nonsym_intrinsic_vec_2_density(
-            cj_cache, &indices[VEC_SIZE], &v_r2_2, &v_dx2, &v_dy2, &v_dz2,
-            v_hi_inv, v_vix, v_viy, v_viz, &rhoSum, &rho_dhSum, &wcountSum,
-            &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, &curlvzSum,
-            v_doi_mask2,
-#ifdef HAVE_AVX512_F
-            knl_mask);
-#else
-            0);
-#endif
-      //       if(doi2_mask)
-      //        runner_iact_nonsym_intrinsic_vec_density(
-      //          &v2_r2, &v2_dx, &v2_dy, &v2_dz, v_hi_inv_2, v_vix2, v_viy2,
-      //          v_viz2,
-      //          &cj_cache.vx[cj_cache_idx], &cj_cache.vy[cj_cache_idx],
-      //          &cj_cache.vz[cj_cache_idx],
-      //          &cj_cache.m[cj_cache_idx], &rhoSum2, &rho_dhSum2, &wcountSum2,
-      //          &wcount_dhSum2,
-      //          &div_vSum2, &curlvxSum2, &curlvySum2, &curlvzSum2,
-      //          v2_doi_mask,
-      //#ifdef HAVE_AVX512_F
-      //          knl_mask);
-      //#else
-      //          0);
-      //#endif
-      //      if(doi2_mask2)
-      //        runner_iact_nonsym_intrinsic_vec_density(
-      //          &v2_r2_2, &v2_dx2, &v2_dy2, &v2_dz2, v_hi_inv_2, v_vix2,
-      //          v_viy2, v_viz2,
-      //          &cj_cache.vx[cj_cache_idx + VEC_SIZE],
-      //          &cj_cache.vy[cj_cache_idx + VEC_SIZE],
-      //          &cj_cache.vz[cj_cache_idx + VEC_SIZE],
-      //          &cj_cache.m[cj_cache_idx + VEC_SIZE], &rhoSum2, &rho_dhSum2,
-      //          &wcountSum2, &wcount_dhSum2,
-      //          &div_vSum2, &curlvxSum2, &curlvySum2, &curlvzSum2,
-      //          v2_doi_mask2,
-      //#ifdef HAVE_AVX512_F
-      //          knl_mask);
-      //#else
-      //          0);
-      //#endif
-
-    } /* loop over the parts in cj. */
-
-    /* Perform horizontal adds on vector sums and store result in particle pi.
-     */
-    VEC_HADD(rhoSum, pi->rho);
-    VEC_HADD(rho_dhSum, pi->density.rho_dh);
-    VEC_HADD(wcountSum, pi->density.wcount);
-    VEC_HADD(wcount_dhSum, pi->density.wcount_dh);
-    VEC_HADD(div_vSum, pi->density.div_v);
-    VEC_HADD(curlvxSum, pi->density.rot_v[0]);
-    VEC_HADD(curlvySum, pi->density.rot_v[1]);
-    VEC_HADD(curlvzSum, pi->density.rot_v[2]);
-
-    // VEC_HADD(rhoSum2, pi2->rho);
-    // VEC_HADD(rho_dhSum2, pi2->density.rho_dh);
-    // VEC_HADD(wcountSum2, pi2->density.wcount);
-    // VEC_HADD(wcount_dhSum2, pi2->density.wcount_dh);
-    // VEC_HADD(div_vSum2, pi2->density.div_v);
-    // VEC_HADD(curlvxSum2, pi2->density.rot_v[0]);
-    // VEC_HADD(curlvySum2, pi2->density.rot_v[1]);
-    // VEC_HADD(curlvzSum2, pi2->density.rot_v[2]);
-
-  } /* loop over the parts in ci. */
-
-  int max_ind_i = 0;
-  /* Loop over the parts in cj. */
-  for (int pjd = 0; pjd < count_j && max_ind_i < count_i; pjd++) {
-
-    /* Get a hold of the jth part in cj. */
-    struct part *restrict pj = &parts_j[sort_j[pjd].i];
-    if (!part_is_active(pj, e)) continue;
-
-    di = sort_i[max_ind_i].d;
-    while (max_ind_i < count_i - 1 && max_dj[pjd] > di) {
-      max_ind_i++;
-
-      di = sort_i[max_ind_i].d;
-    }
-    int exit_iteration = max_ind_i;
-
-    int cj_cache_idx = pjd;
-
-    const float hj = cj_cache->h[cj_cache_idx];
-    const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift;
-    if (dj > di_max) continue;
-
-    const float hjg2 = hj * hj * kernel_gamma2;
-
-    vector pjx, pjy, pjz;
-    vector v_hj, v_vjx, v_vjy, v_vjz, v_hjg2;
-
-    /* Fill particle pi vectors. */
-    pjx.v = vec_set1(cj_cache->x[cj_cache_idx]);
-    pjy.v = vec_set1(cj_cache->y[cj_cache_idx]);
-    pjz.v = vec_set1(cj_cache->z[cj_cache_idx]);
-    v_hj.v = vec_set1(hj);
-    v_vjx.v = vec_set1(cj_cache->vx[cj_cache_idx]);
-    v_vjy.v = vec_set1(cj_cache->vy[cj_cache_idx]);
-    v_vjz.v = vec_set1(cj_cache->vz[cj_cache_idx]);
-
-    v_hjg2.v = vec_set1(hjg2);
-
-    /* Reset cumulative sums of update vectors. */
-    vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum,
-        curlvySum, curlvzSum;
-
-    /* Get the inverse of hj. */
-    vector v_hj_inv;
-
-    v_hj_inv = vec_reciprocal(v_hj);
-
-    rhoSum.v = vec_setzero();
-    rho_dhSum.v = vec_setzero();
-    wcountSum.v = vec_setzero();
-    wcount_dhSum.v = vec_setzero();
-    div_vSum.v = vec_setzero();
-    curlvxSum.v = vec_setzero();
-    curlvySum.v = vec_setzero();
-    curlvzSum.v = vec_setzero();
-
-    /* Pad cache if there is a serial remainder. */
-    int exit_iteration_align = exit_iteration;
-    int rem = exit_iteration % (num_vec_proc * VEC_SIZE);
-    if (rem != 0) {
-      int pad = (num_vec_proc * VEC_SIZE) - rem;
-
-      exit_iteration_align -= pad;
-    }
-
-    vector pix, piy, piz;
-    vector pix2, piy2, piz2;
-    // vector pivx, pivy, pivz, mi;
-
-    /* Loop over the parts in ci. */
-    for (int pid = count_i - 1; pid >= 0; pid -= (num_vec_proc * VEC_SIZE)) {
-
-      int indices[2 * VEC_SIZE];
-
-      for (int i = (2 * VEC_SIZE) - 1; i >= 0; i--)
-        indices[i] = sort_j[pid - i].i;
-
-      /* Get the cache index to the ith particle. */
-      // int ci_cache_idx = sort_i[pid].i;
-
-      vector v_dx, v_dy, v_dz, v_r2;
-      vector v_dx2, v_dy2, v_dz2, v_r2_2;
-
-      /* Load 2 sets of vectors from the particle cache. */
-      // pix.v = vec_load(&ci_cache->x[ci_cache_idx]);
-      // pix2.v = vec_load(&ci_cache->x[ci_cache_idx - VEC_SIZE]);
-      // piy.v = vec_load(&ci_cache->y[ci_cache_idx]);
-      // piy2.v = vec_load(&ci_cache->y[ci_cache_idx - VEC_SIZE]);
-      // piz.v = vec_load(&ci_cache->z[ci_cache_idx]);
-      // piz2.v = vec_load(&ci_cache->z[ci_cache_idx - VEC_SIZE]);
-      // pivx.v = vec_load(&ci_cache->vx[ci_cache_idx]);
-      // pivy.v = vec_load(&ci_cache->vy[ci_cache_idx]);
-      // pivz.v = vec_load(&ci_cache->vz[ci_cache_idx]);
-      // mi.v = vec_load(&ci_cache->m[ci_cache_idx]);
-
-      pix.v = vec_set(ci_cache->x[indices[0]], ci_cache->x[indices[1]],
-                      ci_cache->x[indices[2]], ci_cache->x[indices[3]],
-                      ci_cache->x[indices[4]], ci_cache->x[indices[5]],
-                      ci_cache->x[indices[6]], ci_cache->x[indices[7]]);
-      pix2.v = vec_set(ci_cache->x[indices[8]], ci_cache->x[indices[9]],
-                       ci_cache->x[indices[10]], ci_cache->x[indices[11]],
-                       ci_cache->x[indices[12]], ci_cache->x[indices[13]],
-                       ci_cache->x[indices[14]], ci_cache->x[indices[15]]);
-      piy.v = vec_set(ci_cache->y[indices[0]], ci_cache->y[indices[1]],
-                      ci_cache->y[indices[2]], ci_cache->y[indices[3]],
-                      ci_cache->y[indices[4]], ci_cache->y[indices[5]],
-                      ci_cache->y[indices[6]], ci_cache->y[indices[7]]);
-      piy2.v = vec_set(ci_cache->y[indices[8]], ci_cache->y[indices[9]],
-                       ci_cache->y[indices[10]], ci_cache->y[indices[11]],
-                       ci_cache->y[indices[12]], ci_cache->y[indices[13]],
-                       ci_cache->y[indices[14]], ci_cache->y[indices[15]]);
-      piz.v = vec_set(ci_cache->z[indices[0]], ci_cache->z[indices[1]],
-                      ci_cache->z[indices[2]], ci_cache->z[indices[3]],
-                      ci_cache->z[indices[4]], ci_cache->z[indices[5]],
-                      ci_cache->z[indices[6]], ci_cache->z[indices[7]]);
-      piz2.v = vec_set(ci_cache->z[indices[8]], ci_cache->z[indices[9]],
-                       ci_cache->z[indices[10]], ci_cache->z[indices[11]],
-                       ci_cache->z[indices[12]], ci_cache->z[indices[13]],
-                       ci_cache->z[indices[14]], ci_cache->z[indices[15]]);
-
-      /* Compute the pairwise distance. */
-      v_dx.v = vec_sub(pjx.v, pix.v);
-      v_dx2.v = vec_sub(pjx.v, pix2.v);
-      v_dy.v = vec_sub(pjy.v, piy.v);
-      v_dy2.v = vec_sub(pjy.v, piy2.v);
-      v_dz.v = vec_sub(pjz.v, piz.v);
-      v_dz2.v = vec_sub(pjz.v, piz2.v);
-
-      v_r2.v = vec_mul(v_dx.v, v_dx.v);
-      v_r2_2.v = vec_mul(v_dx2.v, v_dx2.v);
-      v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v);
-      v_r2_2.v = vec_fma(v_dy2.v, v_dy2.v, v_r2_2.v);
-      v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v);
-      v_r2_2.v = vec_fma(v_dz2.v, v_dz2.v, v_r2_2.v);
-
-      vector v_doj_mask, v_doj_mask2;
-      int doj_mask, doj_mask2;
-
-      /* Form r2 < hig2 mask. */
-      v_doj_mask.v = vec_cmp_lt(v_r2.v, v_hjg2.v);
-      v_doj_mask2.v = vec_cmp_lt(v_r2_2.v, v_hjg2.v);
-
-      /* Form integer mask. */
-      doj_mask = vec_cmp_result(v_doj_mask.v);
-      doj_mask2 = vec_cmp_result(v_doj_mask2.v);
-
-      /* Perform interaction with 2 vectors. */
-      if (doj_mask)
-        runner_iact_nonsym_intrinsic_vec_2_density(
-            ci_cache, &indices[0], &v_r2, &v_dx, &v_dy, &v_dz, v_hj_inv, v_vjx,
-            v_vjy, v_vjz, &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum,
-            &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, v_doj_mask,
-#ifdef HAVE_AVX512_F
-            knl_mask);
-#else
-            0);
-#endif
-      if (doj_mask2)
-        runner_iact_nonsym_intrinsic_vec_2_density(
-            ci_cache, &indices[VEC_SIZE], &v_r2_2, &v_dx2, &v_dy2, &v_dz2,
-            v_hj_inv, v_vjx, v_vjy, v_vjz, &rhoSum, &rho_dhSum, &wcountSum,
-            &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, &curlvzSum,
-            v_doj_mask2,
-#ifdef HAVE_AVX512_F
-            knl_mask);
-#else
-            0);
-#endif
-    } /* loop over the parts in cj. */
-
-    /* Perform horizontal adds on vector sums and store result in particle pi.
-     */
-    VEC_HADD(rhoSum, pj->rho);
-    VEC_HADD(rho_dhSum, pj->density.rho_dh);
-    VEC_HADD(wcountSum, pj->density.wcount);
-    VEC_HADD(wcount_dhSum, pj->density.wcount_dh);
-    VEC_HADD(div_vSum, pj->density.div_v);
-    VEC_HADD(curlvxSum, pj->density.rot_v[0]);
-    VEC_HADD(curlvySum, pj->density.rot_v[1]);
-    VEC_HADD(curlvzSum, pj->density.rot_v[2]);
-
-  } /* loop over the parts in ci. */
-
-  TIMER_TOC(timer_dopair_density);
-
-#endif /* WITH_VECTORIZATION */
-}
-
-/* Read one cell at a time. */
-void runner_dopair1_density_vec_4(struct runner *r, struct cell *ci,
-                                  struct cell *cj) {
-
-#ifdef WITH_VECTORIZATION
-  const struct engine *restrict e = r->e;
-
-  int num_vec_proc = 2;
-
-  vector v_hi, v_vix, v_viy, v_viz, v_hig2;
-  vector v_hi_2, v_vix2, v_viy2, v_viz2, v_hig2_2;
-
-  TIMER_TIC;
-
-  /* Anything to do here? */
-  if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return;
-
-#ifdef SWIFT_DEBUG_CHECKS
-  cell_is_drifted(ci, e);
-  cell_is_drifted(cj, e);
-#endif
-
-  /* Get the sort ID. */
-  double shift[3] = {0.0, 0.0, 0.0};
-  const int sid = space_getsid(e->s, &ci, &cj, shift);
-
-  /* Have the cells been sorted? */
-  if (!(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid)))
-    error("Trying to interact unsorted cells.");
-
-  /* Get the cutoff shift. */
-  double rshift = 0.0;
-  for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k];
-
-  /* Pick-out the sorted lists. */
-  const struct entry *restrict sort_i = &ci->sort[sid * (ci->count + 1)];
-  const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)];
-
-  /* Get some other useful values. */
-  const int count_i = ci->count;
-  const int count_j = cj->count;
-  struct part *restrict parts_i = ci->parts;
-  struct part *restrict parts_j = cj->parts;
-  const double di_max = sort_i[count_i - 1].d - rshift;
-  const double dj_min = sort_j[0].d;
-  const float dx_max = (ci->dx_max + cj->dx_max);
-
-  /* Get the particle cache from the runner and re-allocate
-   * the cache if it is not big enough for the cell. */
-  struct cache *restrict ci_cache = &r->ci_cache;
-
-  if (ci_cache->count < count_i) {
-    cache_init(ci_cache, count_i);
-  }
-
-  double loc[3];
-  loc[0] = ci->loc[0];
-  loc[1] = ci->loc[1];
-  loc[2] = ci->loc[2];
-
-  double shift_cj[3] = {0.0, 0.0, 0.0};
-
-  cache_read_cell_sorted(cj, ci_cache, sort_j, loc, shift_cj);
-
-  float *max_di __attribute__((aligned(sizeof(float) * VEC_SIZE)));
-  float *max_dj __attribute__((aligned(sizeof(float) * VEC_SIZE)));
-
-  max_di = r->ci_cache.max_d;
-  max_dj = r->cj_cache.max_d;
-
-  /* Find particles maximum distance into cj, max_di[] and ci, max_dj[]. */
-  /* For particles in ci */
-  float h = parts_i[sort_i[0].i].h;
-  float d;
-
-  /* For particles in ci */
-  max_di[0] = sort_i[0].d + h * kernel_gamma + dx_max - rshift;
-
-  for (int k = 1; k < ci->count; k++) {
-    h = parts_i[sort_i[k].i].h;
-    d = sort_i[k].d + h * kernel_gamma + dx_max - rshift;
-
-    max_di[k] = fmaxf(max_di[k - 1], d);
-  }
-
-  float di, dj;
-
-  int max_ind_j = count_j - 1;
-
-  /* Loop over the parts in ci. */
-  for (int pid = count_i - 1; pid >= 0 && max_ind_j >= 0; pid -= 2) {
-
-    /* Get a hold of the ith part in ci. */
-    struct part *restrict pi = &parts_i[sort_i[pid].i];
-    struct part *restrict pi2 = &parts_i[sort_i[pid - 1].i];
-    if (!part_is_active(pi, e)) continue;
-
-    dj = sort_j[max_ind_j].d;
-    while (max_ind_j > 0 && max_di[pid] < dj) {
-      max_ind_j--;
-
-      dj = sort_j[max_ind_j].d;
-    }
-    int exit_iteration = max_ind_j;
-
-    const float hi = pi->h;
-    const float hi_2 = pi2->h;
-    const double di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift;
-    if (di < dj_min) continue;
-
-    const float hig2 = hi * hi * kernel_gamma2;
-    const float hig2_2 = hi_2 * hi_2 * kernel_gamma2;
-
-    vector pix, piy, piz;
-    vector pix2, piy2, piz2;
-
-    /* Fill particle pi vectors. */
-    pix.v = vec_set1(pi->x[0] - loc[0] - shift[0]);
-    piy.v = vec_set1(pi->x[1] - loc[1] - shift[1]);
-    piz.v = vec_set1(pi->x[2] - loc[2] - shift[2]);
-    v_hi.v = vec_set1(hi);
-    v_vix.v = vec_set1(pi->v[0]);
-    v_viy.v = vec_set1(pi->v[1]);
-    v_viz.v = vec_set1(pi->v[2]);
-
-    v_hig2.v = vec_set1(hig2);
-
-    pix2.v = vec_set1(pi2->x[0] - loc[0] - shift[0]);
-    piy2.v = vec_set1(pi2->x[1] - loc[1] - shift[1]);
-    piz2.v = vec_set1(pi2->x[2] - loc[2] - shift[2]);
-    v_hi_2.v = vec_set1(hi_2);
-    v_vix2.v = vec_set1(pi2->v[0]);
-    v_viy2.v = vec_set1(pi2->v[1]);
-    v_viz2.v = vec_set1(pi2->v[2]);
-
-    v_hig2_2.v = vec_set1(hig2_2);
-
-    /* Reset cumulative sums of update vectors. */
-    vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum,
-        curlvySum, curlvzSum;
-
-    vector rhoSum2, rho_dhSum2, wcountSum2, wcount_dhSum2, div_vSum2,
-        curlvxSum2, curlvySum2, curlvzSum2;
-
-    /* Get the inverse of hi. */
-    vector v_hi_inv;
-    vector v_hi_inv_2;
-
-    v_hi_inv = vec_reciprocal(v_hi);
-    v_hi_inv_2 = vec_reciprocal(v_hi_2);
-
-    rhoSum.v = vec_setzero();
-    rho_dhSum.v = vec_setzero();
-    wcountSum.v = vec_setzero();
-    wcount_dhSum.v = vec_setzero();
-    div_vSum.v = vec_setzero();
-    curlvxSum.v = vec_setzero();
-    curlvySum.v = vec_setzero();
-    curlvzSum.v = vec_setzero();
-
-    rhoSum2.v = vec_setzero();
-    rho_dhSum2.v = vec_setzero();
-    wcountSum2.v = vec_setzero();
-    wcount_dhSum2.v = vec_setzero();
-    div_vSum2.v = vec_setzero();
-    curlvxSum2.v = vec_setzero();
-    curlvySum2.v = vec_setzero();
-    curlvzSum2.v = vec_setzero();
-
-    // exit_iteration = count_j;
-    /* Pad cache if there is a serial remainder. */
-    int exit_iteration_align = exit_iteration;
-    int rem = exit_iteration % (num_vec_proc * VEC_SIZE);
-    if (rem != 0) {
-      int pad = (num_vec_proc * VEC_SIZE) - rem;
-
-      exit_iteration_align += pad;
-    }
-
-    vector pjx, pjy, pjz;
-    vector pjx2, pjy2, pjz2;
-
-    /* Loop over the parts in cj. */
-    for (int pjd = 0; pjd < exit_iteration_align;
-         pjd += (num_vec_proc * VEC_SIZE)) {
-
-      /* Get the cache index to the jth particle. */
-      int cj_cache_idx = pjd;  // sort_j[pjd].i;
-
-      vector v_dx, v_dy, v_dz, v_r2;
-      vector v_dx2, v_dy2, v_dz2, v_r2_2;
-      vector v2_dx, v2_dy, v2_dz, v2_r2;
-      vector v2_dx2, v2_dy2, v2_dz2, v2_r2_2;
-
-      /* Load 2 sets of vectors from the particle cache. */
-      pjx.v = vec_load(&ci_cache->x[cj_cache_idx]);
-      pjx2.v = vec_load(&ci_cache->x[cj_cache_idx + VEC_SIZE]);
-      pjy.v = vec_load(&ci_cache->y[cj_cache_idx]);
-      pjy2.v = vec_load(&ci_cache->y[cj_cache_idx + VEC_SIZE]);
-      pjz.v = vec_load(&ci_cache->z[cj_cache_idx]);
-      pjz2.v = vec_load(&ci_cache->z[cj_cache_idx + VEC_SIZE]);
-      // pjvx.v = vec_load(&ci_cache->vx[cj_cache_idx]);
-      // pjvy.v = vec_load(&ci_cache->vy[cj_cache_idx]);
-      // pjvz.v = vec_load(&ci_cache->vz[cj_cache_idx]);
-      // mj.v = vec_load(&ci_cache->m[cj_cache_idx]);
-
-      /* Compute the pairwise distance. */
-      v_dx.v = vec_sub(pix.v, pjx.v);
-      v_dx2.v = vec_sub(pix.v, pjx2.v);
-      v2_dx.v = vec_sub(pix2.v, pjx.v);
-      v2_dx2.v = vec_sub(pix2.v, pjx2.v);
-
-      v_dy.v = vec_sub(piy.v, pjy.v);
-      v_dy2.v = vec_sub(piy.v, pjy2.v);
-      v2_dy.v = vec_sub(piy2.v, pjy.v);
-      v2_dy2.v = vec_sub(piy2.v, pjy2.v);
-
-      v_dz.v = vec_sub(piz.v, pjz.v);
-      v_dz2.v = vec_sub(piz.v, pjz2.v);
-      v2_dz.v = vec_sub(piz2.v, pjz.v);
-      v2_dz2.v = vec_sub(piz2.v, pjz2.v);
-
-      v_r2.v = vec_mul(v_dx.v, v_dx.v);
-      v_r2_2.v = vec_mul(v_dx2.v, v_dx2.v);
-      v2_r2.v = vec_mul(v2_dx.v, v2_dx.v);
-      v2_r2_2.v = vec_mul(v2_dx2.v, v2_dx2.v);
-
-      v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v);
-      v_r2_2.v = vec_fma(v_dy2.v, v_dy2.v, v_r2_2.v);
-      v2_r2.v = vec_fma(v2_dy.v, v2_dy.v, v2_r2.v);
-      v2_r2_2.v = vec_fma(v2_dy2.v, v2_dy2.v, v2_r2_2.v);
-
-      v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v);
-      v_r2_2.v = vec_fma(v_dz2.v, v_dz2.v, v_r2_2.v);
-      v2_r2.v = vec_fma(v2_dz.v, v2_dz.v, v2_r2.v);
-      v2_r2_2.v = vec_fma(v2_dz2.v, v2_dz2.v, v2_r2_2.v);
-
-      vector v_doi_mask, v_doi_mask2;
-      int doi_mask, doi_mask2;
-
-      vector v2_doi_mask, v2_doi_mask2;
-      int doi2_mask, doi2_mask2;
-
-      /* Form r2 < hig2 mask. */
-      v_doi_mask.v = vec_cmp_lt(v_r2.v, v_hig2.v);
-      v_doi_mask2.v = vec_cmp_lt(v_r2_2.v, v_hig2.v);
-      v2_doi_mask.v = vec_cmp_lt(v2_r2.v, v_hig2_2.v);
-      v2_doi_mask2.v = vec_cmp_lt(v2_r2_2.v, v_hig2_2.v);
-
-      /* Form integer mask. */
-      doi_mask = vec_cmp_result(v_doi_mask.v);
-      doi_mask2 = vec_cmp_result(v_doi_mask2.v);
-      doi2_mask = vec_cmp_result(v2_doi_mask.v);
-      doi2_mask2 = vec_cmp_result(v2_doi_mask2.v);
-
-      if (doi_mask)
-        runner_iact_nonsym_intrinsic_vec_density(
-            &v_r2, &v_dx, &v_dy, &v_dz, v_hi_inv, v_vix, v_viy, v_viz,
-            &ci_cache->vx[cj_cache_idx], &ci_cache->vy[cj_cache_idx],
-            &ci_cache->vz[cj_cache_idx], &ci_cache->m[cj_cache_idx], &rhoSum,
-            &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum,
-            &curlvySum, &curlvzSum, v_doi_mask,
-#ifdef HAVE_AVX512_F
-            knl_mask);
-#else
-            0);
-#endif
-      if (doi_mask2)
-        runner_iact_nonsym_intrinsic_vec_density(
-            &v_r2_2, &v_dx2, &v_dy2, &v_dz2, v_hi_inv, v_vix, v_viy, v_viz,
-            &ci_cache->vx[cj_cache_idx + VEC_SIZE],
-            &ci_cache->vy[cj_cache_idx + VEC_SIZE],
-            &ci_cache->vz[cj_cache_idx + VEC_SIZE],
-            &ci_cache->m[cj_cache_idx + VEC_SIZE], &rhoSum, &rho_dhSum,
-            &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum,
-            &curlvzSum, v_doi_mask2,
-#ifdef HAVE_AVX512_F
-            knl_mask);
-#else
-            0);
-#endif
-      if (doi2_mask)
-        runner_iact_nonsym_intrinsic_vec_density(
-            &v2_r2, &v2_dx, &v2_dy, &v2_dz, v_hi_inv_2, v_vix2, v_viy2, v_viz2,
-            &ci_cache->vx[cj_cache_idx], &ci_cache->vy[cj_cache_idx],
-            &ci_cache->vz[cj_cache_idx], &ci_cache->m[cj_cache_idx], &rhoSum2,
-            &rho_dhSum2, &wcountSum2, &wcount_dhSum2, &div_vSum2, &curlvxSum2,
-            &curlvySum2, &curlvzSum2, v2_doi_mask,
-#ifdef HAVE_AVX512_F
-            knl_mask);
-#else
-            0);
-#endif
-      if (doi2_mask2)
-        runner_iact_nonsym_intrinsic_vec_density(
-            &v2_r2_2, &v2_dx2, &v2_dy2, &v2_dz2, v_hi_inv_2, v_vix2, v_viy2,
-            v_viz2, &ci_cache->vx[cj_cache_idx + VEC_SIZE],
-            &ci_cache->vy[cj_cache_idx + VEC_SIZE],
-            &ci_cache->vz[cj_cache_idx + VEC_SIZE],
-            &ci_cache->m[cj_cache_idx + VEC_SIZE], &rhoSum2, &rho_dhSum2,
-            &wcountSum2, &wcount_dhSum2, &div_vSum2, &curlvxSum2, &curlvySum2,
-            &curlvzSum2, v2_doi_mask2,
-#ifdef HAVE_AVX512_F
-            knl_mask);
-#else
-            0);
-#endif
-
-    } /* loop over the parts in cj. */
-
-    /* Perform horizontal adds on vector sums and store result in particle pi.
-     */
-    VEC_HADD(rhoSum, pi->rho);
-    VEC_HADD(rho_dhSum, pi->density.rho_dh);
-    VEC_HADD(wcountSum, pi->density.wcount);
-    VEC_HADD(wcount_dhSum, pi->density.wcount_dh);
-    VEC_HADD(div_vSum, pi->density.div_v);
-    VEC_HADD(curlvxSum, pi->density.rot_v[0]);
-    VEC_HADD(curlvySum, pi->density.rot_v[1]);
-    VEC_HADD(curlvzSum, pi->density.rot_v[2]);
-
-    VEC_HADD(rhoSum2, pi2->rho);
-    VEC_HADD(rho_dhSum2, pi2->density.rho_dh);
-    VEC_HADD(wcountSum2, pi2->density.wcount);
-    VEC_HADD(wcount_dhSum2, pi2->density.wcount_dh);
-    VEC_HADD(div_vSum2, pi2->density.div_v);
-    VEC_HADD(curlvxSum2, pi2->density.rot_v[0]);
-    VEC_HADD(curlvySum2, pi2->density.rot_v[1]);
-    VEC_HADD(curlvzSum2, pi2->density.rot_v[2]);
-
-  } /* loop over the parts in ci. */
-
-  cache_read_cell_sorted(ci, ci_cache, sort_i, loc, shift);
-
-  h = parts_j[sort_j[0].i].h;
-  max_dj[0] = sort_j[0].d - h * kernel_gamma - dx_max - rshift;
-
-  for (int k = 1; k < cj->count; k++) {
-    h = parts_j[sort_j[k].i].h;
-    d = sort_j[k].d - h * kernel_gamma - dx_max - rshift;
-
-    max_dj[k] = fmaxf(max_dj[k - 1], d);
-  }
-
-  int max_ind_i = 0;
-  /* Loop over the parts in cj. */
-  for (int pjd = 0; pjd < count_j && max_ind_i < count_i; pjd += 2) {
-
-    /* Get a hold of the jth part in cj. */
-    struct part *restrict pj = &parts_j[sort_j[pjd].i];
-    struct part *restrict pj2 = &parts_j[sort_j[pjd + 1].i];
-    if (!part_is_active(pj, e)) continue;
-
-    di = sort_i[max_ind_i].d;
-    while (max_ind_i < count_i - 1 && max_dj[pjd] > di) {
-      max_ind_i++;
-
-      di = sort_i[max_ind_i].d;
-    }
-    int exit_iteration = max_ind_i;
-
-    const float hj = pj->h;
-    const float hj_2 = pj2->h;
-    const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift;
-    if (dj > di_max) continue;
-
-    const float hjg2 = hj * hj * kernel_gamma2;
-    const float hjg2_2 = hj_2 * hj_2 * kernel_gamma2;
-
-    vector pjx, pjy, pjz;
-    vector pjx2, pjy2, pjz2;
-    vector v_hj, v_vjx, v_vjy, v_vjz, v_hjg2;
-    vector v_hj_2, v_vjx2, v_vjy2, v_vjz2, v_hjg2_2;
-
-    /* Fill particle pi vectors. */
-    pjx.v = vec_set1(pj->x[0] - loc[0]);
-    pjy.v = vec_set1(pj->x[1] - loc[1]);
-    pjz.v = vec_set1(pj->x[2] - loc[2]);
-    v_hj.v = vec_set1(hj);
-    v_vjx.v = vec_set1(pj->v[0]);
-    v_vjy.v = vec_set1(pj->v[1]);
-    v_vjz.v = vec_set1(pj->v[2]);
-
-    v_hjg2.v = vec_set1(hjg2);
-
-    pjx2.v = vec_set1(pj2->x[0] - loc[0]);
-    pjy2.v = vec_set1(pj2->x[1] - loc[1]);
-    pjz2.v = vec_set1(pj2->x[2] - loc[2]);
-    v_hj_2.v = vec_set1(hj_2);
-    v_vjx2.v = vec_set1(pj2->v[0]);
-    v_vjy2.v = vec_set1(pj2->v[1]);
-    v_vjz2.v = vec_set1(pj2->v[2]);
-
-    v_hjg2_2.v = vec_set1(hjg2_2);
-
-    /* Reset cumulative sums of update vectors. */
-    vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum,
-        curlvySum, curlvzSum;
-
-    vector rhoSum2, rho_dhSum2, wcountSum2, wcount_dhSum2, div_vSum2,
-        curlvxSum2, curlvySum2, curlvzSum2;
-
-    /* Get the inverse of hj. */
-    vector v_hj_inv;
-    vector v_hj_inv_2;
-
-    v_hj_inv = vec_reciprocal(v_hj);
-    v_hj_inv_2 = vec_reciprocal(v_hj_2);
-
-    rhoSum.v = vec_setzero();
-    rho_dhSum.v = vec_setzero();
-    wcountSum.v = vec_setzero();
-    wcount_dhSum.v = vec_setzero();
-    div_vSum.v = vec_setzero();
-    curlvxSum.v = vec_setzero();
-    curlvySum.v = vec_setzero();
-    curlvzSum.v = vec_setzero();
-
-    rhoSum2.v = vec_setzero();
-    rho_dhSum2.v = vec_setzero();
-    wcountSum2.v = vec_setzero();
-    wcount_dhSum2.v = vec_setzero();
-    div_vSum2.v = vec_setzero();
-    curlvxSum2.v = vec_setzero();
-    curlvySum2.v = vec_setzero();
-    curlvzSum2.v = vec_setzero();
-
-    // exit_iteration = 0;
-    /* Pad cache if there is a serial remainder. */
-    int exit_iteration_align = exit_iteration;
-    int rem = exit_iteration % (num_vec_proc * VEC_SIZE);
-    // if (rem != 0) {
-    //  int pad = (num_vec_proc * VEC_SIZE) - rem;
-
-    //  exit_iteration_align -= pad;
-    //}
-
-    vector pix, piy, piz;
-    vector pix2, piy2, piz2;
-    // vector pivx, pivy, pivz, mi;
-
-    /* Loop over the parts in ci. */
-    // for (int pid = count_i - 1; pid >= 0; pid -= (num_vec_proc * VEC_SIZE)) {
-    // for (int pid = count_i - 1; pid >= exit_iteration_align; pid -=
-    // (num_vec_proc * VEC_SIZE)) {
-    for (int pid = exit_iteration_align;
-         pid < (count_i + (num_vec_proc * VEC_SIZE) - rem);
-         pid += (num_vec_proc * VEC_SIZE)) {
-
-      /* Get the cache index to the ith particle. */
-      int ci_cache_idx = pid;  // sort_i[pid].i;
-      int ci2_cache_idx = pid + VEC_SIZE;
-
-      vector v_dx, v_dy, v_dz, v_r2;
-      vector v_dx2, v_dy2, v_dz2, v_r2_2;
-      vector v2_dx, v2_dy, v2_dz, v2_r2;
-      vector v2_dx2, v2_dy2, v2_dz2, v2_r2_2;
-
-      /* Load 2 sets of vectors from the particle cache. */
-      pix.v = vec_load(&ci_cache->x[ci_cache_idx]);
-      pix2.v = vec_load(&ci_cache->x[ci2_cache_idx]);
-      piy.v = vec_load(&ci_cache->y[ci_cache_idx]);
-      piy2.v = vec_load(&ci_cache->y[ci2_cache_idx]);
-      piz.v = vec_load(&ci_cache->z[ci_cache_idx]);
-      piz2.v = vec_load(&ci_cache->z[ci2_cache_idx]);
-      // pivx.v = vec_load(&ci_cache->vx[ci_cache_idx]);
-      // pivy.v = vec_load(&ci_cache->vy[ci_cache_idx]);
-      // pivz.v = vec_load(&ci_cache->vz[ci_cache_idx]);
-      // mi.v = vec_load(&ci_cache->m[ci_cache_idx]);
-
-      /* Compute the pairwise distance. */
-      v_dx.v = vec_sub(pjx.v, pix.v);
-      v_dx2.v = vec_sub(pjx.v, pix2.v);
-      v2_dx.v = vec_sub(pjx2.v, pix.v);
-      v2_dx2.v = vec_sub(pjx2.v, pix2.v);
-
-      v_dy.v = vec_sub(pjy.v, piy.v);
-      v_dy2.v = vec_sub(pjy.v, piy2.v);
-      v2_dy.v = vec_sub(pjy2.v, piy.v);
-      v2_dy2.v = vec_sub(pjy2.v, piy2.v);
-
-      v_dz.v = vec_sub(pjz.v, piz.v);
-      v_dz2.v = vec_sub(pjz.v, piz2.v);
-      v2_dz.v = vec_sub(pjz2.v, piz.v);
-      v2_dz2.v = vec_sub(pjz2.v, piz2.v);
-
-      v_r2.v = vec_mul(v_dx.v, v_dx.v);
-      v_r2_2.v = vec_mul(v_dx2.v, v_dx2.v);
-      v2_r2.v = vec_mul(v2_dx.v, v2_dx.v);
-      v2_r2_2.v = vec_mul(v2_dx2.v, v2_dx2.v);
-
-      v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v);
-      v_r2_2.v = vec_fma(v_dy2.v, v_dy2.v, v_r2_2.v);
-      v2_r2.v = vec_fma(v2_dy.v, v2_dy.v, v2_r2.v);
-      v2_r2_2.v = vec_fma(v2_dy2.v, v2_dy2.v, v2_r2_2.v);
-
-      v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v);
-      v_r2_2.v = vec_fma(v_dz2.v, v_dz2.v, v_r2_2.v);
-      v2_r2.v = vec_fma(v2_dz.v, v2_dz.v, v2_r2.v);
-      v2_r2_2.v = vec_fma(v2_dz2.v, v2_dz2.v, v2_r2_2.v);
-
-      vector v_doj_mask, v_doj_mask2;
-      int doj_mask, doj_mask2;
-
-      vector v2_doj_mask, v2_doj_mask2;
-      int doj2_mask, doj2_mask2;
-
-      /* Form r2 < hig2 mask. */
-      v_doj_mask.v = vec_cmp_lt(v_r2.v, v_hjg2.v);
-      v_doj_mask2.v = vec_cmp_lt(v_r2_2.v, v_hjg2.v);
-      v2_doj_mask.v = vec_cmp_lt(v2_r2.v, v_hjg2_2.v);
-      v2_doj_mask2.v = vec_cmp_lt(v2_r2_2.v, v_hjg2_2.v);
-
-      /* Form integer mask. */
-      doj_mask = vec_cmp_result(v_doj_mask.v);
-      doj_mask2 = vec_cmp_result(v_doj_mask2.v);
-      doj2_mask = vec_cmp_result(v2_doj_mask.v);
-      doj2_mask2 = vec_cmp_result(v2_doj_mask2.v);
-
-      /* Perform interaction with 2 vectors. */
-      if (doj_mask)
-        runner_iact_nonsym_intrinsic_vec_density(
-            &v_r2, &v_dx, &v_dy, &v_dz, v_hj_inv, v_vjx, v_vjy, v_vjz,
-            &ci_cache->vx[ci_cache_idx], &ci_cache->vy[ci_cache_idx],
-            &ci_cache->vz[ci_cache_idx], &ci_cache->m[ci_cache_idx], &rhoSum,
-            &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum,
-            &curlvySum, &curlvzSum, v_doj_mask,
-#ifdef HAVE_AVX512_F
-            knl_mask);
-#else
-            0);
-#endif
-      if (doj_mask2)
-        runner_iact_nonsym_intrinsic_vec_density(
-            &v_r2_2, &v_dx2, &v_dy2, &v_dz2, v_hj_inv, v_vjx, v_vjy, v_vjz,
-            &ci_cache->vx[ci2_cache_idx], &ci_cache->vy[ci2_cache_idx],
-            &ci_cache->vz[ci2_cache_idx], &ci_cache->m[ci2_cache_idx], &rhoSum,
-            &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum,
-            &curlvySum, &curlvzSum, v_doj_mask2,
-#ifdef HAVE_AVX512_F
-            knl_mask);
-#else
-            0);
-#endif
-      if (doj2_mask)
-        runner_iact_nonsym_intrinsic_vec_density(
-            &v2_r2, &v2_dx, &v2_dy, &v2_dz, v_hj_inv_2, v_vjx2, v_vjy2, v_vjz2,
-            &ci_cache->vx[ci_cache_idx], &ci_cache->vy[ci_cache_idx],
-            &ci_cache->vz[ci_cache_idx], &ci_cache->m[ci_cache_idx], &rhoSum2,
-            &rho_dhSum2, &wcountSum2, &wcount_dhSum2, &div_vSum2, &curlvxSum2,
-            &curlvySum2, &curlvzSum2, v2_doj_mask,
-#ifdef HAVE_AVX512_F
-            knl_mask);
-#else
-            0);
-#endif
-      if (doj2_mask2)
-        runner_iact_nonsym_intrinsic_vec_density(
-            &v2_r2_2, &v2_dx2, &v2_dy2, &v2_dz2, v_hj_inv_2, v_vjx2, v_vjy2,
-            v_vjz2, &ci_cache->vx[ci2_cache_idx], &ci_cache->vy[ci2_cache_idx],
-            &ci_cache->vz[ci2_cache_idx], &ci_cache->m[ci2_cache_idx], &rhoSum2,
-            &rho_dhSum2, &wcountSum2, &wcount_dhSum2, &div_vSum2, &curlvxSum2,
-            &curlvySum2, &curlvzSum2, v2_doj_mask2,
-#ifdef HAVE_AVX512_F
-            knl_mask);
-#else
-            0);
-#endif
-    } /* loop over the parts in cj. */
-
-    /* Perform horizontal adds on vector sums and store result in particle pi.
-     */
-    VEC_HADD(rhoSum, pj->rho);
-    VEC_HADD(rho_dhSum, pj->density.rho_dh);
-    VEC_HADD(wcountSum, pj->density.wcount);
-    VEC_HADD(wcount_dhSum, pj->density.wcount_dh);
-    VEC_HADD(div_vSum, pj->density.div_v);
-    VEC_HADD(curlvxSum, pj->density.rot_v[0]);
-    VEC_HADD(curlvySum, pj->density.rot_v[1]);
-    VEC_HADD(curlvzSum, pj->density.rot_v[2]);
-
-    VEC_HADD(rhoSum2, pj2->rho);
-    VEC_HADD(rho_dhSum2, pj2->density.rho_dh);
-    VEC_HADD(wcountSum2, pj2->density.wcount);
-    VEC_HADD(wcount_dhSum2, pj2->density.wcount_dh);
-    VEC_HADD(div_vSum2, pj2->density.div_v);
-    VEC_HADD(curlvxSum2, pj2->density.rot_v[0]);
-    VEC_HADD(curlvySum2, pj2->density.rot_v[1]);
-    VEC_HADD(curlvzSum2, pj2->density.rot_v[2]);
-
-  } /* loop over the parts in ci. */
-
-  TIMER_TOC(timer_dopair_density);
-
-#endif /* WITH_VECTORIZATION */
-}
-/**
- * @brief Compute the interactions between a cell pair (non-symmetric).
- *
- * @param r The #runner.
- * @param ci The first #cell.
- * @param cj The second #cell.
- */
-void runner_dopair1_density_auto_vec(struct runner *r, struct cell *ci,
-                                     struct cell *cj) {
-
-#if defined(WITH_VECTORIZATION) && defined(DOPAIR1_AUTO_VEC)
-  const struct engine *restrict e = r->e;
-
-  TIMER_TIC;
-
-  /* Anything to do here? */
-  if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return;
-
-#ifdef SWIFT_DEBUG_CHECKS
-  cell_is_drifted(ci, e);
-  cell_is_drifted(cj, e);
-#endif
-
-  /* Get the sort ID. */
-  double shift[3] = {0.0, 0.0, 0.0};
-  const int sid = space_getsid(e->s, &ci, &cj, shift);
-
-  /* Have the cells been sorted? */
-  if (!(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid)))
-    error("Trying to interact unsorted cells.");
-
-  /* Get the cutoff shift. */
-  double rshift = 0.0;
-  for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k];
-
-  /* Pick-out the sorted lists. */
-  const struct entry *restrict sort_i = &ci->sort[sid * (ci->count + 1)];
-  const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)];
-
-  /* Get some other useful values. */
-  const int count_i = ci->count;
-  const int count_j = cj->count;
-  struct part *restrict parts_i = ci->parts;
-  struct part *restrict parts_j = cj->parts;
-  const double di_max = sort_i[count_i - 1].d - rshift;
-  const double dj_min = sort_j[0].d;
-  const float dx_max = (ci->dx_max + cj->dx_max);
-
-  /* Get the particle cache from the runner and re-allocate
-   * the cache if it is not big enough for the cell. */
-  // struct cache *restrict ci_cache = &r->par_cache;
-
-  // if (ci_cache->count < count_i) {
-  //  cache_init(ci_cache, count_i);
-  //}
-  // if (cj_cache.count < count_j) {
-  //  cache_init(&cj_cache, count_j);
-  //}
-
-  // cache_read_two_cells(ci, cj, ci_cache, &cj_cache, shift);
-  cache_read_two_cells_sorted(ci, cj, &ci_cache, &cj_cache, sort_i, sort_j,
-                              shift);
-
-  /* Find particles maximum distance into cj, max_di[] and ci, max_dj[]. */
-  /* For particles in ci */
-  populate_max_d(ci, cj, sort_i, sort_j, &ci_cache, &cj_cache, dx_max, rshift,
-                 max_di, max_dj);
-
-  float di, dj;
-
-  int max_ind_j = count_j - 1;
-
-  /* Loop over the parts in ci. */
-  for (int pid = count_i - 1; pid >= 0 && max_ind_j >= 0; pid--) {
-
-    /* Get a hold of the ith part in ci. */
-    struct part *restrict pi = &parts_i[sort_i[pid].i];
-    if (!part_is_active(pi, e)) continue;
-
-    dj = sort_j[max_ind_j].d;
-    while (max_ind_j > 0 && max_di[pid] < dj) {
-      max_ind_j--;
-
-      dj = sort_j[max_ind_j].d;
-    }
-    int exit_iteration = max_ind_j;
-
-    int ci_cache_idx = pid;  // sort_i[pid].i;
-
-    const float hi = ci_cache.h[ci_cache_idx];
-    const double di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift;
-    if (di < dj_min) continue;
-
-    const float hig2 = hi * hi * kernel_gamma2;
-
-    float pix, piy, piz;
-    float vix, viy, viz;
-    float hi_inv;
-
-    /* Fill particle pi vectors. */
-    pix = ci_cache.x[ci_cache_idx];
-    piy = ci_cache.y[ci_cache_idx];
-    piz = ci_cache.z[ci_cache_idx];
-    vix = ci_cache.vx[ci_cache_idx];
-    viy = ci_cache.vy[ci_cache_idx];
-    viz = ci_cache.vz[ci_cache_idx];
-
-    /* Get the inverse of hi. */
-    hi_inv = 1.0f / hi;
-
-    /* Loop over the parts in cj. */
-    for (int pjd = 0; pjd <= exit_iteration; pjd++) {
-
-      /* Get the cache index to the jth particle. */
-      // int cj_cache_idx = pjd; //sort_j[pjd].i;
-
-      float dx, dy, dz, r2;
-
-      /* Compute the pairwise distance. */
-      dx = pix - cj_cache->x[pjd];
-      dy = piy - cj_cache->y[pjd];
-      dz = piz - cj_cache->z[pjd];
-
-      r2 = dx * dx + dy * dy + dz * dz;
-
-      runner_iact_nonsym_density_jsw(
-          r2, hig2, dx, dy, dz, hi_inv, cj_cache->h[pjd], vix, viy, viz,
-          cj_cache->vx[pjd], cj_cache->vy[pjd], cj_cache->vz[pjd],
-          cj_cache->m[pjd], &ci_cache.rho[pid], &ci_cache.rho_dh[pid],
-          &ci_cache.wcount[pid], &ci_cache.wcount_dh[pid], &ci_cache.div_v[pid],
-          &ci_cache.curl_vx[pid], &ci_cache.curl_vy[pid],
-          &ci_cache.curl_vz[pid]);
-
-    } /* loop over the parts in cj. */
-
-  } /* loop over the parts in ci. */
-
-  int max_ind_i = 0;
-  /* Loop over the parts in cj. */
-  for (int pjd = 0; pjd < count_j && max_ind_i < count_i; pjd++) {
-
-    /* Get a hold of the jth part in cj. */
-    struct part *restrict pj = &parts_j[sort_j[pjd].i];
-    if (!part_is_active(pj, e)) continue;
-
-    di = sort_i[max_ind_i].d;
-    while (max_ind_i < count_i - 1 && max_dj[pjd] > di) {
-      max_ind_i++;
-
-      di = sort_i[max_ind_i].d;
-    }
-    int exit_iteration = max_ind_i;
-
-    int cj_cache_idx = pjd;
-
-    const float hj = cj_cache->h[cj_cache_idx];
-    const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift;
-    if (dj > di_max) continue;
-
-    const float hjg2 = hj * hj * kernel_gamma2;
-
-    float pjx, pjy, pjz;
-    float vjx, vjy, vjz;
-    float hj_inv;
-
-    /* Fill particle pi vectors. */
-    pjx = cj_cache->x[cj_cache_idx];
-    pjy = cj_cache->y[cj_cache_idx];
-    pjz = cj_cache->z[cj_cache_idx];
-    vjx = cj_cache->vx[cj_cache_idx];
-    vjy = cj_cache->vy[cj_cache_idx];
-    vjz = cj_cache->vz[cj_cache_idx];
-
-    /* Get the inverse of hj. */
-    hj_inv = 1.0f / hj;
-
-    /* Loop over the parts in ci. */
-    for (int pid = exit_iteration; pid < count_i; pid++) {
-
-      /* Get the cache index to the ith particle. */
-      int ci_cache_idx = pid;  // sort_i[pid].i;
-
-      float dx, dy, dz, r2;
-
-      /* Compute the pairwise distance. */
-      dx = pjx - ci_cache.x[ci_cache_idx];
-      dy = pjy - ci_cache.y[ci_cache_idx];
-      dz = pjz - ci_cache.z[ci_cache_idx];
-
-      r2 = dx * dx + dy * dy + dz * dz;
-
-      runner_iact_nonsym_density_jsw(
-          r2, hjg2, dx, dy, dz, hj_inv, ci_cache.h[ci_cache_idx], vjx, vjy, vjz,
-          ci_cache.vx[ci_cache_idx], ci_cache.vy[ci_cache_idx],
-          ci_cache.vz[ci_cache_idx], ci_cache.m[ci_cache_idx],
-          &cj_cache.rho[cj_cache_idx], &cj_cache.rho_dh[cj_cache_idx],
-          &cj_cache.wcount[cj_cache_idx], &cj_cache.wcount_dh[cj_cache_idx],
-          &cj_cache.div_v[cj_cache_idx], &cj_cache.curl_vx[cj_cache_idx],
-          &cj_cache.curl_vy[cj_cache_idx], &cj_cache.curl_vz[cj_cache_idx]);
-
-    } /* loop over the parts in ci. */
-
-  } /* loop over the parts in cj. */
-
-  cache_write_sorted_particles(&ci_cache, &cj_cache, ci, cj, sort_i, sort_j);
-
-  TIMER_TOC(timer_dopair_density);
-
-#endif /* WITH_VECTORIZATION */
-}
diff --git a/src/runner_doiact_vec.h b/src/runner_doiact_vec.h
index 0a583aec27cc3d34cec08c1e1da9b5b9a01f40a7..b538f2ab2ebceac69c8a49c29de37f2343abeb7b 100644
--- a/src/runner_doiact_vec.h
+++ b/src/runner_doiact_vec.h
@@ -39,7 +39,5 @@ void runner_doself1_density_vec(struct runner *r, struct cell *restrict c);
 void runner_doself1_density_vec_2(struct runner *r, struct cell *restrict c);
 void runner_dopair1_density_vec(struct runner *r, struct cell *restrict ci,
                                 struct cell *restrict cj);
-void runner_dopair1_density_vec_2(struct runner *r, struct cell *restrict ci,
-                                  struct cell *restrict cj);
 
 #endif /* SWIFT_RUNNER_VEC_H */
diff --git a/src/vector.h b/src/vector.h
index 6c18b3590f34b55388dbc4a5151e633eb4a5673d..5582eecf93b6e7a9f82b885cda1f6c70d6bf059a 100644
--- a/src/vector.h
+++ b/src/vector.h
@@ -23,8 +23,6 @@
 /* Have I already read this file? */
 #ifndef VEC_MACRO
 
-/*TODO: Tidy this file up with comments.*/
-
 #include "../config.h"
 
 #ifdef WITH_VECTORIZATION
diff --git a/tests/test27cells.c b/tests/test27cells.c
index feb13f1952461a73f32ae59d29e917766796ff11..9a8eb1c561a9c68dc6e53b219dc442c0aa1259e1 100644
--- a/tests/test27cells.c
+++ b/tests/test27cells.c
@@ -34,7 +34,9 @@
 
 #if defined(WITH_VECTORIZATION)
 #define DOSELF1 runner_doself1_density_vec
+#define DOPAIR1 runner_dopair1_density_vec
 #define DOSELF1_NAME "runner_doself1_density_vec"
+#define DOPAIR1_NAME "runner_dopair1_density_vec"
 #endif
 
 #ifndef DOSELF1
@@ -42,41 +44,6 @@
 #define DOSELF1_NAME "runner_doself1_density"
 #endif
 
-#if defined(WITH_VECTORIZATION) && defined(DOPAIR1_VEC)
-#define DOPAIR1 runner_dopair1_density_vec
-#define DOPAIR1_NAME "runner_dopair1_density_vec"
-#endif
-
-#if defined(WITH_VECTORIZATION) && defined(DOPAIR1_VEC_1)
-#define DOPAIR1 runner_dopair1_density_vec_1
-#define DOPAIR1_NAME "runner_dopair1_density_vec_1"
-#endif
-
-#if defined(WITH_VECTORIZATION) && defined(DOPAIR1_VEC_2)
-#define DOPAIR1 runner_dopair1_density_vec_2
-#define DOPAIR1_NAME "runner_dopair1_density_vec_2"
-#endif
-
-#if defined(WITH_VECTORIZATION) && defined(DOPAIR1_VEC_3)
-#define DOPAIR1 runner_dopair1_density_vec_3
-#define DOPAIR1_NAME "runner_dopair1_density_vec_3"
-#endif
-
-#if defined(WITH_VECTORIZATION) && defined(DOPAIR1_VEC_4)
-#define DOPAIR1 runner_dopair1_density_vec_4
-#define DOPAIR1_NAME "runner_dopair1_density_vec_4"
-#endif
-
-#if defined(WITH_VECTORIZATION) && defined(DOPAIR1_AUTO_VEC)
-#define DOPAIR1 runner_dopair1_density_auto_vec
-#define DOPAIR1_NAME "runner_dopair1_density_auto_vec"
-#endif
-
-#if defined(DOPAIR1_NOSORT_JSW)
-#define DOPAIR1 runner_dopair1_nosort_density
-#define DOPAIR1_NAME "runner_dopair1_nosort_density"
-#endif
-
 #ifndef DOPAIR1
 #define DOPAIR1 runner_dopair1_density
 #define DOPAIR1_NAME "runner_dopair1_density"
@@ -334,23 +301,11 @@ int check_results(struct part *serial_parts, struct part *vec_parts, int count,
 }
 
 /* Just a forward declaration... */
+void runner_doself1_density(struct runner *r, struct cell *ci);
+void runner_doself1_density_vec(struct runner *r, struct cell *ci);
 void runner_dopair1_density(struct runner *r, struct cell *ci, struct cell *cj);
-void runner_dopair1_nosort_density(struct runner *r, struct cell *ci,
-                                   struct cell *cj);
 void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
                                 struct cell *cj);
-void runner_dopair1_density_vec_1(struct runner *r, struct cell *ci,
-                                  struct cell *cj);
-void runner_dopair1_density_vec_2(struct runner *r, struct cell *ci,
-                                  struct cell *cj);
-void runner_dopair1_density_vec_3(struct runner *r, struct cell *ci,
-                                  struct cell *cj);
-void runner_dopair1_density_vec_4(struct runner *r, struct cell *ci,
-                                  struct cell *cj);
-void runner_dopair1_density_auto_vec(struct runner *r, struct cell *ci,
-                                     struct cell *cj);
-void runner_doself1_density(struct runner *r, struct cell *ci);
-void runner_doself1_density_vec(struct runner *r, struct cell *ci);
 
 /* And go... */
 int main(int argc, char *argv[]) {
@@ -492,8 +447,6 @@ int main(int argc, char *argv[]) {
     cache_init(&runner.ci_cache, 512);
     runner.cj_cache.count = 0;
     cache_init(&runner.cj_cache, 512);
-// cj_cache.count = 0;
-// cache_init(&cj_cache, 512);
 #endif
 
     /* Run all the pairs */
@@ -582,9 +535,9 @@ int main(int argc, char *argv[]) {
   dump_particle_fields(outputFileName, main_cell, cells);
 
   /* Check serial results against the vectorised results. */
-  // if (check_results(main_cell->parts, vec_parts, main_cell->count,
-  // threshold))
-  //  message("Differences found...");
+   if (check_results(main_cell->parts, vec_parts, main_cell->count,
+   threshold))
+    message("Differences found...");
 
   /* Output timing */
   message("Brute force calculation took : %15lli ticks.", toc - tic);