diff --git a/src/cache.h b/src/cache.h
index 19d61b657b3aa1fe8675ee413fcde146071381e9..52534ecdc81af9707bb98cec913972967a8a1ce2 100644
--- a/src/cache.h
+++ b/src/cache.h
@@ -26,9 +26,11 @@
 #include "cell.h"
 #include "error.h"
 #include "part.h"
+#include "sort_part.h"
 #include "vector.h"
 
 #define NUM_VEC_PROC 2
+#define CACHE_ALIGN 64
 #define C2_CACHE_SIZE (NUM_VEC_PROC * VEC_SIZE * 6) + (NUM_VEC_PROC * VEC_SIZE)
 #define C2_CACHE_ALIGN sizeof(float) * VEC_SIZE
 
@@ -37,28 +39,31 @@
 struct cache {
 
   /* Particle x position. */
-  float *restrict x __attribute__((aligned(sizeof(float) * VEC_SIZE)));
+  float *restrict x __attribute__((aligned(CACHE_ALIGN)));
 
   /* Particle y position. */
-  float *restrict y __attribute__((aligned(sizeof(float) * VEC_SIZE)));
+  float *restrict y __attribute__((aligned(CACHE_ALIGN)));
 
   /* Particle z position. */
-  float *restrict z __attribute__((aligned(sizeof(float) * VEC_SIZE)));
+  float *restrict z __attribute__((aligned(CACHE_ALIGN)));
 
   /* Particle smoothing length. */
-  float *restrict h __attribute__((aligned(sizeof(float) * VEC_SIZE)));
+  float *restrict h __attribute__((aligned(CACHE_ALIGN)));
 
   /* Particle mass. */
-  float *restrict m __attribute__((aligned(sizeof(float) * VEC_SIZE)));
+  float *restrict m __attribute__((aligned(CACHE_ALIGN)));
 
   /* Particle x velocity. */
-  float *restrict vx __attribute__((aligned(sizeof(float) * VEC_SIZE)));
+  float *restrict vx __attribute__((aligned(CACHE_ALIGN)));
 
   /* Particle y velocity. */
-  float *restrict vy __attribute__((aligned(sizeof(float) * VEC_SIZE)));
+  float *restrict vy __attribute__((aligned(CACHE_ALIGN)));
 
   /* Particle z velocity. */
-  float *restrict vz __attribute__((aligned(sizeof(float) * VEC_SIZE)));
+  float *restrict vz __attribute__((aligned(CACHE_ALIGN)));
+
+  /* Maximum distance of particles into neighbouring cell. */
+  float *restrict max_d __attribute__((aligned(CACHE_ALIGN)));
 
   /* Cache size. */
   int count;
@@ -102,10 +107,12 @@ struct c2_cache {
 __attribute__((always_inline)) INLINE void cache_init(struct cache *c,
                                                       size_t count) {
 
-  /* Align cache on correct byte boundary and pad cache size to include 2 vector
-   * lengths for remainder operations. */
-  unsigned long alignment = sizeof(float) * VEC_SIZE;
-  unsigned int sizeBytes = (count + (2 * VEC_SIZE)) * sizeof(float);
+  /* Align cache on correct byte boundary and pad cache size to be a multiple of
+   * the vector size
+   * and include 2 vector lengths for remainder operations. */
+  unsigned int pad = 2 * VEC_SIZE, rem = count % VEC_SIZE;
+  if (rem > 0) pad += VEC_SIZE - rem;
+  unsigned int sizeBytes = (count + pad) * sizeof(float);
   int error = 0;
 
   /* Free memory if cache has already been allocated. */
@@ -118,16 +125,18 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c,
     free(c->vy);
     free(c->vz);
     free(c->h);
+    free(c->max_d);
   }
 
-  error += posix_memalign((void **)&c->x, alignment, sizeBytes);
-  error += posix_memalign((void **)&c->y, alignment, sizeBytes);
-  error += posix_memalign((void **)&c->z, alignment, sizeBytes);
-  error += posix_memalign((void **)&c->m, alignment, sizeBytes);
-  error += posix_memalign((void **)&c->vx, alignment, sizeBytes);
-  error += posix_memalign((void **)&c->vy, alignment, sizeBytes);
-  error += posix_memalign((void **)&c->vz, alignment, sizeBytes);
-  error += posix_memalign((void **)&c->h, alignment, sizeBytes);
+  error += posix_memalign((void **)&c->x, CACHE_ALIGN, sizeBytes);
+  error += posix_memalign((void **)&c->y, CACHE_ALIGN, sizeBytes);
+  error += posix_memalign((void **)&c->z, CACHE_ALIGN, sizeBytes);
+  error += posix_memalign((void **)&c->m, CACHE_ALIGN, sizeBytes);
+  error += posix_memalign((void **)&c->vx, CACHE_ALIGN, sizeBytes);
+  error += posix_memalign((void **)&c->vy, CACHE_ALIGN, sizeBytes);
+  error += posix_memalign((void **)&c->vz, CACHE_ALIGN, sizeBytes);
+  error += posix_memalign((void **)&c->h, CACHE_ALIGN, sizeBytes);
+  error += posix_memalign((void **)&c->max_d, CACHE_ALIGN, sizeBytes);
 
   if (error != 0)
     error("Couldn't allocate cache, no. of particles: %d", (int)count);
@@ -145,8 +154,11 @@ __attribute__((always_inline)) INLINE void cache_read_particles(
 
 #if defined(GADGET2_SPH)
 
-  /* Shift the particles positions to a local frame so single precision can be
-   * used instead of double precision. */
+/* Shift the particles positions to a local frame so single precision can be
+ * used instead of double precision. */
+#if defined(WITH_VECTORIZATION) && defined(__ICC)
+#pragma vector aligned
+#endif
   for (int i = 0; i < ci->count; i++) {
     ci_cache->x[i] = ci->parts[i].x[0] - ci->loc[0];
     ci_cache->y[i] = ci->parts[i].x[1] - ci->loc[1];
@@ -163,7 +175,247 @@ __attribute__((always_inline)) INLINE void cache_read_particles(
 }
 
 /**
- * @brief Clean the memory allocated by a #cache object.
+ * @brief Populate cache by reading in the particles from two cells in unsorted
+ * order.
+ *
+ * @param ci The i #cell.
+ * @param cj The j #cell.
+ * @param ci_cache The cache for cell ci.
+ * @param cj_cache The cache for cell cj.
+ * @param shift The amount to shift the particle positions to account for BCs
+ */
+__attribute__((always_inline)) INLINE void cache_read_two_cells(
+    const struct cell *const ci, const struct cell *const cj,
+    struct cache *const ci_cache, struct cache *const cj_cache,
+    const double *const shift) {
+
+  /* Shift the particles positions to a local frame (ci frame) so single
+   * precision can be
+   * used instead of double precision. Also shift the cell ci, particles
+   * positions due to BCs but leave cell cj. */
+  for (int i = 0; i < ci->count; i++) {
+    ci_cache->x[i] = ci->parts[i].x[0] - ci->loc[0] - shift[0];
+    ci_cache->y[i] = ci->parts[i].x[1] - ci->loc[1] - shift[1];
+    ci_cache->z[i] = ci->parts[i].x[2] - ci->loc[2] - shift[2];
+    ci_cache->h[i] = ci->parts[i].h;
+
+    ci_cache->m[i] = ci->parts[i].mass;
+    ci_cache->vx[i] = ci->parts[i].v[0];
+    ci_cache->vy[i] = ci->parts[i].v[1];
+    ci_cache->vz[i] = ci->parts[i].v[2];
+  }
+
+  for (int i = 0; i < cj->count; i++) {
+    cj_cache->x[i] = cj->parts[i].x[0] - ci->loc[0];
+    cj_cache->y[i] = cj->parts[i].x[1] - ci->loc[1];
+    cj_cache->z[i] = cj->parts[i].x[2] - ci->loc[2];
+    cj_cache->h[i] = cj->parts[i].h;
+
+    cj_cache->m[i] = cj->parts[i].mass;
+    cj_cache->vx[i] = cj->parts[i].v[0];
+    cj_cache->vy[i] = cj->parts[i].v[1];
+    cj_cache->vz[i] = cj->parts[i].v[2];
+  }
+}
+
+__attribute__((always_inline)) INLINE void cache_read_cell_sorted(
+    const struct cell *const ci, struct cache *const ci_cache,
+    const struct entry *restrict sort_i, double *const loc,
+    double *const shift) {
+
+  int idx;
+/* Shift the particles positions to a local frame (ci frame) so single precision
+ * can be
+ * used instead of double precision. Also shift the cell ci, particles positions
+ * due to BCs but leave cell cj. */
+#if defined(WITH_VECTORIZATION) && defined(__ICC)
+#pragma simd
+#endif
+  for (int i = 0; i < ci->count; i++) {
+    idx = sort_i[i].i;
+
+    ci_cache->x[i] = ci->parts[idx].x[0] - loc[0] - shift[0];
+    ci_cache->y[i] = ci->parts[idx].x[1] - loc[1] - shift[1];
+    ci_cache->z[i] = ci->parts[idx].x[2] - loc[2] - shift[2];
+    ci_cache->h[i] = ci->parts[idx].h;
+
+    ci_cache->m[i] = ci->parts[idx].mass;
+    ci_cache->vx[i] = ci->parts[idx].v[0];
+    ci_cache->vy[i] = ci->parts[idx].v[1];
+    ci_cache->vz[i] = ci->parts[idx].v[2];
+  }
+}
+
+/**
+ * @brief Populate cache by reading in the particles from two cells in sorted
+ * order.
+ *
+ * @param ci The i #cell.
+ * @param cj The j #cell.
+ * @param ci_cache The #cache for cell ci.
+ * @param cj_cache The #cache for cell cj.
+ * @param sort_i The array of sorted particle indices for cell ci.
+ * @param sort_j The array of sorted particle indices for cell ci.
+ * @param shift The amount to shift the particle positions to account for BCs
+ */
+__attribute__((always_inline)) INLINE void cache_read_two_cells_sorted(
+    const struct cell *const ci, const struct cell *const cj,
+    struct cache *const ci_cache, struct cache *const cj_cache,
+    const struct entry *restrict sort_i, const struct entry *restrict sort_j,
+    const double *const shift) {
+
+  int idx;
+/* Shift the particles positions to a local frame (ci frame) so single precision
+ * can be
+ * used instead of double precision. Also shift the cell ci, particles positions
+ * due to BCs but leave cell cj. */
+#if defined(WITH_VECTORIZATION) && defined(__ICC)
+#pragma simd
+#endif
+  for (int i = 0; i < ci->count; i++) {
+    idx = sort_i[i].i;
+    ci_cache->x[i] = ci->parts[idx].x[0] - ci->loc[0] - shift[0];
+    ci_cache->y[i] = ci->parts[idx].x[1] - ci->loc[1] - shift[1];
+    ci_cache->z[i] = ci->parts[idx].x[2] - ci->loc[2] - shift[2];
+    ci_cache->h[i] = ci->parts[idx].h;
+
+    ci_cache->m[i] = ci->parts[idx].mass;
+    ci_cache->vx[i] = ci->parts[idx].v[0];
+    ci_cache->vy[i] = ci->parts[idx].v[1];
+    ci_cache->vz[i] = ci->parts[idx].v[2];
+  }
+
+#if defined(WITH_VECTORIZATION) && defined(__ICC)
+#pragma simd
+#endif
+  for (int i = 0; i < cj->count; i++) {
+    idx = sort_j[i].i;
+    cj_cache->x[i] = cj->parts[idx].x[0] - ci->loc[0];
+    cj_cache->y[i] = cj->parts[idx].x[1] - ci->loc[1];
+    cj_cache->z[i] = cj->parts[idx].x[2] - ci->loc[2];
+    cj_cache->h[i] = cj->parts[idx].h;
+
+    cj_cache->m[i] = cj->parts[idx].mass;
+    cj_cache->vx[i] = cj->parts[idx].v[0];
+    cj_cache->vy[i] = cj->parts[idx].v[1];
+    cj_cache->vz[i] = cj->parts[idx].v[2];
+  }
+}
+
+/**
+ * @brief Populate caches by only reading particles that are within range of
+ * each other within the adjoining cell.Also read the particles into the cache
+ * in sorted order.
+ *
+ * @param ci The i #cell.
+ * @param cj The j #cell.
+ * @param ci_cache The #cache for cell ci.
+ * @param cj_cache The #cache for cell cj.
+ * @param sort_i The array of sorted particle indices for cell ci.
+ * @param sort_j The array of sorted particle indices for cell ci.
+ * @param shift The amount to shift the particle positions to account for BCs
+ * @param first_pi The first particle in cell ci that is in range.
+ * @param last_pj The last particle in cell cj that is in range.
+ * @param num_vec_proc Number of vectors that will be used to process the
+ * interaction.
+ */
+__attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
+    const struct cell *const ci, const struct cell *const cj,
+    struct cache *const ci_cache, struct cache *const cj_cache,
+    const struct entry *restrict sort_i, const struct entry *restrict sort_j,
+    const double *const shift, int *first_pi, int *last_pj,
+    const int num_vec_proc) {
+
+  int idx, ci_cache_idx;
+  /* Pad number of particles read to the vector size. */
+  int rem = (ci->count - *first_pi) % (num_vec_proc * VEC_SIZE);
+  if (rem != 0) {
+    int pad = (num_vec_proc * VEC_SIZE) - rem;
+
+    if (*first_pi - pad >= 0) *first_pi -= pad;
+  }
+
+  rem = *last_pj % (num_vec_proc * VEC_SIZE);
+  if (rem != 0) {
+    int pad = (num_vec_proc * VEC_SIZE) - rem;
+
+    if (*last_pj + pad < cj->count) *last_pj += pad;
+  }
+
+  int first_pi_align = *first_pi;
+  int last_pj_align = *last_pj;
+
+/* Shift the particles positions to a local frame (ci frame) so single precision
+ * can be
+ * used instead of double precision. Also shift the cell ci, particles positions
+ * due to BCs but leave cell cj. */
+#if defined(WITH_VECTORIZATION) && defined(__ICC)
+#pragma vector aligned
+#endif
+  for (int i = first_pi_align; i < ci->count; i++) {
+    /* Make sure ci_cache is filled from the first element. */
+    ci_cache_idx = i - first_pi_align;
+    idx = sort_i[i].i;
+    ci_cache->x[ci_cache_idx] = ci->parts[idx].x[0] - ci->loc[0] - shift[0];
+    ci_cache->y[ci_cache_idx] = ci->parts[idx].x[1] - ci->loc[1] - shift[1];
+    ci_cache->z[ci_cache_idx] = ci->parts[idx].x[2] - ci->loc[2] - shift[2];
+    ci_cache->h[ci_cache_idx] = ci->parts[idx].h;
+
+    ci_cache->m[ci_cache_idx] = ci->parts[idx].mass;
+    ci_cache->vx[ci_cache_idx] = ci->parts[idx].v[0];
+    ci_cache->vy[ci_cache_idx] = ci->parts[idx].v[1];
+    ci_cache->vz[ci_cache_idx] = ci->parts[idx].v[2];
+  }
+
+  /* Pad cache with fake particles that exist outside the cell so will not
+   * interact.*/
+  float fake_pix = 2.0f * ci->parts[sort_i[ci->count - 1].i].x[0];
+  for (int i = ci->count - first_pi_align;
+       i < ci->count - first_pi_align + VEC_SIZE; i++) {
+    ci_cache->x[i] = fake_pix;
+    ci_cache->y[i] = 1.f;
+    ci_cache->z[i] = 1.f;
+    ci_cache->h[i] = 1.f;
+
+    ci_cache->m[i] = 1.f;
+    ci_cache->vx[i] = 1.f;
+    ci_cache->vy[i] = 1.f;
+    ci_cache->vz[i] = 1.f;
+  }
+
+#if defined(WITH_VECTORIZATION) && defined(__ICC)
+#pragma vector aligned
+#endif
+  for (int i = 0; i <= last_pj_align; i++) {
+    idx = sort_j[i].i;
+    cj_cache->x[i] = cj->parts[idx].x[0] - ci->loc[0];
+    cj_cache->y[i] = cj->parts[idx].x[1] - ci->loc[1];
+    cj_cache->z[i] = cj->parts[idx].x[2] - ci->loc[2];
+    cj_cache->h[i] = cj->parts[idx].h;
+
+    cj_cache->m[i] = cj->parts[idx].mass;
+    cj_cache->vx[i] = cj->parts[idx].v[0];
+    cj_cache->vy[i] = cj->parts[idx].v[1];
+    cj_cache->vz[i] = cj->parts[idx].v[2];
+  }
+
+  /* Pad cache with fake particles that exist outside the cell so will not
+   * interact.*/
+  float fake_pjx = 2.0f * cj->parts[sort_j[cj->count - 1].i].x[0];
+  for (int i = last_pj_align + 1; i < last_pj_align + 1 + VEC_SIZE; i++) {
+    cj_cache->x[i] = fake_pjx;
+    cj_cache->y[i] = 1.f;
+    cj_cache->z[i] = 1.f;
+    cj_cache->h[i] = 1.f;
+
+    cj_cache->m[i] = 1.f;
+    cj_cache->vx[i] = 1.f;
+    cj_cache->vy[i] = 1.f;
+    cj_cache->vz[i] = 1.f;
+  }
+}
+
+/* @brief Clean the memory allocated by a #cache object.
  *
  * @param c The #cache to clean.
  */
@@ -177,6 +429,7 @@ static INLINE void cache_clean(struct cache *c) {
     free(c->vy);
     free(c->vz);
     free(c->h);
+    free(c->max_d);
   }
 }
 
diff --git a/src/engine.c b/src/engine.c
index 3c8ec68160c0aca9fd8c009cc0a8fdd5df88f28e..ab14af5c2e3fee19fcf2ec5963788aa8338d0568 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -4264,9 +4264,11 @@ void engine_init(struct engine *e, struct space *s,
       e->runners[k].qid = k * nr_queues / e->nr_threads;
     }
 
-    /* Allocate particle cache. */
-    e->runners[k].par_cache.count = 0;
-    cache_init(&e->runners[k].par_cache, CACHE_SIZE);
+    /* Allocate particle caches. */
+    e->runners[k].ci_cache.count = 0;
+    e->runners[k].cj_cache.count = 0;
+    cache_init(&e->runners[k].ci_cache, CACHE_SIZE);
+    cache_init(&e->runners[k].cj_cache, CACHE_SIZE);
 
     if (verbose) {
       if (with_aff)
@@ -4353,7 +4355,8 @@ void engine_compute_next_snapshot_time(struct engine *e) {
  */
 void engine_clean(struct engine *e) {
 
-  for (int i = 0; i < e->nr_threads; ++i) cache_clean(&e->runners[i].par_cache);
+  for (int i = 0; i < e->nr_threads; ++i) cache_clean(&e->runners[i].ci_cache);
+  for (int i = 0; i < e->nr_threads; ++i) cache_clean(&e->runners[i].cj_cache);
   free(e->runners);
   free(e->snapshotUnits);
   free(e->links);
diff --git a/src/hydro/Gadget2/hydro_iact.h b/src/hydro/Gadget2/hydro_iact.h
index 3fef18b4f487f1734a5f93c4bad46cf4e6968240..ae4ee27f5ed83350aaf1a0bfd0069d7b85701854 100644
--- a/src/hydro/Gadget2/hydro_iact.h
+++ b/src/hydro/Gadget2/hydro_iact.h
@@ -32,6 +32,7 @@
  * Gadget-2 tree-code neighbours search.
  */
 
+#include "cache.h"
 #include "minmax.h"
 
 /**
@@ -150,7 +151,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_density(
   for (k = 0; k < 3; k++)
     dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]);
 #else
-  error("Unknown vector size.")
+  error("Unknown vector size.");
 #endif
 
   /* Get the radius and inverse radius. */
@@ -317,7 +318,7 @@ runner_iact_nonsym_vec_density(float *R2, float *Dx, float *Hi, float *Hj,
   for (k = 0; k < 3; k++)
     dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]);
 #else
-  error("Unknown vector size.")
+  error("Unknown vector size.");
 #endif
 
   /* Get the radius and inverse radius. */
@@ -374,6 +375,106 @@ runner_iact_nonsym_vec_density(float *R2, float *Dx, float *Hi, float *Hj,
 }
 
 #ifdef WITH_VECTORIZATION
+
+/**
+ * @brief Density interaction computed using 1 vector
+ * (non-symmetric vectorized version).
+ */
+__attribute__((always_inline)) INLINE static void
+runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz,
+                                 vector hi_inv, vector vix, vector viy,
+                                 vector viz, float *Vjx, float *Vjy, float *Vjz,
+                                 float *Mj, vector *rhoSum, vector *rho_dhSum,
+                                 vector *wcountSum, vector *wcount_dhSum,
+                                 vector *div_vSum, vector *curlvxSum,
+                                 vector *curlvySum, vector *curlvzSum,
+                                 vector mask, int knlMask) {
+
+  vector r, ri, xi, wi, wi_dx;
+  vector mj;
+  vector dvx, dvy, dvz;
+  vector vjx, vjy, vjz;
+  vector dvdr;
+  vector curlvrx, curlvry, curlvrz;
+
+  /* Fill the vectors. */
+  mj.v = vec_load(Mj);
+  vjx.v = vec_load(Vjx);
+  vjy.v = vec_load(Vjy);
+  vjz.v = vec_load(Vjz);
+
+  /* Get the radius and inverse radius. */
+  ri = vec_reciprocal_sqrt(*r2);
+  r.v = vec_mul(r2->v, ri.v);
+
+  xi.v = vec_mul(r.v, hi_inv.v);
+
+  /* Calculate the kernel for two particles. */
+  kernel_deval_1_vec(&xi, &wi, &wi_dx);
+
+  /* Compute dv. */
+  dvx.v = vec_sub(vix.v, vjx.v);
+  dvy.v = vec_sub(viy.v, vjy.v);
+  dvz.v = vec_sub(viz.v, vjz.v);
+
+  /* Compute dv dot r */
+  dvdr.v = vec_fma(dvx.v, dx->v, vec_fma(dvy.v, dy->v, vec_mul(dvz.v, dz->v)));
+  dvdr.v = vec_mul(dvdr.v, ri.v);
+
+  /* Compute dv cross r */
+  curlvrx.v =
+      vec_fma(dvy.v, dz->v, vec_mul(vec_set1(-1.0f), vec_mul(dvz.v, dy->v)));
+  curlvry.v =
+      vec_fma(dvz.v, dx->v, vec_mul(vec_set1(-1.0f), vec_mul(dvx.v, dz->v)));
+  curlvrz.v =
+      vec_fma(dvx.v, dy->v, vec_mul(vec_set1(-1.0f), vec_mul(dvy.v, dx->v)));
+  curlvrx.v = vec_mul(curlvrx.v, ri.v);
+  curlvry.v = vec_mul(curlvry.v, ri.v);
+  curlvrz.v = vec_mul(curlvrz.v, ri.v);
+
+/* Mask updates to intermediate vector sums for particle pi. */
+#ifdef HAVE_AVX512_F
+  rhoSum->v =
+      _mm512_mask_add_ps(rhoSum->v, knlMask, vec_mul(mj.v, wi.v), rhoSum->v);
+
+  rho_dhSum->v =
+      _mm512_mask_sub_ps(rho_dhSum->v, knlMask, rho_dhSum->v,
+                         vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
+                                               vec_mul(xi.v, wi_dx.v))));
+
+  wcountSum->v = _mm512_mask_add_ps(wcountSum->v, knlMask, wi.v, wcountSum->v);
+
+  wcount_dhSum->v = _mm512_mask_sub_ps(wcount_dhSum->v, knlMask,
+                                       wcount_dhSum->v, vec_mul(xi.v, wi_dx.v));
+
+  div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask, div_vSum->v,
+                                   vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)));
+
+  curlvxSum->v = _mm512_mask_add_ps(curlvxSum->v, knlMask,
+                                    vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)),
+                                    curlvxSum->v);
+
+  curlvySum->v = _mm512_mask_add_ps(curlvySum->v, knlMask,
+                                    vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)),
+                                    curlvySum->v);
+
+  curlvzSum->v = _mm512_mask_add_ps(curlvzSum->v, knlMask,
+                                    vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)),
+                                    curlvzSum->v);
+#else
+  rhoSum->v += vec_and(vec_mul(mj.v, wi.v), mask.v);
+  rho_dhSum->v -= vec_and(vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
+                                                vec_mul(xi.v, wi_dx.v))),
+                          mask.v);
+  wcountSum->v += vec_and(wi.v, mask.v);
+  wcount_dhSum->v -= vec_and(vec_mul(xi.v, wi_dx.v), mask.v);
+  div_vSum->v -= vec_and(vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask.v);
+  curlvxSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask.v);
+  curlvySum->v += vec_and(vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask.v);
+  curlvzSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask.v);
+#endif
+}
+
 /**
  * @brief Density interaction computed using 2 interleaved vectors
  * (non-symmetric vectorized version).
@@ -744,7 +845,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_force(
               vec_set(pj[0]->force.balsara, pj[1]->force.balsara,
                       pj[2]->force.balsara, pj[3]->force.balsara);
 #else
-  error("Unknown vector size.")
+  error("Unknown vector size.");
 #endif
 
   /* Get the radius and inverse radius. */
@@ -1023,7 +1124,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force(
               vec_set(pj[0]->force.balsara, pj[1]->force.balsara,
                       pj[2]->force.balsara, pj[3]->force.balsara);
 #else
-  error("Unknown vector size.")
+  error("Unknown vector size.");
 #endif
 
   /* Get the radius and inverse radius. */
diff --git a/src/kernel_hydro.h b/src/kernel_hydro.h
index ceaa3b7b46279e3dfe063bbda34d97233f33acbb..f634a59d7ee769951e6560d46a92053c144cc766 100644
--- a/src/kernel_hydro.h
+++ b/src/kernel_hydro.h
@@ -304,6 +304,41 @@ __attribute__((always_inline)) INLINE static void kernel_eval(
   *W = w * kernel_constant * kernel_gamma_inv_dim;
 }
 
+/**
+ * @brief Computes the kernel function derivative.
+ *
+ * The kernel function needs to be mutliplied by \f$h^{-d}\f$ and the gradient
+ * by \f$h^{-(d+1)}\f$, where \f$d\f$ is the dimensionality of the problem.
+ *
+ * Returns 0 if \f$u > \gamma = H/h\f$.
+ *
+ * @param u The ratio of the distance to the smoothing length \f$u = x/h\f$.
+ * @param dW_dx (return) The norm of the gradient of \f$|\nabla W(x,h)|\f$.
+ */
+__attribute__((always_inline)) INLINE static void kernel_eval_dWdx(
+    float u, float *restrict dW_dx) {
+
+  /* Go to the range [0,1[ from [0,H[ */
+  const float x = u * kernel_gamma_inv;
+
+  /* Pick the correct branch of the kernel */
+  const int temp = (int)(x * kernel_ivals_f);
+  const int ind = temp > kernel_ivals ? kernel_ivals : temp;
+  const float *const coeffs = &kernel_coeffs[ind * (kernel_degree + 1)];
+
+  /* First two terms of the polynomial ... */
+  float dw_dx = ((float)kernel_degree * coeffs[0] * x) +
+                (float)(kernel_degree - 1) * coeffs[1];
+
+  /* ... and the rest of them */
+  for (int k = 2; k < kernel_degree; k++) {
+    dw_dx = dw_dx * x + (float)(kernel_degree - k) * coeffs[k];
+  }
+
+  /* Return everything */
+  *dW_dx = dw_dx * kernel_constant * kernel_gamma_inv_dim_plus_one;
+}
+
 /* ------------------------------------------------------------------------- */
 
 #ifdef WITH_VECTORIZATION
@@ -362,7 +397,8 @@ __attribute__((always_inline)) INLINE static void kernel_deval_vec(
       dw_dx->v * kernel_constant_vec.v * kernel_gamma_inv_dim_plus_one_vec.v;
 }
 
-/* Define constant vectors for the Wendland C2 kernel coefficients. */
+/* Define constant vectors for the Wendland C2 and Cubic Spline kernel
+ * coefficients. */
 #ifdef WENDLAND_C2_KERNEL
 static const vector wendland_const_c0 = FILL_VEC(4.f);
 static const vector wendland_const_c1 = FILL_VEC(-15.f);
@@ -370,8 +406,115 @@ static const vector wendland_const_c2 = FILL_VEC(20.f);
 static const vector wendland_const_c3 = FILL_VEC(-10.f);
 static const vector wendland_const_c4 = FILL_VEC(0.f);
 static const vector wendland_const_c5 = FILL_VEC(1.f);
+
+static const vector wendland_dwdx_const_c0 = FILL_VEC(20.f);
+static const vector wendland_dwdx_const_c1 = FILL_VEC(-60.f);
+static const vector wendland_dwdx_const_c2 = FILL_VEC(60.f);
+static const vector wendland_dwdx_const_c3 = FILL_VEC(-20.f);
+#elif defined(CUBIC_SPLINE_KERNEL)
+/* First region 0 < u < 0.5 */
+static const vector cubic_1_const_c0 = FILL_VEC(3.f);
+static const vector cubic_1_const_c1 = FILL_VEC(-3.f);
+static const vector cubic_1_const_c2 = FILL_VEC(0.f);
+static const vector cubic_1_const_c3 = FILL_VEC(0.5f);
+static const vector cubic_1_dwdx_const_c0 = FILL_VEC(9.f);
+static const vector cubic_1_dwdx_const_c1 = FILL_VEC(-6.f);
+static const vector cubic_1_dwdx_const_c2 = FILL_VEC(0.f);
+
+/* Second region 0.5 <= u < 1 */
+static const vector cubic_2_const_c0 = FILL_VEC(-1.f);
+static const vector cubic_2_const_c1 = FILL_VEC(3.f);
+static const vector cubic_2_const_c2 = FILL_VEC(-3.f);
+static const vector cubic_2_const_c3 = FILL_VEC(1.f);
+static const vector cubic_2_dwdx_const_c0 = FILL_VEC(-3.f);
+static const vector cubic_2_dwdx_const_c1 = FILL_VEC(6.f);
+static const vector cubic_2_dwdx_const_c2 = FILL_VEC(-3.f);
+static const vector cond = FILL_VEC(0.5f);
+#endif
+
+/**
+ * @brief Computes the kernel function and its derivative for two particles
+ * using vectors.
+ *
+ * Return 0 if $u > \\gamma = H/h$
+ *
+ * @param u The ratio of the distance to the smoothing length $u = x/h$.
+ * @param w (return) The value of the kernel function $W(x,h)$.
+ * @param dw_dx (return) The norm of the gradient of $|\\nabla W(x,h)|$.
+ */
+__attribute__((always_inline)) INLINE static void kernel_deval_1_vec(
+    vector *u, vector *w, vector *dw_dx) {
+
+  /* Go to the range [0,1[ from [0,H[ */
+  vector x;
+  x.v = vec_mul(u->v, kernel_gamma_inv_vec.v);
+
+#ifdef WENDLAND_C2_KERNEL
+  /* Init the iteration for Horner's scheme. */
+  w->v = vec_fma(wendland_const_c0.v, x.v, wendland_const_c1.v);
+  dw_dx->v = wendland_const_c0.v;
+
+  /* Calculate the polynomial interleaving vector operations */
+  dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
+  w->v = vec_fma(x.v, w->v, wendland_const_c2.v);
+
+  dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
+  w->v = vec_fma(x.v, w->v, wendland_const_c3.v);
+
+  dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
+  w->v = vec_mul(x.v, w->v); /* wendland_const_c4 is zero. */
+
+  dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
+  w->v = vec_fma(x.v, w->v, wendland_const_c5.v);
+#elif defined(CUBIC_SPLINE_KERNEL)
+  vector w2, dw_dx2;
+  vector mask_reg1, mask_reg2;
+
+  /* Form a mask for each part of the kernel. */
+  mask_reg1.v = vec_cmp_lt(x.v, cond.v);  /* 0 < x < 0.5 */
+  mask_reg2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */
+  ;
+
+  /* Work out w for both regions of the kernel and combine the results together
+   * using masks. */
+
+  /* Init the iteration for Horner's scheme. */
+  w->v = vec_fma(cubic_1_const_c0.v, x.v, cubic_1_const_c1.v);
+  w2.v = vec_fma(cubic_2_const_c0.v, x.v, cubic_2_const_c1.v);
+  dw_dx->v = cubic_1_const_c0.v;
+  dw_dx2.v = cubic_2_const_c0.v;
+
+  /* Calculate the polynomial interleaving vector operations. */
+  dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
+  dw_dx2.v = vec_fma(dw_dx2.v, x.v, w2.v);
+  w->v = vec_mul(x.v, w->v); /* cubic_1_const_c2 is zero. */
+  w2.v = vec_fma(x.v, w2.v, cubic_2_const_c2.v);
+
+  dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
+  dw_dx2.v = vec_fma(dw_dx2.v, x.v, w2.v);
+  w->v = vec_fma(x.v, w->v, cubic_1_const_c3.v);
+  w2.v = vec_fma(x.v, w2.v, cubic_2_const_c3.v);
+
+  /* Mask out unneeded values. */
+  w->v = vec_and(w->v, mask_reg1.v);
+  w2.v = vec_and(w2.v, mask_reg2.v);
+  dw_dx->v = vec_and(dw_dx->v, mask_reg1.v);
+  dw_dx2.v = vec_and(dw_dx2.v, mask_reg2.v);
+
+  /* Added both w and w2 together to form complete result. */
+  w->v = vec_add(w->v, w2.v);
+  dw_dx->v = vec_add(dw_dx->v, dw_dx2.v);
+#else
+#error
 #endif
 
+  /* Return everything */
+  w->v =
+      vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
+  dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
+                                       kernel_gamma_inv_dim_plus_one_vec.v));
+}
+
 /**
  * @brief Computes the kernel function and its derivative for two particles
  * using interleaved vectors.
@@ -417,8 +560,8 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
 
   dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
   dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v);
-  w->v = vec_fma(x.v, w->v, wendland_const_c4.v);
-  w2->v = vec_fma(x2.v, w2->v, wendland_const_c4.v);
+  w->v = vec_mul(x.v, w->v);    /* wendland_const_c4 is zero. */
+  w2->v = vec_mul(x2.v, w2->v); /* wendland_const_c4 is zero. */
 
   dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
   dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v);
@@ -434,34 +577,67 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
                                        kernel_gamma_inv_dim_plus_one_vec.v));
   dw_dx2->v = vec_mul(dw_dx2->v, vec_mul(kernel_constant_vec.v,
                                          kernel_gamma_inv_dim_plus_one_vec.v));
-#else
-
-  /* Load x and get the interval id. */
-  vector ind, ind2;
-  ind.m = vec_ftoi(vec_fmin(x.v * kernel_ivals_vec.v, kernel_ivals_vec.v));
-  ind2.m = vec_ftoi(vec_fmin(x2.v * kernel_ivals_vec.v, kernel_ivals_vec.v));
-
-  /* load the coefficients. */
-  vector c[kernel_degree + 1], c2[kernel_degree + 1];
-  for (int k = 0; k < VEC_SIZE; k++)
-    for (int j = 0; j < kernel_degree + 1; j++) {
-      c[j].f[k] = kernel_coeffs[ind.i[k] * (kernel_degree + 1) + j];
-      c2[j].f[k] = kernel_coeffs[ind2.i[k] * (kernel_degree + 1) + j];
-    }
+#elif defined(CUBIC_SPLINE_KERNEL)
+  vector w_2, dw_dx_2;
+  vector w2_2, dw_dx2_2;
+  vector mask_reg1, mask_reg2, mask_reg1_v2, mask_reg2_v2;
+
+  /* Form a mask for each part of the kernel. */
+  mask_reg1.v = vec_cmp_lt(x.v, cond.v);     /* 0 < x < 0.5 */
+  mask_reg1_v2.v = vec_cmp_lt(x2.v, cond.v); /* 0 < x < 0.5 */
+  mask_reg2.v = vec_cmp_gte(x.v, cond.v);    /* 0.5 < x < 1 */
+  ;
+  mask_reg2_v2.v = vec_cmp_gte(x2.v, cond.v); /* 0.5 < x < 1 */
+  ;
+
+  /* Work out w for both regions of the kernel and combine the results together
+   * using masks. */
 
   /* Init the iteration for Horner's scheme. */
-  w->v = (c[0].v * x.v) + c[1].v;
-  w2->v = (c2[0].v * x2.v) + c2[1].v;
-  dw_dx->v = c[0].v;
-  dw_dx2->v = c2[0].v;
+  w->v = vec_fma(cubic_1_const_c0.v, x.v, cubic_1_const_c1.v);
+  w2->v = vec_fma(cubic_1_const_c0.v, x2.v, cubic_1_const_c1.v);
+  w_2.v = vec_fma(cubic_2_const_c0.v, x.v, cubic_2_const_c1.v);
+  w2_2.v = vec_fma(cubic_2_const_c0.v, x2.v, cubic_2_const_c1.v);
+  dw_dx->v = cubic_1_const_c0.v;
+  dw_dx2->v = cubic_1_const_c0.v;
+  dw_dx_2.v = cubic_2_const_c0.v;
+  dw_dx2_2.v = cubic_2_const_c0.v;
+
+  /* Calculate the polynomial interleaving vector operations. */
+  dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
+  dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v);
+  dw_dx_2.v = vec_fma(dw_dx_2.v, x.v, w_2.v);
+  dw_dx2_2.v = vec_fma(dw_dx2_2.v, x2.v, w2_2.v);
+  w->v = vec_mul(x.v, w->v);    /* cubic_1_const_c2 is zero. */
+  w2->v = vec_mul(x2.v, w2->v); /* cubic_1_const_c2 is zero. */
+  w_2.v = vec_fma(x.v, w_2.v, cubic_2_const_c2.v);
+  w2_2.v = vec_fma(x2.v, w2_2.v, cubic_2_const_c2.v);
+
+  dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
+  dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v);
+  dw_dx_2.v = vec_fma(dw_dx_2.v, x.v, w_2.v);
+  dw_dx2_2.v = vec_fma(dw_dx2_2.v, x2.v, w2_2.v);
+  w->v = vec_fma(x.v, w->v, cubic_1_const_c3.v);
+  w2->v = vec_fma(x2.v, w2->v, cubic_1_const_c3.v);
+  w_2.v = vec_fma(x.v, w_2.v, cubic_2_const_c3.v);
+  w2_2.v = vec_fma(x2.v, w2_2.v, cubic_2_const_c3.v);
+
+  /* Mask out unneeded values. */
+  w->v = vec_and(w->v, mask_reg1.v);
+  w2->v = vec_and(w2->v, mask_reg1_v2.v);
+  w_2.v = vec_and(w_2.v, mask_reg2.v);
+  w2_2.v = vec_and(w2_2.v, mask_reg2_v2.v);
+  dw_dx->v = vec_and(dw_dx->v, mask_reg1.v);
+  dw_dx2->v = vec_and(dw_dx2->v, mask_reg1_v2.v);
+  dw_dx_2.v = vec_and(dw_dx_2.v, mask_reg2.v);
+  dw_dx2_2.v = vec_and(dw_dx2_2.v, mask_reg2_v2.v);
+
+  /* Added both w and w2 together to form complete result. */
+  w->v = vec_add(w->v, w_2.v);
+  w2->v = vec_add(w2->v, w2_2.v);
+  dw_dx->v = vec_add(dw_dx->v, dw_dx_2.v);
+  dw_dx2->v = vec_add(dw_dx2->v, dw_dx2_2.v);
 
-  /* And we're off! */
-  for (int k = 2; k <= kernel_degree; k++) {
-    dw_dx->v = (dw_dx->v * x.v) + w->v;
-    dw_dx2->v = (dw_dx2->v * x2.v) + w2->v;
-    w->v = (x.v * w->v) + c[k].v;
-    w2->v = (x2.v * w2->v) + c2[k].v;
-  }
   /* Return everything */
   w->v = w->v * kernel_constant_vec.v * kernel_gamma_inv_dim_vec.v;
   w2->v = w2->v * kernel_constant_vec.v * kernel_gamma_inv_dim_vec.v;
@@ -473,8 +649,133 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
 #endif
 }
 
+/**
+ * @brief Computes the kernel function for two particles
+ * using vectors.
+ *
+ * Return 0 if $u > \\gamma = H/h$
+ *
+ * @param u The ratio of the distance to the smoothing length $u = x/h$.
+ * @param w (return) The value of the kernel function $W(x,h)$.
+ */
+__attribute__((always_inline)) INLINE static void kernel_eval_W_vec(vector *u,
+                                                                    vector *w) {
+
+  /* Go to the range [0,1[ from [0,H[ */
+  vector x;
+  x.v = vec_mul(u->v, kernel_gamma_inv_vec.v);
+
+#ifdef WENDLAND_C2_KERNEL
+  /* Init the iteration for Horner's scheme. */
+  w->v = vec_fma(wendland_const_c0.v, x.v, wendland_const_c1.v);
+
+  /* Calculate the polynomial interleaving vector operations */
+  w->v = vec_fma(x.v, w->v, wendland_const_c2.v);
+  w->v = vec_fma(x.v, w->v, wendland_const_c3.v);
+  w->v = vec_mul(x.v, w->v); /* wendland_const_c4 is zero.*/
+  w->v = vec_fma(x.v, w->v, wendland_const_c5.v);
+#elif defined(CUBIC_SPLINE_KERNEL)
+  vector w2;
+  vector mask1, mask2;
+
+  /* Form a mask for each part of the kernel. */
+  mask1.v = vec_cmp_lt(x.v, cond.v);  /* 0 < x < 0.5 */
+  mask2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */
+  ;
+
+  /* Work out w for both regions of the kernel and combine the results together
+   * using masks. */
+
+  /* Init the iteration for Horner's scheme. */
+  w->v = vec_fma(cubic_1_const_c0.v, x.v, cubic_1_const_c1.v);
+  w2.v = vec_fma(cubic_2_const_c0.v, x.v, cubic_2_const_c1.v);
+
+  /* Calculate the polynomial interleaving vector operations. */
+  w->v = vec_mul(x.v, w->v); /* cubic_1_const_c2 is zero */
+  w2.v = vec_fma(x.v, w2.v, cubic_2_const_c2.v);
+
+  w->v = vec_fma(x.v, w->v, cubic_1_const_c3.v);
+  w2.v = vec_fma(x.v, w2.v, cubic_2_const_c3.v);
+
+  /* Mask out unneeded values. */
+  w->v = vec_and(w->v, mask1.v);
+  w2.v = vec_and(w2.v, mask2.v);
+
+  /* Added both w and w2 together to form complete result. */
+  w->v = vec_add(w->v, w2.v);
+#else
+#error
 #endif
 
+  /* Return everything */
+  w->v =
+      vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
+}
+
+/**
+ * @brief Computes the kernel function derivative for two particles
+ * using vectors.
+ *
+ * Return 0 if $u > \\gamma = H/h$
+ *
+ * @param u The ratio of the distance to the smoothing length $u = x/h$.
+ * @param dw_dx (return) The norm of the gradient of $|\\nabla W(x,h)|$.
+ */
+__attribute__((always_inline)) INLINE static void kernel_eval_dWdx_vec(
+    vector *u, vector *dw_dx) {
+
+  /* Go to the range [0,1[ from [0,H[ */
+  vector x;
+  x.v = vec_mul(u->v, kernel_gamma_inv_vec.v);
+
+#ifdef WENDLAND_C2_KERNEL
+  /* Init the iteration for Horner's scheme. */
+  dw_dx->v = vec_fma(wendland_dwdx_const_c0.v, x.v, wendland_dwdx_const_c1.v);
+
+  /* Calculate the polynomial interleaving vector operations */
+  dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c2.v);
+
+  dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c3.v);
+
+  dw_dx->v = vec_mul(dw_dx->v, x.v);
+
+#elif defined(CUBIC_SPLINE_KERNEL)
+  vector dw_dx2;
+  vector mask1, mask2;
+
+  /* Form a mask for each part of the kernel. */
+  mask1.v = vec_cmp_lt(x.v, cond.v);  /* 0 < x < 0.5 */
+  mask2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */
+  ;
+
+  /* Work out w for both regions of the kernel and combine the results together
+   * using masks. */
+
+  /* Init the iteration for Horner's scheme. */
+  dw_dx->v = vec_fma(cubic_1_dwdx_const_c0.v, x.v, cubic_1_dwdx_const_c1.v);
+  dw_dx2.v = vec_fma(cubic_2_dwdx_const_c0.v, x.v, cubic_2_dwdx_const_c1.v);
+
+  /* Calculate the polynomial interleaving vector operations. */
+  dw_dx->v = vec_mul(dw_dx->v, x.v); /* cubic_1_dwdx_const_c2 is zero. */
+  dw_dx2.v = vec_fma(dw_dx2.v, x.v, cubic_2_dwdx_const_c2.v);
+
+  /* Mask out unneeded values. */
+  dw_dx->v = vec_and(dw_dx->v, mask1.v);
+  dw_dx2.v = vec_and(dw_dx2.v, mask2.v);
+
+  /* Added both dwdx and dwdx2 together to form complete result. */
+  dw_dx->v = vec_add(dw_dx->v, dw_dx2.v);
+#else
+#error
+#endif
+
+  /* Return everything */
+  dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
+                                       kernel_gamma_inv_dim_plus_one_vec.v));
+}
+
+#endif /* WITH_VECTORIZATION */
+
 /* Some cross-check functions */
 void hydro_kernel_dump(int N);
 
diff --git a/src/runner.c b/src/runner.c
index 778eb0c7c6a11770dbe60a858ec9bcbc2c78c2d0..dba98e69373761b8ee52261a0b692d7109e183db 100644
--- a/src/runner.c
+++ b/src/runner.c
@@ -1790,8 +1790,13 @@ void *runner_main(void *data) {
           break;
 
         case task_type_pair:
-          if (t->subtype == task_subtype_density)
+          if (t->subtype == task_subtype_density) {
+#if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH)
+            runner_dopair1_density_vec(r, ci, cj);
+#else
             runner_dopair1_density(r, ci, cj);
+#endif
+          }
 #ifdef EXTRA_HYDRO_LOOP
           else if (t->subtype == task_subtype_gradient)
             runner_dopair1_gradient(r, ci, cj);
diff --git a/src/runner.h b/src/runner.h
index 51de9827406b014debeefba03ad09f636e3fa45f..49f7cd88a0b345cac1b29b7fd38cf59b21268012 100644
--- a/src/runner.h
+++ b/src/runner.h
@@ -49,8 +49,11 @@ struct runner {
   /*! The engine owing this runner. */
   struct engine *e;
 
-  /*! The particle cache of this runner. */
-  struct cache par_cache;
+  /*! The particle cache of cell ci. */
+  struct cache ci_cache;
+
+  /*! The particle cache of cell cj. */
+  struct cache cj_cache;
 };
 
 /* Function prototypes. */
diff --git a/src/runner_doiact.h b/src/runner_doiact.h
index 36d287c32798f749eeab90db73289deb8b21f906..839e492956140bd2b633d1e7c99c2b2a43f89629 100644
--- a/src/runner_doiact.h
+++ b/src/runner_doiact.h
@@ -26,6 +26,9 @@
 
 #define PASTE(x, y) x##_##y
 
+#define _DOPAIR1_BRANCH(f) PASTE(runner_dopair1_branch, f)
+#define DOPAIR1_BRANCH _DOPAIR1_BRANCH(FUNCTION)
+
 #define _DOPAIR1(f) PASTE(runner_dopair1, f)
 #define DOPAIR1 _DOPAIR1(FUNCTION)
 
@@ -2287,8 +2290,13 @@ void DOSUB_PAIR1(struct runner *r, struct cell *ci, struct cell *cj, int sid,
         cj->dx_max_sort > cj->dmin * space_maxreldx)
       runner_do_sort(r, cj, (1 << sid), 1);
 
-    /* Compute the interactions. */
+/* Compute the interactions. */
+#if (DOPAIR1 == runner_dopair1_density) && defined(WITH_VECTORIZATION) && \
+    defined(GADGET2_SPH)
+    runner_dopair1_density_vec(r, ci, cj);
+#else
     DOPAIR1(r, ci, cj);
+#endif
   }
 
   if (gettimer) TIMER_TOC(TIMER_DOSUB_PAIR);
diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c
index d78de2cd3c508fd17a816316dc931010f2febf80..90b612684d7a8296f0d3802ec539b3b769a0e877 100644
--- a/src/runner_doiact_vec.c
+++ b/src/runner_doiact_vec.c
@@ -20,6 +20,8 @@
 /* Config parameters. */
 #include "../config.h"
 
+#include "swift.h"
+
 #include "active.h"
 
 /* This object's header. */
@@ -259,6 +261,98 @@ __attribute__((always_inline)) INLINE static void storeInteractions(
     *icount = 0;
   }
 }
+
+/* @brief Populates the arrays max_di and max_dj with the maximum distances of
+ * particles into their neighbouring cells. Also finds the first pi that
+ * interacts with any particle in cj and the last pj that interacts with any
+ * particle in ci.
+ * @param ci #cell pointer to ci
+ * @param cj #cell pointer to cj
+ * @param sort_i #entry array for particle distance in ci
+ * @param sort_j #entry array for particle distance in cj
+ * @param ci_cache #cache for cell ci
+ * @param cj_cache #cache for cell cj
+ * @param dx_max maximum particle movement allowed in cell
+ * @param rshift cutoff shift
+ * @param max_di array to hold the maximum distances of pi particles into cell
+ * cj
+ * @param max_dj array to hold the maximum distances of pj particles into cell
+ * cj
+ * @param init_pi first pi to interact with a pj particle
+ * @param init_pj last pj to interact with a pi particle
+ */
+__attribute__((always_inline)) INLINE static void populate_max_d_no_cache(
+    const struct cell *ci, const struct cell *cj,
+    const struct entry *restrict sort_i, const struct entry *restrict sort_j,
+    const float dx_max, const float rshift, float *max_di, float *max_dj,
+    int *init_pi, int *init_pj, const struct engine *e) {
+
+  struct part *restrict parts_i = ci->parts;
+  struct part *restrict parts_j = cj->parts;
+  struct part *p = &parts_i[sort_i[0].i];
+
+  float h, d;
+
+  /* Get the distance of the last pi and the first pj on the sorted axis.*/
+  const float di_max = sort_i[ci->count - 1].d - rshift;
+  const float dj_min = sort_j[0].d;
+
+  int first_pi = 0, last_pj = cj->count - 1;
+
+  /* Find the first active particle in ci to interact with any particle in cj.
+   */
+  /* Populate max_di with distances. */
+  int active_id = ci->count - 1;
+  for (int k = ci->count - 1; k >= 0; k--) {
+    p = &parts_i[sort_i[k].i];
+    h = p->h;
+    d = sort_i[k].d + h * kernel_gamma + dx_max - rshift;
+
+    max_di[k] = d;
+
+    /* If the particle is out of range set the index to
+     * the last active particle within range. */
+    if (d < dj_min) {
+      first_pi = active_id;
+      break;
+    } else {
+      if (part_is_active(p, e)) active_id = k;
+    }
+  }
+
+  /* Find the maximum distance of pi particles into cj.*/
+  for (int k = first_pi + 1; k < ci->count; k++) {
+    max_di[k] = fmaxf(max_di[k - 1], max_di[k]);
+  }
+
+  /* Find the last particle in cj to interact with any particle in ci. */
+  /* Populate max_dj with distances. */
+  active_id = 0;
+  for (int k = 0; k < cj->count; k++) {
+    p = &parts_j[sort_j[k].i];
+    h = p->h;
+    d = sort_j[k].d - h * kernel_gamma - dx_max - rshift;
+
+    max_dj[k] = d;
+
+    /* If the particle is out of range set the index to
+     * the last active particle within range. */
+    if (d > di_max) {
+      last_pj = active_id;
+      break;
+    } else {
+      if (part_is_active(p, e)) active_id = k;
+    }
+  }
+
+  /* Find the maximum distance of pj particles into ci.*/
+  for (int k = 1; k <= last_pj; k++) {
+    max_dj[k] = fmaxf(max_dj[k - 1], max_dj[k]);
+  }
+
+  *init_pi = first_pi;
+  *init_pj = last_pj;
+}
 #endif /* WITH_VECTORIZATION */
 
 /**
@@ -287,11 +381,11 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
 
   if (!cell_is_active(c, e)) return;
 
-  if (!cell_is_drifted(c, e)) cell_drift_particles(c, e);
+  if (!cell_is_drifted(c, e)) error("Interacting undrifted cell.");
 
   /* Get the particle cache from the runner and re-allocate
    * the cache if it is not big enough for the cell. */
-  struct cache *restrict cell_cache = &r->par_cache;
+  struct cache *restrict cell_cache = &r->ci_cache;
 
   if (cell_cache->count < count) {
     cache_init(cell_cache, count);
@@ -354,6 +448,7 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
       int pad = (num_vec_proc * VEC_SIZE) - rem;
 
       count_align += pad;
+
       /* Set positions to the same as particle pi so when the r2 > 0 mask is
        * applied these extra contributions are masked out.*/
       for (int i = count; i < count_align; i++) {
@@ -394,17 +489,17 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
       vector v_dx_tmp2, v_dy_tmp2, v_dz_tmp2, v_r2_2;
 
       v_dx_tmp.v = vec_sub(pix.v, pjx.v);
-      v_dy_tmp.v = vec_sub(piy.v, pjy.v);
-      v_dz_tmp.v = vec_sub(piz.v, pjz.v);
       v_dx_tmp2.v = vec_sub(pix.v, pjx2.v);
+      v_dy_tmp.v = vec_sub(piy.v, pjy.v);
       v_dy_tmp2.v = vec_sub(piy.v, pjy2.v);
+      v_dz_tmp.v = vec_sub(piz.v, pjz.v);
       v_dz_tmp2.v = vec_sub(piz.v, pjz2.v);
 
       v_r2.v = vec_mul(v_dx_tmp.v, v_dx_tmp.v);
-      v_r2.v = vec_fma(v_dy_tmp.v, v_dy_tmp.v, v_r2.v);
-      v_r2.v = vec_fma(v_dz_tmp.v, v_dz_tmp.v, v_r2.v);
       v_r2_2.v = vec_mul(v_dx_tmp2.v, v_dx_tmp2.v);
+      v_r2.v = vec_fma(v_dy_tmp.v, v_dy_tmp.v, v_r2.v);
       v_r2_2.v = vec_fma(v_dy_tmp2.v, v_dy_tmp2.v, v_r2_2.v);
+      v_r2.v = vec_fma(v_dz_tmp.v, v_dz_tmp.v, v_r2.v);
       v_r2_2.v = vec_fma(v_dz_tmp2.v, v_dz_tmp2.v, v_r2_2.v);
 
 /* Form a mask from r2 < hig2 and r2 > 0.*/
@@ -545,14 +640,14 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec_2(
 
   /* Get the particle cache from the runner and re-allocate
    * the cache if it is not big enough for the cell. */
-  struct cache *restrict cell_cache = &r->par_cache;
+  struct cache *restrict cell_cache = &r->ci_cache;
 
   if (cell_cache->count < count) {
     cache_init(cell_cache, count);
   }
 
   /* Read the particles from the cell and store them locally in the cache. */
-  cache_read_particles(c, &r->par_cache);
+  cache_read_particles(c, &r->ci_cache);
 
   /* Create two secondary caches. */
   int icount = 0, icount_align = 0;
@@ -872,3 +967,438 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec_2(
   TIMER_TOC(timer_doself_density);
 #endif /* WITH_VECTORIZATION */
 }
+
+/**
+ * @brief Compute the density interactions between a cell pair (non-symmetric)
+ * using vector intrinsics.
+ *
+ * @param r The #runner.
+ * @param ci The first #cell.
+ * @param cj The second #cell.
+ */
+void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
+                                struct cell *cj) {
+
+#ifdef WITH_VECTORIZATION
+  const struct engine *restrict e = r->e;
+
+  vector v_hi, v_vix, v_viy, v_viz, v_hig2;
+
+  TIMER_TIC;
+
+  /* Anything to do here? */
+  if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return;
+
+  if (!cell_is_drifted(ci, e) || !cell_is_drifted(cj, e))
+    error("Interacting undrifted cells.");
+
+  /* Get the sort ID. */
+  double shift[3] = {0.0, 0.0, 0.0};
+  const int sid = space_getsid(e->s, &ci, &cj, shift);
+
+  /* Have the cells been sorted? */
+  if (!(ci->sorted & (1 << sid)) || ci->dx_max_sort > space_maxreldx * ci->dmin)
+    runner_do_sort(r, ci, (1 << sid), 1);
+  if (!(cj->sorted & (1 << sid)) || cj->dx_max_sort > space_maxreldx * cj->dmin)
+    runner_do_sort(r, cj, (1 << sid), 1);
+
+  /* Get the cutoff shift. */
+  double rshift = 0.0;
+  for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k];
+
+  /* Pick-out the sorted lists. */
+  const struct entry *restrict sort_i = &ci->sort[sid * (ci->count + 1)];
+  const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)];
+
+#ifdef SWIFT_DEBUG_CHECKS
+  /* Check that the dx_max_sort values in the cell are indeed an upper
+     bound on particle movement. */
+  for (int pid = 0; pid < ci->count; pid++) {
+    const struct part *p = &ci->parts[sort_i[pid].i];
+    const float d = p->x[0] * runner_shift[sid][0] +
+                    p->x[1] * runner_shift[sid][1] +
+                    p->x[2] * runner_shift[sid][2];
+    if (fabsf(d - sort_i[pid].d) - ci->dx_max_sort >
+        1.0e-6 * max(fabsf(d), ci->dx_max_sort))
+      error("particle shift diff exceeds dx_max_sort.");
+  }
+  for (int pjd = 0; pjd < cj->count; pjd++) {
+    const struct part *p = &cj->parts[sort_j[pjd].i];
+    const float d = p->x[0] * runner_shift[sid][0] +
+                    p->x[1] * runner_shift[sid][1] +
+                    p->x[2] * runner_shift[sid][2];
+    if (fabsf(d - sort_j[pjd].d) - cj->dx_max_sort >
+        1.0e-6 * max(fabsf(d), cj->dx_max_sort))
+      error("particle shift diff exceeds dx_max_sort.");
+  }
+#endif /* SWIFT_DEBUG_CHECKS */
+
+  /* Get some other useful values. */
+  const int count_i = ci->count;
+  const int count_j = cj->count;
+  const double hi_max = ci->h_max * kernel_gamma - rshift;
+  const double hj_max = cj->h_max * kernel_gamma;
+  struct part *restrict parts_i = ci->parts;
+  struct part *restrict parts_j = cj->parts;
+  const double di_max = sort_i[count_i - 1].d - rshift;
+  const double dj_min = sort_j[0].d;
+  const float dx_max = (ci->dx_max_sort + cj->dx_max_sort);
+
+  /* Check if any particles are active and return if there are not. */
+  int numActive = 0;
+  for (int pid = count_i - 1;
+       pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min; pid--) {
+    struct part *restrict pi = &parts_i[sort_i[pid].i];
+    if (part_is_active(pi, e)) {
+      numActive++;
+      break;
+    }
+  }
+
+  if (!numActive) {
+    for (int pjd = 0; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max;
+         pjd++) {
+      struct part *restrict pj = &parts_j[sort_j[pjd].i];
+      if (part_is_active(pj, e)) {
+        numActive++;
+        break;
+      }
+    }
+  }
+
+  if (numActive == 0) return;
+
+  /* Get both particle caches from the runner and re-allocate
+   * them if they are not big enough for the cells. */
+  struct cache *restrict ci_cache = &r->ci_cache;
+  struct cache *restrict cj_cache = &r->cj_cache;
+
+  if (ci_cache->count < count_i) {
+    cache_init(ci_cache, count_i);
+  }
+  if (cj_cache->count < count_j) {
+    cache_init(cj_cache, count_j);
+  }
+
+  int first_pi, last_pj;
+  float *max_di __attribute__((aligned(sizeof(float) * VEC_SIZE)));
+  float *max_dj __attribute__((aligned(sizeof(float) * VEC_SIZE)));
+
+  max_di = r->ci_cache.max_d;
+  max_dj = r->cj_cache.max_d;
+
+  /* Find particles maximum distance into cj, max_di[] and ci, max_dj[]. */
+  /* Also find the first pi that interacts with any particle in cj and the last
+   * pj that interacts with any particle in ci. */
+  populate_max_d_no_cache(ci, cj, sort_i, sort_j, dx_max, rshift, max_di,
+                          max_dj, &first_pi, &last_pj, e);
+
+  /* Find the maximum index into cj that is required by a particle in ci. */
+  /* Find the maximum index into ci that is required by a particle in cj. */
+  float di, dj;
+  int max_ind_j = count_j - 1;
+  int max_ind_i = 0;
+
+  dj = sort_j[max_ind_j].d;
+  while (max_ind_j > 0 && max_di[count_i - 1] < dj) {
+    max_ind_j--;
+
+    dj = sort_j[max_ind_j].d;
+  }
+
+  di = sort_i[max_ind_i].d;
+  while (max_ind_i < count_i - 1 && max_dj[0] > di) {
+    max_ind_i++;
+
+    di = sort_i[max_ind_i].d;
+  }
+
+  /* Limits of the outer loops. */
+  int first_pi_loop = first_pi;
+  int last_pj_loop = last_pj;
+
+  /* Take the max/min of both values calculated to work out how many particles
+   * to read into the cache. */
+  last_pj = max(last_pj, max_ind_j);
+  first_pi = min(first_pi, max_ind_i);
+
+  /* Read the needed particles into the two caches. */
+  int first_pi_align = first_pi;
+  int last_pj_align = last_pj;
+  cache_read_two_partial_cells_sorted(ci, cj, ci_cache, cj_cache, sort_i,
+                                      sort_j, shift, &first_pi_align,
+                                      &last_pj_align, 1);
+
+  /* Get the number of particles read into the ci cache. */
+  int ci_cache_count = count_i - first_pi_align;
+
+  if (cell_is_active(ci, e)) {
+
+    /* Loop over the parts in ci. */
+    for (int pid = count_i - 1; pid >= first_pi_loop && max_ind_j >= 0; pid--) {
+
+      /* Get a hold of the ith part in ci. */
+      struct part *restrict pi = &parts_i[sort_i[pid].i];
+      if (!part_is_active(pi, e)) continue;
+
+      /* Determine the exit iteration of the interaction loop. */
+      dj = sort_j[max_ind_j].d;
+      while (max_ind_j > 0 && max_di[pid] < dj) {
+        max_ind_j--;
+
+        dj = sort_j[max_ind_j].d;
+      }
+      int exit_iteration = max_ind_j + 1;
+
+      /* Set the cache index. */
+      int ci_cache_idx = pid - first_pi_align;
+
+      const float hi = ci_cache->h[ci_cache_idx];
+      const float hig2 = hi * hi * kernel_gamma2;
+
+      vector pix, piy, piz;
+
+      /* Fill particle pi vectors. */
+      pix.v = vec_set1(ci_cache->x[ci_cache_idx]);
+      piy.v = vec_set1(ci_cache->y[ci_cache_idx]);
+      piz.v = vec_set1(ci_cache->z[ci_cache_idx]);
+      v_hi.v = vec_set1(hi);
+      v_vix.v = vec_set1(ci_cache->vx[ci_cache_idx]);
+      v_viy.v = vec_set1(ci_cache->vy[ci_cache_idx]);
+      v_viz.v = vec_set1(ci_cache->vz[ci_cache_idx]);
+
+      v_hig2.v = vec_set1(hig2);
+
+      /* Reset cumulative sums of update vectors. */
+      vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum,
+          curlvySum, curlvzSum;
+
+      /* Get the inverse of hi. */
+      vector v_hi_inv;
+
+      v_hi_inv = vec_reciprocal(v_hi);
+
+      rhoSum.v = vec_setzero();
+      rho_dhSum.v = vec_setzero();
+      wcountSum.v = vec_setzero();
+      wcount_dhSum.v = vec_setzero();
+      div_vSum.v = vec_setzero();
+      curlvxSum.v = vec_setzero();
+      curlvySum.v = vec_setzero();
+      curlvzSum.v = vec_setzero();
+
+      /* Pad the exit iteration if there is a serial remainder. */
+      int exit_iteration_align = exit_iteration;
+      int rem = exit_iteration % VEC_SIZE;
+      if (rem != 0) {
+        int pad = VEC_SIZE - rem;
+
+        if (exit_iteration_align + pad <= last_pj_align + 1)
+          exit_iteration_align += pad;
+      }
+
+      vector pjx, pjy, pjz;
+
+      /* Loop over the parts in cj. */
+      for (int pjd = 0; pjd < exit_iteration_align; pjd += VEC_SIZE) {
+
+        /* Get the cache index to the jth particle. */
+        int cj_cache_idx = pjd;
+
+        vector v_dx, v_dy, v_dz, v_r2;
+
+#ifdef SWIFT_DEBUG_CHECKS
+        if (cj_cache_idx % VEC_SIZE != 0 || cj_cache_idx < 0) {
+          error("Unaligned read!!! cj_cache_idx=%d", cj_cache_idx);
+        }
+#endif
+
+        /* Load 2 sets of vectors from the particle cache. */
+        pjx.v = vec_load(&cj_cache->x[cj_cache_idx]);
+        pjy.v = vec_load(&cj_cache->y[cj_cache_idx]);
+        pjz.v = vec_load(&cj_cache->z[cj_cache_idx]);
+
+        /* Compute the pairwise distance. */
+        v_dx.v = vec_sub(pix.v, pjx.v);
+        v_dy.v = vec_sub(piy.v, pjy.v);
+        v_dz.v = vec_sub(piz.v, pjz.v);
+
+        v_r2.v = vec_mul(v_dx.v, v_dx.v);
+        v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v);
+        v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v);
+
+        vector v_doi_mask;
+        int doi_mask;
+
+        /* Form r2 < hig2 mask. */
+        v_doi_mask.v = vec_cmp_lt(v_r2.v, v_hig2.v);
+
+        /* Form integer mask. */
+        doi_mask = vec_cmp_result(v_doi_mask.v);
+
+        /* If there are any interactions perform them. */
+        if (doi_mask)
+          runner_iact_nonsym_1_vec_density(
+              &v_r2, &v_dx, &v_dy, &v_dz, v_hi_inv, v_vix, v_viy, v_viz,
+              &cj_cache->vx[cj_cache_idx], &cj_cache->vy[cj_cache_idx],
+              &cj_cache->vz[cj_cache_idx], &cj_cache->m[cj_cache_idx], &rhoSum,
+              &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum,
+              &curlvySum, &curlvzSum, v_doi_mask,
+#ifdef HAVE_AVX512_F
+              knl_mask);
+#else
+              0);
+#endif
+
+      } /* loop over the parts in cj. */
+
+      /* Perform horizontal adds on vector sums and store result in particle pi.
+      */
+      VEC_HADD(rhoSum, pi->rho);
+      VEC_HADD(rho_dhSum, pi->density.rho_dh);
+      VEC_HADD(wcountSum, pi->density.wcount);
+      VEC_HADD(wcount_dhSum, pi->density.wcount_dh);
+      VEC_HADD(div_vSum, pi->density.div_v);
+      VEC_HADD(curlvxSum, pi->density.rot_v[0]);
+      VEC_HADD(curlvySum, pi->density.rot_v[1]);
+      VEC_HADD(curlvzSum, pi->density.rot_v[2]);
+
+    } /* loop over the parts in ci. */
+  }
+
+  if (cell_is_active(cj, e)) {
+    /* Loop over the parts in cj. */
+    for (int pjd = 0; pjd <= last_pj_loop && max_ind_i < count_i; pjd++) {
+
+      /* Get a hold of the jth part in cj. */
+      struct part *restrict pj = &parts_j[sort_j[pjd].i];
+      if (!part_is_active(pj, e)) continue;
+
+      /* Determine the exit iteration of the interaction loop. */
+      di = sort_i[max_ind_i].d;
+      while (max_ind_i < count_i - 1 && max_dj[pjd] > di) {
+        max_ind_i++;
+
+        di = sort_i[max_ind_i].d;
+      }
+      int exit_iteration = max_ind_i;
+
+      /* Set the cache index. */
+      int cj_cache_idx = pjd;
+
+      const float hj = cj_cache->h[cj_cache_idx];
+      const float hjg2 = hj * hj * kernel_gamma2;
+
+      vector pjx, pjy, pjz;
+      vector v_hj, v_vjx, v_vjy, v_vjz, v_hjg2;
+
+      /* Fill particle pi vectors. */
+      pjx.v = vec_set1(cj_cache->x[cj_cache_idx]);
+      pjy.v = vec_set1(cj_cache->y[cj_cache_idx]);
+      pjz.v = vec_set1(cj_cache->z[cj_cache_idx]);
+      v_hj.v = vec_set1(hj);
+      v_vjx.v = vec_set1(cj_cache->vx[cj_cache_idx]);
+      v_vjy.v = vec_set1(cj_cache->vy[cj_cache_idx]);
+      v_vjz.v = vec_set1(cj_cache->vz[cj_cache_idx]);
+
+      v_hjg2.v = vec_set1(hjg2);
+
+      /* Reset cumulative sums of update vectors. */
+      vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum,
+          curlvySum, curlvzSum;
+
+      /* Get the inverse of hj. */
+      vector v_hj_inv;
+
+      v_hj_inv = vec_reciprocal(v_hj);
+
+      rhoSum.v = vec_setzero();
+      rho_dhSum.v = vec_setzero();
+      wcountSum.v = vec_setzero();
+      wcount_dhSum.v = vec_setzero();
+      div_vSum.v = vec_setzero();
+      curlvxSum.v = vec_setzero();
+      curlvySum.v = vec_setzero();
+      curlvzSum.v = vec_setzero();
+
+      vector pix, piy, piz;
+
+      /* Convert exit iteration to cache indices. */
+      int exit_iteration_align = exit_iteration - first_pi_align;
+
+      /* Pad the exit iteration align so cache reads are aligned. */
+      int rem = exit_iteration_align % VEC_SIZE;
+      if (exit_iteration_align < VEC_SIZE) {
+        exit_iteration_align = 0;
+      } else
+        exit_iteration_align -= rem;
+
+      /* Loop over the parts in ci. */
+      for (int ci_cache_idx = exit_iteration_align;
+           ci_cache_idx < ci_cache_count; ci_cache_idx += VEC_SIZE) {
+
+#ifdef SWIFT_DEBUG_CHECKS
+        if (ci_cache_idx % VEC_SIZE != 0 || ci_cache_idx < 0) {
+          error("Unaligned read!!! ci_cache_idx=%d", ci_cache_idx);
+        }
+#endif
+
+        vector v_dx, v_dy, v_dz, v_r2;
+
+        /* Load 2 sets of vectors from the particle cache. */
+        pix.v = vec_load(&ci_cache->x[ci_cache_idx]);
+        piy.v = vec_load(&ci_cache->y[ci_cache_idx]);
+        piz.v = vec_load(&ci_cache->z[ci_cache_idx]);
+
+        /* Compute the pairwise distance. */
+        v_dx.v = vec_sub(pjx.v, pix.v);
+        v_dy.v = vec_sub(pjy.v, piy.v);
+        v_dz.v = vec_sub(pjz.v, piz.v);
+
+        v_r2.v = vec_mul(v_dx.v, v_dx.v);
+        v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v);
+        v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v);
+
+        vector v_doj_mask;
+        int doj_mask;
+
+        /* Form r2 < hig2 mask. */
+        v_doj_mask.v = vec_cmp_lt(v_r2.v, v_hjg2.v);
+
+        /* Form integer mask. */
+        doj_mask = vec_cmp_result(v_doj_mask.v);
+
+        /* If there are any interactions perform them. */
+        if (doj_mask)
+          runner_iact_nonsym_1_vec_density(
+              &v_r2, &v_dx, &v_dy, &v_dz, v_hj_inv, v_vjx, v_vjy, v_vjz,
+              &ci_cache->vx[ci_cache_idx], &ci_cache->vy[ci_cache_idx],
+              &ci_cache->vz[ci_cache_idx], &ci_cache->m[ci_cache_idx], &rhoSum,
+              &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum,
+              &curlvySum, &curlvzSum, v_doj_mask,
+#ifdef HAVE_AVX512_F
+              knl_mask);
+#else
+              0);
+#endif
+
+      } /* loop over the parts in ci. */
+
+      /* Perform horizontal adds on vector sums and store result in particle pj.
+       */
+      VEC_HADD(rhoSum, pj->rho);
+      VEC_HADD(rho_dhSum, pj->density.rho_dh);
+      VEC_HADD(wcountSum, pj->density.wcount);
+      VEC_HADD(wcount_dhSum, pj->density.wcount_dh);
+      VEC_HADD(div_vSum, pj->density.div_v);
+      VEC_HADD(curlvxSum, pj->density.rot_v[0]);
+      VEC_HADD(curlvySum, pj->density.rot_v[1]);
+      VEC_HADD(curlvzSum, pj->density.rot_v[2]);
+
+    } /* loop over the parts in cj. */
+
+    TIMER_TOC(timer_dopair_density);
+  }
+
+#endif /* WITH_VECTORIZATION */
+}
diff --git a/src/runner_doiact_vec.h b/src/runner_doiact_vec.h
index 9bb24f12cedf03ec49a5a03f92d308f92d49aa54..e252083ae743248a5f23d1772c2e770c5e1c6c14 100644
--- a/src/runner_doiact_vec.h
+++ b/src/runner_doiact_vec.h
@@ -24,6 +24,7 @@
 #include "../config.h"
 
 /* Local headers */
+#include "active.h"
 #include "cell.h"
 #include "engine.h"
 #include "hydro.h"
@@ -35,5 +36,7 @@
 /* Function prototypes. */
 void runner_doself1_density_vec(struct runner *r, struct cell *restrict c);
 void runner_doself1_density_vec_2(struct runner *r, struct cell *restrict c);
+void runner_dopair1_density_vec(struct runner *r, struct cell *restrict ci,
+                                struct cell *restrict cj);
 
 #endif /* SWIFT_RUNNER_VEC_H */
diff --git a/src/vector.h b/src/vector.h
index 5582eecf93b6e7a9f82b885cda1f6c70d6bf059a..7d82b9f5c9e2bc6e1fea9426ab3870eeec180408 100644
--- a/src/vector.h
+++ b/src/vector.h
@@ -74,6 +74,7 @@
 #define vec_cmp_gt(a, b) _mm512_cmp_ps_mask(a, b, _CMP_GT_OQ)
 #define vec_cmp_lt(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ)
 #define vec_cmp_lte(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ)
+#define vec_cmp_gte(a, b) _mm512_cmp_ps_mask(a, b, _CMP_GE_OQ)
 #define vec_and(a, b) _mm512_and_ps(a, b)
 #define vec_todbl_lo(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 0))
 #define vec_todbl_hi(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 1))
@@ -106,8 +107,15 @@
   }
 
 /* Performs a horizontal add on the vector and adds the result to a float. */
+#ifdef __ICC
 #define VEC_HADD(a, b) b += _mm512_reduce_add_ps(a.v)
-
+#else /* _mm512_reduce_add_ps not present in GCC compiler. \
+       TODO: Implement intrinsic version.*/
+#define VEC_HADD(a, b)                              \
+  {                                                 \
+    for (int i = 0; i < VEC_SIZE; i++) b += a.f[i]; \
+  }
+#endif
 /* Calculates the number of set bits in the mask and adds the result to an int.
  */
 #define VEC_FORM_PACKED_MASK(mask, v_mask, pack) \
@@ -147,6 +155,7 @@
 #define vec_cmp_lt(a, b) _mm256_cmp_ps(a, b, _CMP_LT_OQ)
 #define vec_cmp_gt(a, b) _mm256_cmp_ps(a, b, _CMP_GT_OQ)
 #define vec_cmp_lte(a, b) _mm256_cmp_ps(a, b, _CMP_LE_OQ)
+#define vec_cmp_gte(a, b) _mm256_cmp_ps(a, b, _CMP_GE_OQ)
 #define vec_cmp_result(a) _mm256_movemask_ps(a)
 #define vec_and(a, b) _mm256_and_ps(a, b)
 #define vec_todbl_lo(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 0))
diff --git a/tests/test27cells.c b/tests/test27cells.c
index 2e7e9b63c9e8f09239fc1ba145fabcf30d09e977..bd827b68e90ea5f4e9d5577612e6cecda2edf83a 100644
--- a/tests/test27cells.c
+++ b/tests/test27cells.c
@@ -34,7 +34,9 @@
 
 #if defined(WITH_VECTORIZATION)
 #define DOSELF1 runner_doself1_density_vec
+#define DOPAIR1 runner_dopair1_density_vec
 #define DOSELF1_NAME "runner_doself1_density_vec"
+#define DOPAIR1_NAME "runner_dopair1_density_vec"
 #endif
 
 #ifndef DOSELF1
@@ -42,6 +44,11 @@
 #define DOSELF1_NAME "runner_doself1_density"
 #endif
 
+#ifndef DOPAIR1
+#define DOPAIR1 runner_dopair1_density
+#define DOPAIR1_NAME "runner_dopair1_density"
+#endif
+
 enum velocity_types {
   velocity_zero,
   velocity_random,
@@ -303,9 +310,11 @@ int check_results(struct part *serial_parts, struct part *vec_parts, int count,
 }
 
 /* Just a forward declaration... */
-void runner_dopair1_density(struct runner *r, struct cell *ci, struct cell *cj);
 void runner_doself1_density(struct runner *r, struct cell *ci);
 void runner_doself1_density_vec(struct runner *r, struct cell *ci);
+void runner_dopair1_density(struct runner *r, struct cell *ci, struct cell *cj);
+void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
+                                struct cell *cj);
 
 /* And go... */
 int main(int argc, char *argv[]) {
@@ -384,7 +393,8 @@ int main(int argc, char *argv[]) {
   }
 
   /* Help users... */
-  message("Function called: %s", DOSELF1_NAME);
+  message("DOSELF1 function called: %s", DOSELF1_NAME);
+  message("DOPAIR1 function called: %s", DOPAIR1_NAME);
   message("Vector size: %d", VEC_SIZE);
   message("Adiabatic index: ga = %f", hydro_gamma);
   message("Hydro implementation: %s", SPH_IMPLEMENTATION);
@@ -450,24 +460,26 @@ int main(int argc, char *argv[]) {
 
 #if !(defined(MINIMAL_SPH) && defined(WITH_VECTORIZATION))
 
+#ifdef WITH_VECTORIZATION
+    runner.ci_cache.count = 0;
+    cache_init(&runner.ci_cache, 512);
+    runner.cj_cache.count = 0;
+    cache_init(&runner.cj_cache, 512);
+#endif
+
     /* Run all the pairs */
     for (int j = 0; j < 27; ++j) {
       if (cells[j] != main_cell) {
         const ticks sub_tic = getticks();
 
-        runner_dopair1_density(&runner, main_cell, cells[j]);
+        DOPAIR1(&runner, main_cell, cells[j]);
 
         const ticks sub_toc = getticks();
         timings[j] += sub_toc - sub_tic;
       }
     }
 
-/* And now the self-interaction */
-#ifdef WITH_VECTORIZATION
-    runner.par_cache.count = 0;
-    cache_init(&runner.par_cache, 512);
-#endif
-
+    /* And now the self-interaction */
     const ticks self_tic = getticks();
 
     DOSELF1(&runner, main_cell);
diff --git a/tests/testKernel.c b/tests/testKernel.c
index 344c038be0020019cc989cca6969b4e65bfd973f..13f4e36534eb11a4c8f7ba9c19a48de6599e31f5 100644
--- a/tests/testKernel.c
+++ b/tests/testKernel.c
@@ -53,7 +53,7 @@ int main() {
   printf("\nVector Output for VEC_SIZE=%d\n", VEC_SIZE);
   printf("-------------\n");
 
-#ifdef WITH_VECORIZATION
+#ifdef WITH_VECTORIZATION
 
   for (int i = 0; i < numPoints; i += VEC_SIZE) {