diff --git a/src/Makefile.am b/src/Makefile.am
index 3aab888c705fffc6e63179928c81c005b2ed9820..f53797f6a4e506a5aa0dc2945fc143a7a4a18d3d 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -44,7 +44,8 @@ include_HEADERS = space.h runner.h queue.h task.h lock.h cell.h part.h const.h \
     common_io.h single_io.h multipole.h map.h tools.h partition.h clocks.h parser.h \
     physical_constants.h physical_constants_cgs.h potential.h version.h \
     hydro_properties.h riemann.h threadpool.h cooling.h cooling_struct.h sourceterms.h \
-    sourceterms_struct.h statistics.h memswap.h profiler.h dump.h logger.h
+    sourceterms_struct.h statistics.h memswap.h cache.h runner_doiact_vec.h profiler.h \
+    dump.h logger.h
 
 # Common source files
 AM_SOURCES = space.c runner.c queue.c task.c cell.c engine.c \
@@ -53,11 +54,11 @@ AM_SOURCES = space.c runner.c queue.c task.c cell.c engine.c \
     kernel_hydro.c tools.c part.c partition.c clocks.c parser.c \
     physical_constants.c potential.c hydro_properties.c \
     runner_doiact_fft.c threadpool.c cooling.c sourceterms.c \
-    statistics.c profiler.c dump.c logger.c
+    statistics.c runner_doiact_vec.c profiler.c dump.c logger.c
 
 # Include files for distribution, not installation.
 nobase_noinst_HEADERS = align.h approx_math.h atomic.h cycle.h error.h inline.h kernel_hydro.h kernel_gravity.h \
-		 kernel_long_gravity.h vector.h runner_doiact.h runner_doiact_grav.h runner_doiact_fft.h \
+		 kernel_long_gravity.h vector.h cache.h runner_doiact.h runner_doiact_vec.h runner_doiact_grav.h runner_doiact_fft.h \
                  units.h intrinsics.h minmax.h kick.h timestep.h drift.h adiabatic_index.h io_properties.h \
 		 dimension.h equation_of_state.h active.h \
 		 gravity.h gravity_io.h \
@@ -82,7 +83,7 @@ nobase_noinst_HEADERS = align.h approx_math.h atomic.h cycle.h error.h inline.h
 		 cooling/none/cooling.h cooling/none/cooling_struct.h \
 	         cooling/const_du/cooling.h cooling/const_du/cooling_struct.h \
                  cooling/const_lambda/cooling.h cooling/const_lambda/cooling_struct.h \
-     memswap.h dump.h logger.h
+                 memswap.h dump.h logger.h
 
 
 # Sources and flags for regular library
diff --git a/src/cache.h b/src/cache.h
new file mode 100644
index 0000000000000000000000000000000000000000..a6a7090806f27c038eef562fa654b19a24103560
--- /dev/null
+++ b/src/cache.h
@@ -0,0 +1,161 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (c) 2016 James Willis (jame.s.willis@durham.ac.uk)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+#ifndef SWIFT_CACHE_H
+#define SWIFT_CACHE_H
+
+/* Config parameters. */
+#include "../config.h"
+
+/* Local headers */
+#include "cell.h"
+#include "error.h"
+#include "part.h"
+#include "vector.h"
+
+#define NUM_VEC_PROC 2
+#define C2_CACHE_SIZE (NUM_VEC_PROC * VEC_SIZE * 6) + (NUM_VEC_PROC * VEC_SIZE)
+#define C2_CACHE_ALIGN sizeof(float) * VEC_SIZE
+
+/* Cache struct to hold a local copy of a cells' particle
+ * properties required for density/force calculations.*/
+struct cache {
+
+  /* Particle x position. */
+  float *restrict x __attribute__((aligned(sizeof(float) * VEC_SIZE)));
+
+  /* Particle y position. */
+  float *restrict y __attribute__((aligned(sizeof(float) * VEC_SIZE)));
+
+  /* Particle z position. */
+  float *restrict z __attribute__((aligned(sizeof(float) * VEC_SIZE)));
+
+  /* Particle smoothing length. */
+  float *restrict h __attribute__((aligned(sizeof(float) * VEC_SIZE)));
+
+  /* Particle mass. */
+  float *restrict m __attribute__((aligned(sizeof(float) * VEC_SIZE)));
+
+  /* Particle x velocity. */
+  float *restrict vx __attribute__((aligned(sizeof(float) * VEC_SIZE)));
+
+  /* Particle y velocity. */
+  float *restrict vy __attribute__((aligned(sizeof(float) * VEC_SIZE)));
+
+  /* Particle z velocity. */
+  float *restrict vz __attribute__((aligned(sizeof(float) * VEC_SIZE)));
+
+  /* Cache size. */
+  int count;
+};
+
+/* Secondary cache struct to hold a list of interactions between two
+ * particles.*/
+struct c2_cache {
+
+  /* Separation between two particles squared. */
+  float r2q[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+
+  /* x separation between two particles. */
+  float dxq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+
+  /* y separation between two particles. */
+  float dyq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+
+  /* z separation between two particles. */
+  float dzq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+
+  /* Mass of particle pj. */
+  float mq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+
+  /* x velocity of particle pj. */
+  float vxq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+
+  /* y velocity of particle pj. */
+  float vyq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+
+  /* z velocity of particle pj. */
+  float vzq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+};
+
+/**
+ * @brief Allocate memory and initialise cache.
+ *
+ * @param c The cache.
+ * @param count Number of particles to allocate space for.
+ */
+__attribute__((always_inline)) INLINE void cache_init(struct cache *c,
+                                                      size_t count) {
+
+  /* Align cache on correct byte boundary and pad cache size to include 2 vector
+   * lengths for remainder operations. */
+  unsigned long alignment = sizeof(float) * VEC_SIZE;
+  unsigned int sizeBytes = (count + (2 * VEC_SIZE)) * sizeof(float);
+  int error = 0;
+
+  /* Free memory if cache has already been allocated. */
+  if (c->count > 0) {
+    free(c->x);
+    free(c->y);
+    free(c->z);
+    free(c->m);
+    free(c->vx);
+    free(c->vy);
+    free(c->vz);
+    free(c->h);
+  }
+
+  error += posix_memalign((void **)&c->x, alignment, sizeBytes);
+  error += posix_memalign((void **)&c->y, alignment, sizeBytes);
+  error += posix_memalign((void **)&c->z, alignment, sizeBytes);
+  error += posix_memalign((void **)&c->m, alignment, sizeBytes);
+  error += posix_memalign((void **)&c->vx, alignment, sizeBytes);
+  error += posix_memalign((void **)&c->vy, alignment, sizeBytes);
+  error += posix_memalign((void **)&c->vz, alignment, sizeBytes);
+  error += posix_memalign((void **)&c->h, alignment, sizeBytes);
+
+  if (error != 0)
+    error("Couldn't allocate cache, no. of particles: %d", (int)count);
+  c->count = count;
+}
+
+/**
+ * @brief Populate cache by reading in the particles in unsorted order.
+ *
+ * @param ci The #cell.
+ * @param ci_cache The cache.
+ */
+__attribute__((always_inline)) INLINE void cache_read_particles(
+    const struct cell *const ci, struct cache *const ci_cache) {
+
+  /* Shift the particles positions to a local frame so single precision can be
+   * used instead of double precision. */
+  for (int i = 0; i < ci->count; i++) {
+    ci_cache->x[i] = ci->parts[i].x[0] - ci->loc[0];
+    ci_cache->y[i] = ci->parts[i].x[1] - ci->loc[1];
+    ci_cache->z[i] = ci->parts[i].x[2] - ci->loc[2];
+    ci_cache->h[i] = ci->parts[i].h;
+
+    ci_cache->m[i] = ci->parts[i].mass;
+    ci_cache->vx[i] = ci->parts[i].v[0];
+    ci_cache->vy[i] = ci->parts[i].v[1];
+    ci_cache->vz[i] = ci->parts[i].v[2];
+  }
+}
+
+#endif /* SWIFT_CACHE_H */
diff --git a/src/engine.c b/src/engine.c
index 6b3f5650f070a075aed2696444538b3bc4e10b4d..6f571f9ece9014e447b8afc38a79da7a614a81ee 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -70,6 +70,9 @@
 #include "units.h"
 #include "version.h"
 
+/* Particle cache size. */
+#define CACHE_SIZE 512
+
 const char *engine_policy_names[16] = {"none",
                                        "rand",
                                        "steal",
@@ -3412,6 +3415,11 @@ void engine_init(struct engine *e, struct space *s,
       e->runners[k].cpuid = k;
       e->runners[k].qid = k * nr_queues / e->nr_threads;
     }
+
+    /* Allocate particle cache. */
+    e->runners[k].par_cache.count = 0;
+    cache_init(&e->runners[k].par_cache, CACHE_SIZE);
+
     if (verbose) {
       if (with_aff)
         message("runner %i on cpuid=%i with qid=%i.", e->runners[k].id,
diff --git a/src/hydro/Gadget2/hydro_iact.h b/src/hydro/Gadget2/hydro_iact.h
index 08fb2b37db566e191bd74d82488b5d68e764573b..3fef18b4f487f1734a5f93c4bad46cf4e6968240 100644
--- a/src/hydro/Gadget2/hydro_iact.h
+++ b/src/hydro/Gadget2/hydro_iact.h
@@ -155,20 +155,15 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_density(
 
   /* Get the radius and inverse radius. */
   r2.v = vec_load(R2);
-  ri.v = vec_rsqrt(r2.v);
-  /*vec_rsqrt does not have the level of accuracy we need, so an extra term is
-   * added below.*/
-  ri.v = ri.v - vec_set1(0.5f) * ri.v * (r2.v * ri.v * ri.v - vec_set1(1.0f));
+  ri = vec_reciprocal_sqrt(r2);
   r.v = r2.v * ri.v;
 
   hi.v = vec_load(Hi);
-  hi_inv.v = vec_rcp(hi.v);
-  hi_inv.v = hi_inv.v - hi_inv.v * (hi_inv.v * hi.v - vec_set1(1.0f));
+  hi_inv = vec_reciprocal(hi);
   xi.v = r.v * hi_inv.v;
 
   hj.v = vec_load(Hj);
-  hj_inv.v = vec_rcp(hj.v);
-  hj_inv.v = hj_inv.v - hj_inv.v * (hj_inv.v * hj.v - vec_set1(1.0f));
+  hj_inv = vec_reciprocal(hj);
   xj.v = r.v * hj_inv.v;
 
   /* Compute the kernel function. */
@@ -327,15 +322,11 @@ runner_iact_nonsym_vec_density(float *R2, float *Dx, float *Hi, float *Hj,
 
   /* Get the radius and inverse radius. */
   r2.v = vec_load(R2);
-  ri.v = vec_rsqrt(r2.v);
-  /*vec_rsqrt does not have the level of accuracy we need, so an extra term is
-   * added below.*/
-  ri.v = ri.v - vec_set1(0.5f) * ri.v * (r2.v * ri.v * ri.v - vec_set1(1.0f));
+  ri = vec_reciprocal_sqrt(r2);
   r.v = r2.v * ri.v;
 
   hi.v = vec_load(Hi);
-  hi_inv.v = vec_rcp(hi.v);
-  hi_inv.v = hi_inv.v - hi_inv.v * (hi_inv.v * hi.v - vec_set1(1.0f));
+  hi_inv = vec_reciprocal(hi);
   xi.v = r.v * hi_inv.v;
 
   kernel_deval_vec(&xi, &wi, &wi_dx);
@@ -382,6 +373,176 @@ runner_iact_nonsym_vec_density(float *R2, float *Dx, float *Hi, float *Hj,
 #endif
 }
 
+#ifdef WITH_VECTORIZATION
+/**
+ * @brief Density interaction computed using 2 interleaved vectors
+ * (non-symmetric vectorized version).
+ */
+__attribute__((always_inline)) INLINE static void
+runner_iact_nonsym_2_vec_density(
+    float *R2, float *Dx, float *Dy, float *Dz, vector hi_inv, vector vix,
+    vector viy, vector viz, float *Vjx, float *Vjy, float *Vjz, float *Mj,
+    vector *rhoSum, vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum,
+    vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum,
+    vector mask, vector mask2, int knlMask, int knlMask2) {
+
+  vector r, ri, r2, xi, wi, wi_dx;
+  vector mj;
+  vector dx, dy, dz, dvx, dvy, dvz;
+  vector vjx, vjy, vjz;
+  vector dvdr;
+  vector curlvrx, curlvry, curlvrz;
+  vector r_2, ri2, r2_2, xi2, wi2, wi_dx2;
+  vector mj2;
+  vector dx2, dy2, dz2, dvx2, dvy2, dvz2;
+  vector vjx2, vjy2, vjz2;
+  vector dvdr2;
+  vector curlvrx2, curlvry2, curlvrz2;
+
+  /* Fill the vectors. */
+  mj.v = vec_load(Mj);
+  mj2.v = vec_load(&Mj[VEC_SIZE]);
+  vjx.v = vec_load(Vjx);
+  vjx2.v = vec_load(&Vjx[VEC_SIZE]);
+  vjy.v = vec_load(Vjy);
+  vjy2.v = vec_load(&Vjy[VEC_SIZE]);
+  vjz.v = vec_load(Vjz);
+  vjz2.v = vec_load(&Vjz[VEC_SIZE]);
+  dx.v = vec_load(Dx);
+  dx2.v = vec_load(&Dx[VEC_SIZE]);
+  dy.v = vec_load(Dy);
+  dy2.v = vec_load(&Dy[VEC_SIZE]);
+  dz.v = vec_load(Dz);
+  dz2.v = vec_load(&Dz[VEC_SIZE]);
+
+  /* Get the radius and inverse radius. */
+  r2.v = vec_load(R2);
+  r2_2.v = vec_load(&R2[VEC_SIZE]);
+  ri = vec_reciprocal_sqrt(r2);
+  ri2 = vec_reciprocal_sqrt(r2_2);
+  r.v = vec_mul(r2.v, ri.v);
+  r_2.v = vec_mul(r2_2.v, ri2.v);
+
+  xi.v = vec_mul(r.v, hi_inv.v);
+  xi2.v = vec_mul(r_2.v, hi_inv.v);
+
+  /* Calculate the kernel for two particles. */
+  kernel_deval_2_vec(&xi, &wi, &wi_dx, &xi2, &wi2, &wi_dx2);
+
+  /* Compute dv. */
+  dvx.v = vec_sub(vix.v, vjx.v);
+  dvx2.v = vec_sub(vix.v, vjx2.v);
+  dvy.v = vec_sub(viy.v, vjy.v);
+  dvy2.v = vec_sub(viy.v, vjy2.v);
+  dvz.v = vec_sub(viz.v, vjz.v);
+  dvz2.v = vec_sub(viz.v, vjz2.v);
+
+  /* Compute dv dot r */
+  dvdr.v = vec_fma(dvx.v, dx.v, vec_fma(dvy.v, dy.v, vec_mul(dvz.v, dz.v)));
+  dvdr2.v =
+      vec_fma(dvx2.v, dx2.v, vec_fma(dvy2.v, dy2.v, vec_mul(dvz2.v, dz2.v)));
+  dvdr.v = vec_mul(dvdr.v, ri.v);
+  dvdr2.v = vec_mul(dvdr2.v, ri2.v);
+
+  /* Compute dv cross r */
+  curlvrx.v =
+      vec_fma(dvy.v, dz.v, vec_mul(vec_set1(-1.0f), vec_mul(dvz.v, dy.v)));
+  curlvrx2.v =
+      vec_fma(dvy2.v, dz2.v, vec_mul(vec_set1(-1.0f), vec_mul(dvz2.v, dy2.v)));
+  curlvry.v =
+      vec_fma(dvz.v, dx.v, vec_mul(vec_set1(-1.0f), vec_mul(dvx.v, dz.v)));
+  curlvry2.v =
+      vec_fma(dvz2.v, dx2.v, vec_mul(vec_set1(-1.0f), vec_mul(dvx2.v, dz2.v)));
+  curlvrz.v =
+      vec_fma(dvx.v, dy.v, vec_mul(vec_set1(-1.0f), vec_mul(dvy.v, dx.v)));
+  curlvrz2.v =
+      vec_fma(dvx2.v, dy2.v, vec_mul(vec_set1(-1.0f), vec_mul(dvy2.v, dx2.v)));
+  curlvrx.v = vec_mul(curlvrx.v, ri.v);
+  curlvrx2.v = vec_mul(curlvrx2.v, ri2.v);
+  curlvry.v = vec_mul(curlvry.v, ri.v);
+  curlvry2.v = vec_mul(curlvry2.v, ri2.v);
+  curlvrz.v = vec_mul(curlvrz.v, ri.v);
+  curlvrz2.v = vec_mul(curlvrz2.v, ri2.v);
+
+/* Mask updates to intermediate vector sums for particle pi. */
+#ifdef HAVE_AVX512_F
+  rhoSum->v =
+      _mm512_mask_add_ps(rhoSum->v, knlMask, vec_mul(mj.v, wi.v), rhoSum->v);
+  rhoSum->v =
+      _mm512_mask_add_ps(rhoSum->v, knlMask2, vec_mul(mj2.v, wi2.v), rhoSum->v);
+
+  rho_dhSum->v =
+      _mm512_mask_sub_ps(rho_dhSum->v, knlMask, rho_dhSum->v,
+                         vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
+                                               vec_mul(xi.v, wi_dx.v))));
+  rho_dhSum->v = _mm512_mask_sub_ps(
+      rho_dhSum->v, knlMask2, rho_dhSum->v,
+      vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v,
+                             vec_mul(xi2.v, wi_dx2.v))));
+
+  wcountSum->v = _mm512_mask_add_ps(wcountSum->v, knlMask, wi.v, wcountSum->v);
+  wcountSum->v =
+      _mm512_mask_add_ps(wcountSum->v, knlMask2, wi2.v, wcountSum->v);
+
+  wcount_dhSum->v = _mm512_mask_sub_ps(wcount_dhSum->v, knlMask,
+                                       wcount_dhSum->v, vec_mul(xi.v, wi_dx.v));
+  wcount_dhSum->v = _mm512_mask_sub_ps(
+      wcount_dhSum->v, knlMask2, wcount_dhSum->v, vec_mul(xi2.v, wi_dx2.v));
+
+  div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask, div_vSum->v,
+                                   vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)));
+  div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask2, div_vSum->v,
+                                   vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)));
+
+  curlvxSum->v = _mm512_mask_add_ps(curlvxSum->v, knlMask,
+                                    vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)),
+                                    curlvxSum->v);
+  curlvxSum->v = _mm512_mask_add_ps(
+      curlvxSum->v, knlMask2, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)),
+      curlvxSum->v);
+
+  curlvySum->v = _mm512_mask_add_ps(curlvySum->v, knlMask,
+                                    vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)),
+                                    curlvySum->v);
+  curlvySum->v = _mm512_mask_add_ps(
+      curlvySum->v, knlMask2, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)),
+      curlvySum->v);
+
+  curlvzSum->v = _mm512_mask_add_ps(curlvzSum->v, knlMask,
+                                    vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)),
+                                    curlvzSum->v);
+  curlvzSum->v = _mm512_mask_add_ps(
+      curlvzSum->v, knlMask2, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)),
+      curlvzSum->v);
+#else
+  rhoSum->v += vec_and(vec_mul(mj.v, wi.v), mask.v);
+  rhoSum->v += vec_and(vec_mul(mj2.v, wi2.v), mask2.v);
+  rho_dhSum->v -= vec_and(vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
+                                                vec_mul(xi.v, wi_dx.v))),
+                          mask.v);
+  rho_dhSum->v -=
+      vec_and(vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v,
+                                     vec_mul(xi2.v, wi_dx2.v))),
+              mask2.v);
+  wcountSum->v += vec_and(wi.v, mask.v);
+  wcountSum->v += vec_and(wi2.v, mask2.v);
+  wcount_dhSum->v -= vec_and(vec_mul(xi.v, wi_dx.v), mask.v);
+  wcount_dhSum->v -= vec_and(vec_mul(xi2.v, wi_dx2.v), mask2.v);
+  div_vSum->v -= vec_and(vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask.v);
+  div_vSum->v -= vec_and(vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)), mask2.v);
+  curlvxSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask.v);
+  curlvxSum->v +=
+      vec_and(vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)), mask2.v);
+  curlvySum->v += vec_and(vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask.v);
+  curlvySum->v +=
+      vec_and(vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)), mask2.v);
+  curlvzSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask.v);
+  curlvzSum->v +=
+      vec_and(vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)), mask2.v);
+#endif
+}
+#endif
+
 /**
  * @brief Force loop
  */
@@ -492,9 +653,10 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_force(
   vector hi, hj, hi_inv, hj_inv;
   vector hid_inv, hjd_inv;
   vector wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, dvdr;
-  vector piPOrho, pjPOrho, pirho, pjrho;
+  vector piPOrho2, pjPOrho2, pirho, pjrho;
   vector mi, mj;
   vector f;
+  vector grad_hi, grad_hj;
   vector dx[3];
   vector vi[3], vj[3];
   vector pia[3], pja[3];
@@ -512,14 +674,20 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_force(
                  pi[4]->mass, pi[5]->mass, pi[6]->mass, pi[7]->mass);
   mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass,
                  pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass);
-  piPOrho.v = vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2,
-                      pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2,
-                      pi[4]->force.P_over_rho2, pi[5]->force.P_over_rho2,
-                      pi[6]->force.P_over_rho2, pi[7]->force.P_over_rho2);
-  pjPOrho.v = vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2,
-                      pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2,
-                      pj[4]->force.P_over_rho2, pj[5]->force.P_over_rho2,
-                      pj[6]->force.P_over_rho2, pj[7]->force.P_over_rho2);
+  piPOrho2.v = vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2,
+                       pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2,
+                       pi[4]->force.P_over_rho2, pi[5]->force.P_over_rho2,
+                       pi[6]->force.P_over_rho2, pi[7]->force.P_over_rho2);
+  pjPOrho2.v = vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2,
+                       pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2,
+                       pj[4]->force.P_over_rho2, pj[5]->force.P_over_rho2,
+                       pj[6]->force.P_over_rho2, pj[7]->force.P_over_rho2);
+  grad_hi.v =
+      vec_set(pi[0]->force.f, pi[1]->force.f, pi[2]->force.f, pi[3]->force.f,
+              pi[4]->force.f, pi[5]->force.f, pi[6]->force.f, pi[7]->force.f);
+  grad_hj.v =
+      vec_set(pj[0]->force.f, pj[1]->force.f, pj[2]->force.f, pj[3]->force.f,
+              pj[4]->force.f, pj[5]->force.f, pj[6]->force.f, pj[7]->force.f);
   pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho, pi[4]->rho,
                     pi[5]->rho, pi[6]->rho, pi[7]->rho);
   pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho, pj[4]->rho,
@@ -551,10 +719,14 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_force(
 #elif VEC_SIZE == 4
   mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass);
   mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass);
-  piPOrho.v = vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2,
-                      pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2);
-  pjPOrho.v = vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2,
-                      pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2);
+  piPOrho2.v = vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2,
+                       pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2);
+  pjPOrho2.v = vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2,
+                       pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2);
+  grad_hi.v =
+      vec_set(pi[0]->force.f, pi[1]->force.f, pi[2]->force.f, pi[3]->force.f);
+  grad_hj.v =
+      vec_set(pj[0]->force.f, pj[1]->force.f, pj[2]->force.f, pj[3]->force.f);
   pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho);
   pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho);
   ci.v = vec_set(pi[0]->force.soundspeed, pi[1]->force.soundspeed,
@@ -577,14 +749,12 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_force(
 
   /* Get the radius and inverse radius. */
   r2.v = vec_load(R2);
-  ri.v = vec_rsqrt(r2.v);
-  ri.v = ri.v - vec_set1(0.5f) * ri.v * (r2.v * ri.v * ri.v - vec_set1(1.0f));
+  ri = vec_reciprocal_sqrt(r2);
   r.v = r2.v * ri.v;
 
   /* Get the kernel for hi. */
   hi.v = vec_load(Hi);
-  hi_inv.v = vec_rcp(hi.v);
-  hi_inv.v = hi_inv.v - hi_inv.v * (hi.v * hi_inv.v - vec_set1(1.0f));
+  hi_inv = vec_reciprocal(hi);
   hid_inv = pow_dimension_plus_one_vec(hi_inv); /* 1/h^(d+1) */
   xi.v = r.v * hi_inv.v;
   kernel_deval_vec(&xi, &wi, &wi_dx);
@@ -592,8 +762,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_force(
 
   /* Get the kernel for hj. */
   hj.v = vec_load(Hj);
-  hj_inv.v = vec_rcp(hj.v);
-  hj_inv.v = hj_inv.v - hj_inv.v * (hj.v * hj_inv.v - vec_set1(1.0f));
+  hj_inv = vec_reciprocal(hj);
   hjd_inv = pow_dimension_plus_one_vec(hj_inv); /* 1/h^(d+1) */
   xj.v = r.v * hj_inv.v;
   kernel_deval_vec(&xj, &wj, &wj_dx);
@@ -619,7 +788,9 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_force(
 
   /* Now, convolve with the kernel */
   visc_term.v = vec_set1(0.5f) * visc.v * (wi_dr.v + wj_dr.v) * ri.v;
-  sph_term.v = (piPOrho.v * wi_dr.v + pjPOrho.v * wj_dr.v) * ri.v;
+  sph_term.v =
+      (grad_hi.v * piPOrho2.v * wi_dr.v + grad_hj.v * pjPOrho2.v * wj_dr.v) *
+      ri.v;
 
   /* Eventually get the acceleration */
   acc.v = visc_term.v + sph_term.v;
@@ -764,9 +935,10 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force(
   vector hi, hj, hi_inv, hj_inv;
   vector hid_inv, hjd_inv;
   vector wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, dvdr;
-  vector piPOrho, pjPOrho, pirho, pjrho;
+  vector piPOrho2, pjPOrho2, pirho, pjrho;
   vector mj;
   vector f;
+  vector grad_hi, grad_hj;
   vector dx[3];
   vector vi[3], vj[3];
   vector pia[3];
@@ -782,14 +954,20 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force(
 #if VEC_SIZE == 8
   mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass,
                  pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass);
-  piPOrho.v = vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2,
-                      pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2,
-                      pi[4]->force.P_over_rho2, pi[5]->force.P_over_rho2,
-                      pi[6]->force.P_over_rho2, pi[7]->force.P_over_rho2);
-  pjPOrho.v = vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2,
-                      pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2,
-                      pj[4]->force.P_over_rho2, pj[5]->force.P_over_rho2,
-                      pj[6]->force.P_over_rho2, pj[7]->force.P_over_rho2);
+  piPOrho2.v = vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2,
+                       pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2,
+                       pi[4]->force.P_over_rho2, pi[5]->force.P_over_rho2,
+                       pi[6]->force.P_over_rho2, pi[7]->force.P_over_rho2);
+  pjPOrho2.v = vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2,
+                       pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2,
+                       pj[4]->force.P_over_rho2, pj[5]->force.P_over_rho2,
+                       pj[6]->force.P_over_rho2, pj[7]->force.P_over_rho2);
+  grad_hi.v =
+      vec_set(pi[0]->force.f, pi[1]->force.f, pi[2]->force.f, pi[3]->force.f,
+              pi[4]->force.f, pi[5]->force.f, pi[6]->force.f, pi[7]->force.f);
+  grad_hj.v =
+      vec_set(pj[0]->force.f, pj[1]->force.f, pj[2]->force.f, pj[3]->force.f,
+              pj[4]->force.f, pj[5]->force.f, pj[6]->force.f, pj[7]->force.f);
   pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho, pi[4]->rho,
                     pi[5]->rho, pi[6]->rho, pi[7]->rho);
   pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho, pj[4]->rho,
@@ -820,10 +998,14 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force(
               pj[6]->force.balsara, pj[7]->force.balsara);
 #elif VEC_SIZE == 4
   mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass);
-  piPOrho.v = vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2,
-                      pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2);
-  pjPOrho.v = vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2,
-                      pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2);
+  piPOrho2.v = vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2,
+                       pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2);
+  pjPOrho2.v = vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2,
+                       pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2);
+  grad_hi.v =
+      vec_set(pi[0]->force.f, pi[1]->force.f, pi[2]->force.f, pi[3]->force.f);
+  grad_hj.v =
+      vec_set(pj[0]->force.f, pj[1]->force.f, pj[2]->force.f, pj[3]->force.f);
   pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho);
   pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho);
   ci.v = vec_set(pi[0]->force.soundspeed, pi[1]->force.soundspeed,
@@ -846,14 +1028,12 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force(
 
   /* Get the radius and inverse radius. */
   r2.v = vec_load(R2);
-  ri.v = vec_rsqrt(r2.v);
-  ri.v = ri.v - vec_set1(0.5f) * ri.v * (r2.v * ri.v * ri.v - vec_set1(1.0f));
+  ri = vec_reciprocal_sqrt(r2);
   r.v = r2.v * ri.v;
 
   /* Get the kernel for hi. */
   hi.v = vec_load(Hi);
-  hi_inv.v = vec_rcp(hi.v);
-  hi_inv.v = hi_inv.v - hi_inv.v * (hi.v * hi_inv.v - vec_set1(1.0f));
+  hi_inv = vec_reciprocal(hi);
   hid_inv = pow_dimension_plus_one_vec(hi_inv);
   xi.v = r.v * hi_inv.v;
   kernel_deval_vec(&xi, &wi, &wi_dx);
@@ -861,8 +1041,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force(
 
   /* Get the kernel for hj. */
   hj.v = vec_load(Hj);
-  hj_inv.v = vec_rcp(hj.v);
-  hj_inv.v = hj_inv.v - hj_inv.v * (hj.v * hj_inv.v - vec_set1(1.0f));
+  hj_inv = vec_reciprocal(hj);
   hjd_inv = pow_dimension_plus_one_vec(hj_inv);
   xj.v = r.v * hj_inv.v;
   kernel_deval_vec(&xj, &wj, &wj_dx);
@@ -888,7 +1067,9 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force(
 
   /* Now, convolve with the kernel */
   visc_term.v = vec_set1(0.5f) * visc.v * (wi_dr.v + wj_dr.v) * ri.v;
-  sph_term.v = (piPOrho.v * wi_dr.v + pjPOrho.v * wj_dr.v) * ri.v;
+  sph_term.v =
+      (grad_hi.v * piPOrho2.v * wi_dr.v + grad_hj.v * pjPOrho2.v * wj_dr.v) *
+      ri.v;
 
   /* Eventually get the acceleration */
   acc.v = visc_term.v + sph_term.v;
diff --git a/src/kernel_hydro.h b/src/kernel_hydro.h
index 8f38fc0d2b98988a48fe36edcbd2f9419d237d41..7bf2e01a719a29b731bb437096093b13ca086e37 100644
--- a/src/kernel_hydro.h
+++ b/src/kernel_hydro.h
@@ -362,6 +362,117 @@ __attribute__((always_inline)) INLINE static void kernel_deval_vec(
       dw_dx->v * kernel_constant_vec.v * kernel_gamma_inv_dim_plus_one_vec.v;
 }
 
+/* Define constant vectors for the Wendland C2 kernel coefficients. */
+#ifdef WENDLAND_C2_KERNEL
+static const vector wendland_const_c0 = FILL_VEC(4.f);
+static const vector wendland_const_c1 = FILL_VEC(-15.f);
+static const vector wendland_const_c2 = FILL_VEC(20.f);
+static const vector wendland_const_c3 = FILL_VEC(-10.f);
+static const vector wendland_const_c4 = FILL_VEC(0.f);
+static const vector wendland_const_c5 = FILL_VEC(1.f);
+#endif
+
+/**
+ * @brief Computes the kernel function and its derivative for two particles
+ * using interleaved vectors.
+ *
+ * Return 0 if $u > \\gamma = H/h$
+ *
+ * @param u The ratio of the distance to the smoothing length $u = x/h$.
+ * @param w (return) The value of the kernel function $W(x,h)$.
+ * @param dw_dx (return) The norm of the gradient of $|\\nabla W(x,h)|$.
+ * @param u2 The ratio of the distance to the smoothing length $u = x/h$ for
+ * second particle.
+ * @param w2 (return) The value of the kernel function $W(x,h)$ for second
+ * particle.
+ * @param dw_dx2 (return) The norm of the gradient of $|\\nabla W(x,h)|$ for
+ * second particle.
+ */
+__attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
+    vector *u, vector *w, vector *dw_dx, vector *u2, vector *w2,
+    vector *dw_dx2) {
+
+  /* Go to the range [0,1[ from [0,H[ */
+  vector x, x2;
+  x.v = vec_mul(u->v, kernel_gamma_inv_vec.v);
+  x2.v = vec_mul(u2->v, kernel_gamma_inv_vec.v);
+
+#ifdef WENDLAND_C2_KERNEL
+  /* Init the iteration for Horner's scheme. */
+  w->v = vec_fma(wendland_const_c0.v, x.v, wendland_const_c1.v);
+  w2->v = vec_fma(wendland_const_c0.v, x2.v, wendland_const_c1.v);
+  dw_dx->v = wendland_const_c0.v;
+  dw_dx2->v = wendland_const_c0.v;
+
+  /* Calculate the polynomial interleaving vector operations */
+  dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
+  dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v);
+  w->v = vec_fma(x.v, w->v, wendland_const_c2.v);
+  w2->v = vec_fma(x2.v, w2->v, wendland_const_c2.v);
+
+  dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
+  dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v);
+  w->v = vec_fma(x.v, w->v, wendland_const_c3.v);
+  w2->v = vec_fma(x2.v, w2->v, wendland_const_c3.v);
+
+  dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
+  dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v);
+  w->v = vec_fma(x.v, w->v, wendland_const_c4.v);
+  w2->v = vec_fma(x2.v, w2->v, wendland_const_c4.v);
+
+  dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
+  dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v);
+  w->v = vec_fma(x.v, w->v, wendland_const_c5.v);
+  w2->v = vec_fma(x2.v, w2->v, wendland_const_c5.v);
+
+  /* Return everything */
+  w->v =
+      vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
+  w2->v = vec_mul(w2->v,
+                  vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
+  dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
+                                       kernel_gamma_inv_dim_plus_one_vec.v));
+  dw_dx2->v = vec_mul(dw_dx2->v, vec_mul(kernel_constant_vec.v,
+                                         kernel_gamma_inv_dim_plus_one_vec.v));
+#else
+
+  /* Load x and get the interval id. */
+  vector ind, ind2;
+  ind.m = vec_ftoi(vec_fmin(x.v * kernel_ivals_vec.v, kernel_ivals_vec.v));
+  ind2.m = vec_ftoi(vec_fmin(x2.v * kernel_ivals_vec.v, kernel_ivals_vec.v));
+
+  /* load the coefficients. */
+  vector c[kernel_degree + 1], c2[kernel_degree + 1];
+  for (int k = 0; k < VEC_SIZE; k++)
+    for (int j = 0; j < kernel_degree + 1; j++) {
+      c[j].f[k] = kernel_coeffs[ind.i[k] * (kernel_degree + 1) + j];
+      c2[j].f[k] = kernel_coeffs[ind2.i[k] * (kernel_degree + 1) + j];
+    }
+
+  /* Init the iteration for Horner's scheme. */
+  w->v = (c[0].v * x.v) + c[1].v;
+  w2->v = (c2[0].v * x2.v) + c2[1].v;
+  dw_dx->v = c[0].v;
+  dw_dx2->v = c2[0].v;
+
+  /* And we're off! */
+  for (int k = 2; k <= kernel_degree; k++) {
+    dw_dx->v = (dw_dx->v * x.v) + w->v;
+    dw_dx2->v = (dw_dx2->v * x2.v) + w2->v;
+    w->v = (x.v * w->v) + c[k].v;
+    w2->v = (x2.v * w2->v) + c2[k].v;
+  }
+  /* Return everything */
+  w->v = w->v * kernel_constant_vec.v * kernel_gamma_inv_dim_vec.v;
+  w2->v = w2->v * kernel_constant_vec.v * kernel_gamma_inv_dim_vec.v;
+  dw_dx->v =
+      dw_dx->v * kernel_constant_vec.v * kernel_gamma_inv_dim_plus_one_vec.v;
+  dw_dx2->v =
+      dw_dx2->v * kernel_constant_vec.v * kernel_gamma_inv_dim_plus_one_vec.v;
+
+#endif
+}
+
 #endif
 
 /* Some cross-check functions */
diff --git a/src/runner.c b/src/runner.c
index 25ff384ecaabfdad7231b8bcd5f9a5b7e90ac732..23e5c8d2d920023394f466c60deec821bca4d332 100644
--- a/src/runner.c
+++ b/src/runner.c
@@ -53,6 +53,7 @@
 #include "hydro_properties.h"
 #include "kick.h"
 #include "minmax.h"
+#include "runner_doiact_vec.h"
 #include "scheduler.h"
 #include "sourceterms.h"
 #include "space.h"
@@ -1093,7 +1094,13 @@ void *runner_main(void *data) {
       /* Different types of tasks... */
       switch (t->type) {
         case task_type_self:
-          if (t->subtype == task_subtype_density) runner_doself1_density(r, ci);
+          if (t->subtype == task_subtype_density) {
+#ifdef WITH_VECTORIZATION
+            runner_doself1_density_vec(r, ci);
+#else
+            runner_doself1_density(r, ci);
+#endif
+          }
 #ifdef EXTRA_HYDRO_LOOP
           else if (t->subtype == task_subtype_gradient)
             runner_doself1_gradient(r, ci);
diff --git a/src/runner.h b/src/runner.h
index 6aab8469a1eb9aba22dac31553424bbcce8f183f..2ee4b3e4c4d5c93a1180af245e18b4b68a2db188 100644
--- a/src/runner.h
+++ b/src/runner.h
@@ -23,6 +23,8 @@
 #ifndef SWIFT_RUNNER_H
 #define SWIFT_RUNNER_H
 
+#include "cache.h"
+
 extern const double runner_shift[13][3];
 extern const char runner_flip[27];
 
@@ -45,6 +47,9 @@ struct runner {
 
   /*! The engine owing this runner. */
   struct engine *e;
+
+  /*! The particle cache of this runner. */
+  struct cache par_cache;
 };
 
 /* Function prototypes. */
diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c
new file mode 100644
index 0000000000000000000000000000000000000000..5db0ec6f89168f2d8cc9f4da574aeaab9ba54ca1
--- /dev/null
+++ b/src/runner_doiact_vec.c
@@ -0,0 +1,868 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (c) 2016 James Willis (james.s.willis@durham.ac.uk)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+
+/* Config parameters. */
+#include "../config.h"
+
+/* This object's header. */
+#include "runner_doiact_vec.h"
+
+#ifdef WITH_VECTORIZATION
+/**
+ * @brief Compute the vector remainder interactions from the secondary cache.
+ *
+ * @param int_cache (return) secondary #cache of interactions between two
+ * particles.
+ * @param icount Interaction count.
+ * @param rhoSum (return) #vector holding the cumulative sum of the density
+ * update on pi.
+ * @param rho_dhSum (return) #vector holding the cumulative sum of the density
+ * gradient update on pi.
+ * @param wcountSum (return) #vector holding the cumulative sum of the wcount
+ * update on pi.
+ * @param wcount_dhSum (return) #vector holding the cumulative sum of the wcount
+ * gradient update on pi.
+ * @param div_vSum (return) #vector holding the cumulative sum of the divergence
+ * update on pi.
+ * @param curlvxSum (return) #vector holding the cumulative sum of the curl of
+ * vx update on pi.
+ * @param curlvySum (return) #vector holding the cumulative sum of the curl of
+ * vy update on pi.
+ * @param curlvzSum (return) #vector holding the cumulative sum of the curl of
+ * vz update on pi.
+ * @param v_hi_inv #vector of 1/h for pi.
+ * @param v_vix #vector of x velocity of pi.
+ * @param v_viy #vector of y velocity of pi.
+ * @param v_viz #vector of z velocity of pi.
+ * @param icount_align (return) Interaction count after the remainder
+ * interactions have been performed, should be a multiple of the vector length.
+ */
+__attribute__((always_inline)) INLINE static void calcRemInteractions(
+    struct c2_cache *const int_cache, const int icount, vector *rhoSum,
+    vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum,
+    vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum,
+    vector v_hi_inv, vector v_vix, vector v_viy, vector v_viz,
+    int *icount_align) {
+
+#ifdef HAVE_AVX512_F
+  KNL_MASK_16 knl_mask, knl_mask2;
+#endif
+  vector int_mask, int_mask2;
+
+  /* Work out the number of remainder interactions and pad secondary cache. */
+  *icount_align = icount;
+  int rem = icount % (NUM_VEC_PROC * VEC_SIZE);
+  if (rem != 0) {
+    int pad = (NUM_VEC_PROC * VEC_SIZE) - rem;
+    *icount_align += pad;
+
+/* Initialise masks to true. */
+#ifdef HAVE_AVX512_F
+    knl_mask = 0xFFFF;
+    knl_mask2 = 0xFFFF;
+    int_mask.m = vec_setint1(0xFFFFFFFF);
+    int_mask2.m = vec_setint1(0xFFFFFFFF);
+#else
+    int_mask.m = vec_setint1(0xFFFFFFFF);
+    int_mask2.m = vec_setint1(0xFFFFFFFF);
+#endif
+    /* Pad secondary cache so that there are no contributions in the interaction
+     * function. */
+    for (int i = icount; i < *icount_align; i++) {
+      int_cache->mq[i] = 0.f;
+      int_cache->r2q[i] = 1.f;
+      int_cache->dxq[i] = 0.f;
+      int_cache->dyq[i] = 0.f;
+      int_cache->dzq[i] = 0.f;
+      int_cache->vxq[i] = 0.f;
+      int_cache->vyq[i] = 0.f;
+      int_cache->vzq[i] = 0.f;
+    }
+
+    /* Zero parts of mask that represent the padded values.*/
+    if (pad < VEC_SIZE) {
+#ifdef HAVE_AVX512_F
+      knl_mask2 = knl_mask2 >> pad;
+#else
+      for (int i = VEC_SIZE - pad; i < VEC_SIZE; i++) int_mask2.i[i] = 0;
+#endif
+    } else {
+#ifdef HAVE_AVX512_F
+      knl_mask = knl_mask >> (VEC_SIZE - rem);
+      knl_mask2 = 0;
+#else
+      for (int i = rem; i < VEC_SIZE; i++) int_mask.i[i] = 0;
+      int_mask2.v = vec_setzero();
+#endif
+    }
+
+    /* Perform remainder interaction and remove remainder from aligned
+     * interaction count. */
+    *icount_align = icount - rem;
+    runner_iact_nonsym_2_vec_density(
+        &int_cache->r2q[*icount_align], &int_cache->dxq[*icount_align],
+        &int_cache->dyq[*icount_align], &int_cache->dzq[*icount_align],
+        v_hi_inv, v_vix, v_viy, v_viz, &int_cache->vxq[*icount_align],
+        &int_cache->vyq[*icount_align], &int_cache->vzq[*icount_align],
+        &int_cache->mq[*icount_align], rhoSum, rho_dhSum, wcountSum,
+        wcount_dhSum, div_vSum, curlvxSum, curlvySum, curlvzSum, int_mask,
+        int_mask2,
+#ifdef HAVE_AVX512_F
+        knl_mask, knl_mask2);
+#else
+        0, 0);
+#endif
+  }
+}
+
+/**
+ * @brief Left-packs the values needed by an interaction into the secondary
+ * cache (Supports AVX, AVX2 and AVX512 instruction sets).
+ *
+ * @param mask Contains which particles need to interact.
+ * @param pjd Index of the particle to store into.
+ * @param v_r2 #vector of the separation between two particles squared.
+ * @param v_dx #vector of the x separation between two particles.
+ * @param v_dy #vector of the y separation between two particles.
+ * @param v_dz #vector of the z separation between two particles.
+ * @param v_mj #vector of the mass of particle pj.
+ * @param v_vjx #vector of x velocity of pj.
+ * @param v_vjy #vector of y velocity of pj.
+ * @param v_vjz #vector of z velocity of pj.
+ * @param cell_cache #cache of all particles in the cell.
+ * @param int_cache (return) secondary #cache of interactions between two
+ * particles.
+ * @param icount Interaction count.
+ * @param rhoSum #vector holding the cumulative sum of the density update on pi.
+ * @param rho_dhSum #vector holding the cumulative sum of the density gradient
+ * update on pi.
+ * @param wcountSum #vector holding the cumulative sum of the wcount update on
+ * pi.
+ * @param wcount_dhSum #vector holding the cumulative sum of the wcount gradient
+ * update on pi.
+ * @param div_vSum #vector holding the cumulative sum of the divergence update
+ * on pi.
+ * @param curlvxSum #vector holding the cumulative sum of the curl of vx update
+ * on pi.
+ * @param curlvySum #vector holding the cumulative sum of the curl of vy update
+ * on pi.
+ * @param curlvzSum #vector holding the cumulative sum of the curl of vz update
+ * on pi.
+ * @param v_hi_inv #vector of 1/h for pi.
+ * @param v_vix #vector of x velocity of pi.
+ * @param v_viy #vector of y velocity of pi.
+ * @param v_viz #vector of z velocity of pi.
+ */
+__attribute__((always_inline)) INLINE static void storeInteractions(
+    const int mask, const int pjd, vector *v_r2, vector *v_dx, vector *v_dy,
+    vector *v_dz, vector *v_mj, vector *v_vjx, vector *v_vjy, vector *v_vjz,
+    const struct cache *const cell_cache, struct c2_cache *const int_cache,
+    int *icount, vector *rhoSum, vector *rho_dhSum, vector *wcountSum,
+    vector *wcount_dhSum, vector *div_vSum, vector *curlvxSum,
+    vector *curlvySum, vector *curlvzSum, vector v_hi_inv, vector v_vix,
+    vector v_viy, vector v_viz) {
+
+/* Left-pack values needed into the secondary cache using the interaction mask.
+ */
+#if defined(HAVE_AVX2) || defined(HAVE_AVX512_F)
+  int pack = 0;
+
+#ifdef HAVE_AVX512_F
+  pack += __builtin_popcount(mask);
+  VEC_LEFT_PACK(v_r2->v, mask, &int_cache->r2q[*icount]);
+  VEC_LEFT_PACK(v_dx->v, mask, &int_cache->dxq[*icount]);
+  VEC_LEFT_PACK(v_dy->v, mask, &int_cache->dyq[*icount]);
+  VEC_LEFT_PACK(v_dz->v, mask, &int_cache->dzq[*icount]);
+  VEC_LEFT_PACK(v_mj->v, mask, &int_cache->mq[*icount]);
+  VEC_LEFT_PACK(v_vjx->v, mask, &int_cache->vxq[*icount]);
+  VEC_LEFT_PACK(v_vjy->v, mask, &int_cache->vyq[*icount]);
+  VEC_LEFT_PACK(v_vjz->v, mask, &int_cache->vzq[*icount]);
+#else
+  vector v_mask;
+  VEC_FORM_PACKED_MASK(mask, v_mask.m, pack);
+
+  VEC_LEFT_PACK(v_r2->v, v_mask.m, &int_cache->r2q[*icount]);
+  VEC_LEFT_PACK(v_dx->v, v_mask.m, &int_cache->dxq[*icount]);
+  VEC_LEFT_PACK(v_dy->v, v_mask.m, &int_cache->dyq[*icount]);
+  VEC_LEFT_PACK(v_dz->v, v_mask.m, &int_cache->dzq[*icount]);
+  VEC_LEFT_PACK(v_mj->v, v_mask.m, &int_cache->mq[*icount]);
+  VEC_LEFT_PACK(v_vjx->v, v_mask.m, &int_cache->vxq[*icount]);
+  VEC_LEFT_PACK(v_vjy->v, v_mask.m, &int_cache->vyq[*icount]);
+  VEC_LEFT_PACK(v_vjz->v, v_mask.m, &int_cache->vzq[*icount]);
+
+#endif /* HAVE_AVX512_F */
+
+  (*icount) += pack;
+#else
+  /* Quicker to do it serially in AVX rather than use intrinsics. */
+  for (int bit_index = 0; bit_index < VEC_SIZE; bit_index++) {
+    if (mask & (1 << bit_index)) {
+      /* Add this interaction to the queue. */
+      int_cache->r2q[*icount] = v_r2->f[bit_index];
+      int_cache->dxq[*icount] = v_dx->f[bit_index];
+      int_cache->dyq[*icount] = v_dy->f[bit_index];
+      int_cache->dzq[*icount] = v_dz->f[bit_index];
+      int_cache->mq[*icount] = cell_cache->m[pjd + bit_index];
+      int_cache->vxq[*icount] = cell_cache->vx[pjd + bit_index];
+      int_cache->vyq[*icount] = cell_cache->vy[pjd + bit_index];
+      int_cache->vzq[*icount] = cell_cache->vz[pjd + bit_index];
+
+      (*icount)++;
+    }
+  }
+
+#endif /* defined(HAVE_AVX2) || defined(HAVE_AVX512_F) */
+
+  /* Flush the c2 cache if it has reached capacity. */
+  if (*icount >= (C2_CACHE_SIZE - (NUM_VEC_PROC * VEC_SIZE))) {
+
+    int icount_align = *icount;
+
+    /* Peform remainder interactions. */
+    calcRemInteractions(int_cache, *icount, rhoSum, rho_dhSum, wcountSum,
+                        wcount_dhSum, div_vSum, curlvxSum, curlvySum, curlvzSum,
+                        v_hi_inv, v_vix, v_viy, v_viz, &icount_align);
+
+    vector int_mask, int_mask2;
+    int_mask.m = vec_setint1(0xFFFFFFFF);
+    int_mask2.m = vec_setint1(0xFFFFFFFF);
+
+    /* Perform interactions. */
+    for (int pjd = 0; pjd < icount_align; pjd += (NUM_VEC_PROC * VEC_SIZE)) {
+      runner_iact_nonsym_2_vec_density(
+          &int_cache->r2q[pjd], &int_cache->dxq[pjd], &int_cache->dyq[pjd],
+          &int_cache->dzq[pjd], v_hi_inv, v_vix, v_viy, v_viz,
+          &int_cache->vxq[pjd], &int_cache->vyq[pjd], &int_cache->vzq[pjd],
+          &int_cache->mq[pjd], rhoSum, rho_dhSum, wcountSum, wcount_dhSum,
+          div_vSum, curlvxSum, curlvySum, curlvzSum, int_mask, int_mask2, 0, 0);
+    }
+
+    /* Reset interaction count. */
+    *icount = 0;
+  }
+}
+#endif /* WITH_VECTORIZATION */
+
+/**
+ * @brief Compute the cell self-interaction (non-symmetric) using vector
+ * intrinsics with one particle pi at a time.
+ *
+ * @param r The #runner.
+ * @param c The #cell.
+ */
+__attribute__((always_inline)) INLINE void runner_doself1_density_vec(
+    struct runner *r, struct cell *restrict c) {
+
+#ifdef WITH_VECTORIZATION
+  const int ti_current = r->e->ti_current;
+  int doi_mask;
+  struct part *restrict pi;
+  int count_align;
+  int num_vec_proc = NUM_VEC_PROC;
+
+  struct part *restrict parts = c->parts;
+  const int count = c->count;
+
+  vector v_hi, v_vix, v_viy, v_viz, v_hig2, v_r2;
+
+  TIMER_TIC
+
+  if (c->ti_end_min > ti_current) return;
+  if (c->ti_end_max < ti_current) error("Cell in an impossible time-zone");
+
+  /* Get the particle cache from the runner and re-allocate
+   * the cache if it is not big enough for the cell. */
+  struct cache *restrict cell_cache = &r->par_cache;
+
+  if (cell_cache->count < count) {
+    cache_init(cell_cache, count);
+  }
+
+  /* Read the particles from the cell and store them locally in the cache. */
+  cache_read_particles(c, cell_cache);
+
+  /* Create secondary cache to store particle interactions. */
+  struct c2_cache int_cache;
+  int icount = 0, icount_align = 0;
+
+  /* Loop over the particles in the cell. */
+  for (int pid = 0; pid < count; pid++) {
+
+    /* Get a pointer to the ith particle. */
+    pi = &parts[pid];
+
+    /* Is the ith particle active? */
+    if (pi->ti_end > ti_current) continue;
+
+    vector pix, piy, piz;
+
+    const float hi = cell_cache->h[pid];
+
+    /* Fill particle pi vectors. */
+    pix.v = vec_set1(cell_cache->x[pid]);
+    piy.v = vec_set1(cell_cache->y[pid]);
+    piz.v = vec_set1(cell_cache->z[pid]);
+    v_hi.v = vec_set1(hi);
+    v_vix.v = vec_set1(cell_cache->vx[pid]);
+    v_viy.v = vec_set1(cell_cache->vy[pid]);
+    v_viz.v = vec_set1(cell_cache->vz[pid]);
+
+    const float hig2 = hi * hi * kernel_gamma2;
+    v_hig2.v = vec_set1(hig2);
+
+    /* Reset cumulative sums of update vectors. */
+    vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum,
+        curlvySum, curlvzSum;
+
+    /* Get the inverse of hi. */
+    vector v_hi_inv;
+
+    v_hi_inv = vec_reciprocal(v_hi);
+
+    rhoSum.v = vec_setzero();
+    rho_dhSum.v = vec_setzero();
+    wcountSum.v = vec_setzero();
+    wcount_dhSum.v = vec_setzero();
+    div_vSum.v = vec_setzero();
+    curlvxSum.v = vec_setzero();
+    curlvySum.v = vec_setzero();
+    curlvzSum.v = vec_setzero();
+
+    /* Pad cache if there is a serial remainder. */
+    count_align = count;
+    int rem = count % (num_vec_proc * VEC_SIZE);
+    if (rem != 0) {
+      int pad = (num_vec_proc * VEC_SIZE) - rem;
+
+      count_align += pad;
+      /* Set positions to the same as particle pi so when the r2 > 0 mask is
+       * applied these extra contributions are masked out.*/
+      for (int i = count; i < count_align; i++) {
+        cell_cache->x[i] = pix.f[0];
+        cell_cache->y[i] = piy.f[0];
+        cell_cache->z[i] = piz.f[0];
+      }
+    }
+
+    vector pjx, pjy, pjz;
+    vector pjvx, pjvy, pjvz, mj;
+    vector pjx2, pjy2, pjz2;
+    vector pjvx2, pjvy2, pjvz2, mj2;
+
+    /* Find all of particle pi's interacions and store needed values in the
+     * secondary cache.*/
+    for (int pjd = 0; pjd < count_align; pjd += (num_vec_proc * VEC_SIZE)) {
+
+      /* Load 2 sets of vectors from the particle cache. */
+      pjx.v = vec_load(&cell_cache->x[pjd]);
+      pjy.v = vec_load(&cell_cache->y[pjd]);
+      pjz.v = vec_load(&cell_cache->z[pjd]);
+      pjvx.v = vec_load(&cell_cache->vx[pjd]);
+      pjvy.v = vec_load(&cell_cache->vy[pjd]);
+      pjvz.v = vec_load(&cell_cache->vz[pjd]);
+      mj.v = vec_load(&cell_cache->m[pjd]);
+
+      pjx2.v = vec_load(&cell_cache->x[pjd + VEC_SIZE]);
+      pjy2.v = vec_load(&cell_cache->y[pjd + VEC_SIZE]);
+      pjz2.v = vec_load(&cell_cache->z[pjd + VEC_SIZE]);
+      pjvx2.v = vec_load(&cell_cache->vx[pjd + VEC_SIZE]);
+      pjvy2.v = vec_load(&cell_cache->vy[pjd + VEC_SIZE]);
+      pjvz2.v = vec_load(&cell_cache->vz[pjd + VEC_SIZE]);
+      mj2.v = vec_load(&cell_cache->m[pjd + VEC_SIZE]);
+
+      /* Compute the pairwise distance. */
+      vector v_dx_tmp, v_dy_tmp, v_dz_tmp;
+      vector v_dx_tmp2, v_dy_tmp2, v_dz_tmp2, v_r2_2;
+
+      v_dx_tmp.v = vec_sub(pix.v, pjx.v);
+      v_dy_tmp.v = vec_sub(piy.v, pjy.v);
+      v_dz_tmp.v = vec_sub(piz.v, pjz.v);
+      v_dx_tmp2.v = vec_sub(pix.v, pjx2.v);
+      v_dy_tmp2.v = vec_sub(piy.v, pjy2.v);
+      v_dz_tmp2.v = vec_sub(piz.v, pjz2.v);
+
+      v_r2.v = vec_mul(v_dx_tmp.v, v_dx_tmp.v);
+      v_r2.v = vec_fma(v_dy_tmp.v, v_dy_tmp.v, v_r2.v);
+      v_r2.v = vec_fma(v_dz_tmp.v, v_dz_tmp.v, v_r2.v);
+      v_r2_2.v = vec_mul(v_dx_tmp2.v, v_dx_tmp2.v);
+      v_r2_2.v = vec_fma(v_dy_tmp2.v, v_dy_tmp2.v, v_r2_2.v);
+      v_r2_2.v = vec_fma(v_dz_tmp2.v, v_dz_tmp2.v, v_r2_2.v);
+
+/* Form a mask from r2 < hig2 and r2 > 0.*/
+#ifdef HAVE_AVX512_F
+      // KNL_MASK_16 doi_mask, doi_mask_check, doi_mask2, doi_mask2_check;
+      KNL_MASK_16 doi_mask_check, doi_mask2, doi_mask2_check;
+
+      doi_mask_check = vec_cmp_gt(v_r2.v, vec_setzero());
+      doi_mask = vec_cmp_lt(v_r2.v, v_hig2.v);
+
+      doi_mask2_check = vec_cmp_gt(v_r2_2.v, vec_setzero());
+      doi_mask2 = vec_cmp_lt(v_r2_2.v, v_hig2.v);
+
+      doi_mask = doi_mask & doi_mask_check;
+      doi_mask2 = doi_mask2 & doi_mask2_check;
+
+#else
+      vector v_doi_mask, v_doi_mask_check, v_doi_mask2, v_doi_mask2_check;
+      int doi_mask2;
+
+      /* Form r2 > 0 mask and r2 < hig2 mask. */
+      v_doi_mask_check.v = vec_cmp_gt(v_r2.v, vec_setzero());
+      v_doi_mask.v = vec_cmp_lt(v_r2.v, v_hig2.v);
+
+      /* Form r2 > 0 mask and r2 < hig2 mask. */
+      v_doi_mask2_check.v = vec_cmp_gt(v_r2_2.v, vec_setzero());
+      v_doi_mask2.v = vec_cmp_lt(v_r2_2.v, v_hig2.v);
+
+      /* Combine two masks and form integer mask. */
+      doi_mask = vec_cmp_result(vec_and(v_doi_mask.v, v_doi_mask_check.v));
+      doi_mask2 = vec_cmp_result(vec_and(v_doi_mask2.v, v_doi_mask2_check.v));
+#endif /* HAVE_AVX512_F */
+
+      /* If there are any interactions left pack interaction values into c2
+       * cache. */
+      if (doi_mask) {
+        storeInteractions(doi_mask, pjd, &v_r2, &v_dx_tmp, &v_dy_tmp, &v_dz_tmp,
+                          &mj, &pjvx, &pjvy, &pjvz, cell_cache, &int_cache,
+                          &icount, &rhoSum, &rho_dhSum, &wcountSum,
+                          &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum,
+                          &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz);
+      }
+      if (doi_mask2) {
+        storeInteractions(
+            doi_mask2, pjd + VEC_SIZE, &v_r2_2, &v_dx_tmp2, &v_dy_tmp2,
+            &v_dz_tmp2, &mj2, &pjvx2, &pjvy2, &pjvz2, cell_cache, &int_cache,
+            &icount, &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum,
+            &curlvxSum, &curlvySum, &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz);
+      }
+    }
+
+    /* Perform padded vector remainder interactions if any are present. */
+    calcRemInteractions(&int_cache, icount, &rhoSum, &rho_dhSum, &wcountSum,
+                        &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum,
+                        &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz,
+                        &icount_align);
+
+    /* Initialise masks to true in case remainder interactions have been
+     * performed. */
+    vector int_mask, int_mask2;
+#ifdef HAVE_AVX512_F
+    KNL_MASK_16 knl_mask = 0xFFFF;
+    KNL_MASK_16 knl_mask2 = 0xFFFF;
+    int_mask.m = vec_setint1(0xFFFFFFFF);
+    int_mask2.m = vec_setint1(0xFFFFFFFF);
+#else
+    int_mask.m = vec_setint1(0xFFFFFFFF);
+    int_mask2.m = vec_setint1(0xFFFFFFFF);
+#endif
+
+    /* Perform interaction with 2 vectors. */
+    for (int pjd = 0; pjd < icount_align; pjd += (num_vec_proc * VEC_SIZE)) {
+      runner_iact_nonsym_2_vec_density(
+          &int_cache.r2q[pjd], &int_cache.dxq[pjd], &int_cache.dyq[pjd],
+          &int_cache.dzq[pjd], v_hi_inv, v_vix, v_viy, v_viz,
+          &int_cache.vxq[pjd], &int_cache.vyq[pjd], &int_cache.vzq[pjd],
+          &int_cache.mq[pjd], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum,
+          &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, int_mask, int_mask2,
+#ifdef HAVE_AVX512_F
+          knl_mask, knl_mask2);
+#else
+          0, 0);
+#endif
+    }
+
+    /* Perform horizontal adds on vector sums and store result in particle pi.
+     */
+    VEC_HADD(rhoSum, pi->rho);
+    VEC_HADD(rho_dhSum, pi->density.rho_dh);
+    VEC_HADD(wcountSum, pi->density.wcount);
+    VEC_HADD(wcount_dhSum, pi->density.wcount_dh);
+    VEC_HADD(div_vSum, pi->density.div_v);
+    VEC_HADD(curlvxSum, pi->density.rot_v[0]);
+    VEC_HADD(curlvySum, pi->density.rot_v[1]);
+    VEC_HADD(curlvzSum, pi->density.rot_v[2]);
+
+    /* Reset interaction count. */
+    icount = 0;
+  } /* loop over all particles. */
+
+  TIMER_TOC(timer_doself_density);
+#endif /* WITH_VECTORIZATION */
+}
+
+/**
+ * @brief Compute the cell self-interaction (non-symmetric) using vector
+ * intrinsics with two particle pis at a time.
+ * CURRENTLY BROKEN DO NOT USE.
+ *
+ * @param r The #runner.
+ * @param c The #cell.
+ */
+__attribute__((always_inline)) INLINE void runner_doself1_density_vec_2(
+    struct runner *r, struct cell *restrict c) {
+
+#ifdef WITH_VECTORIZATION
+  const int ti_current = r->e->ti_current;
+  int doi_mask;
+  int doi2_mask;
+  struct part *restrict pi;
+  struct part *restrict pi2;
+  int count_align;
+
+  vector v_hi, v_vix, v_viy, v_viz, v_hig2, v_r2;
+  vector v_hi2, v_vix2, v_viy2, v_viz2, v_hig2_2, v2_r2;
+
+  TIMER_TIC
+
+  /* TODO: Need to find two active particles, not just one. */
+  if (c->ti_end_min > ti_current) return;
+  if (c->ti_end_max < ti_current) error("Cell in an impossible time-zone");
+
+  struct part *restrict parts = c->parts;
+  const int count = c->count;
+
+  /* Get the particle cache from the runner and re-allocate
+   * the cache if it is not big enough for the cell. */
+  struct cache *restrict cell_cache = &r->par_cache;
+
+  if (cell_cache->count < count) {
+    cache_init(cell_cache, count);
+  }
+
+  /* Read the particles from the cell and store them locally in the cache. */
+  cache_read_particles(c, &r->par_cache);
+
+  /* Create two secondary caches. */
+  int icount = 0, icount_align = 0;
+  struct c2_cache int_cache;
+
+  int icount2 = 0, icount_align2 = 0;
+  struct c2_cache int_cache2;
+
+  /* Loop over the particles in the cell. */
+  for (int pid = 0; pid < count; pid += 2) {
+
+    /* Get a pointer to the ith particle and next i particle. */
+    pi = &parts[pid];
+    pi2 = &parts[pid + 1];
+
+    /* Is the ith particle active? */
+    if (pi->ti_end > ti_current) continue;
+
+    vector pix, piy, piz;
+    vector pix2, piy2, piz2;
+
+    const float hi = cell_cache->h[pid];
+    const float hi2 = cell_cache->h[pid + 1];
+
+    /* Fill pi position vector. */
+    pix.v = vec_set1(cell_cache->x[pid]);
+    piy.v = vec_set1(cell_cache->y[pid]);
+    piz.v = vec_set1(cell_cache->z[pid]);
+    v_hi.v = vec_set1(hi);
+    v_vix.v = vec_set1(cell_cache->vx[pid]);
+    v_viy.v = vec_set1(cell_cache->vy[pid]);
+    v_viz.v = vec_set1(cell_cache->vz[pid]);
+
+    pix2.v = vec_set1(cell_cache->x[pid + 1]);
+    piy2.v = vec_set1(cell_cache->y[pid + 1]);
+    piz2.v = vec_set1(cell_cache->z[pid + 1]);
+    v_hi2.v = vec_set1(hi2);
+    v_vix2.v = vec_set1(cell_cache->vx[pid + 1]);
+    v_viy2.v = vec_set1(cell_cache->vy[pid + 1]);
+    v_viz2.v = vec_set1(cell_cache->vz[pid + 1]);
+
+    const float hig2 = hi * hi * kernel_gamma2;
+    const float hig2_2 = hi2 * hi2 * kernel_gamma2;
+    v_hig2.v = vec_set1(hig2);
+    v_hig2_2.v = vec_set1(hig2_2);
+
+    vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum,
+        curlvySum, curlvzSum;
+    vector rhoSum2, rho_dhSum2, wcountSum2, wcount_dhSum2, div_vSum2,
+        curlvxSum2, curlvySum2, curlvzSum2;
+
+    vector v_hi_inv, v_hi_inv2;
+
+    v_hi_inv = vec_reciprocal(v_hi);
+    v_hi_inv2 = vec_reciprocal(v_hi2);
+
+    rhoSum.v = vec_setzero();
+    rho_dhSum.v = vec_setzero();
+    wcountSum.v = vec_setzero();
+    wcount_dhSum.v = vec_setzero();
+    div_vSum.v = vec_setzero();
+    curlvxSum.v = vec_setzero();
+    curlvySum.v = vec_setzero();
+    curlvzSum.v = vec_setzero();
+
+    rhoSum2.v = vec_setzero();
+    rho_dhSum2.v = vec_setzero();
+    wcountSum2.v = vec_setzero();
+    wcount_dhSum2.v = vec_setzero();
+    div_vSum2.v = vec_setzero();
+    curlvxSum2.v = vec_setzero();
+    curlvySum2.v = vec_setzero();
+    curlvzSum2.v = vec_setzero();
+
+    /* Pad cache if there is a serial remainder. */
+    count_align = count;
+    int rem = count % (NUM_VEC_PROC * VEC_SIZE);
+    if (rem != 0) {
+      int pad = (NUM_VEC_PROC * VEC_SIZE) - rem;
+
+      count_align += pad;
+      /* Set positions to the same as particle pi so when the r2 > 0 mask is
+       * applied these extra contributions are masked out.*/
+      for (int i = count; i < count_align; i++) {
+        cell_cache->x[i] = pix.f[0];
+        cell_cache->y[i] = piy.f[0];
+        cell_cache->z[i] = piz.f[0];
+      }
+    }
+
+    vector pjx, pjy, pjz;
+    vector pjvx, pjvy, pjvz, mj;
+    vector pjx2, pjy2, pjz2;
+    vector pjvx2, pjvy2, pjvz2, mj2;
+
+    /* Find all of particle pi's interacions and store needed values in
+     * secondary cache.*/
+    for (int pjd = 0; pjd < count_align; pjd += (NUM_VEC_PROC * VEC_SIZE)) {
+
+      /* Load 2 sets of vectors from the particle cache. */
+      pjx.v = vec_load(&cell_cache->x[pjd]);
+      pjy.v = vec_load(&cell_cache->y[pjd]);
+      pjz.v = vec_load(&cell_cache->z[pjd]);
+      pjvx.v = vec_load(&cell_cache->vx[pjd]);
+      pjvy.v = vec_load(&cell_cache->vy[pjd]);
+      pjvz.v = vec_load(&cell_cache->vz[pjd]);
+      mj.v = vec_load(&cell_cache->m[pjd]);
+
+      pjx2.v = vec_load(&cell_cache->x[pjd + VEC_SIZE]);
+      pjy2.v = vec_load(&cell_cache->y[pjd + VEC_SIZE]);
+      pjz2.v = vec_load(&cell_cache->z[pjd + VEC_SIZE]);
+      pjvx2.v = vec_load(&cell_cache->vx[pjd + VEC_SIZE]);
+      pjvy2.v = vec_load(&cell_cache->vy[pjd + VEC_SIZE]);
+      pjvz2.v = vec_load(&cell_cache->vz[pjd + VEC_SIZE]);
+      mj2.v = vec_load(&cell_cache->m[pjd + VEC_SIZE]);
+
+      /* Compute the pairwise distance. */
+      vector v_dx_tmp, v_dy_tmp, v_dz_tmp;
+      vector v_dx_tmp2, v_dy_tmp2, v_dz_tmp2, v_r2_2;
+      vector v_dx2_tmp, v_dy2_tmp, v_dz2_tmp;
+      vector v_dx2_tmp2, v_dy2_tmp2, v_dz2_tmp2, v2_r2_2;
+
+      v_dx_tmp.v = vec_sub(pix.v, pjx.v);
+      v_dy_tmp.v = vec_sub(piy.v, pjy.v);
+      v_dz_tmp.v = vec_sub(piz.v, pjz.v);
+      v_dx_tmp2.v = vec_sub(pix.v, pjx2.v);
+      v_dy_tmp2.v = vec_sub(piy.v, pjy2.v);
+      v_dz_tmp2.v = vec_sub(piz.v, pjz2.v);
+
+      v_dx2_tmp.v = vec_sub(pix2.v, pjx.v);
+      v_dy2_tmp.v = vec_sub(piy2.v, pjy.v);
+      v_dz2_tmp.v = vec_sub(piz2.v, pjz.v);
+      v_dx2_tmp2.v = vec_sub(pix2.v, pjx2.v);
+      v_dy2_tmp2.v = vec_sub(piy2.v, pjy2.v);
+      v_dz2_tmp2.v = vec_sub(piz2.v, pjz2.v);
+
+      v_r2.v = vec_mul(v_dx_tmp.v, v_dx_tmp.v);
+      v_r2.v = vec_fma(v_dy_tmp.v, v_dy_tmp.v, v_r2.v);
+      v_r2.v = vec_fma(v_dz_tmp.v, v_dz_tmp.v, v_r2.v);
+      v_r2_2.v = vec_mul(v_dx_tmp2.v, v_dx_tmp2.v);
+      v_r2_2.v = vec_fma(v_dy_tmp2.v, v_dy_tmp2.v, v_r2_2.v);
+      v_r2_2.v = vec_fma(v_dz_tmp2.v, v_dz_tmp2.v, v_r2_2.v);
+
+      v2_r2.v = vec_mul(v_dx2_tmp.v, v_dx2_tmp.v);
+      v2_r2.v = vec_fma(v_dy2_tmp.v, v_dy2_tmp.v, v2_r2.v);
+      v2_r2.v = vec_fma(v_dz2_tmp.v, v_dz2_tmp.v, v2_r2.v);
+      v2_r2_2.v = vec_mul(v_dx2_tmp2.v, v_dx2_tmp2.v);
+      v2_r2_2.v = vec_fma(v_dy2_tmp2.v, v_dy2_tmp2.v, v2_r2_2.v);
+      v2_r2_2.v = vec_fma(v_dz2_tmp2.v, v_dz2_tmp2.v, v2_r2_2.v);
+
+/* Form a mask from r2 < hig2 and r2 > 0.*/
+#ifdef HAVE_AVX512_F
+      // KNL_MASK_16 doi_mask, doi_mask_check, doi_mask2, doi_mask2_check;
+      KNL_MASK_16 doi_mask_check, doi_mask2, doi_mask2_check;
+      KNL_MASK_16 doi2_mask_check, doi2_mask2, doi2_mask2_check;
+
+      doi_mask_check = vec_cmp_gt(v_r2.v, vec_setzero());
+      doi_mask = vec_cmp_lt(v_r2.v, v_hig2.v);
+
+      doi2_mask_check = vec_cmp_gt(v2_r2.v, vec_setzero());
+      doi2_mask = vec_cmp_lt(v2_r2.v, v_hig2_2.v);
+
+      doi_mask2_check = vec_cmp_gt(v_r2_2.v, vec_setzero());
+      doi_mask2 = vec_cmp_lt(v_r2_2.v, v_hig2.v);
+
+      doi2_mask2_check = vec_cmp_gt(v2_r2_2.v, vec_setzero());
+      doi2_mask2 = vec_cmp_lt(v2_r2_2.v, v_hig2_2.v);
+
+      doi_mask = doi_mask & doi_mask_check;
+      doi_mask2 = doi_mask2 & doi_mask2_check;
+
+      doi2_mask = doi2_mask & doi2_mask_check;
+      doi2_mask2 = doi2_mask2 & doi2_mask2_check;
+#else
+      vector v_doi_mask, v_doi_mask_check, v_doi_mask2, v_doi_mask2_check;
+      int doi_mask2;
+
+      vector v_doi2_mask, v_doi2_mask_check, v_doi2_mask2, v_doi2_mask2_check;
+      int doi2_mask2;
+
+      v_doi_mask_check.v = vec_cmp_gt(v_r2.v, vec_setzero());
+      v_doi_mask.v = vec_cmp_lt(v_r2.v, v_hig2.v);
+
+      v_doi2_mask_check.v = vec_cmp_gt(v2_r2.v, vec_setzero());
+      v_doi2_mask.v = vec_cmp_lt(v2_r2.v, v_hig2_2.v);
+
+      v_doi_mask2_check.v = vec_cmp_gt(v_r2_2.v, vec_setzero());
+      v_doi_mask2.v = vec_cmp_lt(v_r2_2.v, v_hig2.v);
+
+      v_doi2_mask2_check.v = vec_cmp_gt(v2_r2_2.v, vec_setzero());
+      v_doi2_mask2.v = vec_cmp_lt(v2_r2_2.v, v_hig2_2.v);
+
+      doi_mask = vec_cmp_result(vec_and(v_doi_mask.v, v_doi_mask_check.v));
+      doi_mask2 = vec_cmp_result(vec_and(v_doi_mask2.v, v_doi_mask2_check.v));
+      doi2_mask = vec_cmp_result(vec_and(v_doi2_mask.v, v_doi2_mask_check.v));
+      doi2_mask2 =
+          vec_cmp_result(vec_and(v_doi2_mask2.v, v_doi2_mask2_check.v));
+#endif /* HAVE_AVX512_F */
+
+      /* Hit or miss? */
+      // if (doi_mask) {
+      storeInteractions(doi_mask, pjd, &v_r2, &v_dx_tmp, &v_dy_tmp, &v_dz_tmp,
+                        &mj, &pjvx, &pjvy, &pjvz, cell_cache, &int_cache,
+                        &icount, &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum,
+                        &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, v_hi_inv,
+                        v_vix, v_viy, v_viz);
+      //}
+      // if (doi2_mask) {
+      storeInteractions(
+          doi2_mask, pjd, &v2_r2, &v_dx2_tmp, &v_dy2_tmp, &v_dz2_tmp, &mj,
+          &pjvx, &pjvy, &pjvz, cell_cache, &int_cache2, &icount2, &rhoSum2,
+          &rho_dhSum2, &wcountSum2, &wcount_dhSum2, &div_vSum2, &curlvxSum2,
+          &curlvySum2, &curlvzSum2, v_hi_inv2, v_vix2, v_viy2, v_viz2);
+      //}
+      /* Hit or miss? */
+      // if (doi_mask2) {
+      storeInteractions(doi_mask2, pjd + VEC_SIZE, &v_r2_2, &v_dx_tmp2,
+                        &v_dy_tmp2, &v_dz_tmp2, &mj2, &pjvx2, &pjvy2, &pjvz2,
+                        cell_cache, &int_cache, &icount, &rhoSum, &rho_dhSum,
+                        &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum,
+                        &curlvySum, &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz);
+      //}
+      // if (doi2_mask2) {
+      storeInteractions(doi2_mask2, pjd + VEC_SIZE, &v2_r2_2, &v_dx2_tmp2,
+                        &v_dy2_tmp2, &v_dz2_tmp2, &mj2, &pjvx2, &pjvy2, &pjvz2,
+                        cell_cache, &int_cache2, &icount2, &rhoSum2,
+                        &rho_dhSum2, &wcountSum2, &wcount_dhSum2, &div_vSum2,
+                        &curlvxSum2, &curlvySum2, &curlvzSum2, v_hi_inv2,
+                        v_vix2, v_viy2, v_viz2);
+      //}
+    }
+
+    /* Perform padded vector remainder interactions if any are present. */
+    calcRemInteractions(&int_cache, icount, &rhoSum, &rho_dhSum, &wcountSum,
+                        &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum,
+                        &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz,
+                        &icount_align);
+
+    calcRemInteractions(&int_cache2, icount2, &rhoSum2, &rho_dhSum2,
+                        &wcountSum2, &wcount_dhSum2, &div_vSum2, &curlvxSum2,
+                        &curlvySum2, &curlvzSum2, v_hi_inv2, v_vix2, v_viy2,
+                        v_viz2, &icount_align2);
+
+    /* Initialise masks to true incase remainder interactions have been
+     * performed. */
+    vector int_mask, int_mask2;
+    vector int2_mask, int2_mask2;
+#ifdef HAVE_AVX512_F
+    KNL_MASK_16 knl_mask = 0xFFFF;
+    KNL_MASK_16 knl_mask2 = 0xFFFF;
+    int_mask.m = vec_setint1(0xFFFFFFFF);
+    int_mask2.m = vec_setint1(0xFFFFFFFF);
+    int2_mask.m = vec_setint1(0xFFFFFFFF);
+    int2_mask2.m = vec_setint1(0xFFFFFFFF);
+#else
+    int_mask.m = vec_setint1(0xFFFFFFFF);
+    int_mask2.m = vec_setint1(0xFFFFFFFF);
+
+    int2_mask.m = vec_setint1(0xFFFFFFFF);
+    int2_mask2.m = vec_setint1(0xFFFFFFFF);
+#endif
+
+    /* Perform interaction with 2 vectors. */
+    for (int pjd = 0; pjd < icount_align; pjd += (NUM_VEC_PROC * VEC_SIZE)) {
+      runner_iact_nonsym_2_vec_density(
+          &int_cache.r2q[pjd], &int_cache.dxq[pjd], &int_cache.dyq[pjd],
+          &int_cache.dzq[pjd], v_hi_inv, v_vix, v_viy, v_viz,
+          &int_cache.vxq[pjd], &int_cache.vyq[pjd], &int_cache.vzq[pjd],
+          &int_cache.mq[pjd], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum,
+          &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, int_mask, int_mask2,
+#ifdef HAVE_AVX512_F
+          knl_mask, knl_mask2);
+#else
+          0, 0);
+#endif
+    }
+
+    for (int pjd = 0; pjd < icount_align2; pjd += (NUM_VEC_PROC * VEC_SIZE)) {
+      runner_iact_nonsym_2_vec_density(
+          &int_cache2.r2q[pjd], &int_cache2.dxq[pjd], &int_cache2.dyq[pjd],
+          &int_cache2.dzq[pjd], v_hi_inv2, v_vix2, v_viy2, v_viz2,
+          &int_cache2.vxq[pjd], &int_cache2.vyq[pjd], &int_cache2.vzq[pjd],
+          &int_cache2.mq[pjd], &rhoSum2, &rho_dhSum2, &wcountSum2,
+          &wcount_dhSum2, &div_vSum2, &curlvxSum2, &curlvySum2, &curlvzSum2,
+          int2_mask, int2_mask2,
+#ifdef HAVE_AVX512_F
+          knl_mask, knl_mask2);
+#else
+          0, 0);
+#endif
+    }
+    /* Perform horizontal adds on vector sums and store result in particle pi.
+     */
+    VEC_HADD(rhoSum, pi->rho);
+    VEC_HADD(rho_dhSum, pi->density.rho_dh);
+    VEC_HADD(wcountSum, pi->density.wcount);
+    VEC_HADD(wcount_dhSum, pi->density.wcount_dh);
+    VEC_HADD(div_vSum, pi->density.div_v);
+    VEC_HADD(curlvxSum, pi->density.rot_v[0]);
+    VEC_HADD(curlvySum, pi->density.rot_v[1]);
+    VEC_HADD(curlvzSum, pi->density.rot_v[2]);
+
+    VEC_HADD(rhoSum2, pi2->rho);
+    VEC_HADD(rho_dhSum2, pi2->density.rho_dh);
+    VEC_HADD(wcountSum2, pi2->density.wcount);
+    VEC_HADD(wcount_dhSum2, pi2->density.wcount_dh);
+    VEC_HADD(div_vSum2, pi2->density.div_v);
+    VEC_HADD(curlvxSum2, pi2->density.rot_v[0]);
+    VEC_HADD(curlvySum2, pi2->density.rot_v[1]);
+    VEC_HADD(curlvzSum2, pi2->density.rot_v[2]);
+
+    /* Reset interaction count. */
+    icount = 0;
+    icount2 = 0;
+  } /* loop over all particles. */
+
+  TIMER_TOC(timer_doself_density);
+#endif /* WITH_VECTORIZATION */
+}
diff --git a/src/runner_doiact_vec.h b/src/runner_doiact_vec.h
new file mode 100644
index 0000000000000000000000000000000000000000..9bb24f12cedf03ec49a5a03f92d308f92d49aa54
--- /dev/null
+++ b/src/runner_doiact_vec.h
@@ -0,0 +1,39 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (c) 2016 James Willis (james.s.willis@durham.ac.uk)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+
+#ifndef SWIFT_RUNNER_VEC_H
+#define SWIFT_RUNNER_VEC_H
+
+/* Config parameters. */
+#include "../config.h"
+
+/* Local headers */
+#include "cell.h"
+#include "engine.h"
+#include "hydro.h"
+#include "part.h"
+#include "runner.h"
+#include "timers.h"
+#include "vector.h"
+
+/* Function prototypes. */
+void runner_doself1_density_vec(struct runner *r, struct cell *restrict c);
+void runner_doself1_density_vec_2(struct runner *r, struct cell *restrict c);
+
+#endif /* SWIFT_RUNNER_VEC_H */
diff --git a/src/swift.h b/src/swift.h
index 2928c263525f57a7ee999b50547aa374b456f556..6c2bcf1811c336b7d5dd2b838b4a4f518ba34ebb 100644
--- a/src/swift.h
+++ b/src/swift.h
@@ -24,6 +24,7 @@
 
 /* Local headers. */
 #include "atomic.h"
+#include "cache.h"
 #include "cell.h"
 #include "clocks.h"
 #include "const.h"
diff --git a/src/tools.c b/src/tools.c
index e526bb1b838f6d97b72eadb4070f3f2a94938c04..f4cb86744332dd211ac07e366cd048049b2068bd 100644
--- a/src/tools.c
+++ b/src/tools.c
@@ -558,7 +558,171 @@ void shuffle_particles(struct part *parts, const int count) {
 }
 
 /**
- * @brief Computes the forces between all g-particles using the N^2 algorithm
+ * @brief Compares two values based on their relative difference: |a - b|/|a +
+ * b|
+ *
+ * @param a Value a
+ * @param b Value b
+ * @param threshold The limit on the relative difference between the two values
+ * @param absDiff Absolute difference: |a - b|
+ * @param absSum Absolute sum: |a + b|
+ * @param relDiff Relative difference: |a - b|/|a + b|
+ *
+ * @return 1 if difference found, 0 otherwise
+ */
+int compare_values(double a, double b, double threshold, double *absDiff,
+                   double *absSum, double *relDiff) {
+
+  int result = 0;
+  *absDiff = 0.0, *absSum = 0.0, *relDiff = 0.0;
+
+  *absDiff = fabs(a - b);
+  *absSum = fabs(a + b);
+  if (*absSum > 0.f) {
+    *relDiff = *absDiff / *absSum;
+  }
+
+  if (*relDiff > threshold) {
+    result = 1;
+  }
+
+  return result;
+}
+
+/**
+ * @brief Compares two particles' properties using the relative difference and a
+ * threshold.
+ *
+ * @param a Particle A
+ * @param b Particle B
+ * @param threshold The limit on the relative difference between the two values
+ *
+ * @return 1 if difference found, 0 otherwise
+ */
+int compare_particles(struct part a, struct part b, double threshold) {
+  int result = 0;
+  double absDiff = 0.0, absSum = 0.0, relDiff = 0.0;
+
+  for (int k = 0; k < 3; k++) {
+    if (compare_values(a.x[k], b.x[k], threshold, &absDiff, &absSum,
+                       &relDiff)) {
+      message(
+          "Relative difference (%e) larger than tolerance (%e) for x[%d] of "
+          "particle %lld.",
+          relDiff, threshold, k, a.id);
+      message("a = %e, b = %e", a.x[k], b.x[k]);
+      result = 1;
+    }
+  }
+  for (int k = 0; k < 3; k++) {
+    if (compare_values(a.v[k], b.v[k], threshold, &absDiff, &absSum,
+                       &relDiff)) {
+      message(
+          "Relative difference (%e) larger than tolerance (%e) for v[%d] of "
+          "particle %lld.",
+          relDiff, threshold, k, a.id);
+      message("a = %e, b = %e", a.v[k], b.v[k]);
+      result = 1;
+    }
+  }
+  for (int k = 0; k < 3; k++) {
+    if (compare_values(a.a_hydro[k], b.a_hydro[k], threshold, &absDiff, &absSum,
+                       &relDiff)) {
+      message(
+          "Relative difference (%e) larger than tolerance (%e) for a_hydro[%d] "
+          "of particle %lld.",
+          relDiff, threshold, k, a.id);
+      message("a = %e, b = %e", a.a_hydro[k], b.a_hydro[k]);
+      result = 1;
+    }
+  }
+  if (compare_values(a.rho, b.rho, threshold, &absDiff, &absSum, &relDiff)) {
+    message(
+        "Relative difference (%e) larger than tolerance (%e) for rho of "
+        "particle %lld.",
+        relDiff, threshold, a.id);
+    message("a = %e, b = %e", a.rho, b.rho);
+    result = 1;
+  }
+  if (compare_values(a.density.rho_dh, b.density.rho_dh, threshold, &absDiff,
+                     &absSum, &relDiff)) {
+    message(
+        "Relative difference (%e) larger than tolerance (%e) for rho_dh of "
+        "particle %lld.",
+        relDiff, threshold, a.id);
+    message("a = %e, b = %e", a.density.rho_dh, b.density.rho_dh);
+    result = 1;
+  }
+  if (compare_values(a.density.wcount, b.density.wcount, threshold, &absDiff,
+                     &absSum, &relDiff)) {
+    message(
+        "Relative difference (%e) larger than tolerance (%e) for wcount of "
+        "particle %lld.",
+        relDiff, threshold, a.id);
+    message("a = %e, b = %e", a.density.wcount, b.density.wcount);
+    result = 1;
+  }
+  if (compare_values(a.density.wcount_dh, b.density.wcount_dh, threshold,
+                     &absDiff, &absSum, &relDiff)) {
+    message(
+        "Relative difference (%e) larger than tolerance (%e) for wcount_dh of "
+        "particle %lld.",
+        relDiff, threshold, a.id);
+    message("a = %e, b = %e", a.density.wcount_dh, b.density.wcount_dh);
+    result = 1;
+  }
+  if (compare_values(a.force.h_dt, b.force.h_dt, threshold, &absDiff, &absSum,
+                     &relDiff)) {
+    message(
+        "Relative difference (%e) larger than tolerance (%e) for h_dt of "
+        "particle %lld.",
+        relDiff, threshold, a.id);
+    message("a = %e, b = %e", a.force.h_dt, b.force.h_dt);
+    result = 1;
+  }
+  if (compare_values(a.force.v_sig, b.force.v_sig, threshold, &absDiff, &absSum,
+                     &relDiff)) {
+    message(
+        "Relative difference (%e) larger than tolerance (%e) for v_sig of "
+        "particle %lld.",
+        relDiff, threshold, a.id);
+    message("a = %e, b = %e", a.force.v_sig, b.force.v_sig);
+    result = 1;
+  }
+  if (compare_values(a.entropy_dt, b.entropy_dt, threshold, &absDiff, &absSum,
+                     &relDiff)) {
+    message(
+        "Relative difference (%e) larger than tolerance (%e) for entropy_dt of "
+        "particle %lld.",
+        relDiff, threshold, a.id);
+    message("a = %e, b = %e", a.entropy_dt, b.entropy_dt);
+    result = 1;
+  }
+  if (compare_values(a.density.div_v, b.density.div_v, threshold, &absDiff,
+                     &absSum, &relDiff)) {
+    message(
+        "Relative difference (%e) larger than tolerance (%e) for div_v of "
+        "particle %lld.",
+        relDiff, threshold, a.id);
+    message("a = %e, b = %e", a.density.div_v, b.density.div_v);
+    result = 1;
+  }
+  for (int k = 0; k < 3; k++) {
+    if (compare_values(a.density.rot_v[k], b.density.rot_v[k], threshold,
+                       &absDiff, &absSum, &relDiff)) {
+      message(
+          "Relative difference (%e) larger than tolerance (%e) for rot_v[%d] "
+          "of particle %lld.",
+          relDiff, threshold, k, a.id);
+      message("a = %e, b = %e", a.density.rot_v[k], b.density.rot_v[k]);
+      result = 1;
+    }
+  }
+
+  return result;
+}
+
+/** @brief Computes the forces between all g-particles using the N^2 algorithm
  *
  * Overwrites the accelerations of the gparts with the values.
  * Do not use for actual runs.
diff --git a/src/tools.h b/src/tools.h
index 43ddd946c3e8cdf53139bb917135dffd8a8acd12..ece3078dce7cc8ab4b15538a1e5d9a990d81b36d 100644
--- a/src/tools.h
+++ b/src/tools.h
@@ -47,4 +47,8 @@ void shuffle_particles(struct part *parts, const int count);
 void gravity_n2(struct gpart *gparts, const int gcount,
                 const struct phys_const *constants, float rlr);
 
+int compare_values(double a, double b, double threshold, double *absDiff,
+                   double *absSum, double *relDiff);
+int compare_particles(struct part a, struct part b, double threshold);
+
 #endif /* SWIFT_TOOL_H */
diff --git a/src/vector.h b/src/vector.h
index 53869fd2594227d3332d7435f47cdff7cded224b..b2b3049f6abdf2a8464cb516147b050a6f4fc3e1 100644
--- a/src/vector.h
+++ b/src/vector.h
@@ -46,19 +46,33 @@
 #define VEC_FLOAT __m512
 #define VEC_DBL __m512d
 #define VEC_INT __m512i
+#define KNL_MASK_16 __mmask16
 #define vec_load(a) _mm512_load_ps(a)
+#define vec_store(a, addr) _mm512_store_ps(addr, a)
+#define vec_setzero() _mm512_setzero_ps()
+#define vec_setintzero() _mm512_setzero_epi32()
 #define vec_set1(a) _mm512_set1_ps(a)
+#define vec_setint1(a) _mm512_set1_epi32(a)
 #define vec_set(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
   _mm512_set_ps(p, o, n, m, l, k, j, i, h, g, f, e, d, c, b, a)
 #define vec_dbl_set(a, b, c, d, e, f, g, h) \
   _mm512_set_pd(h, g, f, e, d, c, b, a)
+#define vec_add(a, b) _mm512_add_ps(a, b)
+#define vec_sub(a, b) _mm512_sub_ps(a, b)
+#define vec_mul(a, b) _mm512_mul_ps(a, b)
+#define vec_fma(a, b, c) _mm512_fmadd_ps(a, b, c)
 #define vec_sqrt(a) _mm512_sqrt_ps(a)
-#define vec_rcp(a) _mm512_rcp_ps(a)
-#define vec_rsqrt(a) _mm512_rsqrt_ps(a)
+#define vec_rcp(a) _mm512_rcp14_ps(a)
+#define vec_rsqrt(a) _mm512_rsqrt14_ps(a)
 #define vec_ftoi(a) _mm512_cvttps_epi32(a)
 #define vec_fmin(a, b) _mm512_min_ps(a, b)
 #define vec_fmax(a, b) _mm512_max_ps(a, b)
 #define vec_fabs(a) _mm512_andnot_ps(_mm512_set1_ps(-0.f), a)
+#define vec_floor(a) _mm512_floor_ps(a)
+#define vec_cmp_gt(a, b) _mm512_cmp_ps_mask(a, b, _CMP_GT_OQ)
+#define vec_cmp_lt(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ)
+#define vec_cmp_lte(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ)
+#define vec_and(a, b) _mm512_and_ps(a, b)
 #define vec_todbl_lo(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 0))
 #define vec_todbl_hi(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 1))
 #define vec_dbl_tofloat(a, b) _mm512_insertf128(_mm512_castps128_ps512(a), b, 1)
@@ -86,15 +100,28 @@
     .f[6] = a, .f[7] = a, .f[8] = a, .f[9] = a, .f[10] = a, .f[11] = a, \
     .f[12] = a, .f[13] = a, .f[14] = a, .f[15] = a                      \
   }
+#define VEC_HADD(a, b) b += _mm512_reduce_add_ps(a.v)
+#define VEC_FORM_PACKED_MASK(mask, v_mask, pack) \
+  pack += __builtin_popcount(mask);
+#define VEC_LEFT_PACK(a, mask, result) \
+  _mm512_mask_compressstoreu_ps(result, mask, a)
 #elif defined(HAVE_AVX)
 #define VEC_SIZE 8
 #define VEC_FLOAT __m256
 #define VEC_DBL __m256d
 #define VEC_INT __m256i
 #define vec_load(a) _mm256_load_ps(a)
+#define vec_store(a, addr) _mm256_store_ps(addr, a)
+#define vec_unaligned_store(a, addr) _mm256_storeu_ps(addr, a)
+#define vec_setzero() _mm256_setzero_ps()
+#define vec_setintzero() _mm256_setzero_si256()
 #define vec_set1(a) _mm256_set1_ps(a)
+#define vec_setint1(a) _mm256_set1_epi32(a)
 #define vec_set(a, b, c, d, e, f, g, h) _mm256_set_ps(h, g, f, e, d, c, b, a)
 #define vec_dbl_set(a, b, c, d) _mm256_set_pd(d, c, b, a)
+#define vec_add(a, b) _mm256_add_ps(a, b)
+#define vec_sub(a, b) _mm256_sub_ps(a, b)
+#define vec_mul(a, b) _mm256_mul_ps(a, b)
 #define vec_sqrt(a) _mm256_sqrt_ps(a)
 #define vec_rcp(a) _mm256_rcp_ps(a)
 #define vec_rsqrt(a) _mm256_rsqrt_ps(a)
@@ -102,6 +129,12 @@
 #define vec_fmin(a, b) _mm256_min_ps(a, b)
 #define vec_fmax(a, b) _mm256_max_ps(a, b)
 #define vec_fabs(a) _mm256_andnot_ps(_mm256_set1_ps(-0.f), a)
+#define vec_floor(a) _mm256_floor_ps(a)
+#define vec_cmp_lt(a, b) _mm256_cmp_ps(a, b, _CMP_LT_OQ)
+#define vec_cmp_gt(a, b) _mm256_cmp_ps(a, b, _CMP_GT_OQ)
+#define vec_cmp_lte(a, b) _mm256_cmp_ps(a, b, _CMP_LE_OQ)
+#define vec_cmp_result(a) _mm256_movemask_ps(a)
+#define vec_and(a, b) _mm256_and_ps(a, b)
 #define vec_todbl_lo(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 0))
 #define vec_todbl_hi(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 1))
 #define vec_dbl_tofloat(a, b) _mm256_insertf128(_mm256_castps128_ps256(a), b, 1)
@@ -118,9 +151,63 @@
     .f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a, .f[4] = a, .f[5] = a, \
     .f[6] = a, .f[7] = a                                              \
   }
+#define VEC_HADD(a, b)            \
+  a.v = _mm256_hadd_ps(a.v, a.v); \
+  a.v = _mm256_hadd_ps(a.v, a.v); \
+  b += a.f[0] + a.f[4];
+#define VEC_GET_LOW(a) _mm256_castps256_ps128(a)
+#define VEC_GET_HIGH(a) _mm256_extractf128_ps(a, 1)
 #ifdef HAVE_AVX2
+#define vec_fma(a, b, c) _mm256_fmadd_ps(a, b, c)
+#define identity_indices 0x0706050403020100
 #define VEC_HAVE_GATHER
 #define vec_gather(base, offsets) _mm256_i32gather_ps(base, offsets.m, 1)
+#define VEC_FORM_PACKED_MASK(mask, v_mask, pack)                               \
+  {                                                                            \
+    unsigned long expanded_mask = _pdep_u64(mask, 0x0101010101010101);         \
+    expanded_mask *= 0xFF;                                                     \
+    unsigned long wanted_indices = _pext_u64(identity_indices, expanded_mask); \
+    __m128i bytevec = _mm_cvtsi64_si128(wanted_indices);                       \
+    v_mask = _mm256_cvtepu8_epi32(bytevec);                                    \
+    pack += __builtin_popcount(mask);                                          \
+  }
+#define VEC_LEFT_PACK(a, mask, result) \
+  vec_unaligned_store(_mm256_permutevar8x32_ps(a, mask), result)
+#endif
+#ifndef vec_fma
+#define vec_fma(a, b, c) vec_add(vec_mul(a, b), c)
+#endif
+#ifndef VEC_FORM_PACKED_MASK
+#define VEC_FORM_PACKED_MASK(mask, v_mask, pack)   \
+  {                                                \
+    for (int i = 0; i < VEC_SIZE; i++)             \
+      if ((mask & (1 << i))) v_mask.i[pack++] = i; \
+  }
+#define VEC_FORM_PACKED_MASK_2(mask, v_mask, pack, mask2, v_mask2, pack2) \
+  {                                                                       \
+    for (int i = 0; i < VEC_SIZE; i++) {                                  \
+      if ((mask & (1 << i))) v_mask.i[pack++] = i;                        \
+      if ((mask2 & (1 << i))) v_mask2.i[pack2++] = i;                     \
+    }                                                                     \
+  }
+#endif
+#ifndef VEC_LEFT_PACK
+#define VEC_LEFT_PACK(a, mask, result)                                     \
+  {                                                                        \
+    __m256 t1 = _mm256_castps128_ps256(_mm256_extractf128_ps(a, 1));       \
+    __m256 t2 = _mm256_insertf128_ps(t1, _mm256_castps256_ps128(a), 1);    \
+    __m256 r0 = _mm256_permutevar_ps(a, mask);                             \
+    __m256 r1 = _mm256_permutevar_ps(t2, mask);                            \
+    __m128i k1 = _mm_slli_epi32(                                           \
+        (__m128i)(_mm_xor_si128((__m128i)VEC_GET_HIGH((__m256)mask),       \
+                                (__m128i)_mm_set1_epi32(4))),              \
+        29);                                                               \
+    __m128i k0 = _mm_slli_epi32((__m128i)(VEC_GET_LOW((__m256)mask)), 29); \
+    __m256 kk =                                                            \
+        _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_castsi128_ps(k0)), \
+                             _mm_castsi128_ps(k1), 1);                     \
+    *((__m256 *)(result)) = _mm256_blendv_ps(r0, r1, kk);                  \
+  }
 #endif
 #elif defined(HAVE_SSE2)
 #define VEC_SIZE 4
@@ -128,6 +215,7 @@
 #define VEC_DBL __m128d
 #define VEC_INT __m128i
 #define vec_load(a) _mm_load_ps(a)
+#define vec_store(a, addr) _mm_store_ps(addr, a)
 #define vec_set1(a) _mm_set1_ps(a)
 #define vec_set(a, b, c, d) _mm_set_ps(d, c, b, a)
 #define vec_dbl_set(a, b) _mm_set_pd(b, a)
@@ -138,6 +226,10 @@
 #define vec_fmin(a, b) _mm_min_ps(a, b)
 #define vec_fmax(a, b) _mm_max_ps(a, b)
 #define vec_fabs(a) _mm_andnot_ps(_mm_set1_ps(-0.f), a)
+#define vec_floor(a) _mm_floor_ps(a)
+#define vec_cmp_lt(a, b) _mm_cmplt_ps(a, b)
+#define vec_cmp_lte(a, b) _mm_cmp_ps(a, b, _CMP_LE_OQ)
+#define vec_cmp_result(a) _mm_movemask_ps(a)
 #define vec_todbl_lo(a) _mm_cvtps_pd(a)
 #define vec_todbl_hi(a) _mm_cvtps_pd(_mm_movehl_ps(a, a))
 #define vec_dbl_tofloat(a, b) _mm_movelh_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b))
@@ -165,6 +257,45 @@ typedef union {
   int i[VEC_SIZE];
 } vector;
 
+/**
+ * @brief Calculates the inverse ($1/x$) of a vector using intrinsics and a
+ * Newton iteration to obtain the correct level of accuracy.
+ *
+ * @param x #vector to be inverted.
+ * @return x_inv #vector inverted x.
+ */
+__attribute__((always_inline)) INLINE vector vec_reciprocal(vector x) {
+
+  vector x_inv;
+
+  x_inv.v = vec_rcp(x.v);
+  x_inv.v = vec_sub(x_inv.v,
+                    vec_mul(x_inv.v, (vec_fma(x.v, x_inv.v, vec_set1(-1.0f)))));
+
+  return x_inv;
+}
+
+/**
+ * @brief Calculates the inverse and square root (\f$1/\sqrt{x}\f$) of a vector
+ * using intrinsics and a Newton iteration to obtain the correct level of
+ * accuracy.
+ *
+ * @param x #vector to be inverted.
+ * @return x_inv #vector inverted x.
+ */
+__attribute__((always_inline)) INLINE vector vec_reciprocal_sqrt(vector x) {
+
+  vector x_inv;
+
+  x_inv.v = vec_rsqrt(x.v);
+  x_inv.v = vec_sub(
+      x_inv.v,
+      vec_mul(vec_mul(vec_set1(0.5f), x_inv.v),
+              (vec_fma(x.v, vec_mul(x_inv.v, x_inv.v), vec_set1(-1.0f)))));
+
+  return x_inv;
+}
+
 #else
 /* Needed for cache alignment. */
 #define VEC_SIZE 16
diff --git a/tests/test27cells.c b/tests/test27cells.c
index ec9325322d9cb7531e3c1916e81323fa86f6bb60..f645d0e2097990f8e3e3de13d5b108d3cf921bc9 100644
--- a/tests/test27cells.c
+++ b/tests/test27cells.c
@@ -30,6 +30,23 @@
 /* Local headers. */
 #include "swift.h"
 
+#define ACC_THRESHOLD 1e-5
+
+#if defined(WITH_VECTORIZATION) && defined(DOSELF1_VEC)
+#define DOSELF1 runner_doself1_density_vec
+#define DOSELF1_NAME "runner_doself1_density_vec"
+#endif
+
+#if defined(WITH_VECTORIZATION) && defined(DOSELF1_VEC_2)
+#define DOSELF1 runner_doself1_density_vec_2
+#define DOSELF1_NAME "runner_doself1_density_vec_2"
+#endif
+
+#ifndef DOSELF1
+#define DOSELF1 runner_doself1_density
+#define DOSELF1_NAME "runner_doself1_density"
+#endif
+
 enum velocity_types {
   velocity_zero,
   velocity_random,
@@ -255,15 +272,41 @@ void dump_particle_fields(char *fileName, struct cell *main_cell,
   fclose(file);
 }
 
+/**
+ * @brief Compares the vectorised result against
+ * the serial result of the interaction.
+ *
+ * @param serial_parts Particle array that has been interacted serially
+ * @param vec_parts Particle array to be interacted using vectors
+ * @param count No. of particles that have been interacted
+ * @param threshold Level of accuracy needed
+ *
+ * @return Non-zero value if difference found, 0 otherwise
+ */
+int check_results(struct part *serial_parts, struct part *vec_parts, int count,
+                  double threshold) {
+  int result = 0;
+
+  for (int i = 0; i < count; i++)
+    result += compare_particles(serial_parts[i], vec_parts[i], threshold);
+
+  return result;
+}
+
 /* Just a forward declaration... */
 void runner_dopair1_density(struct runner *r, struct cell *ci, struct cell *cj);
 void runner_doself1_density(struct runner *r, struct cell *ci);
+void runner_doself1_density_vec(struct runner *r, struct cell *ci);
+void runner_doself1_density_vec_2(struct runner *r, struct cell *ci);
 
 /* And go... */
 int main(int argc, char *argv[]) {
+
+  engine_pin();
   size_t runs = 0, particles = 0;
   double h = 1.23485, size = 1., rho = 1.;
   double perturbation = 0.;
+  double threshold = ACC_THRESHOLD;
   char outputFileNameExtension[200] = "";
   char outputFileName[200] = "";
   enum velocity_types vel = velocity_zero;
@@ -279,7 +322,7 @@ int main(int argc, char *argv[]) {
   srand(0);
 
   char c;
-  while ((c = getopt(argc, argv, "m:s:h:n:r:t:d:f:v:")) != -1) {
+  while ((c = getopt(argc, argv, "m:s:h:n:r:t:d:f:v:a:")) != -1) {
     switch (c) {
       case 'h':
         sscanf(optarg, "%lf", &h);
@@ -305,6 +348,9 @@ int main(int argc, char *argv[]) {
       case 'v':
         sscanf(optarg, "%d", (int *)&vel);
         break;
+      case 'a':
+        sscanf(optarg, "%lf", &threshold);
+        break;
       case '?':
         error("Unknown option.");
         break;
@@ -330,6 +376,8 @@ int main(int argc, char *argv[]) {
   }
 
   /* Help users... */
+  message("Function called: %s", DOSELF1_NAME);
+  message("Vector size: %d", VEC_SIZE);
   message("Adiabatic index: ga = %f", hydro_gamma);
   message("Hydro implementation: %s", SPH_IMPLEMENTATION);
   message("Smoothing length: h = %f", h * size);
@@ -372,6 +420,9 @@ int main(int argc, char *argv[]) {
   /* Store the main cell for future use */
   main_cell = cells[13];
 
+  ticks timings[27];
+  for (int i = 0; i < 27; i++) timings[i] = 0;
+
   ticks time = 0;
   for (size_t i = 0; i < runs; ++i) {
     /* Zero the fields */
@@ -382,12 +433,30 @@ int main(int argc, char *argv[]) {
 #if !(defined(MINIMAL_SPH) && defined(WITH_VECTORIZATION))
 
     /* Run all the pairs */
-    for (int j = 0; j < 27; ++j)
-      if (cells[j] != main_cell)
+    for (int j = 0; j < 27; ++j) {
+      if (cells[j] != main_cell) {
+        const ticks sub_tic = getticks();
+
         runner_dopair1_density(&runner, main_cell, cells[j]);
 
-    /* And now the self-interaction */
-    runner_doself1_density(&runner, main_cell);
+        const ticks sub_toc = getticks();
+        timings[j] += sub_toc - sub_tic;
+      }
+    }
+
+/* And now the self-interaction */
+#ifdef WITH_VECTORIZATION
+    runner.par_cache.count = 0;
+    cache_init(&runner.par_cache, 512);
+#endif
+
+    const ticks self_tic = getticks();
+
+    DOSELF1(&runner, main_cell);
+
+    const ticks self_toc = getticks();
+
+    timings[13] += self_toc - self_tic;
 
 #endif
 
@@ -405,8 +474,26 @@ int main(int argc, char *argv[]) {
     }
   }
 
+  /* Store the vectorised particle results. */
+  struct part vec_parts[main_cell->count];
+  for (int i = 0; i < main_cell->count; i++) vec_parts[i] = main_cell->parts[i];
+
   /* Output timing */
-  message("SWIFT calculation took       : %15lli ticks.", time / runs);
+  ticks corner_time = timings[0] + timings[2] + timings[6] + timings[8] +
+                      timings[18] + timings[20] + timings[24] + timings[26];
+
+  ticks edge_time = timings[1] + timings[3] + timings[5] + timings[7] +
+                    timings[9] + timings[11] + timings[15] + timings[17] +
+                    timings[19] + timings[21] + timings[23] + timings[25];
+
+  ticks face_time = timings[4] + timings[10] + timings[12] + timings[14] +
+                    timings[16] + timings[22];
+
+  message("Corner calculations took       : %15lli ticks.", corner_time / runs);
+  message("Edge calculations took         : %15lli ticks.", edge_time / runs);
+  message("Face calculations took         : %15lli ticks.", face_time / runs);
+  message("Self calculations took         : %15lli ticks.", timings[13] / runs);
+  message("SWIFT calculation took         : %15lli ticks.", time / runs);
 
   /* Now perform a brute-force version for accuracy tests */
 
@@ -435,6 +522,10 @@ int main(int argc, char *argv[]) {
   sprintf(outputFileName, "brute_force_27_%s.dat", outputFileNameExtension);
   dump_particle_fields(outputFileName, main_cell, cells);
 
+  /* Check serial results against the vectorised results. */
+  if (check_results(main_cell->parts, vec_parts, main_cell->count, threshold))
+    message("Differences found...");
+
   /* Output timing */
   message("Brute force calculation took : %15lli ticks.", toc - tic);
 
diff --git a/tests/test27cells.sh.in b/tests/test27cells.sh.in
index bf9cfeaf9a70790a321fa7ec4c63983d8cfd866c..07b6b92a82cee2bbe9c593f8f62e750d4406f84e 100755
--- a/tests/test27cells.sh.in
+++ b/tests/test27cells.sh.in
@@ -6,7 +6,7 @@ do
 
     rm -f brute_force_27_standard.dat swift_dopair_27_standard.dat
 
-    ./test27cells -n 6 -r 1 -d 0 -f standard -v $v
+    ./test27cells -n 6 -r 1 -d 0 -f standard -v $v -a 1e-4
 
     if [ -e brute_force_27_standard.dat ]
     then
diff --git a/tests/test27cellsPerturbed.sh.in b/tests/test27cellsPerturbed.sh.in
index 3cdaf79ab17e705ec69a0b646949cc5a71109796..30498594b659101216b51dfea2346fa9230dbc97 100755
--- a/tests/test27cellsPerturbed.sh.in
+++ b/tests/test27cellsPerturbed.sh.in
@@ -6,7 +6,7 @@ do
 
     rm -f brute_force_27_perturbed.dat swift_dopair_27_perturbed.dat
 
-    ./test27cells -n 6 -r 1 -d 0.1 -f perturbed -v $v
+    ./test27cells -n 6 -r 1 -d 0.1 -f perturbed -v $v -a 5e-4
 
     if [ -e brute_force_27_perturbed.dat ]
     then
diff --git a/tests/testInteractions.c b/tests/testInteractions.c
index 4faebbbdb35c4bc36cca16cb207517f05629f8c1..4ce7fe40554d24551750629fa47c0bee7acdb6da 100644
--- a/tests/testInteractions.c
+++ b/tests/testInteractions.c
@@ -30,6 +30,9 @@ int main() { return 0; }
 #include <unistd.h>
 #include "swift.h"
 
+#define array_align sizeof(float) * VEC_SIZE
+#define ACC_THRESHOLD 1e-5
+
 /* Typdef function pointers for serial and vectorised versions of the
  * interaction functions. */
 typedef void (*serial_interaction)(float, float *, float, float, struct part *,
@@ -48,8 +51,8 @@ typedef void (*vec_interaction)(float *, float *, float *, float *,
  *separation.
  * @param partId The running counter of IDs.
  */
-struct part *make_particles(int count, double *offset, double spacing, double h,
-                            long long *partId) {
+struct part *make_particles(size_t count, double *offset, double spacing,
+                            double h, long long *partId) {
 
   struct part *particles;
   if (posix_memalign((void **)&particles, part_align,
@@ -60,11 +63,28 @@ struct part *make_particles(int count, double *offset, double spacing, double h,
 
   /* Construct the particles */
   struct part *p;
-  for (size_t i = 0; i < VEC_SIZE + 1; ++i) {
+
+  /* Set test particle at centre of unit sphere. */
+  p = &particles[0];
+
+  /* Place the test particle at the centre of a unit sphere. */
+  p->x[0] = 0.0f;
+  p->x[1] = 0.0f;
+  p->x[2] = 0.0f;
+
+  p->h = h;
+  p->id = ++(*partId);
+  p->mass = 1.0f;
+
+  /* Place rest of particles around the test particle
+   * with random position within a unit sphere. */
+  for (size_t i = 1; i < count; ++i) {
     p = &particles[i];
-    p->x[0] = offset[0] + spacing * i;
-    p->x[1] = offset[1] + spacing * i;
-    p->x[2] = offset[2] + spacing * i;
+
+    /* Randomise positions within a unit sphere. */
+    p->x[0] = random_uniform(-1.0, 1.0);
+    p->x[1] = random_uniform(-1.0, 1.0);
+    p->x[2] = random_uniform(-1.0, 1.0);
 
     /* Randomise velocities. */
     p->v[0] = random_uniform(-0.05, 0.05);
@@ -81,20 +101,17 @@ struct part *make_particles(int count, double *offset, double spacing, double h,
 /**
  * @brief Populates particle properties needed for the force calculation.
  */
-void prepare_force(struct part *parts) {
+void prepare_force(struct part *parts, size_t count) {
 
   struct part *p;
-  for (size_t i = 0; i < VEC_SIZE + 1; ++i) {
+  for (size_t i = 0; i < count; ++i) {
     p = &parts[i];
     p->rho = i + 1;
-#if defined(GADGET2_SPH)
-    p->force.balsara = i + 1;
-    p->force.P_over_rho2 = i + 1;
-#elif defined(DEFAULT_SPH)
-    p->force.balsara = i + 1;
+    p->force.balsara = random_uniform(0.0, 1.0);
     p->force.P_over_rho2 = i + 1;
-#else
-#endif
+    p->force.soundspeed = random_uniform(2.0, 3.0);
+    p->force.v_sig = 0.0f;
+    p->force.h_dt = 0.0f;
   }
 }
 
@@ -106,7 +123,7 @@ void dump_indv_particle_fields(char *fileName, struct part *p) {
   FILE *file = fopen(fileName, "a");
 
   fprintf(file,
-          "%6llu %10f %10f %10f %10f %10f %10f %10f %10f %10f %13e %13e %13e "
+          "%6llu %10f %10f %10f %10f %10f %10f %10e %10e %10e %13e %13e %13e "
           "%13e %13e %13e %13e "
           "%13e %13e %13e %10f\n",
           p->id, p->x[0], p->x[1], p->x[2], p->v[0], p->v[1], p->v[2],
@@ -120,7 +137,8 @@ void dump_indv_particle_fields(char *fileName, struct part *p) {
           p->density.div_v, p->density.rot_v[0], p->density.rot_v[1],
           p->density.rot_v[2], 0.
 #else
-          0., 0., 0., 0., 0.
+          p->density.div_v, p->density.rot_v[0], p->density.rot_v[1],
+          p->density.rot_v[2]
 #endif
           );
   fclose(file);
@@ -140,27 +158,52 @@ void write_header(char *fileName) {
           "ID", "pos_x", "pos_y", "pos_z", "v_x", "v_y", "v_z", "a_x", "a_y",
           "a_z", "rho", "rho_dh", "wcount", "wcount_dh", "dh/dt", "v_sig",
           "div_v", "curl_vx", "curl_vy", "curl_vz", "dS/dt");
-  fprintf(file, "\nPARTICLES BEFORE INTERACTION:\n");
+  fprintf(file, "\n# PARTICLES BEFORE INTERACTION:\n");
   fclose(file);
 }
 
 /**
- * @brief Calls the serial and vectorised version of the non-symmetrical density
- * interaction.
+ * @brief Compares the vectorised result against
+ * the serial result of the interaction.
+ *
+ * @param serial_test_part Particle that has been updated serially
+ * @param serial_parts Particle array that has been interacted serially
+ * @param vec_test_part Particle that has been updated using vectors
+ * @param vec_parts Particle array to be interacted using vectors
+ * @param count No. of particles that have been interacted
+ *
+ * @return Non-zero value if difference found, 0 otherwise
+ */
+int check_results(struct part serial_test_part, struct part *serial_parts,
+                  struct part vec_test_part, struct part *vec_parts,
+                  int count) {
+  int result = 0;
+  result += compare_particles(serial_test_part, vec_test_part, ACC_THRESHOLD);
+
+  for (int i = 0; i < count; i++)
+    result += compare_particles(serial_parts[i], vec_parts[i], ACC_THRESHOLD);
+
+  return result;
+}
+
+/*
+ * @brief Calls the serial and vectorised version of an interaction
+ * function given by the function pointers.
  *
+ * @param test_part Particle that will be updated
  * @param parts Particle array to be interacted
  * @param count No. of particles to be interacted
- * @param serial_inter_func Function pointer to serial interaction function
- * @param vec_inter_func Function pointer to vectorised interaction function
- * @param filePrefix Prefix of output files
+ * @param serial_inter_func Serial interaction function to be called
+ * @param vec_inter_func Vectorised interaction function to be called
+ * @param runs No. of times to call interactions
  *
  */
-void test_interactions(struct part *parts, int count,
+void test_interactions(struct part test_part, struct part *parts, size_t count,
                        serial_interaction serial_inter_func,
-                       vec_interaction vec_inter_func, char *filePrefix) {
+                       vec_interaction vec_inter_func, char *filePrefix,
+                       size_t runs) {
 
-  /* Use the first particle in the array as the one that gets updated. */
-  struct part pi = parts[0];
+  ticks serial_time = 0, vec_time = 0;
 
   FILE *file;
   char serial_filename[200] = "";
@@ -174,98 +217,148 @@ void test_interactions(struct part *parts, int count,
   write_header(serial_filename);
   write_header(vec_filename);
 
-  /* Dump state of particles before serial interaction. */
-  dump_indv_particle_fields(serial_filename, &pi);
-  for (int i = 1; i < count; i++)
-    dump_indv_particle_fields(serial_filename, &parts[i]);
-
-  /* Make copy of pi to be used in vectorised version. */
-  struct part pi_vec = pi;
-  struct part pj_vec[VEC_SIZE];
-  for (int i = 0; i < VEC_SIZE; i++) pj_vec[i] = parts[i + 1];
-
-  float r2q[VEC_SIZE] __attribute__((aligned(sizeof(float) * VEC_SIZE)));
-  float hiq[VEC_SIZE] __attribute__((aligned(sizeof(float) * VEC_SIZE)));
-  float hjq[VEC_SIZE] __attribute__((aligned(sizeof(float) * VEC_SIZE)));
-  float dxq[3 * VEC_SIZE] __attribute__((aligned(sizeof(float) * VEC_SIZE)));
-  struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
-
-  /* Perform serial interaction */
-  for (int i = 1; i < count; i++) {
-    /* Compute the pairwise distance. */
-    float r2 = 0.0f;
-    float dx[3];
-    for (int k = 0; k < 3; k++) {
-      dx[k] = pi.x[k] - parts[i].x[k];
-      r2 += dx[k] * dx[k];
+  /* Test particle at the center of a unit sphere. */
+  struct part pi_serial, pi_vec;
+
+  /* Remaining particles in the sphere that will interact with test particle. */
+  struct part pj_serial[count], pj_vec[count];
+
+  /* Stores the separation, smoothing length and pointers to particles
+   * needed for the vectorised interaction. */
+  float r2q[count] __attribute__((aligned(array_align)));
+  float hiq[count] __attribute__((aligned(array_align)));
+  float hjq[count] __attribute__((aligned(array_align)));
+  float dxq[3 * count] __attribute__((aligned(array_align)));
+  struct part *piq[count], *pjq[count];
+
+  /* Call serial interaction a set number of times. */
+  for (size_t k = 0; k < runs; k++) {
+    /* Reset particle to initial setup */
+    pi_serial = test_part;
+    for (size_t i = 0; i < count; i++) pj_serial[i] = parts[i];
+
+    /* Only dump data on first run. */
+    if (k == 0) {
+      /* Dump state of particles before serial interaction. */
+      dump_indv_particle_fields(serial_filename, &pi_serial);
+      for (size_t i = 0; i < count; i++)
+        dump_indv_particle_fields(serial_filename, &pj_serial[i]);
     }
 
-    serial_inter_func(r2, dx, pi.h, parts[i].h, &pi, &parts[i]);
+    /* Perform serial interaction */
+    for (size_t i = 0; i < count; i++) {
+      /* Compute the pairwise distance. */
+      float r2 = 0.0f;
+      float dx[3];
+      for (size_t k = 0; k < 3; k++) {
+        dx[k] = pi_serial.x[k] - pj_serial[i].x[k];
+        r2 += dx[k] * dx[k];
+      }
+
+      const ticks tic = getticks();
+
+      serial_inter_func(r2, dx, pi_serial.h, pj_serial[i].h, &pi_serial,
+                        &pj_serial[i]);
+
+      serial_time += getticks() - tic;
+    }
   }
 
   file = fopen(serial_filename, "a");
-  fprintf(file, "\nPARTICLES AFTER INTERACTION:\n");
+  fprintf(file, "\n# PARTICLES AFTER INTERACTION:\n");
   fclose(file);
 
   /* Dump result of serial interaction. */
-  dump_indv_particle_fields(serial_filename, &pi);
-  for (int i = 1; i < count; i++)
-    dump_indv_particle_fields(serial_filename, &parts[i]);
-
-  /* Setup arrays for vector interaction. */
-  for (int i = 0; i < VEC_SIZE; i++) {
-    /* Compute the pairwise distance. */
-    float r2 = 0.0f;
-    float dx[3];
-    for (int k = 0; k < 3; k++) {
-      dx[k] = pi_vec.x[k] - pj_vec[i].x[k];
-      r2 += dx[k] * dx[k];
+  dump_indv_particle_fields(serial_filename, &pi_serial);
+  for (size_t i = 0; i < count; i++)
+    dump_indv_particle_fields(serial_filename, &pj_serial[i]);
+
+  /* Call vector interaction a set number of times. */
+  for (size_t k = 0; k < runs; k++) {
+    /* Reset particle to initial setup */
+    pi_vec = test_part;
+    for (size_t i = 0; i < count; i++) pj_vec[i] = parts[i];
+
+    /* Setup arrays for vector interaction. */
+    for (size_t i = 0; i < count; i++) {
+      /* Compute the pairwise distance. */
+      float r2 = 0.0f;
+      float dx[3];
+      for (size_t k = 0; k < 3; k++) {
+        dx[k] = pi_vec.x[k] - pj_vec[i].x[k];
+        r2 += dx[k] * dx[k];
+      }
+
+      r2q[i] = r2;
+      dxq[3 * i + 0] = dx[0];
+      dxq[3 * i + 1] = dx[1];
+      dxq[3 * i + 2] = dx[2];
+      hiq[i] = pi_vec.h;
+      hjq[i] = pj_vec[i].h;
+      piq[i] = &pi_vec;
+      pjq[i] = &pj_vec[i];
     }
-    r2q[i] = r2;
-    dxq[3 * i + 0] = dx[0];
-    dxq[3 * i + 1] = dx[1];
-    dxq[3 * i + 2] = dx[2];
-    hiq[i] = pi_vec.h;
-    hjq[i] = pj_vec[i].h;
-    piq[i] = &pi_vec;
-    pjq[i] = &pj_vec[i];
-  }
 
-  /* Dump state of particles before vector interaction. */
-  dump_indv_particle_fields(vec_filename, piq[0]);
-  for (size_t i = 0; i < VEC_SIZE; i++)
-    dump_indv_particle_fields(vec_filename, pjq[i]);
+    /* Only dump data on first run. */
+    if (k == 0) {
+      /* Dump state of particles before vector interaction. */
+      dump_indv_particle_fields(vec_filename, piq[0]);
+      for (size_t i = 0; i < count; i++)
+        dump_indv_particle_fields(vec_filename, pjq[i]);
+    }
 
-  /* Perform vector interaction. */
-  vec_inter_func(r2q, dxq, hiq, hjq, piq, pjq);
+    const ticks vec_tic = getticks();
+
+    /* Perform vector interaction. */
+    for (size_t i = 0; i < count; i += VEC_SIZE) {
+      vec_inter_func(&(r2q[i]), &(dxq[3 * i]), &(hiq[i]), &(hjq[i]), &(piq[i]),
+                     &(pjq[i]));
+    }
+
+    vec_time += getticks() - vec_tic;
+  }
 
   file = fopen(vec_filename, "a");
-  fprintf(file, "\nPARTICLES AFTER INTERACTION:\n");
+  fprintf(file, "\n# PARTICLES AFTER INTERACTION:\n");
   fclose(file);
 
-  /* Dump result of serial interaction. */
+  /* Dump result of vector interaction. */
   dump_indv_particle_fields(vec_filename, piq[0]);
-  for (size_t i = 0; i < VEC_SIZE; i++)
+  for (size_t i = 0; i < count; i++)
     dump_indv_particle_fields(vec_filename, pjq[i]);
+
+  /* Check serial results against the vectorised results. */
+  if (check_results(pi_serial, pj_serial, pi_vec, pj_vec, count))
+    message("Differences found...");
+
+  message("The serial interactions took     : %15lli ticks.",
+          serial_time / runs);
+  message("The vectorised interactions took : %15lli ticks.", vec_time / runs);
 }
 
 /* And go... */
 int main(int argc, char *argv[]) {
-  double h = 1.2348, spacing = 0.5;
+  size_t runs = 10000;
+  double h = 1.0, spacing = 0.5;
   double offset[3] = {0.0, 0.0, 0.0};
-  int count = VEC_SIZE + 1;
+  size_t count = 256;
 
   /* Get some randomness going */
   srand(0);
 
   char c;
-  while ((c = getopt(argc, argv, "s:h:")) != -1) {
+  while ((c = getopt(argc, argv, "h:s:n:r:")) != -1) {
     switch (c) {
       case 'h':
         sscanf(optarg, "%lf", &h);
         break;
       case 's':
         sscanf(optarg, "%lf", &spacing);
+      case 'n':
+        sscanf(optarg, "%zu", &count);
+        break;
+      case 'r':
+        sscanf(optarg, "%zu", &runs);
         break;
       case '?':
         error("Unknown option.");
@@ -281,26 +374,35 @@ int main(int argc, char *argv[]) {
         "runner_iact_vec_density."
         "\n\nOptions:"
         "\n-h DISTANCE=1.2348 - Smoothing length in units of <x>"
-        "\n-s spacing         - Spacing between particles",
+        "\n-s SPACING=0.5     - Spacing between particles"
+        "\n-n NUMBER=9        - No. of particles",
         argv[0]);
     exit(1);
   }
 
+  /* Correct count so that VEC_SIZE of particles interact with the test
+   * particle. */
+  count = count - (count % VEC_SIZE) + 1;
+
   /* Build the infrastructure */
   static long long partId = 0;
+  struct part density_test_particle, force_test_particle;
   struct part *density_particles =
       make_particles(count, offset, spacing, h, &partId);
   struct part *force_particles =
       make_particles(count, offset, spacing, h, &partId);
-  prepare_force(force_particles);
+  prepare_force(force_particles, count);
 
   /* Define which interactions to call */
   serial_interaction serial_inter_func = &runner_iact_nonsym_density;
   vec_interaction vec_inter_func = &runner_iact_nonsym_vec_density;
 
+  density_test_particle = density_particles[0];
   /* Call the non-sym density test. */
-  test_interactions(density_particles, count, serial_inter_func, vec_inter_func,
-                    "test_nonsym_density");
+  message("Testing non-symmetrical density interaction...");
+  test_interactions(density_test_particle, &density_particles[1], count - 1,
+                    serial_inter_func, vec_inter_func, "test_nonsym_density",
+                    runs);
 
   density_particles = make_particles(count, offset, spacing, h, &partId);
 
@@ -308,28 +410,36 @@ int main(int argc, char *argv[]) {
   serial_inter_func = &runner_iact_density;
   vec_inter_func = &runner_iact_vec_density;
 
+  density_test_particle = density_particles[0];
   /* Call the symmetrical density test. */
-  test_interactions(density_particles, count, serial_inter_func, vec_inter_func,
-                    "test_sym_density");
+  message("Testing symmetrical density interaction...");
+  test_interactions(density_test_particle, &density_particles[1], count - 1,
+                    serial_inter_func, vec_inter_func, "test_sym_density",
+                    runs);
 
   /* Re-assign function pointers. */
   serial_inter_func = &runner_iact_nonsym_force;
   vec_inter_func = &runner_iact_nonsym_vec_force;
 
+  force_test_particle = force_particles[0];
   /* Call the test non-sym force test. */
-  test_interactions(force_particles, count, serial_inter_func, vec_inter_func,
-                    "test_nonsym_force");
+  message("Testing non-symmetrical force interaction...");
+  test_interactions(force_test_particle, &force_particles[1], count - 1,
+                    serial_inter_func, vec_inter_func, "test_nonsym_force",
+                    runs);
 
   force_particles = make_particles(count, offset, spacing, h, &partId);
-  prepare_force(force_particles);
+  prepare_force(force_particles, count);
 
   /* Re-assign function pointers. */
   serial_inter_func = &runner_iact_force;
   vec_inter_func = &runner_iact_vec_force;
 
+  force_test_particle = force_particles[0];
   /* Call the test symmetrical force test. */
-  test_interactions(force_particles, count, serial_inter_func, vec_inter_func,
-                    "test_sym_force");
+  message("Testing symmetrical force interaction...");
+  test_interactions(force_test_particle, &force_particles[1], count - 1,
+                    serial_inter_func, vec_inter_func, "test_sym_force", runs);
 
   return 0;
 }
diff --git a/tests/tolerance_27_normal.dat b/tests/tolerance_27_normal.dat
index 71acaa89be231d02fc33e47c96a7bacf623bbf48..e2053261ed6675e4f0ce92866db896819eaff0c2 100644
--- a/tests/tolerance_27_normal.dat
+++ b/tests/tolerance_27_normal.dat
@@ -1,3 +1,3 @@
 #   ID      pos_x      pos_y      pos_z        v_x        v_y        v_z           rho        rho_dh        wcount     wcount_dh         div_v       curl_vx       curl_vy       curl_vz
-    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-6	      2e-6	    2e-5       2e-3		 2e-6	     2e-6	   2e-6		 2e-6
-    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-6	      2e-6	    1e-5       1e-4		 2e-5	     2e-5	   2e-5	 	 2e-5
+    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-6	      2e-5	    2e-4       2e-3		 2e-6	     2e-6	   2e-6		 2e-6
+    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-6	      6e-5	    1e-4       1e-4		 2e-5	     2e-5	   2e-5	 	 2e-5
diff --git a/tests/tolerance_27_perturbed.dat b/tests/tolerance_27_perturbed.dat
index 45293cbaa223b5887f3b0ce05cd9430d0db7440b..226a3d1423685656b37fa8acfe1741c859099352 100644
--- a/tests/tolerance_27_perturbed.dat
+++ b/tests/tolerance_27_perturbed.dat
@@ -1,3 +1,3 @@
 #   ID      pos_x      pos_y      pos_z        v_x        v_y        v_z           rho        rho_dh        wcount     wcount_dh         div_v       curl_vx       curl_vy       curl_vz
-    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1.2e-6     1e-5	    2.1e-5     2e-3		 2.1e-6	     2e-6	   2e-6		 2e-6
-    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-6	      3e-3	    1e-5       1e-4		 2e-5	     4e-4	   4e-4	 	 4e-4
+    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1.2e-6     1e-4	    4e-5       2e-3		 3.1e-6	     2e-6	   2e-6		 2e-6
+    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-6	      3e-3	    1e-5       1e-4		 2e-5	     2e-3	   2e-3	 	 2e-3