diff --git a/.gitignore b/.gitignore
index e3e17bb82a01d9af0ace6ed72d196cf2dba242f6..d3faf084a3e0a78b06f0bfa57742bfe8d9311a20 100644
--- a/.gitignore
+++ b/.gitignore
@@ -59,6 +59,11 @@ tests/brute_force_periodic_BC_perturbed.dat
 tests/swift_dopair_active.dat
 tests/test_nonsym_density_serial.dat
 tests/test_nonsym_density_vec.dat
+tests/test_nonsym_force_serial.dat
+tests/test_nonsym_density_1_vec.dat
+tests/test_nonsym_density_2_vec.dat
+tests/test_nonsym_force_1_vec.dat
+tests/test_nonsym_force_2_vec.dat
 tests/testGreetings
 tests/testReading
 tests/input.hdf5
diff --git a/examples/main.c b/examples/main.c
index ee1253062409ec2e787e064a5fb50da2c830d35d..a86cdc60a7473e84e8fba8c32417e7e19ebaeb60 100644
--- a/examples/main.c
+++ b/examples/main.c
@@ -677,6 +677,7 @@ int main(int argc, char *argv[]) {
 
   /* Write the state of the system before starting time integration. */
   engine_dump_snapshot(&e);
+  engine_print_stats(&e);
 
   /* Legend */
   if (myrank == 0)
@@ -815,6 +816,7 @@ int main(int argc, char *argv[]) {
 
   /* Write final output. */
   engine_drift_all(&e);
+  engine_print_stats(&e);
   engine_dump_snapshot(&e);
 
 #ifdef WITH_MPI
diff --git a/src/active.h b/src/active.h
index 58e88835b6f51ae15f9fd7270c0e1f89bbd6d61a..f9ed4a0d40127f1544c744a61b05515a19857b05 100644
--- a/src/active.h
+++ b/src/active.h
@@ -144,6 +144,14 @@ __attribute__((always_inline)) INLINE static int part_is_active(
   return (part_bin <= max_active_bin);
 }
 
+__attribute__((always_inline)) INLINE static int part_is_active_no_debug(
+    const struct part *p, const timebin_t max_active_bin) {
+
+  const timebin_t part_bin = p->time_bin;
+
+  return (part_bin <= max_active_bin);
+}
+
 /**
  * @brief Is this g-particle finishing its time-step now ?
  *
diff --git a/src/cache.h b/src/cache.h
index 170eb8f5cb0939e85647bbb4622d66cc78ef3dbe..bb72149607f496dd378a848a4270fe74d3f75ad9 100644
--- a/src/cache.h
+++ b/src/cache.h
@@ -65,6 +65,21 @@ struct cache {
   /* Maximum index into neighbouring cell for particles that are in range. */
   int *restrict max_index SWIFT_CACHE_ALIGN;
 
+  /* Particle density. */
+  float *restrict rho SWIFT_CACHE_ALIGN;
+
+  /* Particle smoothing length gradient. */
+  float *restrict grad_h SWIFT_CACHE_ALIGN;
+
+  /* Pressure over density squared. */
+  float *restrict pOrho2 SWIFT_CACHE_ALIGN;
+
+  /* Balsara switch. */
+  float *restrict balsara SWIFT_CACHE_ALIGN;
+
+  /* Particle sound speed. */
+  float *restrict soundspeed SWIFT_CACHE_ALIGN;
+
   /* Cache size. */
   int count;
 };
@@ -127,6 +142,11 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c,
     free(c->vz);
     free(c->h);
     free(c->max_index);
+    free(c->rho);
+    free(c->grad_h);
+    free(c->pOrho2);
+    free(c->balsara);
+    free(c->soundspeed);
   }
 
   error += posix_memalign((void **)&c->x, SWIFT_CACHE_ALIGNMENT, sizeBytes);
@@ -139,6 +159,15 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c,
   error += posix_memalign((void **)&c->h, SWIFT_CACHE_ALIGNMENT, sizeBytes);
   error += posix_memalign((void **)&c->max_index, SWIFT_CACHE_ALIGNMENT,
                           sizeIntBytes);
+  error += posix_memalign((void **)&c->rho, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error +=
+      posix_memalign((void **)&c->grad_h, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error +=
+      posix_memalign((void **)&c->pOrho2, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error +=
+      posix_memalign((void **)&c->balsara, SWIFT_CACHE_ALIGNMENT, sizeBytes);
+  error +=
+      posix_memalign((void **)&c->soundspeed, SWIFT_CACHE_ALIGNMENT, sizeBytes);
 
   if (error != 0)
     error("Couldn't allocate cache, no. of particles: %d", (int)count);
@@ -191,6 +220,68 @@ __attribute__((always_inline)) INLINE void cache_read_particles(
 #endif
 }
 
+/**
+ * @brief Populate cache for force interactions by reading in the particles in
+ * unsorted order.
+ *
+ * @param ci The #cell.
+ * @param ci_cache The cache.
+ */
+__attribute__((always_inline)) INLINE void cache_read_force_particles(
+    const struct cell *restrict const ci,
+    struct cache *restrict const ci_cache) {
+
+#if defined(GADGET2_SPH)
+
+  /* Let the compiler know that the data is aligned and create pointers to the
+   * arrays inside the cache. */
+  swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, y, ci_cache->y, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, z, ci_cache->z, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, h, ci_cache->h, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, m, ci_cache->m, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vx, ci_cache->vx, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, rho, ci_cache->rho, SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, grad_h, ci_cache->grad_h,
+                            SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, pOrho2, ci_cache->pOrho2,
+                            SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, balsara, ci_cache->balsara,
+                            SWIFT_CACHE_ALIGNMENT);
+  swift_declare_aligned_ptr(float, soundspeed, ci_cache->soundspeed,
+                            SWIFT_CACHE_ALIGNMENT);
+
+  const struct part *restrict parts = ci->parts;
+  double loc[3];
+  loc[0] = ci->loc[0];
+  loc[1] = ci->loc[1];
+  loc[2] = ci->loc[2];
+
+  /* Shift the particles positions to a local frame so single precision can be
+   * used instead of double precision. */
+  for (int i = 0; i < ci->count; i++) {
+    x[i] = (float)(parts[i].x[0] - loc[0]);
+    y[i] = (float)(parts[i].x[1] - loc[1]);
+    z[i] = (float)(parts[i].x[2] - loc[2]);
+    h[i] = parts[i].h;
+
+    m[i] = parts[i].mass;
+    vx[i] = parts[i].v[0];
+    vy[i] = parts[i].v[1];
+    vz[i] = parts[i].v[2];
+
+    rho[i] = parts[i].rho;
+    grad_h[i] = parts[i].force.f;
+    pOrho2[i] = parts[i].force.P_over_rho2;
+    balsara[i] = parts[i].force.balsara;
+    soundspeed[i] = parts[i].force.soundspeed;
+  }
+
+#endif
+}
+
 /**
  * @brief Populate caches by only reading particles that are within range of
  * each other within the adjoining cell.Also read the particles into the cache
diff --git a/src/engine.c b/src/engine.c
index 1230abf848b3b63465bd5e7d4065dec36d46e46f..93c430d611cf573c643b4cf94325fb97333381a7 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -3378,6 +3378,9 @@ void engine_print_stats(struct engine *e) {
   struct statistics global_stats = stats;
 #endif
 
+  /* Finalize operations */
+  stats_finalize(&stats);
+
   /* Print info */
   if (e->nodeID == 0)
     stats_print_to_file(e->file_stats, &global_stats, e->time);
@@ -4372,7 +4375,7 @@ void engine_init(struct engine *e, struct space *s,
   e->file_timesteps = NULL;
   e->deltaTimeStatistics =
       parser_get_param_double(params, "Statistics:delta_time");
-  e->timeLastStatistics = e->timeBegin - e->deltaTimeStatistics;
+  e->timeLastStatistics = 0;
   e->verbose = verbose;
   e->count_step = 0;
   e->wallclock_time = 0.f;
@@ -4540,10 +4543,10 @@ void engine_init(struct engine *e, struct space *s,
     e->file_stats = fopen(energyfileName, "w");
     fprintf(e->file_stats,
             "#%14s %14s %14s %14s %14s %14s %14s %14s %14s %14s %14s %14s %14s "
-            "%14s %14s %14s\n",
+            "%14s %14s %14s %14s %14s %14s\n",
             "Time", "Mass", "E_tot", "E_kin", "E_int", "E_pot", "E_pot_self",
             "E_pot_ext", "E_radcool", "Entropy", "p_x", "p_y", "p_z", "ang_x",
-            "ang_y", "ang_z");
+            "ang_y", "ang_z", "com_x", "com_y", "com_z");
     fflush(e->file_stats);
 
     char timestepsfileName[200] = "";
diff --git a/src/engine.h b/src/engine.h
index 90645b308529e49a370bf91f2b3d05195b2b08d8..2b5a4eee7cde974fc555460c6483d02b21bd6c04 100644
--- a/src/engine.h
+++ b/src/engine.h
@@ -261,6 +261,7 @@ void engine_unskip(struct engine *e);
 void engine_drift_all(struct engine *e);
 void engine_drift_top_multipoles(struct engine *e);
 void engine_reconstruct_multipoles(struct engine *e);
+void engine_print_stats(struct engine *e);
 void engine_dump_snapshot(struct engine *e);
 void engine_init(struct engine *e, struct space *s,
                  const struct swift_params *params, int nr_nodes, int nodeID,
diff --git a/src/hydro/Gadget2/hydro_iact.h b/src/hydro/Gadget2/hydro_iact.h
index 9e06fec92f9667d21ce7dc196ba26f9870cb24f4..cce4fb03ee405fbdf2232184778a0f426e63014a 100644
--- a/src/hydro/Gadget2/hydro_iact.h
+++ b/src/hydro/Gadget2/hydro_iact.h
@@ -1167,4 +1167,365 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force(
 #endif
 }
 
+#ifdef WITH_VECTORIZATION
+static const vector const_viscosity_alpha_fac =
+    FILL_VEC(-0.25f * const_viscosity_alpha);
+
+/**
+ * @brief Force interaction computed using 1 vector
+ * (non-symmetric vectorized version).
+ */
+__attribute__((always_inline)) INLINE static void
+runner_iact_nonsym_1_vec_force(
+    vector *r2, vector *dx, vector *dy, vector *dz, vector vix, vector viy,
+    vector viz, vector pirho, vector grad_hi, vector piPOrho2, vector balsara_i,
+    vector ci, float *Vjx, float *Vjy, float *Vjz, float *Pjrho, float *Grad_hj,
+    float *PjPOrho2, float *Balsara_j, float *Cj, float *Mj, vector hi_inv,
+    vector hj_inv, vector *a_hydro_xSum, vector *a_hydro_ySum,
+    vector *a_hydro_zSum, vector *h_dtSum, vector *v_sigSum,
+    vector *entropy_dtSum, mask_t mask) {
+
+#ifdef WITH_VECTORIZATION
+
+  vector r, ri;
+  vector vjx, vjy, vjz, dvx, dvy, dvz;
+  vector pjrho, grad_hj, pjPOrho2, balsara_j, cj, mj;
+  vector xi, xj;
+  vector hid_inv, hjd_inv;
+  vector wi_dx, wj_dx, wi_dr, wj_dr, dvdr;
+  vector piax, piay, piaz;
+  vector pih_dt;
+  vector v_sig;
+  vector omega_ij, mu_ij, fac_mu, balsara;
+  vector rho_ij, visc, visc_term, sph_term, acc, entropy_dt;
+
+  /* Fill vectors. */
+  vjx.v = vec_load(Vjx);
+  vjy.v = vec_load(Vjy);
+  vjz.v = vec_load(Vjz);
+  mj.v = vec_load(Mj);
+
+  pjrho.v = vec_load(Pjrho);
+  grad_hj.v = vec_load(Grad_hj);
+  pjPOrho2.v = vec_load(PjPOrho2);
+  balsara_j.v = vec_load(Balsara_j);
+  cj.v = vec_load(Cj);
+
+  fac_mu.v = vec_set1(1.f); /* Will change with cosmological integration */
+
+  /* Load stuff. */
+  balsara.v = vec_add(balsara_i.v, balsara_j.v);
+
+  /* Get the radius and inverse radius. */
+  ri = vec_reciprocal_sqrt(*r2);
+  r.v = vec_mul(r2->v, ri.v);
+
+  /* Get the kernel for hi. */
+  hid_inv = pow_dimension_plus_one_vec(hi_inv);
+  xi.v = vec_mul(r.v, hi_inv.v);
+  kernel_eval_dWdx_force_vec(&xi, &wi_dx);
+  wi_dr.v = vec_mul(hid_inv.v, wi_dx.v);
+
+  /* Get the kernel for hj. */
+  hjd_inv = pow_dimension_plus_one_vec(hj_inv);
+  xj.v = vec_mul(r.v, hj_inv.v);
+
+  /* Calculate the kernel for two particles. */
+  kernel_eval_dWdx_force_vec(&xj, &wj_dx);
+
+  wj_dr.v = vec_mul(hjd_inv.v, wj_dx.v);
+
+  /* Compute dv. */
+  dvx.v = vec_sub(vix.v, vjx.v);
+  dvy.v = vec_sub(viy.v, vjy.v);
+  dvz.v = vec_sub(viz.v, vjz.v);
+
+  /* Compute dv dot r. */
+  dvdr.v = vec_fma(dvx.v, dx->v, vec_fma(dvy.v, dy->v, vec_mul(dvz.v, dz->v)));
+
+  /* Compute the relative velocity. (This is 0 if the particles move away from
+   * each other and negative otherwise) */
+  omega_ij.v = vec_fmin(dvdr.v, vec_setzero());
+  mu_ij.v =
+      vec_mul(fac_mu.v, vec_mul(ri.v, omega_ij.v)); /* This is 0 or negative */
+
+  /* Compute signal velocity */
+  v_sig.v = vec_fnma(vec_set1(3.f), mu_ij.v, vec_add(ci.v, cj.v));
+
+  /* Now construct the full viscosity term */
+  rho_ij.v = vec_mul(vec_set1(0.5f), vec_add(pirho.v, pjrho.v));
+  visc.v = vec_div(vec_mul(const_viscosity_alpha_fac.v,
+                           vec_mul(v_sig.v, vec_mul(mu_ij.v, balsara.v))),
+                   rho_ij.v);
+
+  /* Now, convolve with the kernel */
+  visc_term.v =
+      vec_mul(vec_set1(0.5f),
+              vec_mul(visc.v, vec_mul(vec_add(wi_dr.v, wj_dr.v), ri.v)));
+
+  sph_term.v =
+      vec_mul(vec_fma(vec_mul(grad_hi.v, piPOrho2.v), wi_dr.v,
+                      vec_mul(grad_hj.v, vec_mul(pjPOrho2.v, wj_dr.v))),
+              ri.v);
+
+  /* Eventually get the acceleration */
+  acc.v = vec_add(visc_term.v, sph_term.v);
+
+  /* Use the force, Luke! */
+  piax.v = vec_mul(mj.v, vec_mul(dx->v, acc.v));
+  piay.v = vec_mul(mj.v, vec_mul(dy->v, acc.v));
+  piaz.v = vec_mul(mj.v, vec_mul(dz->v, acc.v));
+
+  /* Get the time derivative for h. */
+  pih_dt.v =
+      vec_div(vec_mul(mj.v, vec_mul(dvdr.v, vec_mul(ri.v, wi_dr.v))), pjrho.v);
+
+  /* Change in entropy */
+  entropy_dt.v = vec_mul(mj.v, vec_mul(visc_term.v, dvdr.v));
+
+  /* Store the forces back on the particles. */
+  a_hydro_xSum->v = vec_mask_sub(a_hydro_xSum->v, piax.v, mask);
+  a_hydro_ySum->v = vec_mask_sub(a_hydro_ySum->v, piay.v, mask);
+  a_hydro_zSum->v = vec_mask_sub(a_hydro_zSum->v, piaz.v, mask);
+  h_dtSum->v = vec_mask_sub(h_dtSum->v, pih_dt.v, mask);
+  v_sigSum->v = vec_fmax(v_sigSum->v, vec_and_mask(v_sig.v, mask));
+  entropy_dtSum->v = vec_mask_add(entropy_dtSum->v, entropy_dt.v, mask);
+
+#else
+
+  error(
+      "The Gadget2 serial version of runner_iact_nonsym_force was called when "
+      "the vectorised version should have been used.");
+
+#endif
+}
+
+/**
+ * @brief Force interaction computed using 2 interleaved vectors
+ * (non-symmetric vectorized version).
+ */
+__attribute__((always_inline)) INLINE static void
+runner_iact_nonsym_2_vec_force(
+    float *R2, float *Dx, float *Dy, float *Dz, vector vix, vector viy,
+    vector viz, vector pirho, vector grad_hi, vector piPOrho2, vector balsara_i,
+    vector ci, float *Vjx, float *Vjy, float *Vjz, float *Pjrho, float *Grad_hj,
+    float *PjPOrho2, float *Balsara_j, float *Cj, float *Mj, vector hi_inv,
+    float *Hj_inv, vector *a_hydro_xSum, vector *a_hydro_ySum,
+    vector *a_hydro_zSum, vector *h_dtSum, vector *v_sigSum,
+    vector *entropy_dtSum, mask_t mask, mask_t mask_2, short mask_cond) {
+
+#ifdef WITH_VECTORIZATION
+
+  vector r, r2, ri;
+  vector dx, dy, dz, dvx, dvy, dvz;
+  vector vjx, vjy, vjz;
+  vector pjrho, grad_hj, pjPOrho2, balsara_j, cj, mj, hj_inv;
+  vector ui, uj;
+  vector hid_inv, hjd_inv;
+  vector wi_dx, wj_dx, wi_dr, wj_dr, dvdr;
+  vector piax, piay, piaz;
+  vector pih_dt;
+  vector v_sig;
+  vector omega_ij, mu_ij, fac_mu, balsara;
+  vector rho_ij, visc, visc_term, sph_term, acc, entropy_dt;
+
+  vector r_2, r2_2, ri_2;
+  vector dx_2, dy_2, dz_2, dvx_2, dvy_2, dvz_2;
+  vector vjx_2, vjy_2, vjz_2;
+  vector pjrho_2, grad_hj_2, pjPOrho2_2, balsara_j_2, cj_2, mj_2, hj_inv_2;
+  vector ui_2, uj_2;
+  vector hjd_inv_2;
+  vector wi_dx_2, wj_dx_2, wi_dr_2, wj_dr_2, dvdr_2;
+  vector piax_2, piay_2, piaz_2;
+  vector pih_dt_2;
+  vector v_sig_2;
+  vector omega_ij_2, mu_ij_2, balsara_2;
+  vector rho_ij_2, visc_2, visc_term_2, sph_term_2, acc_2, entropy_dt_2;
+
+  /* Fill vectors. */
+  mj.v = vec_load(Mj);
+  mj_2.v = vec_load(&Mj[VEC_SIZE]);
+  vjx.v = vec_load(Vjx);
+  vjx_2.v = vec_load(&Vjx[VEC_SIZE]);
+  vjy.v = vec_load(Vjy);
+  vjy_2.v = vec_load(&Vjy[VEC_SIZE]);
+  vjz.v = vec_load(Vjz);
+  vjz_2.v = vec_load(&Vjz[VEC_SIZE]);
+  dx.v = vec_load(Dx);
+  dx_2.v = vec_load(&Dx[VEC_SIZE]);
+  dy.v = vec_load(Dy);
+  dy_2.v = vec_load(&Dy[VEC_SIZE]);
+  dz.v = vec_load(Dz);
+  dz_2.v = vec_load(&Dz[VEC_SIZE]);
+
+  /* Get the radius and inverse radius. */
+  r2.v = vec_load(R2);
+  r2_2.v = vec_load(&R2[VEC_SIZE]);
+  ri = vec_reciprocal_sqrt(r2);
+  ri_2 = vec_reciprocal_sqrt(r2_2);
+  r.v = vec_mul(r2.v, ri.v);
+  r_2.v = vec_mul(r2_2.v, ri_2.v);
+
+  /* Get remaining properties. */
+  pjrho.v = vec_load(Pjrho);
+  pjrho_2.v = vec_load(&Pjrho[VEC_SIZE]);
+  grad_hj.v = vec_load(Grad_hj);
+  grad_hj_2.v = vec_load(&Grad_hj[VEC_SIZE]);
+  pjPOrho2.v = vec_load(PjPOrho2);
+  pjPOrho2_2.v = vec_load(&PjPOrho2[VEC_SIZE]);
+  balsara_j.v = vec_load(Balsara_j);
+  balsara_j_2.v = vec_load(&Balsara_j[VEC_SIZE]);
+  cj.v = vec_load(Cj);
+  cj_2.v = vec_load(&Cj[VEC_SIZE]);
+  hj_inv.v = vec_load(Hj_inv);
+  hj_inv_2.v = vec_load(&Hj_inv[VEC_SIZE]);
+
+  fac_mu.v = vec_set1(1.f); /* Will change with cosmological integration */
+
+  /* Find the balsara switch. */
+  balsara.v = vec_add(balsara_i.v, balsara_j.v);
+  balsara_2.v = vec_add(balsara_i.v, balsara_j_2.v);
+
+  /* Get the kernel for hi. */
+  hid_inv = pow_dimension_plus_one_vec(hi_inv);
+  ui.v = vec_mul(r.v, hi_inv.v);
+  ui_2.v = vec_mul(r_2.v, hi_inv.v);
+  kernel_eval_dWdx_force_vec(&ui, &wi_dx);
+  kernel_eval_dWdx_force_vec(&ui_2, &wi_dx_2);
+  wi_dr.v = vec_mul(hid_inv.v, wi_dx.v);
+  wi_dr_2.v = vec_mul(hid_inv.v, wi_dx_2.v);
+
+  /* Get the kernel for hj. */
+  hjd_inv = pow_dimension_plus_one_vec(hj_inv);
+  hjd_inv_2 = pow_dimension_plus_one_vec(hj_inv_2);
+  uj.v = vec_mul(r.v, hj_inv.v);
+  uj_2.v = vec_mul(r_2.v, hj_inv_2.v);
+
+  /* Calculate the kernel for two particles. */
+  kernel_eval_dWdx_force_vec(&uj, &wj_dx);
+  kernel_eval_dWdx_force_vec(&uj_2, &wj_dx_2);
+
+  wj_dr.v = vec_mul(hjd_inv.v, wj_dx.v);
+  wj_dr_2.v = vec_mul(hjd_inv_2.v, wj_dx_2.v);
+
+  /* Compute dv. */
+  dvx.v = vec_sub(vix.v, vjx.v);
+  dvx_2.v = vec_sub(vix.v, vjx_2.v);
+  dvy.v = vec_sub(viy.v, vjy.v);
+  dvy_2.v = vec_sub(viy.v, vjy_2.v);
+  dvz.v = vec_sub(viz.v, vjz.v);
+  dvz_2.v = vec_sub(viz.v, vjz_2.v);
+
+  /* Compute dv dot r. */
+  dvdr.v = vec_fma(dvx.v, dx.v, vec_fma(dvy.v, dy.v, vec_mul(dvz.v, dz.v)));
+  dvdr_2.v = vec_fma(dvx_2.v, dx_2.v,
+                     vec_fma(dvy_2.v, dy_2.v, vec_mul(dvz_2.v, dz_2.v)));
+
+  /* Compute the relative velocity. (This is 0 if the particles move away from
+   * each other and negative otherwise) */
+  omega_ij.v = vec_fmin(dvdr.v, vec_setzero());
+  omega_ij_2.v = vec_fmin(dvdr_2.v, vec_setzero());
+  mu_ij.v =
+      vec_mul(fac_mu.v, vec_mul(ri.v, omega_ij.v)); /* This is 0 or negative */
+  mu_ij_2.v = vec_mul(
+      fac_mu.v, vec_mul(ri_2.v, omega_ij_2.v)); /* This is 0 or negative */
+
+  /* Compute signal velocity */
+  v_sig.v = vec_fnma(vec_set1(3.f), mu_ij.v, vec_add(ci.v, cj.v));
+  v_sig_2.v = vec_fnma(vec_set1(3.f), mu_ij_2.v, vec_add(ci.v, cj_2.v));
+
+  /* Now construct the full viscosity term */
+  rho_ij.v = vec_mul(vec_set1(0.5f), vec_add(pirho.v, pjrho.v));
+  rho_ij_2.v = vec_mul(vec_set1(0.5f), vec_add(pirho.v, pjrho_2.v));
+
+  visc.v = vec_div(vec_mul(const_viscosity_alpha_fac.v,
+                           vec_mul(v_sig.v, vec_mul(mu_ij.v, balsara.v))),
+                   rho_ij.v);
+  visc_2.v =
+      vec_div(vec_mul(const_viscosity_alpha_fac.v,
+                      vec_mul(v_sig_2.v, vec_mul(mu_ij_2.v, balsara_2.v))),
+              rho_ij_2.v);
+
+  /* Now, convolve with the kernel */
+  visc_term.v =
+      vec_mul(vec_set1(0.5f),
+              vec_mul(visc.v, vec_mul(vec_add(wi_dr.v, wj_dr.v), ri.v)));
+  visc_term_2.v = vec_mul(
+      vec_set1(0.5f),
+      vec_mul(visc_2.v, vec_mul(vec_add(wi_dr_2.v, wj_dr_2.v), ri_2.v)));
+
+  vector grad_hi_mul_piPOrho2;
+  grad_hi_mul_piPOrho2.v = vec_mul(grad_hi.v, piPOrho2.v);
+
+  sph_term.v =
+      vec_mul(vec_fma(grad_hi_mul_piPOrho2.v, wi_dr.v,
+                      vec_mul(grad_hj.v, vec_mul(pjPOrho2.v, wj_dr.v))),
+              ri.v);
+  sph_term_2.v =
+      vec_mul(vec_fma(grad_hi_mul_piPOrho2.v, wi_dr_2.v,
+                      vec_mul(grad_hj_2.v, vec_mul(pjPOrho2_2.v, wj_dr_2.v))),
+              ri_2.v);
+
+  /* Eventually get the acceleration */
+  acc.v = vec_add(visc_term.v, sph_term.v);
+  acc_2.v = vec_add(visc_term_2.v, sph_term_2.v);
+
+  /* Use the force, Luke! */
+  piax.v = vec_mul(mj.v, vec_mul(dx.v, acc.v));
+  piax_2.v = vec_mul(mj_2.v, vec_mul(dx_2.v, acc_2.v));
+  piay.v = vec_mul(mj.v, vec_mul(dy.v, acc.v));
+  piay_2.v = vec_mul(mj_2.v, vec_mul(dy_2.v, acc_2.v));
+  piaz.v = vec_mul(mj.v, vec_mul(dz.v, acc.v));
+  piaz_2.v = vec_mul(mj_2.v, vec_mul(dz_2.v, acc_2.v));
+
+  /* Get the time derivative for h. */
+  pih_dt.v =
+      vec_div(vec_mul(mj.v, vec_mul(dvdr.v, vec_mul(ri.v, wi_dr.v))), pjrho.v);
+  pih_dt_2.v =
+      vec_div(vec_mul(mj_2.v, vec_mul(dvdr_2.v, vec_mul(ri_2.v, wi_dr_2.v))),
+              pjrho_2.v);
+
+  /* Change in entropy */
+  entropy_dt.v = vec_mul(mj.v, vec_mul(visc_term.v, dvdr.v));
+  entropy_dt_2.v = vec_mul(mj_2.v, vec_mul(visc_term_2.v, dvdr_2.v));
+
+  /* Store the forces back on the particles. */
+  if (mask_cond) {
+    a_hydro_xSum->v = vec_mask_sub(a_hydro_xSum->v, piax.v, mask);
+    a_hydro_xSum->v = vec_mask_sub(a_hydro_xSum->v, piax_2.v, mask_2);
+    a_hydro_ySum->v = vec_mask_sub(a_hydro_ySum->v, piay.v, mask);
+    a_hydro_ySum->v = vec_mask_sub(a_hydro_ySum->v, piay_2.v, mask_2);
+    a_hydro_zSum->v = vec_mask_sub(a_hydro_zSum->v, piaz.v, mask);
+    a_hydro_zSum->v = vec_mask_sub(a_hydro_zSum->v, piaz_2.v, mask_2);
+    h_dtSum->v = vec_mask_sub(h_dtSum->v, pih_dt.v, mask);
+    h_dtSum->v = vec_mask_sub(h_dtSum->v, pih_dt_2.v, mask_2);
+    v_sigSum->v = vec_fmax(v_sigSum->v, vec_and_mask(v_sig.v, mask));
+    v_sigSum->v = vec_fmax(v_sigSum->v, vec_and_mask(v_sig_2.v, mask_2));
+    entropy_dtSum->v = vec_mask_add(entropy_dtSum->v, entropy_dt.v, mask);
+    entropy_dtSum->v = vec_mask_add(entropy_dtSum->v, entropy_dt_2.v, mask_2);
+  } else {
+    a_hydro_xSum->v = vec_sub(a_hydro_xSum->v, piax.v);
+    a_hydro_xSum->v = vec_sub(a_hydro_xSum->v, piax_2.v);
+    a_hydro_ySum->v = vec_sub(a_hydro_ySum->v, piay.v);
+    a_hydro_ySum->v = vec_sub(a_hydro_ySum->v, piay_2.v);
+    a_hydro_zSum->v = vec_sub(a_hydro_zSum->v, piaz.v);
+    a_hydro_zSum->v = vec_sub(a_hydro_zSum->v, piaz_2.v);
+    h_dtSum->v = vec_sub(h_dtSum->v, pih_dt.v);
+    h_dtSum->v = vec_sub(h_dtSum->v, pih_dt_2.v);
+    v_sigSum->v = vec_fmax(v_sigSum->v, v_sig.v);
+    v_sigSum->v = vec_fmax(v_sigSum->v, v_sig_2.v);
+    entropy_dtSum->v = vec_add(entropy_dtSum->v, entropy_dt.v);
+    entropy_dtSum->v = vec_add(entropy_dtSum->v, entropy_dt_2.v);
+  }
+#else
+
+  error(
+      "The Gadget2 serial version of runner_iact_nonsym_force was called when "
+      "the vectorised version should have been used.");
+
+#endif
+}
+
+#endif
+
 #endif /* SWIFT_GADGET2_HYDRO_IACT_H */
diff --git a/src/kernel_hydro.h b/src/kernel_hydro.h
index 5355c15f8f1a0d0c3d811c7039da04caf0522cc9..d3c905e50ea023ea63629505fd3566cbed3ed9b0 100644
--- a/src/kernel_hydro.h
+++ b/src/kernel_hydro.h
@@ -438,9 +438,7 @@ static const vector cond = FILL_VEC(0.5f);
 
 /**
  * @brief Computes the kernel function and its derivative for two particles
- * using vectors.
- *
- * Return 0 if $u > \\gamma = H/h$
+ * using vectors. The return value is undefined if $u > \\gamma = H/h$.
  *
  * @param u The ratio of the distance to the smoothing length $u = x/h$.
  * @param w (return) The value of the kernel function $W(x,h)$.
@@ -518,9 +516,8 @@ __attribute__((always_inline)) INLINE static void kernel_deval_1_vec(
 
 /**
  * @brief Computes the kernel function and its derivative for two particles
- * using interleaved vectors.
- *
- * Return 0 if $u > \\gamma = H/h$
+ * using interleaved vectors. The return value is undefined if $u > \\gamma =
+ * H/h$.
  *
  * @param u The ratio of the distance to the smoothing length $u = x/h$.
  * @param w (return) The value of the kernel function $W(x,h)$.
@@ -644,9 +641,7 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
 
 /**
  * @brief Computes the kernel function for two particles
- * using vectors.
- *
- * Return 0 if $u > \\gamma = H/h$
+ * using vectors. The return value is undefined if $u > \\gamma = H/h$.
  *
  * @param u The ratio of the distance to the smoothing length $u = x/h$.
  * @param w (return) The value of the kernel function $W(x,h)$.
@@ -704,6 +699,65 @@ __attribute__((always_inline)) INLINE static void kernel_eval_W_vec(vector *u,
       vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
 }
 
+/**
+ * @brief Computes the kernel function derivative for two particles
+ * using vectors. The return value is undefined if $u > \\gamma = H/h$.
+ *
+ * @param u The ratio of the distance to the smoothing length $u = x/h$.
+ * @param dw_dx (return) The norm of the gradient of $|\\nabla W(x,h)|$.
+ */
+__attribute__((always_inline)) INLINE static void kernel_eval_dWdx_vec(
+    vector *u, vector *dw_dx) {
+
+  /* Go to the range [0,1[ from [0,H[ */
+  vector x;
+  x.v = vec_mul(u->v, kernel_gamma_inv_vec.v);
+
+#ifdef WENDLAND_C2_KERNEL
+  /* Init the iteration for Horner's scheme. */
+  dw_dx->v = vec_fma(wendland_dwdx_const_c0.v, x.v, wendland_dwdx_const_c1.v);
+
+  /* Calculate the polynomial interleaving vector operations */
+  dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c2.v);
+
+  dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c3.v);
+
+  dw_dx->v = vec_mul(dw_dx->v, x.v);
+
+#elif defined(CUBIC_SPLINE_KERNEL)
+  vector dw_dx2;
+  mask_t mask_reg1, mask_reg2;
+
+  /* Form a mask for each part of the kernel. */
+  vec_create_mask(mask_reg1, vec_cmp_lt(x.v, cond.v));  /* 0 < x < 0.5 */
+  vec_create_mask(mask_reg2, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */
+
+  /* Work out w for both regions of the kernel and combine the results together
+   * using masks. */
+
+  /* Init the iteration for Horner's scheme. */
+  dw_dx->v = vec_fma(cubic_1_dwdx_const_c0.v, x.v, cubic_1_dwdx_const_c1.v);
+  dw_dx2.v = vec_fma(cubic_2_dwdx_const_c0.v, x.v, cubic_2_dwdx_const_c1.v);
+
+  /* Calculate the polynomial interleaving vector operations. */
+  dw_dx->v = vec_mul(dw_dx->v, x.v); /* cubic_1_dwdx_const_c2 is zero. */
+  dw_dx2.v = vec_fma(dw_dx2.v, x.v, cubic_2_dwdx_const_c2.v);
+
+  /* Mask out unneeded values. */
+  dw_dx->v = vec_and_mask(dw_dx->v, mask_reg1);
+  dw_dx2.v = vec_and_mask(dw_dx2.v, mask_reg2);
+
+  /* Added both dwdx and dwdx2 together to form complete result. */
+  dw_dx->v = vec_add(dw_dx->v, dw_dx2.v);
+#else
+#error "Vectorisation not supported for this kernel!!!"
+#endif
+
+  /* Return everything */
+  dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
+                                       kernel_gamma_inv_dim_plus_one_vec.v));
+}
+
 /**
  * @brief Computes the kernel function derivative for two particles
  * using vectors.
@@ -713,7 +767,7 @@ __attribute__((always_inline)) INLINE static void kernel_eval_W_vec(vector *u,
  * @param u The ratio of the distance to the smoothing length $u = x/h$.
  * @param dw_dx (return) The norm of the gradient of $|\\nabla W(x,h)|$.
  */
-__attribute__((always_inline)) INLINE static void kernel_eval_dWdx_vec(
+__attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_vec(
     vector *u, vector *dw_dx) {
 
   /* Go to the range [0,1[ from [0,H[ */
@@ -855,6 +909,9 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_2_vec(
   /* Return everything */
   dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
                                        kernel_gamma_inv_dim_plus_one_vec.v));
+  dw_dx_2->v = vec_mul(
+      dw_dx_2->v,
+      vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_plus_one_vec.v));
 }
 
 #endif /* WITH_VECTORIZATION */
diff --git a/src/runner.c b/src/runner.c
index bb5f2685555504c24a40a32e5cbf635a7d4a7f1f..69c1512479e07da0aacad0a9e28bcaa6aafce104 100644
--- a/src/runner.c
+++ b/src/runner.c
@@ -1864,9 +1864,13 @@ void *runner_main(void *data) {
           else if (t->subtype == task_subtype_gradient)
             runner_doself1_gradient(r, ci);
 #endif
-          else if (t->subtype == task_subtype_force)
+          else if (t->subtype == task_subtype_force) {
+#if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH)
+            runner_doself2_force_vec(r, ci);
+#else
             runner_doself2_force(r, ci);
-          else if (t->subtype == task_subtype_grav)
+#endif
+          } else if (t->subtype == task_subtype_grav)
             runner_doself_grav(r, ci, 1);
           else if (t->subtype == task_subtype_external_grav)
             runner_do_grav_external(r, ci, 1);
diff --git a/src/runner_doiact.h b/src/runner_doiact.h
index 0dd6b2cdfd430e471fc13087afb0b2fcde947b97..14eecaa2c0c126b721a400e63298e1d55a9c98da 100644
--- a/src/runner_doiact.h
+++ b/src/runner_doiact.h
@@ -2664,9 +2664,14 @@ void DOSUB_SELF2(struct runner *r, struct cell *ci, int gettimer) {
   }
 
   /* Otherwise, compute self-interaction. */
-  else
+  else {
+#if (DOSELF2 == runner_doself2_force) && defined(WITH_VECTORIZATION) && \
+    defined(GADGET2_SPH)
+    runner_doself2_force_vec(r, ci);
+#else
     DOSELF2(r, ci);
-
+#endif
+  }
   if (gettimer) TIMER_TOC(TIMER_DOSUB_SELF);
 }
 
diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c
index 1557c1ec84863c3518f8671cdf0e9ca44732e94e..2f0b311218c86a60c5c0853c555cb48479249bce 100644
--- a/src/runner_doiact_vec.c
+++ b/src/runner_doiact_vec.c
@@ -27,6 +27,8 @@
 #include "active.h"
 
 #ifdef WITH_VECTORIZATION
+static const vector kernel_gamma2_vec = FILL_VEC(kernel_gamma2);
+
 /**
  * @brief Compute the vector remainder interactions from the secondary cache.
  *
@@ -241,14 +243,12 @@ __attribute__((always_inline)) INLINE static void storeInteractions(
  * @param di_max Maximal position on the axis that can interact in cell ci
  * @param dj_min Minimal position on the axis that can interact in cell ci
  * @param max_index_i array to hold the maximum distances of pi particles into
- * cell
- * cj
+ * #cell cj
  * @param max_index_j array to hold the maximum distances of pj particles into
- * cell
- * cj
+ * #cell cj
  * @param init_pi first pi to interact with a pj particle
  * @param init_pj last pj to interact with a pi particle
- * @param e The #engine.
+ * @param max_active_bin The largest time-bin active during this step.
  */
 __attribute__((always_inline)) INLINE static void populate_max_index_no_cache(
     const struct cell *ci, const struct cell *cj,
@@ -256,7 +256,7 @@ __attribute__((always_inline)) INLINE static void populate_max_index_no_cache(
     const float dx_max, const float rshift, const double hi_max,
     const double hj_max, const double di_max, const double dj_min,
     int *max_index_i, int *max_index_j, int *init_pi, int *init_pj,
-    const struct engine *e) {
+    const timebin_t max_active_bin) {
 
   const struct part *restrict parts_i = ci->parts;
   const struct part *restrict parts_j = cj->parts;
@@ -271,7 +271,8 @@ __attribute__((always_inline)) INLINE static void populate_max_index_no_cache(
   while (first_pi > 0 && sort_i[first_pi - 1].d + dx_max + hi_max > dj_min) {
     first_pi--;
     /* Store the index of the particle if it is active. */
-    if (part_is_active(&parts_i[sort_i[first_pi].i], e)) active_id = first_pi;
+    if (part_is_active_no_debug(&parts_i[sort_i[first_pi].i], max_active_bin))
+      active_id = first_pi;
   }
 
   /* Set the first active pi in range of any particle in cell j. */
@@ -318,7 +319,8 @@ __attribute__((always_inline)) INLINE static void populate_max_index_no_cache(
          sort_j[last_pj + 1].d - hj_max - dx_max < di_max) {
     last_pj++;
     /* Store the index of the particle if it is active. */
-    if (part_is_active(&parts_j[sort_j[last_pj].i], e)) active_id = last_pj;
+    if (part_is_active_no_debug(&parts_j[sort_j[last_pj].i], max_active_bin))
+      active_id = last_pj;
   }
 
   /* Set the last active pj in range of any particle in cell i. */
@@ -381,6 +383,8 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
   struct part *restrict parts = c->parts;
   const int count = c->count;
 
+  const timebin_t max_active_bin = e->max_active_bin;
+
   vector v_hi, v_vix, v_viy, v_viz, v_hig2, v_r2;
 
   TIMER_TIC
@@ -411,7 +415,7 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
     pi = &parts[pid];
 
     /* Is the ith particle active? */
-    if (!part_is_active(pi, e)) continue;
+    if (!part_is_active_no_debug(pi, max_active_bin)) continue;
 
     vector pix, piy, piz;
 
@@ -481,22 +485,22 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
       pjz2.v = vec_load(&cell_cache->z[pjd + VEC_SIZE]);
 
       /* Compute the pairwise distance. */
-      vector v_dx_tmp, v_dy_tmp, v_dz_tmp;
-      vector v_dx_tmp2, v_dy_tmp2, v_dz_tmp2, v_r2_2;
-
-      v_dx_tmp.v = vec_sub(pix.v, pjx.v);
-      v_dx_tmp2.v = vec_sub(pix.v, pjx2.v);
-      v_dy_tmp.v = vec_sub(piy.v, pjy.v);
-      v_dy_tmp2.v = vec_sub(piy.v, pjy2.v);
-      v_dz_tmp.v = vec_sub(piz.v, pjz.v);
-      v_dz_tmp2.v = vec_sub(piz.v, pjz2.v);
-
-      v_r2.v = vec_mul(v_dx_tmp.v, v_dx_tmp.v);
-      v_r2_2.v = vec_mul(v_dx_tmp2.v, v_dx_tmp2.v);
-      v_r2.v = vec_fma(v_dy_tmp.v, v_dy_tmp.v, v_r2.v);
-      v_r2_2.v = vec_fma(v_dy_tmp2.v, v_dy_tmp2.v, v_r2_2.v);
-      v_r2.v = vec_fma(v_dz_tmp.v, v_dz_tmp.v, v_r2.v);
-      v_r2_2.v = vec_fma(v_dz_tmp2.v, v_dz_tmp2.v, v_r2_2.v);
+      vector v_dx, v_dy, v_dz;
+      vector v_dx_2, v_dy_2, v_dz_2, v_r2_2;
+
+      v_dx.v = vec_sub(pix.v, pjx.v);
+      v_dx_2.v = vec_sub(pix.v, pjx2.v);
+      v_dy.v = vec_sub(piy.v, pjy.v);
+      v_dy_2.v = vec_sub(piy.v, pjy2.v);
+      v_dz.v = vec_sub(piz.v, pjz.v);
+      v_dz_2.v = vec_sub(piz.v, pjz2.v);
+
+      v_r2.v = vec_mul(v_dx.v, v_dx.v);
+      v_r2_2.v = vec_mul(v_dx_2.v, v_dx_2.v);
+      v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v);
+      v_r2_2.v = vec_fma(v_dy_2.v, v_dy_2.v, v_r2_2.v);
+      v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v);
+      v_r2_2.v = vec_fma(v_dz_2.v, v_dz_2.v, v_r2_2.v);
 
       /* Form a mask from r2 < hig2 and r2 > 0.*/
       mask_t v_doi_mask, v_doi_mask_self_check, v_doi_mask2,
@@ -526,19 +530,18 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
       /* If there are any interactions left pack interaction values into c2
        * cache. */
       if (doi_mask) {
-        storeInteractions(doi_mask, pjd, &v_r2, &v_dx_tmp, &v_dy_tmp, &v_dz_tmp,
-                          cell_cache, &int_cache, &icount, &rhoSum, &rho_dhSum,
-                          &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum,
-                          &curlvySum, &curlvzSum, v_hi_inv, v_vix, v_viy,
-                          v_viz);
-      }
-      if (doi_mask2) {
-        storeInteractions(doi_mask2, pjd + VEC_SIZE, &v_r2_2, &v_dx_tmp2,
-                          &v_dy_tmp2, &v_dz_tmp2, cell_cache, &int_cache,
-                          &icount, &rhoSum, &rho_dhSum, &wcountSum,
+        storeInteractions(doi_mask, pjd, &v_r2, &v_dx, &v_dy, &v_dz, cell_cache,
+                          &int_cache, &icount, &rhoSum, &rho_dhSum, &wcountSum,
                           &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum,
                           &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz);
       }
+      if (doi_mask2) {
+        storeInteractions(doi_mask2, pjd + VEC_SIZE, &v_r2_2, &v_dx_2, &v_dy_2,
+                          &v_dz_2, cell_cache, &int_cache, &icount, &rhoSum,
+                          &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum,
+                          &curlvxSum, &curlvySum, &curlvzSum, v_hi_inv, v_vix,
+                          v_viy, v_viz);
+      }
     }
 
     /* Perform padded vector remainder interactions if any are present. */
@@ -583,6 +586,202 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
 #endif /* WITH_VECTORIZATION */
 }
 
+/**
+ * @brief Compute the force cell self-interaction (non-symmetric) using vector
+ * intrinsics with one particle pi at a time.
+ *
+ * @param r The #runner.
+ * @param c The #cell.
+ */
+__attribute__((always_inline)) INLINE void runner_doself2_force_vec(
+    struct runner *r, struct cell *restrict c) {
+
+#ifdef WITH_VECTORIZATION
+  const struct engine *e = r->e;
+  struct part *restrict pi;
+  int count_align;
+  const int num_vec_proc = 1;
+
+  const timebin_t max_active_bin = e->max_active_bin;
+
+  struct part *restrict parts = c->parts;
+  const int count = c->count;
+
+  vector v_hi, v_vix, v_viy, v_viz, v_hig2, v_r2;
+  vector v_rhoi, v_grad_hi, v_pOrhoi2, v_balsara_i, v_ci;
+
+  TIMER_TIC;
+
+  if (!cell_is_active(c, e)) return;
+
+  if (!cell_are_part_drifted(c, e)) error("Interacting undrifted cell.");
+
+  /* Get the particle cache from the runner and re-allocate
+   * the cache if it is not big enough for the cell. */
+  struct cache *restrict cell_cache = &r->ci_cache;
+
+  if (cell_cache->count < count) {
+    cache_init(cell_cache, count);
+  }
+
+  /* Read the particles from the cell and store them locally in the cache. */
+  cache_read_force_particles(c, cell_cache);
+
+#ifdef SWIFT_DEBUG_CHECKS
+  for (int i = 0; i < count; i++) {
+    pi = &c->parts[i];
+    /* Check that particles have been drifted to the current time */
+    if (pi->ti_drift != e->ti_current)
+      error("Particle pi not drifted to current time");
+  }
+#endif
+
+  /* Loop over the particles in the cell. */
+  for (int pid = 0; pid < count; pid++) {
+
+    /* Get a pointer to the ith particle. */
+    pi = &parts[pid];
+
+    /* Is the ith particle active? */
+    if (!part_is_active_no_debug(pi, max_active_bin)) continue;
+
+    vector pix, piy, piz;
+
+    const float hi = cell_cache->h[pid];
+
+    /* Fill particle pi vectors. */
+    pix.v = vec_set1(cell_cache->x[pid]);
+    piy.v = vec_set1(cell_cache->y[pid]);
+    piz.v = vec_set1(cell_cache->z[pid]);
+    v_hi.v = vec_set1(hi);
+    v_vix.v = vec_set1(cell_cache->vx[pid]);
+    v_viy.v = vec_set1(cell_cache->vy[pid]);
+    v_viz.v = vec_set1(cell_cache->vz[pid]);
+
+    v_rhoi.v = vec_set1(cell_cache->rho[pid]);
+    v_grad_hi.v = vec_set1(cell_cache->grad_h[pid]);
+    v_pOrhoi2.v = vec_set1(cell_cache->pOrho2[pid]);
+    v_balsara_i.v = vec_set1(cell_cache->balsara[pid]);
+    v_ci.v = vec_set1(cell_cache->soundspeed[pid]);
+
+    const float hig2 = hi * hi * kernel_gamma2;
+    v_hig2.v = vec_set1(hig2);
+
+    /* Reset cumulative sums of update vectors. */
+    vector a_hydro_xSum, a_hydro_ySum, a_hydro_zSum, h_dtSum, v_sigSum,
+        entropy_dtSum;
+
+    /* Get the inverse of hi. */
+    vector v_hi_inv;
+
+    v_hi_inv = vec_reciprocal(v_hi);
+
+    a_hydro_xSum.v = vec_setzero();
+    a_hydro_ySum.v = vec_setzero();
+    a_hydro_zSum.v = vec_setzero();
+    h_dtSum.v = vec_setzero();
+    v_sigSum.v = vec_set1(pi->force.v_sig);
+    entropy_dtSum.v = vec_setzero();
+
+    /* Pad cache if there is a serial remainder. */
+    count_align = count;
+    int rem = count % (num_vec_proc * VEC_SIZE);
+    if (rem != 0) {
+      int pad = (num_vec_proc * VEC_SIZE) - rem;
+
+      count_align += pad;
+
+      /* Set positions to the same as particle pi so when the r2 > 0 mask is
+       * applied these extra contributions are masked out.*/
+      for (int i = count; i < count_align; i++) {
+        cell_cache->x[i] = pix.f[0];
+        cell_cache->y[i] = piy.f[0];
+        cell_cache->z[i] = piz.f[0];
+        cell_cache->h[i] = 1.f;
+        cell_cache->rho[i] = 1.f;
+        cell_cache->grad_h[i] = 1.f;
+        cell_cache->pOrho2[i] = 1.f;
+        cell_cache->balsara[i] = 1.f;
+        cell_cache->soundspeed[i] = 1.f;
+      }
+    }
+
+    vector pjx, pjy, pjz, hj, hjg2;
+
+    /* Find all of particle pi's interacions and store needed values in the
+     * secondary cache.*/
+    for (int pjd = 0; pjd < count_align; pjd += (num_vec_proc * VEC_SIZE)) {
+
+      /* Load 1 set of vectors from the particle cache. */
+      pjx.v = vec_load(&cell_cache->x[pjd]);
+      pjy.v = vec_load(&cell_cache->y[pjd]);
+      pjz.v = vec_load(&cell_cache->z[pjd]);
+      hj.v = vec_load(&cell_cache->h[pjd]);
+      hjg2.v = vec_mul(vec_mul(hj.v, hj.v), kernel_gamma2_vec.v);
+
+      /* Compute the pairwise distance. */
+      vector v_dx, v_dy, v_dz;
+
+      v_dx.v = vec_sub(pix.v, pjx.v);
+      v_dy.v = vec_sub(piy.v, pjy.v);
+      v_dz.v = vec_sub(piz.v, pjz.v);
+
+      v_r2.v = vec_mul(v_dx.v, v_dx.v);
+      v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v);
+      v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v);
+
+      /* Form r2 > 0 mask, r2 < hig2 mask and r2 < hjg2 mask. */
+      mask_t v_doi_mask, v_doi_mask_self_check;
+      int doi_mask;
+
+      /* Form r2 > 0 mask.*/
+      vec_create_mask(v_doi_mask_self_check, vec_cmp_gt(v_r2.v, vec_setzero()));
+
+      /* Form a mask from r2 < hig2 mask and r2 < hjg2 mask. */
+      vector v_h2;
+      v_h2.v = vec_fmax(v_hig2.v, hjg2.v);
+      vec_create_mask(v_doi_mask, vec_cmp_lt(v_r2.v, v_h2.v));
+
+      /* Combine all 3 masks and form integer mask. */
+      vec_combine_masks(v_doi_mask, v_doi_mask_self_check);
+      doi_mask = vec_form_int_mask(v_doi_mask);
+
+      /* If there are any interactions perform them. */
+      if (doi_mask) {
+        vector v_hj_inv;
+        v_hj_inv = vec_reciprocal(hj);
+
+        /* To stop floating point exceptions for when particle separations are
+         * 0.
+        */
+        v_r2.v = vec_add(v_r2.v, vec_set1(FLT_MIN));
+
+        runner_iact_nonsym_1_vec_force(
+            &v_r2, &v_dx, &v_dy, &v_dz, v_vix, v_viy, v_viz, v_rhoi, v_grad_hi,
+            v_pOrhoi2, v_balsara_i, v_ci, &cell_cache->vx[pjd],
+            &cell_cache->vy[pjd], &cell_cache->vz[pjd], &cell_cache->rho[pjd],
+            &cell_cache->grad_h[pjd], &cell_cache->pOrho2[pjd],
+            &cell_cache->balsara[pjd], &cell_cache->soundspeed[pjd],
+            &cell_cache->m[pjd], v_hi_inv, v_hj_inv, &a_hydro_xSum,
+            &a_hydro_ySum, &a_hydro_zSum, &h_dtSum, &v_sigSum, &entropy_dtSum,
+            v_doi_mask);
+      }
+
+    } /* Loop over all other particles. */
+
+    VEC_HADD(a_hydro_xSum, pi->a_hydro[0]);
+    VEC_HADD(a_hydro_ySum, pi->a_hydro[1]);
+    VEC_HADD(a_hydro_zSum, pi->a_hydro[2]);
+    VEC_HADD(h_dtSum, pi->force.h_dt);
+    VEC_HMAX(v_sigSum, pi->force.v_sig);
+    VEC_HADD(entropy_dtSum, pi->entropy_dt);
+
+  } /* loop over all particles. */
+
+  TIMER_TOC(timer_doself_force);
+#endif /* WITH_VECTORIZATION */
+}
+
 /**
  * @brief Compute the density interactions between a cell pair (non-symmetric)
  * using vector intrinsics.
@@ -599,6 +798,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
 
 #ifdef WITH_VECTORIZATION
   const struct engine *restrict e = r->e;
+  const timebin_t max_active_bin = e->max_active_bin;
 
   vector v_hi, v_vix, v_viy, v_viz, v_hig2;
 
@@ -661,7 +861,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
   for (int pid = count_i - 1;
        pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min; pid--) {
     struct part *restrict pi = &parts_i[sort_i[pid].i];
-    if (part_is_active(pi, e)) {
+    if (part_is_active_no_debug(pi, max_active_bin)) {
       numActive++;
       break;
     }
@@ -671,7 +871,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
     for (int pjd = 0; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max;
          pjd++) {
       struct part *restrict pj = &parts_j[sort_j[pjd].i];
-      if (part_is_active(pj, e)) {
+      if (part_is_active_no_debug(pj, max_active_bin)) {
         numActive++;
         break;
       }
@@ -700,12 +900,12 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
   max_index_j = r->cj_cache.max_index;
 
   /* Find particles maximum index into cj, max_index_i[] and ci, max_index_j[].
-   */
+  */
   /* Also find the first pi that interacts with any particle in cj and the last
    * pj that interacts with any particle in ci. */
   populate_max_index_no_cache(ci, cj, sort_i, sort_j, dx_max, rshift, hi_max,
                               hj_max, di_max, dj_min, max_index_i, max_index_j,
-                              &first_pi, &last_pj, e);
+                              &first_pi, &last_pj, max_active_bin);
 
   /* Limits of the outer loops. */
   int first_pi_loop = first_pi;
@@ -733,7 +933,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
 
       /* Get a hold of the ith part in ci. */
       struct part *restrict pi = &parts_i[sort_i[pid].i];
-      if (!part_is_active(pi, e)) continue;
+      if (!part_is_active_no_debug(pi, max_active_bin)) continue;
 
       /* Set the cache index. */
       int ci_cache_idx = pid - first_pi_align;
@@ -801,8 +1001,10 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
         vector v_dx, v_dy, v_dz, v_r2;
 
 #ifdef SWIFT_DEBUG_CHECKS
-        if (cj_cache_idx % VEC_SIZE != 0 || cj_cache_idx < 0) {
-          error("Unaligned read!!! cj_cache_idx=%d", cj_cache_idx);
+        if (cj_cache_idx % VEC_SIZE != 0 || cj_cache_idx < 0 ||
+            cj_cache_idx + (VEC_SIZE - 1) > (last_pj_align + 1 + VEC_SIZE)) {
+          error("Unaligned read!!! cj_cache_idx=%d, last_pj_align=%d",
+                cj_cache_idx, last_pj_align);
         }
 #endif
 
@@ -861,7 +1063,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
 
       /* Get a hold of the jth part in cj. */
       struct part *restrict pj = &parts_j[sort_j[pjd].i];
-      if (!part_is_active(pj, e)) continue;
+      if (!part_is_active_no_debug(pj, max_active_bin)) continue;
 
       /* Set the cache index. */
       int cj_cache_idx = pjd;
@@ -927,8 +1129,13 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
            ci_cache_idx < ci_cache_count; ci_cache_idx += VEC_SIZE) {
 
 #ifdef SWIFT_DEBUG_CHECKS
-        if (ci_cache_idx % VEC_SIZE != 0 || ci_cache_idx < 0) {
-          error("Unaligned read!!! ci_cache_idx=%d", ci_cache_idx);
+        if (ci_cache_idx % VEC_SIZE != 0 || ci_cache_idx < 0 ||
+            ci_cache_idx + (VEC_SIZE - 1) >
+                (count_i - first_pi_align + VEC_SIZE)) {
+          error(
+              "Unaligned read!!! ci_cache_idx=%d, first_pi_align=%d, "
+              "count_i=%d",
+              ci_cache_idx, first_pi_align, count_i);
         }
 #endif
 
@@ -969,7 +1176,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
       } /* loop over the parts in ci. */
 
       /* Perform horizontal adds on vector sums and store result in particle pj.
-       */
+      */
       VEC_HADD(rhoSum, pj->rho);
       VEC_HADD(rho_dhSum, pj->density.rho_dh);
       VEC_HADD(wcountSum, pj->density.wcount);
diff --git a/src/runner_doiact_vec.h b/src/runner_doiact_vec.h
index 13c88cebaa47c5051b4119033a055e49a9a1079c..09dc76ef04df5d29ea32f4af24efdc09e433aa73 100644
--- a/src/runner_doiact_vec.h
+++ b/src/runner_doiact_vec.h
@@ -35,6 +35,7 @@
 
 /* Function prototypes. */
 void runner_doself1_density_vec(struct runner *r, struct cell *restrict c);
+void runner_doself2_force_vec(struct runner *r, struct cell *restrict c);
 void runner_dopair1_density_vec(struct runner *r, struct cell *restrict ci,
                                 struct cell *restrict cj, const int sid,
                                 const double *shift);
diff --git a/src/statistics.c b/src/statistics.c
index 5a3f1ff4f9a2232a14817e7e55fd2cff5bdcd80e..b06f087c70877fa5dda451300fcb4bf665b011be 100644
--- a/src/statistics.c
+++ b/src/statistics.c
@@ -74,6 +74,9 @@ void stats_add(struct statistics *a, const struct statistics *b) {
   a->ang_mom[0] += b->ang_mom[0];
   a->ang_mom[1] += b->ang_mom[1];
   a->ang_mom[2] += b->ang_mom[2];
+  a->centre_of_mass[0] += b->centre_of_mass[0];
+  a->centre_of_mass[1] += b->centre_of_mass[1];
+  a->centre_of_mass[2] += b->centre_of_mass[2];
 }
 
 /**
@@ -129,8 +132,8 @@ void stats_collect_part_mapper(void *map_data, int nr_parts, void *extra_data) {
     /* Get useful time variables */
     const integertime_t ti_begin =
         get_integer_time_begin(ti_current, p->time_bin);
-    const integertime_t ti_step = get_integer_timestep(p->time_bin);
-    const float dt = (ti_current - (ti_begin + ti_step / 2)) * timeBase;
+    const integertime_t ti_end = get_integer_time_end(ti_current, p->time_bin);
+    const float dt = (ti_current - ((ti_begin + ti_end) / 2)) * timeBase;
 
     /* Get the total acceleration */
     float a_tot[3] = {p->a_hydro[0], p->a_hydro[1], p->a_hydro[2]};
@@ -147,10 +150,17 @@ void stats_collect_part_mapper(void *map_data, int nr_parts, void *extra_data) {
 
     const float m = hydro_get_mass(p);
     const double x[3] = {p->x[0], p->x[1], p->x[2]};
+    const float rho = hydro_get_density(p);
+    const float u_int = hydro_get_internal_energy(p);
 
     /* Collect mass */
     stats.mass += m;
 
+    /* Collect centre of mass */
+    stats.centre_of_mass[0] += m * x[0];
+    stats.centre_of_mass[1] += m * x[1];
+    stats.centre_of_mass[2] += m * x[2];
+
     /* Collect momentum */
     stats.mom[0] += m * v[0];
     stats.mom[1] += m * v[1];
@@ -163,7 +173,7 @@ void stats_collect_part_mapper(void *map_data, int nr_parts, void *extra_data) {
 
     /* Collect energies. */
     stats.E_kin += 0.5f * m * (v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
-    stats.E_int += m * hydro_get_internal_energy(p);
+    stats.E_int += m * u_int;
     stats.E_rad += cooling_get_radiated_energy(xp);
     if (gp != NULL) {
       stats.E_pot_self += m * gravity_get_potential(gp);
@@ -172,7 +182,7 @@ void stats_collect_part_mapper(void *map_data, int nr_parts, void *extra_data) {
     }
 
     /* Collect entropy */
-    stats.entropy += m * hydro_get_entropy(p);
+    stats.entropy += m * gas_entropy_from_internal_energy(rho, u_int);
   }
 
   /* Now write back to memory */
@@ -219,8 +229,8 @@ void stats_collect_gpart_mapper(void *map_data, int nr_gparts,
     /* Get useful variables */
     const integertime_t ti_begin =
         get_integer_time_begin(ti_current, gp->time_bin);
-    const integertime_t ti_step = get_integer_timestep(gp->time_bin);
-    const float dt = (ti_current - (ti_begin + ti_step / 2)) * timeBase;
+    const integertime_t ti_end = get_integer_time_end(ti_current, gp->time_bin);
+    const float dt = (ti_current - ((ti_begin + ti_end) / 2)) * timeBase;
 
     /* Extrapolate velocities */
     const float v[3] = {gp->v_full[0] + gp->a_grav[0] * dt,
@@ -233,6 +243,11 @@ void stats_collect_gpart_mapper(void *map_data, int nr_gparts,
     /* Collect mass */
     stats.mass += m;
 
+    /* Collect centre of mass */
+    stats.centre_of_mass[0] += m * x[0];
+    stats.centre_of_mass[1] += m * x[1];
+    stats.centre_of_mass[2] += m * x[2];
+
     /* Collect momentum */
     stats.mom[0] += m * v[0];
     stats.mom[1] += m * v[1];
@@ -279,6 +294,18 @@ void stats_collect(const struct space *s, struct statistics *stats) {
                    s->nr_gparts, sizeof(struct gpart), 0, &extra_data);
 }
 
+/**
+ * @brief Apply final opetations on the #stats.
+ *
+ * @param stats The #stats to work on.
+ */
+void stats_finalize(struct statistics *stats) {
+
+  stats->centre_of_mass[0] /= stats->mass;
+  stats->centre_of_mass[1] /= stats->mass;
+  stats->centre_of_mass[2] /= stats->mass;
+}
+
 /**
  * @brief Prints the content of a #statistics aggregator to a file
  *
@@ -294,11 +321,12 @@ void stats_print_to_file(FILE *file, const struct statistics *stats,
 
   fprintf(file,
           " %14e %14e %14e %14e %14e %14e %14e %14e %14e %14e %14e %14e %14e "
-          "%14e %14e %14e\n",
+          "%14e %14e %14e %14e %14e %14e\n",
           time, stats->mass, E_tot, stats->E_kin, stats->E_int, E_pot,
           stats->E_pot_self, stats->E_pot_ext, stats->E_rad, stats->entropy,
           stats->mom[0], stats->mom[1], stats->mom[2], stats->ang_mom[0],
-          stats->ang_mom[1], stats->ang_mom[2]);
+          stats->ang_mom[1], stats->ang_mom[2], stats->centre_of_mass[0],
+          stats->centre_of_mass[1], stats->centre_of_mass[2]);
   fflush(file);
 }
 
diff --git a/src/statistics.h b/src/statistics.h
index b007d1314c458254986862f99b7cb1669d995545..e8cddda1e855d156070b8a000cb15b515f127740 100644
--- a/src/statistics.h
+++ b/src/statistics.h
@@ -58,6 +58,9 @@ struct statistics {
   /*! Angular momentum */
   double ang_mom[3];
 
+  /*! Centre of mass */
+  double centre_of_mass[3];
+
   /*! Lock for threaded access */
   swift_lock_type lock;
 };
@@ -67,6 +70,7 @@ void stats_add(struct statistics* a, const struct statistics* b);
 void stats_print_to_file(FILE* file, const struct statistics* stats,
                          double time);
 void stats_init(struct statistics* s);
+void stats_finalize(struct statistics* s);
 
 #ifdef WITH_MPI
 extern MPI_Datatype statistics_mpi_type;
diff --git a/src/vector.h b/src/vector.h
index 6a7c6837989025785c1f9134004f2ebcc226a205..e463bbd2e62f0c6b51446d22a9805670b134e684 100644
--- a/src/vector.h
+++ b/src/vector.h
@@ -91,6 +91,8 @@
 #define vec_init_mask_true(mask) ({ mask = 0xFFFF; })
 #define vec_zero_mask(mask) ({ mask = 0; })
 #define vec_create_mask(mask, cond) ({ mask = cond; })
+#define vec_combine_masks(mask1, mask2) \
+  ({ mask1 = vec_mask_and(mask1, mask2); })
 #define vec_pad_mask(mask, pad) ({ mask = mask >> (pad); })
 #define vec_blend(mask, a, b) _mm512_mask_blend_ps(mask, a, b)
 #define vec_todbl_lo(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 0))
@@ -186,6 +188,8 @@
 #define vec_and_mask(a, mask) _mm256_and_ps(a, mask.v)
 #define vec_init_mask_true(mask) mask.m = vec_setint1(0xFFFFFFFF)
 #define vec_create_mask(mask, cond) mask.v = cond
+#define vec_combine_masks(mask1, mask2) \
+  ({ mask1.v = vec_mask_and(mask1, mask2); })
 #define vec_zero_mask(mask) mask.v = vec_setzero()
 #define vec_pad_mask(mask, pad) \
   for (int i = VEC_SIZE - (pad); i < VEC_SIZE; i++) mask.i[i] = 0
diff --git a/src/xmf.c b/src/xmf.c
index 67682b4794ade773c39a748eddf765e392c74865..3c8bb257cca224a06e5b516be46e92dd0006183a 100644
--- a/src/xmf.c
+++ b/src/xmf.c
@@ -205,6 +205,7 @@ void xmf_write_groupfooter(FILE* xmfFile, enum part_type ptype) {
  */
 int xmf_precision(enum IO_DATA_TYPE type) {
   switch (type) {
+    case INT:
     case FLOAT:
       return 4;
       break;
@@ -233,6 +234,7 @@ const char* xmf_type(enum IO_DATA_TYPE type) {
     case DOUBLE:
       return "Float";
       break;
+    case INT:
     case ULONGLONG:
     case LONGLONG:
       return "Int";
diff --git a/tests/Makefile.am b/tests/Makefile.am
index bf4c205b5ab5000e246693c4d45772d16f67a5e5..553980a93e907e83b65bb4539ca49c8bc1b7207b 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -32,7 +32,7 @@ TESTS = testGreetings testMaths testReading.sh testSingle testKernel testSymmetr
 check_PROGRAMS = testGreetings testReading testSingle testTimeIntegration \
 		 testSPHStep testActivePair test27cells test125cells testParser \
                  testKernel testFFT testInteractions testMaths \
-                 testSymmetry testThreadpool benchmarkInteractions \
+                 testSymmetry testThreadpool \
                  testAdiabaticIndex testRiemannExact testRiemannTRRS \
                  testRiemannHLLC testMatrixInversion testDump testLogger \
 		 testVoronoi1D testVoronoi2D testVoronoi3D testPeriodicBC \
@@ -72,8 +72,6 @@ testFFT_SOURCES = testFFT.c
 
 testInteractions_SOURCES = testInteractions.c
 
-benchmarkInteractions_SOURCES = benchmarkInteractions.c
-
 testAdiabaticIndex_SOURCES = testAdiabaticIndex.c
 
 testRiemannExact_SOURCES = testRiemannExact.c
diff --git a/tests/benchmarkInteractions.c b/tests/benchmarkInteractions.c
deleted file mode 100644
index 5dc7a4ae85e62313a84d0c0be88b65a1c0bb59df..0000000000000000000000000000000000000000
--- a/tests/benchmarkInteractions.c
+++ /dev/null
@@ -1,525 +0,0 @@
-/*******************************************************************************
- * This file is part of SWIFT.
- * Copyright (C) 2015 Matthieu Schaller (matthieu.schaller@durham.ac.uk).
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- ******************************************************************************/
-
-#include <fenv.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include "swift.h"
-
-#define array_align sizeof(float) * VEC_SIZE
-#define ACC_THRESHOLD 1e-5
-
-#ifdef NONSYM_DENSITY
-#define IACT runner_iact_nonsym_density
-#define IACT_VEC runner_iact_nonsym_2_vec_density
-#define IACT_NAME "test_nonsym_density"
-#define NUM_VEC_PROC_INT 2
-#endif
-
-#ifdef SYM_DENSITY
-#define IACT runner_iact_density
-#define IACT_VEC runner_iact_vec_density
-#define IACT_NAME "test_sym_density"
-#endif
-
-#ifdef NONSYM_FORCE
-#define IACT runner_iact_nonsym_force
-#define IACT_VEC runner_iact_nonsym_vec_force
-#define IACT_NAME "test_nonsym_force"
-#endif
-
-#ifdef SYM_FORCE
-#define IACT runner_iact_force
-#define IACT_VEC runner_iact_vec_force
-#define IACT_NAME "test_sym_force"
-#endif
-
-#ifndef IACT
-#define IACT runner_iact_nonsym_density
-#define IACT_VEC runner_iact_nonsym_1_vec_density
-#define IACT_NAME "test_nonsym_density"
-#define NUM_VEC_PROC_INT 1
-#endif
-
-/**
- * @brief Constructs an array of particles in a valid state prior to
- * a IACT_NONSYM and IACT_NONSYM_VEC call.
- *
- * @param count No. of particles to create
- * @param offset The position of the particle offset from (0,0,0).
- * @param spacing Particle spacing.
- * @param h The smoothing length of the particles in units of the inter-particle
- *separation.
- * @param partId The running counter of IDs.
- */
-struct part *make_particles(size_t count, double *offset, double spacing,
-                            double h, long long *partId) {
-
-  struct part *particles;
-  if (posix_memalign((void **)&particles, part_align,
-                     count * sizeof(struct part)) != 0) {
-    error("couldn't allocate particles, no. of particles: %d", (int)count);
-  }
-  bzero(particles, count * sizeof(struct part));
-
-  /* Construct the particles */
-  struct part *p;
-
-  /* Set test particle at centre of unit sphere. */
-  p = &particles[0];
-
-  /* Place the test particle at the centre of a unit sphere. */
-  p->x[0] = 0.0f;
-  p->x[1] = 0.0f;
-  p->x[2] = 0.0f;
-
-  p->h = h;
-  p->id = ++(*partId);
-
-#if !defined(GIZMO_SPH) && !defined(SHADOWFAX_SPH)
-  p->mass = 1.0f;
-#endif
-
-  /* Place rest of particles around the test particle
-   * with random position within a unit sphere. */
-  for (size_t i = 1; i < count; ++i) {
-    p = &particles[i];
-
-    /* Randomise positions within a unit sphere. */
-    p->x[0] = random_uniform(-1.0, 1.0);
-    p->x[1] = random_uniform(-1.0, 1.0);
-    p->x[2] = random_uniform(-1.0, 1.0);
-
-    /* Randomise velocities. */
-    p->v[0] = random_uniform(-0.05, 0.05);
-    p->v[1] = random_uniform(-0.05, 0.05);
-    p->v[2] = random_uniform(-0.05, 0.05);
-
-    p->h = h;
-    p->id = ++(*partId);
-#if !defined(GIZMO_SPH) && !defined(SHADOWFAX_SPH)
-    p->mass = 1.0f;
-#endif
-  }
-  return particles;
-}
-
-/**
- * @brief Populates particle properties needed for the force calculation.
- */
-void prepare_force(struct part *parts, size_t count) {
-
-#if !defined(GIZMO_SPH) && !defined(SHADOWFAX_SPH) && !defined(MINIMAL_SPH)
-  struct part *p;
-  for (size_t i = 0; i < count; ++i) {
-    p = &parts[i];
-    p->rho = i + 1;
-    p->force.balsara = random_uniform(0.0, 1.0);
-    p->force.P_over_rho2 = i + 1;
-    p->force.soundspeed = random_uniform(2.0, 3.0);
-    p->force.v_sig = 0.0f;
-    p->force.h_dt = 0.0f;
-  }
-#endif
-}
-
-/**
- * @brief Dumps all particle information to a file
- */
-void dump_indv_particle_fields(char *fileName, struct part *p) {
-
-  FILE *file = fopen(fileName, "a");
-
-  fprintf(file,
-          "%6llu %10f %10f %10f %10f %10f %10f %10e %10e %10e %13e %13e %13e "
-          "%13e %13e %13e %13e "
-          "%13e %13e %13e\n",
-          p->id, p->x[0], p->x[1], p->x[2], p->v[0], p->v[1], p->v[2],
-          p->a_hydro[0], p->a_hydro[1], p->a_hydro[2],
-#if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH)
-          0., 0.,
-#else
-          p->rho, p->density.rho_dh,
-#endif
-          p->density.wcount, p->density.wcount_dh, p->force.h_dt,
-#if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH)
-          0.,
-#else
-          p->force.v_sig,
-#endif
-#if defined(MINIMAL_SPH) || defined(GIZMO_SPH) || defined(SHADOWFAX_SPH)
-          0., 0., 0., 0.
-#else
-          p->density.div_v, p->density.rot_v[0], p->density.rot_v[1],
-          p->density.rot_v[2]
-#endif
-          );
-  fclose(file);
-}
-
-/**
- * @brief Creates a header for the output file
- */
-void write_header(char *fileName) {
-
-  FILE *file = fopen(fileName, "w");
-  /* Write header */
-  fprintf(file,
-          "# %4s %10s %10s %10s %10s %10s %10s %10s %10s %10s %13s %13s %13s "
-          "%13s %13s %13s %13s"
-          "%13s %13s %13s %13s\n",
-          "ID", "pos_x", "pos_y", "pos_z", "v_x", "v_y", "v_z", "a_x", "a_y",
-          "a_z", "rho", "rho_dh", "wcount", "wcount_dh", "dh/dt", "v_sig",
-          "div_v", "curl_vx", "curl_vy", "curl_vz", "dS/dt");
-  fprintf(file, "\n# PARTICLES BEFORE INTERACTION:\n");
-  fclose(file);
-}
-
-/**
- * @brief Compares the vectorised result against
- * the serial result of the interaction.
- *
- * @param serial_test_part Particle that has been updated serially
- * @param serial_parts Particle array that has been interacted serially
- * @param vec_test_part Particle that has been updated using vectors
- * @param vec_parts Particle array to be interacted using vectors
- * @param count No. of particles that have been interacted
- *
- * @return Non-zero value if difference found, 0 otherwise
- */
-int check_results(struct part serial_test_part, struct part *serial_parts,
-                  struct part vec_test_part, struct part *vec_parts,
-                  int count) {
-  int result = 0;
-  result += compare_particles(serial_test_part, vec_test_part, ACC_THRESHOLD);
-
-  for (int i = 0; i < count; i++)
-    result += compare_particles(serial_parts[i], vec_parts[i], ACC_THRESHOLD);
-
-  return result;
-}
-
-/*
- * @brief Calls the serial and vectorised version of the non-symmetrical density
- * interaction.
- *
- * @param test_part Particle that will be updated
- * @param parts Particle array to be interacted
- * @param count No. of particles to be interacted
- * @param serial_inter_func Serial interaction function to be called
- * @param vec_inter_func Vectorised interaction function to be called
- * @param runs No. of times to call interactions
- *
- */
-void test_interactions(struct part test_part, struct part *parts, size_t count,
-                       char *filePrefix, int runs) {
-
-  ticks serial_time = 0;
-#ifdef WITH_VECTORIZATION
-  ticks vec_time = 0;
-#endif
-
-  FILE *file;
-  char serial_filename[200] = "";
-  char vec_filename[200] = "";
-
-  strcpy(serial_filename, filePrefix);
-  strcpy(vec_filename, filePrefix);
-  sprintf(serial_filename + strlen(serial_filename), "_serial.dat");
-  sprintf(vec_filename + strlen(vec_filename), "_vec.dat");
-
-  write_header(serial_filename);
-  write_header(vec_filename);
-
-  struct part pi_serial, pi_vec;
-  struct part pj_serial[count], pj_vec[count];
-
-  float r2[count] __attribute__((aligned(array_align)));
-  float dx[3 * count] __attribute__((aligned(array_align)));
-
-#ifdef WITH_VECTORIZATION
-  struct part *piq[count], *pjq[count];
-  for (size_t k = 0; k < count; k++) {
-    piq[k] = NULL;
-    pjq[k] = NULL;
-  }
-
-  float r2q[count] __attribute__((aligned(array_align)));
-  float hiq[count] __attribute__((aligned(array_align)));
-  float dxq[count] __attribute__((aligned(array_align)));
-
-  float dyq[count] __attribute__((aligned(array_align)));
-  float dzq[count] __attribute__((aligned(array_align)));
-  float mjq[count] __attribute__((aligned(array_align)));
-  float vixq[count] __attribute__((aligned(array_align)));
-  float viyq[count] __attribute__((aligned(array_align)));
-  float vizq[count] __attribute__((aligned(array_align)));
-  float vjxq[count] __attribute__((aligned(array_align)));
-  float vjyq[count] __attribute__((aligned(array_align)));
-  float vjzq[count] __attribute__((aligned(array_align)));
-#endif
-
-  /* Call serial interaction a set number of times. */
-  for (int k = 0; k < runs; k++) {
-    /* Reset particle to initial setup */
-    pi_serial = test_part;
-    for (size_t i = 0; i < count; i++) pj_serial[i] = parts[i];
-
-    /* Only dump data on first run. */
-    if (k == 0) {
-      /* Dump state of particles before serial interaction. */
-      dump_indv_particle_fields(serial_filename, &pi_serial);
-      for (size_t i = 0; i < count; i++)
-        dump_indv_particle_fields(serial_filename, &pj_serial[i]);
-    }
-
-    /* Perform serial interaction */
-    for (size_t i = 0; i < count; i++) {
-      /* Compute the pairwise distance. */
-      r2[i] = 0.0f;
-      for (int k = 0; k < 3; k++) {
-        int ind = (3 * i) + k;
-        dx[ind] = pi_serial.x[k] - pj_serial[i].x[k];
-        r2[i] += dx[ind] * dx[ind];
-      }
-    }
-
-    const ticks tic = getticks();
-/* Perform serial interaction */
-#ifdef __ICC
-#pragma novector
-#endif
-    for (size_t i = 0; i < count; i++) {
-      IACT(r2[i], &(dx[3 * i]), pi_serial.h, pj_serial[i].h, &pi_serial,
-           &pj_serial[i]);
-    }
-    serial_time += getticks() - tic;
-  }
-
-  file = fopen(serial_filename, "a");
-  fprintf(file, "\n# PARTICLES AFTER INTERACTION:\n");
-  fclose(file);
-
-  /* Dump result of serial interaction. */
-  dump_indv_particle_fields(serial_filename, &pi_serial);
-  for (size_t i = 0; i < count; i++)
-    dump_indv_particle_fields(serial_filename, &pj_serial[i]);
-
-  /* Call vector interaction a set number of times. */
-  for (int k = 0; k < runs; k++) {
-    /* Reset particle to initial setup */
-    pi_vec = test_part;
-    for (size_t i = 0; i < count; i++) pj_vec[i] = parts[i];
-
-    /* Setup arrays for vector interaction. */
-    for (size_t i = 0; i < count; i++) {
-      /* Compute the pairwise distance. */
-      float r2 = 0.0f;
-      float dx[3];
-      for (int k = 0; k < 3; k++) {
-        dx[k] = pi_vec.x[k] - pj_vec[i].x[k];
-        r2 += dx[k] * dx[k];
-      }
-
-#ifdef WITH_VECTORIZATION
-      r2q[i] = r2;
-      dxq[i] = dx[0];
-      hiq[i] = pi_vec.h;
-      piq[i] = &pi_vec;
-      pjq[i] = &pj_vec[i];
-
-      dyq[i] = dx[1];
-      dzq[i] = dx[2];
-      mjq[i] = pj_vec[i].mass;
-      vixq[i] = pi_vec.v[0];
-      viyq[i] = pi_vec.v[1];
-      vizq[i] = pi_vec.v[2];
-      vjxq[i] = pj_vec[i].v[0];
-      vjyq[i] = pj_vec[i].v[1];
-      vjzq[i] = pj_vec[i].v[2];
-#endif
-    }
-
-    /* Only dump data on first run. */
-    if (k == 0) {
-#ifdef WITH_VECTORIZATION
-      /* Dump state of particles before vector interaction. */
-      dump_indv_particle_fields(vec_filename, piq[0]);
-      for (size_t i = 0; i < count; i++)
-        dump_indv_particle_fields(vec_filename, pjq[i]);
-#endif
-    }
-
-/* Perform vector interaction. */
-#ifdef WITH_VECTORIZATION
-    vector hi_vec, hi_inv_vec, vix_vec, viy_vec, viz_vec;
-    vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum,
-        curlvySum, curlvzSum;
-
-    rhoSum.v = vec_set1(0.f);
-    rho_dhSum.v = vec_set1(0.f);
-    wcountSum.v = vec_set1(0.f);
-    wcount_dhSum.v = vec_set1(0.f);
-    div_vSum.v = vec_set1(0.f);
-    curlvxSum.v = vec_set1(0.f);
-    curlvySum.v = vec_set1(0.f);
-    curlvzSum.v = vec_set1(0.f);
-
-    hi_vec.v = vec_load(&hiq[0]);
-    vix_vec.v = vec_load(&vixq[0]);
-    viy_vec.v = vec_load(&viyq[0]);
-    viz_vec.v = vec_load(&vizq[0]);
-
-    hi_inv_vec = vec_reciprocal(hi_vec);
-
-    mask_t mask;
-    vec_init_mask_true(mask);
-#if (NUM_VEC_PROC_INT == 2)
-    mask_t mask2;
-    vec_init_mask_true(mask2);
-#endif
-    const ticks vec_tic = getticks();
-
-    for (size_t i = 0; i < count; i += NUM_VEC_PROC_INT * VEC_SIZE) {
-
-/* Interleave two vectors for interaction. */
-#if (NUM_VEC_PROC_INT == 2)
-      IACT_VEC(&(r2q[i]), &(dxq[i]), &(dyq[i]), &(dzq[i]), (hi_inv_vec),
-               (vix_vec), (viy_vec), (viz_vec), &(vjxq[i]), &(vjyq[i]),
-               &(vjzq[i]), &(mjq[i]), &rhoSum, &rho_dhSum, &wcountSum,
-               &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, &curlvzSum,
-               mask, mask2, 0);
-#else /* Only use one vector for interaction. */
-      vector r2, dx, dy, dz;
-      r2.v = vec_load(&(r2q[i]));
-      dx.v = vec_load(&(dxq[i]));
-      dy.v = vec_load(&(dyq[i]));
-      dz.v = vec_load(&(dzq[i]));
-
-      IACT_VEC(&r2, &dx, &dy, &dz, (hi_inv_vec), (vix_vec), (viy_vec),
-               (viz_vec), &(vjxq[i]), &(vjyq[i]), &(vjzq[i]), &(mjq[i]),
-               &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum,
-               &curlvxSum, &curlvySum, &curlvzSum, mask);
-#endif
-    }
-
-    VEC_HADD(rhoSum, piq[0]->rho);
-    VEC_HADD(rho_dhSum, piq[0]->density.rho_dh);
-    VEC_HADD(wcountSum, piq[0]->density.wcount);
-    VEC_HADD(wcount_dhSum, piq[0]->density.wcount_dh);
-    VEC_HADD(div_vSum, piq[0]->density.div_v);
-    VEC_HADD(curlvxSum, piq[0]->density.rot_v[0]);
-    VEC_HADD(curlvySum, piq[0]->density.rot_v[1]);
-    VEC_HADD(curlvzSum, piq[0]->density.rot_v[2]);
-
-    vec_time += getticks() - vec_tic;
-#endif
-  }
-
-  file = fopen(vec_filename, "a");
-  fprintf(file, "\n# PARTICLES AFTER INTERACTION:\n");
-  fclose(file);
-
-#ifdef WITH_VECTORIZATION
-  /* Dump result of serial interaction. */
-  dump_indv_particle_fields(vec_filename, piq[0]);
-  for (size_t i = 0; i < count; i++)
-    dump_indv_particle_fields(vec_filename, pjq[i]);
-#endif
-
-#ifdef WITH_VECTORIZATION
-  /* Check serial results against the vectorised results. */
-  if (check_results(pi_serial, pj_serial, pi_vec, pj_vec, count))
-    message("Differences found...");
-#endif
-
-  message("The serial interactions took     : %15lli ticks.",
-          serial_time / runs);
-#ifdef WITH_VECTORIZATION
-  message("The vectorised interactions took : %15lli ticks.", vec_time / runs);
-  message("Speed up: %15fx.", (double)(serial_time) / vec_time);
-#endif
-}
-
-/* And go... */
-int main(int argc, char *argv[]) {
-  size_t runs = 10000;
-  double h = 1.0, spacing = 0.5;
-  double offset[3] = {0.0, 0.0, 0.0};
-  size_t count = 256;
-
-  /* Get some randomness going */
-  srand(0);
-
-  char c;
-  while ((c = getopt(argc, argv, "h:s:n:r:")) != -1) {
-    switch (c) {
-      case 'h':
-        sscanf(optarg, "%lf", &h);
-        break;
-      case 's':
-        sscanf(optarg, "%lf", &spacing);
-      case 'n':
-        sscanf(optarg, "%zu", &count);
-        break;
-      case 'r':
-        sscanf(optarg, "%zu", &runs);
-        break;
-      case '?':
-        error("Unknown option.");
-        break;
-    }
-  }
-
-  if (h < 0 || spacing < 0) {
-    printf(
-        "\nUsage: %s [OPTIONS...]\n"
-        "\nGenerates a particle array with equal particle separation."
-        "\nThese are then interacted using runner_iact_density and "
-        "runner_iact_vec_density."
-        "\n\nOptions:"
-        "\n-h DISTANCE=1.2348 - Smoothing length in units of <x>"
-        "\n-s SPACING=0.5     - Spacing between particles"
-        "\n-n NUMBER=9        - No. of particles",
-        argv[0]);
-    exit(1);
-  }
-
-  /* Correct count so that VEC_SIZE of particles interact with the test
-   * particle. */
-  count = count - (count % VEC_SIZE) + 1;
-
-  /* Build the infrastructure */
-  static long long partId = 0;
-  struct part test_particle;
-  struct part *particles = make_particles(count, offset, spacing, h, &partId);
-
-#if defined(NONSYM_FORCE) || defined(SYM_FORCE)
-  prepare_force(particles, count);
-#endif
-
-  test_particle = particles[0];
-  /* Call the non-sym density test. */
-  message("Testing %s interaction...", IACT_NAME);
-  test_interactions(test_particle, &particles[1], count - 1, IACT_NAME, runs);
-
-  return 0;
-}
diff --git a/tests/test125cells.c b/tests/test125cells.c
index 2b35f7b5d509a7f966d07d6da82616869f29ab66..87325b4e9a2e254a2549b93b1031bf8af4538295 100644
--- a/tests/test125cells.c
+++ b/tests/test125cells.c
@@ -31,6 +31,18 @@
 /* Local headers. */
 #include "swift.h"
 
+#if defined(WITH_VECTORIZATION)
+#define DOSELF2 runner_doself2_force_vec
+#define DOSELF2_NAME "runner_doself2_force_vec"
+#define DOPAIR2_NAME "runner_dopair2_force"
+#endif
+
+#ifndef DOSELF2
+#define DOSELF2 runner_doself2_force
+#define DOSELF2_NAME "runner_doself2_density"
+#define DOPAIR2_NAME "runner_dopair2_force"
+#endif
+
 enum velocity_field {
   velocity_zero,
   velocity_const,
@@ -436,6 +448,7 @@ void runner_dopair1_branch_density(struct runner *r, struct cell *ci,
 void runner_doself1_density(struct runner *r, struct cell *ci);
 void runner_dopair2_force(struct runner *r, struct cell *ci, struct cell *cj);
 void runner_doself2_force(struct runner *r, struct cell *ci);
+void runner_doself2_force_vec(struct runner *r, struct cell *ci);
 
 /* And go... */
 int main(int argc, char *argv[]) {
@@ -516,6 +529,8 @@ int main(int argc, char *argv[]) {
   }
 
   /* Help users... */
+  message("DOSELF2 function called: %s", DOSELF2_NAME);
+  message("DOPAIR2 function called: %s", DOPAIR2_NAME);
   message("Adiabatic index: ga = %f", hydro_gamma);
   message("Hydro implementation: %s", SPH_IMPLEMENTATION);
   message("Smoothing length: h = %f", h * size);
@@ -602,8 +617,12 @@ int main(int argc, char *argv[]) {
       malloc(main_cell->count * sizeof(struct solution_part));
   get_solution(main_cell, solution, rho, vel, press, size);
 
+  ticks timings[27];
+  for (int i = 0; i < 27; i++) timings[i] = 0;
+
   /* Start the test */
   ticks time = 0;
+  ticks self_force_time = 0;
   for (size_t n = 0; n < runs; ++n) {
 
     const ticks tic = getticks();
@@ -674,6 +693,7 @@ int main(int argc, char *argv[]) {
 /* Do the force calculation */
 #if !(defined(MINIMAL_SPH) && defined(WITH_VECTORIZATION))
 
+    int ctr = 0;
     /* Do the pairs (for the central 27 cells) */
     for (int i = 1; i < 4; i++) {
       for (int j = 1; j < 4; j++) {
@@ -681,13 +701,32 @@ int main(int argc, char *argv[]) {
 
           struct cell *cj = cells[i * 25 + j * 5 + k];
 
-          if (main_cell != cj) runner_dopair2_force(&runner, main_cell, cj);
+          if (main_cell != cj) {
+
+            const ticks sub_tic = getticks();
+
+            runner_dopair2_force(&runner, main_cell, cj);
+
+            const ticks sub_toc = getticks();
+            timings[ctr++] += sub_toc - sub_tic;
+          }
         }
       }
     }
 
+#ifdef WITH_VECTORIZATION
+    /* Initialise the cache. */
+    runner.ci_cache.count = 0;
+    cache_init(&runner.ci_cache, 512);
+#endif
+
+    ticks self_tic = getticks();
+
     /* And now the self-interaction for the main cell */
-    runner_doself2_force(&runner, main_cell);
+    DOSELF2(&runner, main_cell);
+
+    self_force_time += getticks() - self_tic;
+    timings[26] += getticks() - self_tic;
 #endif
 
     /* Finally, give a gentle kick */
@@ -702,7 +741,6 @@ int main(int argc, char *argv[]) {
       dump_particle_fields(outputFileName, main_cell, solution, 0);
     }
 
-    /* Reset stuff */
     for (int i = 0; i < 125; ++i) {
       for (int n = 0; n < cells[i]->count; ++n)
         hydro_init_part(&cells[i]->parts[n], &space.hs);
@@ -710,7 +748,22 @@ int main(int argc, char *argv[]) {
   }
 
   /* Output timing */
-  message("SWIFT calculation took       : %15lli ticks.", time / runs);
+  ticks corner_time = timings[0] + timings[2] + timings[6] + timings[8] +
+                      timings[17] + timings[19] + timings[23] + timings[25];
+
+  ticks edge_time = timings[1] + timings[3] + timings[5] + timings[7] +
+                    timings[9] + timings[11] + timings[14] + timings[16] +
+                    timings[18] + timings[20] + timings[22] + timings[24];
+
+  ticks face_time = timings[4] + timings[10] + timings[12] + timings[13] +
+                    timings[15] + timings[21];
+
+  message("Corner calculations took       : %15lli ticks.", corner_time / runs);
+  message("Edge calculations took         : %15lli ticks.", edge_time / runs);
+  message("Face calculations took         : %15lli ticks.", face_time / runs);
+  message("Self calculations took         : %15lli ticks.",
+          self_force_time / runs);
+  message("SWIFT calculation took         : %15lli ticks.", time / runs);
 
   for (int j = 0; j < 125; ++j)
     reset_particles(cells[j], &space.hs, vel, press, size, rho);
diff --git a/tests/testInteractions.c b/tests/testInteractions.c
index 54d1f38733a1f1647331166f1a37b40ed3511419..5bf036b17dce6755fc2b6f77aaea64851d613bb1 100644
--- a/tests/testInteractions.c
+++ b/tests/testInteractions.c
@@ -125,23 +125,28 @@ void dump_indv_particle_fields(char *fileName, struct part *p) {
 
   FILE *file = fopen(fileName, "a");
 
-  /* Write header */
   fprintf(file,
-          "%6llu %10f %10f %10f %10f %10f %10f %13e %13e %13e %13e %13e "
-          "%13e %13e %13e\n",
-          p->id, p->x[0], p->x[1], p->x[2], p->v[0], p->v[1], p->v[2],
+          "%6llu %8.5f %8.5f %8.5f %8.5f %8.5f %8.5f %8.5f %8.5f %8.5f %8.5f "
+          "%8.5f "
+          "%8.5f %8.5f %13e %13e %13e %13e %13e %8.5f %8.5f\n",
+          p->id, p->x[0], p->x[1], p->x[2], p->v[0], p->v[1], p->v[2], p->h,
           hydro_get_density(p),
-#if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH)
+#if defined(MINIMAL_SPH) || defined(SHADOWFAX_SPH)
           0.f,
 #else
-          p->density.rho_dh,
+          p->density.div_v,
 #endif
-          p->density.wcount, p->density.wcount_dh,
-#if defined(GADGET2_SPH) || defined(DEFAULT_SPH) || defined(HOPKINS_PE_SPH)
-          p->density.div_v, p->density.rot_v[0], p->density.rot_v[1],
-          p->density.rot_v[2]
+          hydro_get_entropy(p), hydro_get_internal_energy(p),
+          hydro_get_pressure(p), hydro_get_soundspeed(p), p->a_hydro[0],
+          p->a_hydro[1], p->a_hydro[2], p->force.h_dt,
+#if defined(GADGET2_SPH)
+          p->force.v_sig, p->entropy_dt, 0.f
+#elif defined(DEFAULT_SPH)
+          p->force.v_sig, 0.f, p->force.u_dt
+#elif defined(MINIMAL_SPH)
+          p->force.v_sig, 0.f, p->u_dt
 #else
-          0., 0., 0., 0.
+          0.f, 0.f, 0.f
 #endif
           );
 
@@ -156,10 +161,12 @@ void write_header(char *fileName) {
   FILE *file = fopen(fileName, "w");
   /* Write header */
   fprintf(file,
-          "# %4s %10s %10s %10s %10s %10s %10s %13s %13s %13s %13s %13s "
-          "%13s %13s %13s\n",
-          "ID", "pos_x", "pos_y", "pos_z", "v_x", "v_y", "v_z", "rho", "rho_dh",
-          "wcount", "wcount_dh", "div_v", "curl_vx", "curl_vy", "curl_vz");
+          "# %4s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s %13s %13s "
+          "%13s %13s %13s %8s %8s\n",
+          "ID", "pos_x", "pos_y", "pos_z", "v_x", "v_y", "v_z", "h", "rho",
+          "div_v", "S", "u", "P", "c", "a_x", "a_y", "a_z", "h_dt", "v_sig",
+          "dS/dt", "du/dt");
+
   fclose(file);
 }
 
@@ -212,7 +219,7 @@ void test_interactions(struct part test_part, struct part *parts, size_t count,
   strcpy(serial_filename, filePrefix);
   strcpy(vec_filename, filePrefix);
   sprintf(serial_filename + strlen(serial_filename), "_serial.dat");
-  sprintf(vec_filename + strlen(vec_filename), "_vec.dat");
+  sprintf(vec_filename + strlen(vec_filename), "_%d_vec.dat", num_vec_proc);
 
   write_header(serial_filename);
   write_header(vec_filename);
@@ -388,6 +395,265 @@ void test_interactions(struct part test_part, struct part *parts, size_t count,
   message("Speed up: %15fx.", (double)(serial_time) / vec_time);
 }
 
+/*
+ * @brief Calls the serial and vectorised version of the non-symmetrical force
+ * interaction.
+ *
+ * @param test_part Particle that will be updated
+ * @param parts Particle array to be interacted
+ * @param count No. of particles to be interacted
+ * @param serial_inter_func Serial interaction function to be called
+ * @param vec_inter_func Vectorised interaction function to be called
+ * @param runs No. of times to call interactions
+ *
+ */
+void test_force_interactions(struct part test_part, struct part *parts,
+                             size_t count, char *filePrefix, int runs,
+                             int num_vec_proc) {
+
+  ticks serial_time = 0;
+  ticks vec_time = 0;
+
+  FILE *file;
+  char serial_filename[200] = "";
+  char vec_filename[200] = "";
+
+  strcpy(serial_filename, filePrefix);
+  strcpy(vec_filename, filePrefix);
+  sprintf(serial_filename + strlen(serial_filename), "_serial.dat");
+  sprintf(vec_filename + strlen(vec_filename), "_%d_vec.dat", num_vec_proc);
+
+  write_header(serial_filename);
+  write_header(vec_filename);
+
+  struct part pi_serial, pi_vec;
+  struct part pj_serial[count], pj_vec[count];
+
+  float r2[count] __attribute__((aligned(array_align)));
+  float dx[3 * count] __attribute__((aligned(array_align)));
+
+  struct part *piq[count], *pjq[count];
+  for (size_t k = 0; k < count; k++) {
+    piq[k] = NULL;
+    pjq[k] = NULL;
+  }
+
+  float r2q[count] __attribute__((aligned(array_align)));
+  float dxq[count] __attribute__((aligned(array_align)));
+  float dyq[count] __attribute__((aligned(array_align)));
+  float dzq[count] __attribute__((aligned(array_align)));
+
+  float hiq[count] __attribute__((aligned(array_align)));
+  float vixq[count] __attribute__((aligned(array_align)));
+  float viyq[count] __attribute__((aligned(array_align)));
+  float vizq[count] __attribute__((aligned(array_align)));
+  float rhoiq[count] __attribute__((aligned(array_align)));
+  float grad_hiq[count] __attribute__((aligned(array_align)));
+  float pOrhoi2q[count] __attribute__((aligned(array_align)));
+  float balsaraiq[count] __attribute__((aligned(array_align)));
+  float ciq[count] __attribute__((aligned(array_align)));
+
+  float hj_invq[count] __attribute__((aligned(array_align)));
+  float mjq[count] __attribute__((aligned(array_align)));
+  float vjxq[count] __attribute__((aligned(array_align)));
+  float vjyq[count] __attribute__((aligned(array_align)));
+  float vjzq[count] __attribute__((aligned(array_align)));
+  float rhojq[count] __attribute__((aligned(array_align)));
+  float grad_hjq[count] __attribute__((aligned(array_align)));
+  float pOrhoj2q[count] __attribute__((aligned(array_align)));
+  float balsarajq[count] __attribute__((aligned(array_align)));
+  float cjq[count] __attribute__((aligned(array_align)));
+
+  /* Call serial interaction a set number of times. */
+  for (int k = 0; k < runs; k++) {
+    /* Reset particle to initial setup */
+    pi_serial = test_part;
+    for (size_t i = 0; i < count; i++) pj_serial[i] = parts[i];
+
+    /* Only dump data on first run. */
+    if (k == 0) {
+      /* Dump state of particles before serial interaction. */
+      dump_indv_particle_fields(serial_filename, &pi_serial);
+      for (size_t i = 0; i < count; i++)
+        dump_indv_particle_fields(serial_filename, &pj_serial[i]);
+    }
+
+    /* Perform serial interaction */
+    for (size_t i = 0; i < count; i++) {
+      /* Compute the pairwise distance. */
+      r2[i] = 0.0f;
+      for (int k = 0; k < 3; k++) {
+        int ind = (3 * i) + k;
+        dx[ind] = pi_serial.x[k] - pj_serial[i].x[k];
+        r2[i] += dx[ind] * dx[ind];
+      }
+    }
+
+    const ticks tic = getticks();
+/* Perform serial interaction */
+#ifdef __ICC
+#pragma novector
+#endif
+    for (size_t i = 0; i < count; i++) {
+      runner_iact_nonsym_force(r2[i], &(dx[3 * i]), pi_serial.h, pj_serial[i].h,
+                               &pi_serial, &pj_serial[i]);
+    }
+    serial_time += getticks() - tic;
+  }
+
+  file = fopen(serial_filename, "a");
+  fprintf(file, "\n# PARTICLES AFTER INTERACTION:\n");
+  fclose(file);
+
+  /* Dump result of serial interaction. */
+  dump_indv_particle_fields(serial_filename, &pi_serial);
+  for (size_t i = 0; i < count; i++)
+    dump_indv_particle_fields(serial_filename, &pj_serial[i]);
+
+  /* Call vector interaction a set number of times. */
+  for (int k = 0; k < runs; k++) {
+    /* Reset particle to initial setup */
+    pi_vec = test_part;
+    for (size_t i = 0; i < count; i++) pj_vec[i] = parts[i];
+
+    /* Setup arrays for vector interaction. */
+    for (size_t i = 0; i < count; i++) {
+      /* Compute the pairwise distance. */
+      float r2 = 0.0f;
+      float dx[3];
+      for (int k = 0; k < 3; k++) {
+        dx[k] = pi_vec.x[k] - pj_vec[i].x[k];
+        r2 += dx[k] * dx[k];
+      }
+
+      piq[i] = &pi_vec;
+      pjq[i] = &pj_vec[i];
+
+      r2q[i] = r2;
+      dxq[i] = dx[0];
+      dyq[i] = dx[1];
+      dzq[i] = dx[2];
+
+      hiq[i] = pi_vec.h;
+      vixq[i] = pi_vec.v[0];
+      viyq[i] = pi_vec.v[1];
+      vizq[i] = pi_vec.v[2];
+      rhoiq[i] = pi_vec.rho;
+      grad_hiq[i] = pi_vec.force.f;
+      pOrhoi2q[i] = pi_vec.force.P_over_rho2;
+      balsaraiq[i] = pi_vec.force.balsara;
+      ciq[i] = pi_vec.force.soundspeed;
+
+      hj_invq[i] = 1.f / pj_vec[i].h;
+      mjq[i] = pj_vec[i].mass;
+      vjxq[i] = pj_vec[i].v[0];
+      vjyq[i] = pj_vec[i].v[1];
+      vjzq[i] = pj_vec[i].v[2];
+      rhojq[i] = pj_vec[i].rho;
+      grad_hjq[i] = pj_vec[i].force.f;
+      pOrhoj2q[i] = pj_vec[i].force.P_over_rho2;
+      balsarajq[i] = pj_vec[i].force.balsara;
+      cjq[i] = pj_vec[i].force.soundspeed;
+    }
+
+    /* Only dump data on first run. */
+    if (k == 0) {
+      /* Dump state of particles before vector interaction. */
+      dump_indv_particle_fields(vec_filename, piq[0]);
+      for (size_t i = 0; i < count; i++)
+        dump_indv_particle_fields(vec_filename, pjq[i]);
+    }
+
+    /* Perform vector interaction. */
+    vector hi_vec, hi_inv_vec, vix_vec, viy_vec, viz_vec, rhoi_vec, grad_hi_vec,
+        pOrhoi2_vec, balsara_i_vec, ci_vec;
+    vector a_hydro_xSum, a_hydro_ySum, a_hydro_zSum, h_dtSum, v_sigSum,
+        entropy_dtSum;
+
+    a_hydro_xSum.v = vec_setzero();
+    a_hydro_ySum.v = vec_setzero();
+    a_hydro_zSum.v = vec_setzero();
+    h_dtSum.v = vec_setzero();
+    v_sigSum.v = vec_setzero();
+    entropy_dtSum.v = vec_setzero();
+
+    hi_vec.v = vec_load(&hiq[0]);
+    vix_vec.v = vec_load(&vixq[0]);
+    viy_vec.v = vec_load(&viyq[0]);
+    viz_vec.v = vec_load(&vizq[0]);
+    rhoi_vec.v = vec_load(&rhoiq[0]);
+    grad_hi_vec.v = vec_load(&grad_hiq[0]);
+    pOrhoi2_vec.v = vec_load(&pOrhoi2q[0]);
+    balsara_i_vec.v = vec_load(&balsaraiq[0]);
+    ci_vec.v = vec_load(&ciq[0]);
+
+    hi_inv_vec = vec_reciprocal(hi_vec);
+
+    mask_t mask, mask2;
+    vec_init_mask_true(mask);
+    vec_init_mask_true(mask2);
+
+    const ticks vec_tic = getticks();
+
+    for (size_t i = 0; i < count; i += num_vec_proc * VEC_SIZE) {
+
+      if (num_vec_proc == 2) {
+        runner_iact_nonsym_2_vec_force(
+            &(r2q[i]), &(dxq[i]), &(dyq[i]), &(dzq[i]), (vix_vec), (viy_vec),
+            (viz_vec), rhoi_vec, grad_hi_vec, pOrhoi2_vec, balsara_i_vec,
+            ci_vec, &(vjxq[i]), &(vjyq[i]), &(vjzq[i]), &(rhojq[i]),
+            &(grad_hjq[i]), &(pOrhoj2q[i]), &(balsarajq[i]), &(cjq[i]),
+            &(mjq[i]), hi_inv_vec, &(hj_invq[i]), &a_hydro_xSum, &a_hydro_ySum,
+            &a_hydro_zSum, &h_dtSum, &v_sigSum, &entropy_dtSum, mask, mask2, 0);
+      } else { /* Only use one vector for interaction. */
+
+        vector r2, dx, dy, dz, hj, hj_inv;
+        r2.v = vec_load(&(r2q[i]));
+        dx.v = vec_load(&(dxq[i]));
+        dy.v = vec_load(&(dyq[i]));
+        dz.v = vec_load(&(dzq[i]));
+        hj.v = vec_load(&hj_invq[i]);
+        hj_inv = vec_reciprocal(hj);
+
+        runner_iact_nonsym_1_vec_force(
+            &r2, &dx, &dy, &dz, vix_vec, viy_vec, viz_vec, rhoi_vec,
+            grad_hi_vec, pOrhoi2_vec, balsara_i_vec, ci_vec, &(vjxq[i]),
+            &(vjyq[i]), &(vjzq[i]), &(rhojq[i]), &(grad_hjq[i]), &(pOrhoj2q[i]),
+            &(balsarajq[i]), &(cjq[i]), &(mjq[i]), hi_inv_vec, hj_inv,
+            &a_hydro_xSum, &a_hydro_ySum, &a_hydro_zSum, &h_dtSum, &v_sigSum,
+            &entropy_dtSum, mask);
+      }
+    }
+
+    VEC_HADD(a_hydro_xSum, piq[0]->a_hydro[0]);
+    VEC_HADD(a_hydro_ySum, piq[0]->a_hydro[1]);
+    VEC_HADD(a_hydro_zSum, piq[0]->a_hydro[2]);
+    VEC_HADD(h_dtSum, piq[0]->force.h_dt);
+    VEC_HMAX(v_sigSum, piq[0]->force.v_sig);
+    VEC_HADD(entropy_dtSum, piq[0]->entropy_dt);
+
+    vec_time += getticks() - vec_tic;
+  }
+
+  file = fopen(vec_filename, "a");
+  fprintf(file, "\n# PARTICLES AFTER INTERACTION:\n");
+  fclose(file);
+
+  /* Dump result of serial interaction. */
+  dump_indv_particle_fields(vec_filename, piq[0]);
+  for (size_t i = 0; i < count; i++)
+    dump_indv_particle_fields(vec_filename, pjq[i]);
+
+  /* Check serial results against the vectorised results. */
+  if (check_results(pi_serial, pj_serial, pi_vec, pj_vec, count))
+    message("Differences found...");
+
+  message("The serial interactions took     : %15lli ticks.",
+          serial_time / runs);
+  message("The vectorised interactions took : %15lli ticks.", vec_time / runs);
+  message("Speed up: %15fx.", (double)(serial_time) / vec_time);
+}
+
 /* And go... */
 int main(int argc, char *argv[]) {
   size_t runs = 10000;
@@ -449,6 +715,13 @@ int main(int argc, char *argv[]) {
   test_interactions(test_particle, &particles[1], count - 1, IACT_NAME, runs,
                     2);
 
+  prepare_force(particles, count);
+
+  test_force_interactions(test_particle, &particles[1], count - 1,
+                          "test_nonsym_force", runs, 1);
+  test_force_interactions(test_particle, &particles[1], count - 1,
+                          "test_nonsym_force", runs, 2);
+
   return 0;
 }
 
diff --git a/tests/testInteractions.sh.in b/tests/testInteractions.sh.in
index 4b002c56e37eff417c673ddac2e44b3edf17683a..2a0bba267dd3ba8468177ec5f91af108aba0f1e5 100644
--- a/tests/testInteractions.sh.in
+++ b/tests/testInteractions.sh.in
@@ -2,7 +2,7 @@
 
 echo ""
 
-rm -f test_nonsym_density_serial.dat test_nonsym_density_vec.dat
+rm -f test_nonsym_density_serial.dat test_nonsym_density_1_vec.dat test_nonsym_density_2_vec.dat test_nonsym_force_1_vec.dat test_nonsym_force_2_vec.dat
 
 echo "Running ./testInteractions"
 
@@ -13,15 +13,42 @@ if [ $? != 0 ]; then
 else
   if [ -e test_nonsym_density_serial.dat ]
   then
-    if python @srcdir@/difffloat.py test_nonsym_density_serial.dat test_nonsym_density_vec.dat @srcdir@/tolerance_testInteractions.dat
+    if python @srcdir@/difffloat.py test_nonsym_density_serial.dat test_nonsym_density_1_vec.dat @srcdir@/tolerance_testInteractions.dat
     then
-      echo "Accuracy test passed"
+      echo "Calculating density using 1 vector accuracy test passed"
     else
-      echo "Accuracy test failed"
+      echo "Calculating density using 1 vector accuracy test failed"
+      exit 1
+    fi
+    if python @srcdir@/difffloat.py test_nonsym_density_serial.dat test_nonsym_density_2_vec.dat @srcdir@/tolerance_testInteractions.dat
+    then
+      echo "Calculating density using 2 vectors accuracy test passed"
+    else
+      echo "Calculating density using 2 vectors accuracy test failed"
+      exit 1
+    fi
+  else
+    echo "Error Missing density test output file"
+    exit 1
+  fi
+  if [ -e test_nonsym_force_serial.dat ]
+  then
+    if python @srcdir@/difffloat.py test_nonsym_force_serial.dat test_nonsym_force_1_vec.dat @srcdir@/tolerance_testInteractions.dat
+    then
+      echo "Calculating force using 1 vector accuracy test passed"
+    else
+      echo "Calculating force using 1 vector accuracy test failed"
+      exit 1
+    fi
+    if python @srcdir@/difffloat.py test_nonsym_force_serial.dat test_nonsym_force_2_vec.dat @srcdir@/tolerance_testInteractions.dat
+    then
+      echo "Calculating force using 2 vectors accuracy test passed"
+    else
+      echo "Calculating force using 2 vectors accuracy test failed"
       exit 1
     fi
   else
-    echo "Error Missing test output file"
+    echo "Error Missing force test output file"
     exit 1
   fi
 fi
diff --git a/tests/tolerance_testInteractions.dat b/tests/tolerance_testInteractions.dat
index ebb376bf26bfdc0fb2107ab720bbf9eca5a35bce..0f11d03507b23c76b5703e118eede1359fe2afba 100644
--- a/tests/tolerance_testInteractions.dat
+++ b/tests/tolerance_testInteractions.dat
@@ -1,4 +1,4 @@
-#   ID      pos_x      pos_y      pos_z        v_x        v_y        v_z           rho        rho_dh        wcount     wcount_dh         div_v       curl_vx       curl_vy       curl_vz
-    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-5	      4e-5	    4e-4       1e-2		 1e-5	     6e-6	   6e-6		 6e-6
-    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-6	      1.2e-4	  1e-4       2e-4		 2e-4	     1e-4	   1e-4	 	 1e-4
-    0	    1e-6       1e-6	  1e-6 	       1e-6 	  1e-6	     1e-6	   1e-6	      1e-6	    1e-6       1e-6		 1e-6	     1e-6	   1e-6		 1e-6
+#   ID    pos_x    pos_y    pos_z      v_x      v_y      v_z        h      rho    div_v        S        u        P        c      a_x      a_y      a_z     h_dt    v_sig    dS/dt    du/dt
+    0	  1e-4	   1e-4	    1e-4       1e-4	1e-4	 1e-4	    1e-4   1e-4	  1e-4	       1e-4	1e-4	 1e-4	  1e-4	 1e-4	  1e-4	   1e-4	   1e-4	   1e-4	    1e-4     1e-4
+    0	  1e-4	   1e-4	    1e-4       1e-4	1e-4	 1e-4	    1e-4   1e-4	  1e-4	       1e-4	1e-4	 1e-4	  1e-4	 1e-4	  1e-4	   1e-4	   1e-4	   1e-4	    1e-4     1e-4
+    0	  1e-6	   1e-6	    1e-6       1e-6	1e-6	 1e-6	    1e-6   1e-6	  1e-6	       1e-6	1e-6	 1e-6	  1e-6	 1e-5	  1e-5	   1e-5	   1e-5	   1e-5	    1e-5     1e-5