diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c
index ba34b070ecebee0f5f88bed99303f7866fd8e5c9..c37d2f52c6332a9328f47eae49602c68de39d070 100644
--- a/src/runner_doiact_vec.c
+++ b/src/runner_doiact_vec.c
@@ -27,7 +27,31 @@
 /* This object's header. */
 #include "runner_doiact_vec.h"
 
-void runner_dopair1_density(struct runner *r, struct cell *ci, struct cell *cj);
+#ifdef WITH_VECTORIZATION
+static const vector kernel_gamma2_vec = FILL_VEC(kernel_gamma2);
+
+//static void printFloatVector(vector v, char *label, int length) {
+//
+//  int i;
+//  printf("%s:[", label);
+//  for (i = 0; i < length; i++) {
+//    printf("%f, ", v.f[i]);
+//  }
+//  printf("]\n");
+//}
+
+//static void printIntVector(vector v, char *label, int length) {
+//
+//  int i;
+//  printf("%s:[", label);
+//  for (i = 0; i < length; i++) {
+//    printf("%d, ", v.i[i]);
+//  }
+//  printf("]\n");
+//}
+
+#endif
+
 #ifdef WITH_VECTORIZATION
 /**
  * @brief Compute the vector remainder interactions from the secondary cache.
@@ -263,6 +287,303 @@ __attribute__((always_inline)) INLINE static void storeInteractions(
   }
 }
 
+/**
+ * @brief Compute the vector remainder interactions from the secondary cache.
+ *
+ * @param int_cache (return) secondary #cache of interactions between two
+ * particles.
+ * @param icount Interaction count.
+ * @param rhoSum (return) #vector holding the cumulative sum of the density
+ * update on pi.
+ * @param rho_dhSum (return) #vector holding the cumulative sum of the density
+ * gradient update on pi.
+ * @param wcountSum (return) #vector holding the cumulative sum of the wcount
+ * update on pi.
+ * @param wcount_dhSum (return) #vector holding the cumulative sum of the wcount
+ * gradient update on pi.
+ * @param div_vSum (return) #vector holding the cumulative sum of the divergence
+ * update on pi.
+ * @param curlvxSum (return) #vector holding the cumulative sum of the curl of
+ * vx update on pi.
+ * @param curlvySum (return) #vector holding the cumulative sum of the curl of
+ * vy update on pi.
+ * @param curlvzSum (return) #vector holding the cumulative sum of the curl of
+ * vz update on pi.
+ * @param v_hi_inv #vector of 1/h for pi.
+ * @param v_vix #vector of x velocity of pi.
+ * @param v_viy #vector of y velocity of pi.
+ * @param v_viz #vector of z velocity of pi.
+ * @param icount_align (return) Interaction count after the remainder
+ * interactions have been performed, should be a multiple of the vector length.
+ */
+__attribute__((always_inline)) INLINE static void calcRemForceInteractions(
+    struct c2_cache *const int_cache, const int icount, vector *a_hydro_xSum,
+    vector *a_hydro_ySum, vector *a_hydro_zSum, vector *h_dtSum,
+    vector *v_sigSum, vector *entropy_dtSum,
+    vector v_hi_inv, vector v_vix, vector v_viy, vector v_viz,
+    vector v_rhoi, vector v_grad_hi, vector v_pOrhoi2, vector v_balsara_i, vector v_ci,
+    int *icount_align) {
+
+#ifdef HAVE_AVX512_F
+  KNL_MASK_16 knl_mask, knl_mask2;
+#endif
+  vector int_mask;//, int_mask2;
+
+  /* Work out the number of remainder interactions and pad secondary cache. */
+  *icount_align = icount;
+  int rem = icount % (1 * VEC_SIZE);
+  if (rem != 0) {
+    int pad = (1 * VEC_SIZE) - rem;
+    *icount_align += pad;
+
+/* Initialise masks to true. */
+#ifdef HAVE_AVX512_F
+    knl_mask = 0xFFFF;
+    knl_mask2 = 0xFFFF;
+    int_mask.m = vec_setint1(0xFFFFFFFF);
+    int_mask2.m = vec_setint1(0xFFFFFFFF);
+#else
+    int_mask.m = vec_setint1(0xFFFFFFFF);
+    //int_mask2.m = vec_setint1(0xFFFFFFFF);
+#endif
+    /* Pad secondary cache so that there are no contributions in the interaction
+     * function. */
+    for (int i = icount; i < *icount_align; i++) {
+      int_cache->mq[i] = 0.f;
+      int_cache->r2q[i] = 1.f;
+      int_cache->dxq[i] = 0.f;
+      int_cache->dyq[i] = 0.f;
+      int_cache->dzq[i] = 0.f;
+      int_cache->vxq[i] = 0.f;
+      int_cache->vyq[i] = 0.f;
+      int_cache->vzq[i] = 0.f;
+      int_cache->rhoq[i] = 1.f;
+      int_cache->grad_hq[i] = 1.f;
+      int_cache->pOrho2q[i] = 1.f;
+      int_cache->balsaraq[i] = 1.f;
+      int_cache->soundspeedq[i] = 1.f;
+      int_cache->h_invq[i] = 1.f;
+    }
+
+    /* Zero parts of mask that represent the padded values.*/
+    if (pad < VEC_SIZE) {
+#ifdef HAVE_AVX512_F
+      knl_mask2 = knl_mask2 >> pad;
+#else
+      for (int i = VEC_SIZE - pad; i < VEC_SIZE; i++) int_mask.i[i] = 0;
+#endif
+    } else {
+#ifdef HAVE_AVX512_F
+      knl_mask = knl_mask >> (VEC_SIZE - rem);
+      knl_mask2 = 0;
+#else
+      error("Pad should not be greater than VEC_SIZE.");
+      for (int i = rem; i < VEC_SIZE; i++) int_mask.i[i] = 0;
+      //int_mask2.v = vec_setzero();
+#endif
+    }
+
+    /* Perform remainder interaction and remove remainder from aligned
+     * interaction count. */
+    *icount_align = icount - rem;
+
+    vector r2, dx, dy, dz;
+    vector vjx, vjy, vjz, mj, rhoj, grad_hj, pOrhoj2, balsara_j, cj, hj_inv;
+    r2.v = vec_load(&int_cache->r2q[*icount_align]);
+    dx.v = vec_load(&int_cache->dxq[*icount_align]);
+    dy.v = vec_load(&int_cache->dyq[*icount_align]);
+    dz.v = vec_load(&int_cache->dzq[*icount_align]);
+    vjx.v = vec_load(&int_cache->vxq[*icount_align]);
+    vjy.v = vec_load(&int_cache->vyq[*icount_align]);
+    vjz.v = vec_load(&int_cache->vzq[*icount_align]);
+    mj.v = vec_load(&int_cache->mq[*icount_align]);
+    
+    rhoj.v = vec_load(&int_cache->rhoq[*icount_align]);
+    grad_hj.v = vec_load(&int_cache->grad_hq[*icount_align]);
+    balsara_j.v = vec_load(&int_cache->balsaraq[*icount_align]);
+    cj.v = vec_load(&int_cache->soundspeedq[*icount_align]);
+    hj_inv.v = vec_load(&int_cache->h_invq[*icount_align]);
+
+    runner_iact_nonsym_1_vec_force_2(
+        &r2, &dx, &dy, &dz, &v_vix, &v_viy, &v_viz, &v_rhoi, &v_grad_hi, &v_pOrhoi2, &v_balsara_i, &v_ci,
+        &vjx, &vjy, &vjz, &rhoj, &grad_hj, &pOrhoj2, &balsara_j, &cj, &mj, v_hi_inv, hj_inv,
+        a_hydro_xSum, a_hydro_ySum, a_hydro_zSum, h_dtSum, v_sigSum, entropy_dtSum, int_mask
+#ifdef HAVE_AVX512_F
+        ,knl_mask, knl_mask2);
+#else
+        );
+#endif
+  }
+}
+
+/**
+ * @brief Left-packs the values needed by an interaction into the secondary
+ * cache (Supports AVX, AVX2 and AVX512 instruction sets).
+ *
+ * @param mask Contains which particles need to interact.
+ * @param pjd Index of the particle to store into.
+ * @param v_r2 #vector of the separation between two particles squared.
+ * @param v_dx #vector of the x separation between two particles.
+ * @param v_dy #vector of the y separation between two particles.
+ * @param v_dz #vector of the z separation between two particles.
+ * @param v_mj #vector of the mass of particle pj.
+ * @param v_vjx #vector of x velocity of pj.
+ * @param v_vjy #vector of y velocity of pj.
+ * @param v_vjz #vector of z velocity of pj.
+ * @param cell_cache #cache of all particles in the cell.
+ * @param int_cache (return) secondary #cache of interactions between two
+ * particles.
+ * @param icount Interaction count.
+ * @param rhoSum #vector holding the cumulative sum of the density update on pi.
+ * @param rho_dhSum #vector holding the cumulative sum of the density gradient
+ * update on pi.
+ * @param wcountSum #vector holding the cumulative sum of the wcount update on
+ * pi.
+ * @param wcount_dhSum #vector holding the cumulative sum of the wcount gradient
+ * update on pi.
+ * @param div_vSum #vector holding the cumulative sum of the divergence update
+ * on pi.
+ * @param curlvxSum #vector holding the cumulative sum of the curl of vx update
+ * on pi.
+ * @param curlvySum #vector holding the cumulative sum of the curl of vy update
+ * on pi.
+ * @param curlvzSum #vector holding the cumulative sum of the curl of vz update
+ * on pi.
+ * @param v_hi_inv #vector of 1/h for pi.
+ * @param v_vix #vector of x velocity of pi.
+ * @param v_viy #vector of y velocity of pi.
+ * @param v_viz #vector of z velocity of pi.
+ */
+__attribute__((always_inline)) INLINE static void storeForceInteractions(
+    const int mask, const int pjd, vector *v_r2, vector *v_dx, vector *v_dy,
+    vector *v_dz, vector *v_mj, vector *v_vjx, vector *v_vjy, vector *v_vjz,
+    vector *v_rhoj, vector *v_grad_hj, vector *v_pOrhoj2, vector *v_balsara_j, vector *v_cj, vector *v_hj_inv,
+    const struct cache *const cell_cache, struct c2_cache *const int_cache,
+    int *icount, vector *a_hydro_xSum, vector *a_hydro_ySum, vector *a_hydro_zSum,
+    vector *h_dtSum, vector *v_sigSum, vector *entropy_dtSum,
+    vector v_hi_inv, vector v_vix, vector v_viy, vector v_viz, vector *v_rhoi, vector *v_grad_hi, vector *v_pOrhoi2, vector *v_balsara_i, vector *v_ci) {
+
+/* Left-pack values needed into the secondary cache using the interaction mask.
+ */
+#if defined(HAVE_AVX2) || defined(HAVE_AVX512_F)
+  int pack = 0;
+
+#ifdef HAVE_AVX512_F
+  pack += __builtin_popcount(mask);
+  VEC_LEFT_PACK(v_r2->v, mask, &int_cache->r2q[*icount]);
+  VEC_LEFT_PACK(v_dx->v, mask, &int_cache->dxq[*icount]);
+  VEC_LEFT_PACK(v_dy->v, mask, &int_cache->dyq[*icount]);
+  VEC_LEFT_PACK(v_dz->v, mask, &int_cache->dzq[*icount]);
+  VEC_LEFT_PACK(v_mj->v, mask, &int_cache->mq[*icount]);
+  VEC_LEFT_PACK(v_vjx->v, mask, &int_cache->vxq[*icount]);
+  VEC_LEFT_PACK(v_vjy->v, mask, &int_cache->vyq[*icount]);
+  VEC_LEFT_PACK(v_vjz->v, mask, &int_cache->vzq[*icount]);
+  
+  VEC_LEFT_PACK(v_rhoj->v, mask, &int_cache->rhoq[*icount]);
+  VEC_LEFT_PACK(v_grad_hj->v, mask, &int_cache->grad_hq[*icount]);
+  VEC_LEFT_PACK(v_pOrhoj2->v, mask, &int_cache->pOrho2q[*icount]);
+  VEC_LEFT_PACK(v_balsara_j->v, mask, &int_cache->balsaraq[*icount]);
+  VEC_LEFT_PACK(v_cj->v, mask, &int_cache->soundspeedq[*icount]);
+  VEC_LEFT_PACK(v_hj_inv->v, mask, &int_cache->h_invq[*icount]);
+#else
+  vector v_mask;
+  VEC_FORM_PACKED_MASK(mask, v_mask.m, pack);
+
+  VEC_LEFT_PACK(v_r2->v, v_mask.m, &int_cache->r2q[*icount]);
+  VEC_LEFT_PACK(v_dx->v, v_mask.m, &int_cache->dxq[*icount]);
+  VEC_LEFT_PACK(v_dy->v, v_mask.m, &int_cache->dyq[*icount]);
+  VEC_LEFT_PACK(v_dz->v, v_mask.m, &int_cache->dzq[*icount]);
+  VEC_LEFT_PACK(v_mj->v, v_mask.m, &int_cache->mq[*icount]);
+  VEC_LEFT_PACK(v_vjx->v, v_mask.m, &int_cache->vxq[*icount]);
+  VEC_LEFT_PACK(v_vjy->v, v_mask.m, &int_cache->vyq[*icount]);
+  VEC_LEFT_PACK(v_vjz->v, v_mask.m, &int_cache->vzq[*icount]);
+
+  VEC_LEFT_PACK(v_rhoj->v, v_mask.m, &int_cache->rhoq[*icount]);
+  VEC_LEFT_PACK(v_grad_hj->v, v_mask.m, &int_cache->grad_hq[*icount]);
+  VEC_LEFT_PACK(v_pOrhoj2->v, v_mask.m, &int_cache->pOrho2q[*icount]);
+  VEC_LEFT_PACK(v_balsara_j->v, v_mask.m, &int_cache->balsaraq[*icount]);
+  VEC_LEFT_PACK(v_cj->v, v_mask.m, &int_cache->soundspeedq[*icount]);
+  VEC_LEFT_PACK(v_hj_inv->v, v_mask.m, &int_cache->h_invq[*icount]);
+
+#endif /* HAVE_AVX512_F */
+
+  (*icount) += pack;
+#else
+  /* Quicker to do it serially in AVX rather than use intrinsics. */
+  for (int bit_index = 0; bit_index < VEC_SIZE; bit_index++) {
+    if (mask & (1 << bit_index)) {
+      /* Add this interaction to the queue. */
+      int_cache->r2q[*icount] = v_r2->f[bit_index];
+      int_cache->dxq[*icount] = v_dx->f[bit_index];
+      int_cache->dyq[*icount] = v_dy->f[bit_index];
+      int_cache->dzq[*icount] = v_dz->f[bit_index];
+      int_cache->mq[*icount] = cell_cache->m[pjd + bit_index];
+      int_cache->vxq[*icount] = cell_cache->vx[pjd + bit_index];
+      int_cache->vyq[*icount] = cell_cache->vy[pjd + bit_index];
+      int_cache->vzq[*icount] = cell_cache->vz[pjd + bit_index];
+      
+      int_cache->rhoq[*icount] = cell_cache->rho[pjd + bit_index];
+      int_cache->grad_hq[*icount] = cell_cache->grad_h[pjd + bit_index];
+      int_cache->pOrho2q[*icount] = cell_cache->pOrho2[pjd + bit_index];
+      int_cache->balsaraq[*icount] = cell_cache->balsara[pjd + bit_index];
+      int_cache->soundspeedq[*icount] = cell_cache->soundspeed[pjd + bit_index];
+      int_cache->h_invq[*icount] = v_hj_inv->f[bit_index];
+
+      (*icount)++;
+    }
+  }
+
+#endif /* defined(HAVE_AVX2) || defined(HAVE_AVX512_F) */
+
+  /* Flush the c2 cache if it has reached capacity. */
+  if (*icount >= (C2_CACHE_SIZE - (1 * VEC_SIZE))) {
+
+    error("Flushing interaction cache...");
+
+    int icount_align = *icount;
+
+    /* Peform remainder interactions. */
+    calcRemForceInteractions(int_cache, *icount, a_hydro_xSum, a_hydro_ySum, a_hydro_zSum,
+                             h_dtSum, v_sigSum, entropy_dtSum, v_hi_inv,
+                             v_vix, v_viy, v_viz, *v_rhoi, *v_grad_hi, *v_pOrhoi2, *v_balsara_i, *v_ci,
+                             &icount_align);
+
+
+    vector int_mask;//, int_mask2;
+    int_mask.m = vec_setint1(0xFFFFFFFF);
+    //int_mask2.m = vec_setint1(0xFFFFFFFF);
+
+    /* Perform interactions. */
+    for (int pjd = 0; pjd < icount_align; pjd += (1 * VEC_SIZE)) {
+      vector r2, dx, dy, dz;
+      vector vjx, vjy, vjz, mj, rhoj, grad_hj, pOrhoj2, balsara_j, cj, hj_inv;
+      r2.v = vec_load(&int_cache->r2q[pjd]);
+      dx.v = vec_load(&int_cache->dxq[pjd]);
+      dy.v = vec_load(&int_cache->dyq[pjd]);
+      dz.v = vec_load(&int_cache->dzq[pjd]);
+      vjx.v = vec_load(&int_cache->vxq[pjd]);
+      vjy.v = vec_load(&int_cache->vyq[pjd]);
+      vjz.v = vec_load(&int_cache->vzq[pjd]);
+      mj.v = vec_load(&int_cache->mq[pjd]);
+      
+      rhoj.v = vec_load(&int_cache->rhoq[pjd]);
+      grad_hj.v = vec_load(&int_cache->grad_hq[pjd]);
+      balsara_j.v = vec_load(&int_cache->balsaraq[pjd]);
+      cj.v = vec_load(&int_cache->soundspeedq[pjd]);
+      hj_inv.v = vec_load(&int_cache->h_invq[pjd]);
+
+      runner_iact_nonsym_1_vec_force_2(
+        &r2, &dx, &dy, &dz, &v_vix, &v_viy, &v_viz, v_rhoi, v_grad_hi, v_pOrhoi2, v_balsara_i, v_ci,
+        &vjx, &vjy, &vjz, &rhoj, &grad_hj, &pOrhoj2, &balsara_j, &cj, &mj, v_hi_inv, hj_inv,
+        a_hydro_xSum, a_hydro_ySum, a_hydro_zSum, h_dtSum, v_sigSum, entropy_dtSum, int_mask);
+
+    }
+
+    /* Reset interaction count. */
+    *icount = 0;
+  }
+}
+
 /* @brief Populates the arrays max_di and max_dj with the maximum distances of
  * particles into their neighbouring cells. Also finds the first pi that
  * interacts with any particle in cj and the last pj that interacts with any
@@ -970,6 +1291,777 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec_2(
 #endif /* WITH_VECTORIZATION */
 }
 
+/**
+ * @brief Compute the cell self-interaction (non-symmetric) using vector
+ * intrinsics with one particle pi at a time.
+ *
+ * @param r The #runner.
+ * @param c The #cell.
+ */
+__attribute__((always_inline)) INLINE void runner_doself2_force_vec(
+    struct runner *r, struct cell *restrict c) {
+
+#ifdef WITH_VECTORIZATION
+  //static int intCount = 0;
+  const struct engine *e = r->e;
+  int doi_mask;
+  struct part *restrict pi;
+  int count_align;
+  int num_vec_proc = 1;//NUM_VEC_PROC;
+
+  struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
+
+  struct part *restrict parts = c->parts;
+  const int count = c->count;
+
+  vector v_hi, v_hig2, v_r2;
+
+  //TIMER_TIC
+
+  if (!cell_is_active(c, e)) return;
+
+  if (!cell_is_drifted(c, e)) cell_drift_particles(c, e);
+
+  /* Get the particle cache from the runner and re-allocate
+   * the cache if it is not big enough for the cell. */
+  struct cache *restrict cell_cache = &r->ci_cache;
+
+  if (cell_cache->count < count) {
+    cache_init(cell_cache, count);
+  }
+
+  /* Read the particles from the cell and store them locally in the cache. */
+  cache_read_particles(c, cell_cache);
+
+  /* Create secondary cache to store particle interactions. */
+  //struct c2_cache int_cache;
+  //int icount = 0, icount_align = 0;
+
+  /* Loop over the particles in the cell. */
+  for (int pid = 0; pid < count; pid++) {
+
+    /* Get a pointer to the ith particle. */
+    pi = &parts[pid];
+
+    /* Is the ith particle active? */
+    if (!part_is_active(pi, e)) continue;
+
+    vector pix, piy, piz;
+
+    const float hi = cell_cache->h[pid];
+
+    /* Fill particle pi vectors. */
+    pix.v = vec_set1(cell_cache->x[pid]);
+    piy.v = vec_set1(cell_cache->y[pid]);
+    piz.v = vec_set1(cell_cache->z[pid]);
+    v_hi.v = vec_set1(hi);
+
+    const float hig2 = hi * hi * kernel_gamma2;
+    v_hig2.v = vec_set1(hig2);
+
+    /* Reset cumulative sums of update vectors. */
+    vector a_hydro_xSum, a_hydro_ySum, a_hydro_zSum, h_dtSum, v_sigSum, entropy_dtSum;
+
+    /* Get the inverse of hi. */
+    vector v_hi_inv;
+
+    v_hi_inv = vec_reciprocal(v_hi);
+
+    a_hydro_xSum.v = vec_setzero();
+    a_hydro_ySum.v = vec_setzero();
+    a_hydro_zSum.v = vec_setzero();
+    h_dtSum.v = vec_setzero();
+    v_sigSum.v = vec_set1(pi->force.v_sig);
+    entropy_dtSum.v = vec_setzero();
+
+    /* Pad cache if there is a serial remainder. */
+    count_align = count;
+    int rem = count % (num_vec_proc * VEC_SIZE);
+    if (rem != 0) {
+      int pad = (num_vec_proc * VEC_SIZE) - rem;
+
+      count_align += pad;
+
+      /* Set positions to the same as particle pi so when the r2 > 0 mask is
+       * applied these extra contributions are masked out.*/
+      for (int i = count; i < count_align; i++) {
+        cell_cache->x[i] = pix.f[0];
+        cell_cache->y[i] = piy.f[0];
+        cell_cache->z[i] = piz.f[0];
+      }
+    }
+
+    vector pjx, pjy, pjz;
+    //vector pjvx, pjvy, pjvz, mj;
+    vector hj, hjg2;
+    //vector pjx2, pjy2, pjz2;
+    //vector pjvx2, pjvy2, pjvz2, mj2, hj_2, hjg2_2;
+
+    for(int k=0; k<VEC_SIZE; k++)
+      piq[k] = pi;
+
+    /* Find all of particle pi's interacions and store needed values in the
+     * secondary cache.*/
+    for (int pjd = 0; pjd < count_align; pjd += (num_vec_proc * VEC_SIZE)) {
+
+      /* Load 2 sets of vectors from the particle cache. */
+      //pjx.v = vec_load(&cell_cache->x[pjd]);
+      //pjy.v = vec_load(&cell_cache->y[pjd]);
+      //pjz.v = vec_load(&cell_cache->z[pjd]);
+      //pjvx.v = vec_load(&cell_cache->vx[pjd]);
+      //pjvy.v = vec_load(&cell_cache->vy[pjd]);
+      //pjvz.v = vec_load(&cell_cache->vz[pjd]);
+      //mj.v = vec_load(&cell_cache->m[pjd]);
+    
+      hj.v = vec_load(&cell_cache->h[pjd]);
+      hjg2.v = vec_mul(vec_mul(hj.v,hj.v), kernel_gamma2_vec.v);
+
+      //pjx2.v = vec_load(&cell_cache->x[pjd + VEC_SIZE]);
+      //pjy2.v = vec_load(&cell_cache->y[pjd + VEC_SIZE]);
+      //pjz2.v = vec_load(&cell_cache->z[pjd + VEC_SIZE]);
+      //pjvx2.v = vec_load(&cell_cache->vx[pjd + VEC_SIZE]);
+      //pjvy2.v = vec_load(&cell_cache->vy[pjd + VEC_SIZE]);
+      //pjvz2.v = vec_load(&cell_cache->vz[pjd + VEC_SIZE]);
+      //mj2.v = vec_load(&cell_cache->m[pjd + VEC_SIZE]);
+
+      //hj_2.v = vec_load(&cell_cache->h[pjd + VEC_SIZE]);
+      //hjg2_2.v = vec_mul(vec_mul(hj_2.v,hj_2.v), kernel_gamma2_vec.v);
+
+      vector v_hj_inv;
+
+      v_hj_inv = vec_reciprocal(hj);
+
+      /* Compute the pairwise distance. */
+      vector v_dx_tmp, v_dy_tmp, v_dz_tmp;
+      //vector v_dx_tmp2, v_dy_tmp2, v_dz_tmp2, v_r2_2;
+
+      v_dx_tmp.v = vec_sub(pix.v, pjx.v);
+      //v_dx_tmp2.v = vec_sub(pix.v, pjx2.v);
+      v_dy_tmp.v = vec_sub(piy.v, pjy.v);
+      //v_dy_tmp2.v = vec_sub(piy.v, pjy2.v);
+      v_dz_tmp.v = vec_sub(piz.v, pjz.v);
+      //v_dz_tmp2.v = vec_sub(piz.v, pjz2.v);
+
+      v_r2.v = vec_mul(v_dx_tmp.v, v_dx_tmp.v);
+      //v_r2_2.v = vec_mul(v_dx_tmp2.v, v_dx_tmp2.v);
+      v_r2.v = vec_fma(v_dy_tmp.v, v_dy_tmp.v, v_r2.v);
+      //v_r2_2.v = vec_fma(v_dy_tmp2.v, v_dy_tmp2.v, v_r2_2.v);
+      v_r2.v = vec_fma(v_dz_tmp.v, v_dz_tmp.v, v_r2.v);
+      //v_r2_2.v = vec_fma(v_dz_tmp2.v, v_dz_tmp2.v, v_r2_2.v);
+
+/* Form a mask from r2 < hig2 and r2 > 0.*/
+#ifdef HAVE_AVX512_F
+      // KNL_MASK_16 doi_mask, doi_mask_check, doi_mask2, doi_mask2_check;
+      KNL_MASK_16 doi_mask_check, doi_mask2, doi_mask2_check;
+
+      doi_mask_check = vec_cmp_gt(v_r2.v, vec_setzero());
+      doi_mask = vec_cmp_lt(v_r2.v, v_hig2.v);
+
+      doi_mask2_check = vec_cmp_gt(v_r2_2.v, vec_setzero());
+      doi_mask2 = vec_cmp_lt(v_r2_2.v, v_hig2.v);
+
+      doi_mask = doi_mask & doi_mask_check;
+      doi_mask2 = doi_mask2 & doi_mask2_check;
+
+#else
+      vector v_doi_mask, v_doi_mask_check, v_doi_N3_mask;
+      //vector v_doi_mask2, v_doi_mask2_check, v_doi_N3_mask2;
+      //int doi_mask2;
+
+      /* Form r2 > 0 mask, r2 < hig2 mask and r2 < hjg2 mask. */
+      v_doi_mask_check.v = vec_cmp_gt(v_r2.v, vec_setzero());
+      v_doi_mask.v = vec_cmp_lt(v_r2.v, v_hig2.v);
+      v_doi_N3_mask.v = vec_cmp_lt(v_r2.v, hjg2.v);
+
+      /* Form r2 > 0 mask and r2 < hig2 mask. */
+      //v_doi_mask2_check.v = vec_cmp_gt(v_r2_2.v, vec_setzero());
+      //v_doi_mask2.v = vec_cmp_lt(v_r2_2.v, v_hig2.v);
+      //v_doi_N3_mask2.v = vec_cmp_lt(v_r2_2.v, v_hjg2_2.v);
+
+      v_doi_mask.v = vec_and(vec_add(v_doi_mask.v, v_doi_N3_mask.v), v_doi_mask_check.v);
+      
+      /* Combine two masks and form integer mask. */
+      doi_mask = vec_cmp_result(v_doi_mask.v);
+      //doi_mask2 = vec_cmp_result(vec_add(vec_and(v_doi_mask2.v, v_doi_mask2_check.v), v_doi_N3_mask2.v));
+      
+#endif /* HAVE_AVX512_F */
+
+      for(int k=0; k<VEC_SIZE; k++)
+        pjq[k] = &parts[pjd + k];
+
+      /* If there are any interactions left pack interaction values into c2
+       * cache. */
+      if (doi_mask) {
+        for(int k=0; k<VEC_SIZE; k++) {
+          if( v_r2.f[k] == 0.f) v_r2.f[k] = 1.f;
+        }
+        
+        //intCount += __builtin_popcount(doi_mask);
+
+        runner_iact_nonsym_1_vec_force(&v_r2, &v_dx_tmp, &v_dy_tmp, &v_dz_tmp,
+                          v_hi_inv, v_hj_inv, piq, pjq, 
+                          &a_hydro_xSum, &a_hydro_ySum, &a_hydro_zSum,
+                          &h_dtSum, &v_sigSum, &entropy_dtSum, v_doi_mask);
+      }
+      
+    }
+    
+    VEC_HADD(a_hydro_xSum, pi->a_hydro[0]);
+    VEC_HADD(a_hydro_ySum, pi->a_hydro[1]);
+    VEC_HADD(a_hydro_zSum, pi->a_hydro[2]);
+    VEC_HADD(h_dtSum, pi->force.h_dt);
+    /* TODO: Implement a horizontal max of a vector. */
+    for(int k=0; k<VEC_SIZE; k++)
+      pi->force.v_sig = max(pi->force.v_sig, v_sigSum.f[k]);
+    VEC_HADD(entropy_dtSum, pi->entropy_dt);
+    
+  } /* loop over all particles. */
+
+  //TIMER_TOC(timer_doself_force);
+#endif /* WITH_VECTORIZATION */
+}
+
+/**
+ * @brief Compute the cell self-interaction (non-symmetric) using vector
+ * intrinsics with one particle pi at a time.
+ *
+ * @param r The #runner.
+ * @param c The #cell.
+ */
+__attribute__((always_inline)) INLINE void runner_doself2_force_vec_2(
+    struct runner *r, struct cell *restrict c) {
+
+#ifdef WITH_VECTORIZATION
+  //static int intCount = 0;
+  const struct engine *e = r->e;
+  int doi_mask;
+  struct part *restrict pi;
+  int count_align;
+  int num_vec_proc = 1;//NUM_VEC_PROC;
+
+  struct part *restrict parts = c->parts;
+  const int count = c->count;
+
+  vector v_hi, v_vix, v_viy, v_viz, v_hig2, v_r2;
+  vector v_rhoi, v_grad_hi, v_pOrhoi2, v_balsara_i, v_ci;
+
+  //TIMER_TIC
+
+  if (!cell_is_active(c, e)) return;
+
+  if (!cell_is_drifted(c, e)) cell_drift_particles(c, e);
+
+  /* Get the particle cache from the runner and re-allocate
+   * the cache if it is not big enough for the cell. */
+  struct cache *restrict cell_cache = &r->ci_cache;
+
+  if (cell_cache->count < count) {
+    cache_init(cell_cache, count);
+  }
+
+  /* Read the particles from the cell and store them locally in the cache. */
+  cache_read_particles(c, cell_cache);
+
+  /* Create secondary cache to store particle interactions. */
+  //struct c2_cache int_cache;
+  //int icount = 0, icount_align = 0;
+
+  /* Loop over the particles in the cell. */
+  for (int pid = 0; pid < count; pid++) {
+
+    /* Get a pointer to the ith particle. */
+    pi = &parts[pid];
+
+    /* Is the ith particle active? */
+    if (!part_is_active(pi, e)) continue;
+
+    vector pix, piy, piz;
+
+    const float hi = cell_cache->h[pid];
+
+    /* Fill particle pi vectors. */
+    pix.v = vec_set1(cell_cache->x[pid]);
+    piy.v = vec_set1(cell_cache->y[pid]);
+    piz.v = vec_set1(cell_cache->z[pid]);
+    v_hi.v = vec_set1(hi);
+    v_vix.v = vec_set1(cell_cache->vx[pid]);
+    v_viy.v = vec_set1(cell_cache->vy[pid]);
+    v_viz.v = vec_set1(cell_cache->vz[pid]);
+    
+    v_rhoi.v = vec_set1(cell_cache->rho[pid]);
+    v_grad_hi.v = vec_set1(cell_cache->grad_h[pid]);
+    v_pOrhoi2.v = vec_set1(cell_cache->pOrho2[pid]);
+    v_balsara_i.v = vec_set1(cell_cache->balsara[pid]);
+    v_ci.v = vec_set1(cell_cache->soundspeed[pid]);
+
+    const float hig2 = hi * hi * kernel_gamma2;
+    v_hig2.v = vec_set1(hig2);
+
+    /* Reset cumulative sums of update vectors. */
+    vector a_hydro_xSum, a_hydro_ySum, a_hydro_zSum, h_dtSum, v_sigSum, entropy_dtSum;
+
+    /* Get the inverse of hi. */
+    vector v_hi_inv;
+
+    v_hi_inv = vec_reciprocal(v_hi);
+
+    a_hydro_xSum.v = vec_setzero();
+    a_hydro_ySum.v = vec_setzero();
+    a_hydro_zSum.v = vec_setzero();
+    h_dtSum.v = vec_setzero();
+    v_sigSum.v = vec_set1(pi->force.v_sig);
+    entropy_dtSum.v = vec_setzero();
+
+    /* Pad cache if there is a serial remainder. */
+    count_align = count;
+    int rem = count % (num_vec_proc * VEC_SIZE);
+    if (rem != 0) {
+      int pad = (num_vec_proc * VEC_SIZE) - rem;
+
+      count_align += pad;
+
+      /* Set positions to the same as particle pi so when the r2 > 0 mask is
+       * applied these extra contributions are masked out.*/
+      for (int i = count; i < count_align; i++) {
+        cell_cache->x[i] = pix.f[0];
+        cell_cache->y[i] = piy.f[0];
+        cell_cache->z[i] = piz.f[0];
+      }
+    }
+
+    vector pjx, pjy, pjz;
+    vector pjvx, pjvy, pjvz, mj, hj, hjg2;
+    vector v_rhoj, v_grad_hj, v_pOrhoj2, v_balsara_j, v_cj;
+    //vector pjx2, pjy2, pjz2;
+    //vector pjvx2, pjvy2, pjvz2, mj2, hj_2, hjg2_2;
+
+    /* Find all of particle pi's interacions and store needed values in the
+     * secondary cache.*/
+    for (int pjd = 0; pjd < count_align; pjd += (num_vec_proc * VEC_SIZE)) {
+
+      /* Load 2 sets of vectors from the particle cache. */
+      pjx.v = vec_load(&cell_cache->x[pjd]);
+      pjy.v = vec_load(&cell_cache->y[pjd]);
+      pjz.v = vec_load(&cell_cache->z[pjd]);
+      pjvx.v = vec_load(&cell_cache->vx[pjd]);
+      pjvy.v = vec_load(&cell_cache->vy[pjd]);
+      pjvz.v = vec_load(&cell_cache->vz[pjd]);
+      mj.v = vec_load(&cell_cache->m[pjd]);
+    
+      hj.v = vec_load(&cell_cache->h[pjd]);
+      hjg2.v = vec_mul(vec_mul(hj.v,hj.v), kernel_gamma2_vec.v);
+
+      v_rhoj.v = vec_load(&cell_cache->rho[pjd]);
+      v_grad_hj.v = vec_load(&cell_cache->grad_h[pjd]);
+      v_pOrhoj2.v = vec_load(&cell_cache->pOrho2[pjd]);
+      v_balsara_j.v = vec_load(&cell_cache->balsara[pjd]);
+      v_cj.v = vec_load(&cell_cache->soundspeed[pjd]);
+
+      //pjx2.v = vec_load(&cell_cache->x[pjd + VEC_SIZE]);
+      //pjy2.v = vec_load(&cell_cache->y[pjd + VEC_SIZE]);
+      //pjz2.v = vec_load(&cell_cache->z[pjd + VEC_SIZE]);
+      //pjvx2.v = vec_load(&cell_cache->vx[pjd + VEC_SIZE]);
+      //pjvy2.v = vec_load(&cell_cache->vy[pjd + VEC_SIZE]);
+      //pjvz2.v = vec_load(&cell_cache->vz[pjd + VEC_SIZE]);
+      //mj2.v = vec_load(&cell_cache->m[pjd + VEC_SIZE]);
+
+      //hj_2.v = vec_load(&cell_cache->h[pjd + VEC_SIZE]);
+      //hjg2_2.v = vec_mul(vec_mul(hj_2.v,hj_2.v), kernel_gamma2_vec.v);
+
+      vector v_hj_inv;
+
+      v_hj_inv = vec_reciprocal(hj);
+
+      /* Compute the pairwise distance. */
+      vector v_dx_tmp, v_dy_tmp, v_dz_tmp;
+      //vector v_dx_tmp2, v_dy_tmp2, v_dz_tmp2, v_r2_2;
+
+      v_dx_tmp.v = vec_sub(pix.v, pjx.v);
+      //v_dx_tmp2.v = vec_sub(pix.v, pjx2.v);
+      v_dy_tmp.v = vec_sub(piy.v, pjy.v);
+      //v_dy_tmp2.v = vec_sub(piy.v, pjy2.v);
+      v_dz_tmp.v = vec_sub(piz.v, pjz.v);
+      //v_dz_tmp2.v = vec_sub(piz.v, pjz2.v);
+
+      v_r2.v = vec_mul(v_dx_tmp.v, v_dx_tmp.v);
+      //v_r2_2.v = vec_mul(v_dx_tmp2.v, v_dx_tmp2.v);
+      v_r2.v = vec_fma(v_dy_tmp.v, v_dy_tmp.v, v_r2.v);
+      //v_r2_2.v = vec_fma(v_dy_tmp2.v, v_dy_tmp2.v, v_r2_2.v);
+      v_r2.v = vec_fma(v_dz_tmp.v, v_dz_tmp.v, v_r2.v);
+      //v_r2_2.v = vec_fma(v_dz_tmp2.v, v_dz_tmp2.v, v_r2_2.v);
+
+/* Form a mask from r2 < hig2 and r2 > 0.*/
+#ifdef HAVE_AVX512_F
+      // KNL_MASK_16 doi_mask, doi_mask_check, doi_mask2, doi_mask2_check;
+      KNL_MASK_16 doi_mask_check, doi_mask2, doi_mask2_check;
+
+      doi_mask_check = vec_cmp_gt(v_r2.v, vec_setzero());
+      doi_mask = vec_cmp_lt(v_r2.v, v_hig2.v);
+
+      doi_mask2_check = vec_cmp_gt(v_r2_2.v, vec_setzero());
+      doi_mask2 = vec_cmp_lt(v_r2_2.v, v_hig2.v);
+
+      doi_mask = doi_mask & doi_mask_check;
+      doi_mask2 = doi_mask2 & doi_mask2_check;
+
+#else
+      vector v_doi_mask, v_doi_mask_check, v_doi_N3_mask;
+      //vector v_doi_mask2, v_doi_mask2_check, v_doi_N3_mask2;
+      //int doi_mask2;
+
+      /* Form r2 > 0 mask, r2 < hig2 mask and r2 < hjg2 mask. */
+      v_doi_mask_check.v = vec_cmp_gt(v_r2.v, vec_setzero());
+      v_doi_mask.v = vec_cmp_lt(v_r2.v, v_hig2.v);
+      v_doi_N3_mask.v = vec_cmp_lt(v_r2.v, hjg2.v);
+
+      /* Form r2 > 0 mask and r2 < hig2 mask. */
+      //v_doi_mask2_check.v = vec_cmp_gt(v_r2_2.v, vec_setzero());
+      //v_doi_mask2.v = vec_cmp_lt(v_r2_2.v, v_hig2.v);
+      //v_doi_N3_mask2.v = vec_cmp_lt(v_r2_2.v, v_hjg2_2.v);
+
+      v_doi_mask.v = vec_and(vec_add(v_doi_mask.v, v_doi_N3_mask.v), v_doi_mask_check.v);
+      
+      /* Combine two masks and form integer mask. */
+      doi_mask = vec_cmp_result(v_doi_mask.v);
+      //doi_mask2 = vec_cmp_result(vec_add(vec_and(v_doi_mask2.v, v_doi_mask2_check.v), v_doi_N3_mask2.v));
+      
+#endif /* HAVE_AVX512_F */
+
+      /* If there are any interactions left pack interaction values into c2
+       * cache. */
+      if (doi_mask) {
+        for(int k=0; k<VEC_SIZE; k++) {
+          if( v_r2.f[k] == 0.f) v_r2.f[k] = 1.f;
+        }
+        
+        //intCount += __builtin_popcount(doi_mask);
+
+        runner_iact_nonsym_1_vec_force_2(&v_r2, &v_dx_tmp, &v_dy_tmp, &v_dz_tmp,
+                          &v_vix, &v_viy, &v_viz, &v_rhoi, &v_grad_hi, &v_pOrhoi2, &v_balsara_i, &v_ci,
+                          &pjvx, &pjvy, &pjvz, &v_rhoj, &v_grad_hj, &v_pOrhoj2, &v_balsara_j, &v_cj, &mj,
+                          v_hi_inv, v_hj_inv, 
+                          &a_hydro_xSum, &a_hydro_ySum, &a_hydro_zSum,
+                          &h_dtSum, &v_sigSum, &entropy_dtSum, v_doi_mask);
+      }
+      
+    }
+
+    
+    VEC_HADD(a_hydro_xSum, pi->a_hydro[0]);
+    VEC_HADD(a_hydro_ySum, pi->a_hydro[1]);
+    VEC_HADD(a_hydro_zSum, pi->a_hydro[2]);
+    VEC_HADD(h_dtSum, pi->force.h_dt);
+    /* TODO: Implement a horizontal max of a vector. */
+    for(int k=0; k<VEC_SIZE; k++)
+      pi->force.v_sig = max(pi->force.v_sig, v_sigSum.f[k]);
+    VEC_HADD(entropy_dtSum, pi->entropy_dt);
+    
+  } /* loop over all particles. */
+
+  //message("No. of force interactions: %d", intCount);
+  //TIMER_TOC(timer_doself_force);
+#endif /* WITH_VECTORIZATION */
+}
+
+/**
+ * @brief Compute the cell self-interaction (non-symmetric) using vector
+ * intrinsics with one particle pi at a time.
+ *
+ * @param r The #runner.
+ * @param c The #cell.
+ */
+__attribute__((always_inline)) INLINE void runner_doself2_force_vec_3(
+    struct runner *r, struct cell *restrict c) {
+
+#ifdef WITH_VECTORIZATION
+  static int intCount = 0;
+  const struct engine *e = r->e;
+  int doi_mask;
+  struct part *restrict pi;
+  int count_align;
+  int num_vec_proc = 1;//NUM_VEC_PROC;
+
+  struct part *restrict parts = c->parts;
+  const int count = c->count;
+
+  vector v_hi, v_vix, v_viy, v_viz, v_hig2, v_r2;
+  vector v_rhoi, v_grad_hi, v_pOrhoi2, v_balsara_i, v_ci;
+
+  //TIMER_TIC
+
+  if (!cell_is_active(c, e)) return;
+
+  if (!cell_is_drifted(c, e)) cell_drift_particles(c, e);
+
+  /* Get the particle cache from the runner and re-allocate
+   * the cache if it is not big enough for the cell. */
+  struct cache *restrict cell_cache = &r->ci_cache;
+
+  if (cell_cache->count < count) {
+    cache_init(cell_cache, count);
+  }
+
+  /* Read the particles from the cell and store them locally in the cache. */
+  cache_read_particles(c, cell_cache);
+
+  /* Create secondary cache to store particle interactions. */
+  struct c2_cache int_cache;
+  int icount = 0, icount_align = 0;
+
+  /* Loop over the particles in the cell. */
+  for (int pid = 0; pid < count; pid++) {
+
+    /* Get a pointer to the ith particle. */
+    pi = &parts[pid];
+
+    /* Is the ith particle active? */
+    if (!part_is_active(pi, e)) continue;
+
+    vector pix, piy, piz;
+
+    const float hi = cell_cache->h[pid];
+
+    /* Fill particle pi vectors. */
+    pix.v = vec_set1(cell_cache->x[pid]);
+    piy.v = vec_set1(cell_cache->y[pid]);
+    piz.v = vec_set1(cell_cache->z[pid]);
+    v_hi.v = vec_set1(hi);
+    v_vix.v = vec_set1(cell_cache->vx[pid]);
+    v_viy.v = vec_set1(cell_cache->vy[pid]);
+    v_viz.v = vec_set1(cell_cache->vz[pid]);
+    
+    v_rhoi.v = vec_set1(cell_cache->rho[pid]);
+    v_grad_hi.v = vec_set1(cell_cache->grad_h[pid]);
+    v_pOrhoi2.v = vec_set1(cell_cache->pOrho2[pid]);
+    v_balsara_i.v = vec_set1(cell_cache->balsara[pid]);
+    v_ci.v = vec_set1(cell_cache->soundspeed[pid]);
+
+    const float hig2 = hi * hi * kernel_gamma2;
+    v_hig2.v = vec_set1(hig2);
+
+    /* Reset cumulative sums of update vectors. */
+    vector a_hydro_xSum, a_hydro_ySum, a_hydro_zSum, h_dtSum, v_sigSum, entropy_dtSum;
+
+    /* Get the inverse of hi. */
+    vector v_hi_inv;
+
+    v_hi_inv = vec_reciprocal(v_hi);
+
+    a_hydro_xSum.v = vec_setzero();
+    a_hydro_ySum.v = vec_setzero();
+    a_hydro_zSum.v = vec_setzero();
+    h_dtSum.v = vec_setzero();
+    v_sigSum.v = vec_set1(pi->force.v_sig);
+    entropy_dtSum.v = vec_setzero();
+
+    /* Pad cache if there is a serial remainder. */
+    count_align = count;
+    int rem = count % (num_vec_proc * VEC_SIZE);
+    if (rem != 0) {
+      int pad = (num_vec_proc * VEC_SIZE) - rem;
+
+      count_align += pad;
+
+      /* Set positions to the same as particle pi so when the r2 > 0 mask is
+       * applied these extra contributions are masked out.*/
+      for (int i = count; i < count_align; i++) {
+        cell_cache->x[i] = pix.f[0];
+        cell_cache->y[i] = piy.f[0];
+        cell_cache->z[i] = piz.f[0];
+      }
+    }
+
+    vector pjx, pjy, pjz;
+    vector pjvx, pjvy, pjvz, mj, hj, hjg2;
+    vector v_rhoj, v_grad_hj, v_pOrhoj2, v_balsara_j, v_cj;
+    //vector pjx2, pjy2, pjz2;
+    //vector pjvx2, pjvy2, pjvz2, mj2, hj_2, hjg2_2;
+
+    /* Find all of particle pi's interacions and store needed values in the
+     * secondary cache.*/
+    for (int pjd = 0; pjd < count_align; pjd += (num_vec_proc * VEC_SIZE)) {
+
+      /* Load 2 sets of vectors from the particle cache. */
+      pjx.v = vec_load(&cell_cache->x[pjd]);
+      pjy.v = vec_load(&cell_cache->y[pjd]);
+      pjz.v = vec_load(&cell_cache->z[pjd]);
+      pjvx.v = vec_load(&cell_cache->vx[pjd]);
+      pjvy.v = vec_load(&cell_cache->vy[pjd]);
+      pjvz.v = vec_load(&cell_cache->vz[pjd]);
+      mj.v = vec_load(&cell_cache->m[pjd]);
+    
+      hj.v = vec_load(&cell_cache->h[pjd]);
+      hjg2.v = vec_mul(vec_mul(hj.v,hj.v), kernel_gamma2_vec.v);
+
+      v_rhoj.v = vec_load(&cell_cache->rho[pjd]);
+      v_grad_hj.v = vec_load(&cell_cache->grad_h[pjd]);
+      v_pOrhoj2.v = vec_load(&cell_cache->pOrho2[pjd]);
+      v_balsara_j.v = vec_load(&cell_cache->balsara[pjd]);
+      v_cj.v = vec_load(&cell_cache->soundspeed[pjd]);
+
+      //pjx2.v = vec_load(&cell_cache->x[pjd + VEC_SIZE]);
+      //pjy2.v = vec_load(&cell_cache->y[pjd + VEC_SIZE]);
+      //pjz2.v = vec_load(&cell_cache->z[pjd + VEC_SIZE]);
+      //pjvx2.v = vec_load(&cell_cache->vx[pjd + VEC_SIZE]);
+      //pjvy2.v = vec_load(&cell_cache->vy[pjd + VEC_SIZE]);
+      //pjvz2.v = vec_load(&cell_cache->vz[pjd + VEC_SIZE]);
+      //mj2.v = vec_load(&cell_cache->m[pjd + VEC_SIZE]);
+
+      //hj_2.v = vec_load(&cell_cache->h[pjd + VEC_SIZE]);
+      //hjg2_2.v = vec_mul(vec_mul(hj_2.v,hj_2.v), kernel_gamma2_vec.v);
+
+      vector v_hj_inv;
+
+      v_hj_inv = vec_reciprocal(hj);
+
+      /* Compute the pairwise distance. */
+      vector v_dx_tmp, v_dy_tmp, v_dz_tmp;
+      //vector v_dx_tmp2, v_dy_tmp2, v_dz_tmp2, v_r2_2;
+
+      v_dx_tmp.v = vec_sub(pix.v, pjx.v);
+      //v_dx_tmp2.v = vec_sub(pix.v, pjx2.v);
+      v_dy_tmp.v = vec_sub(piy.v, pjy.v);
+      //v_dy_tmp2.v = vec_sub(piy.v, pjy2.v);
+      v_dz_tmp.v = vec_sub(piz.v, pjz.v);
+      //v_dz_tmp2.v = vec_sub(piz.v, pjz2.v);
+
+      v_r2.v = vec_mul(v_dx_tmp.v, v_dx_tmp.v);
+      //v_r2_2.v = vec_mul(v_dx_tmp2.v, v_dx_tmp2.v);
+      v_r2.v = vec_fma(v_dy_tmp.v, v_dy_tmp.v, v_r2.v);
+      //v_r2_2.v = vec_fma(v_dy_tmp2.v, v_dy_tmp2.v, v_r2_2.v);
+      v_r2.v = vec_fma(v_dz_tmp.v, v_dz_tmp.v, v_r2.v);
+      //v_r2_2.v = vec_fma(v_dz_tmp2.v, v_dz_tmp2.v, v_r2_2.v);
+
+/* Form a mask from r2 < hig2 and r2 > 0.*/
+#ifdef HAVE_AVX512_F
+      // KNL_MASK_16 doi_mask, doi_mask_check, doi_mask2, doi_mask2_check;
+      KNL_MASK_16 doi_mask_check, doi_mask2, doi_mask2_check;
+
+      doi_mask_check = vec_cmp_gt(v_r2.v, vec_setzero());
+      doi_mask = vec_cmp_lt(v_r2.v, v_hig2.v);
+
+      doi_mask2_check = vec_cmp_gt(v_r2_2.v, vec_setzero());
+      doi_mask2 = vec_cmp_lt(v_r2_2.v, v_hig2.v);
+
+      doi_mask = doi_mask & doi_mask_check;
+      doi_mask2 = doi_mask2 & doi_mask2_check;
+
+#else
+      vector v_doi_mask, v_doi_mask_check, v_doi_N3_mask;
+      //vector v_doi_mask2, v_doi_mask2_check, v_doi_N3_mask2;
+      //int doi_mask2;
+
+      /* Form r2 > 0 mask, r2 < hig2 mask and r2 < hjg2 mask. */
+      v_doi_mask_check.v = vec_cmp_gt(v_r2.v, vec_setzero());
+      v_doi_mask.v = vec_cmp_lt(v_r2.v, v_hig2.v);
+      v_doi_N3_mask.v = vec_cmp_lt(v_r2.v, hjg2.v);
+
+      /* Form r2 > 0 mask and r2 < hig2 mask. */
+      //v_doi_mask2_check.v = vec_cmp_gt(v_r2_2.v, vec_setzero());
+      //v_doi_mask2.v = vec_cmp_lt(v_r2_2.v, v_hig2.v);
+      //v_doi_N3_mask2.v = vec_cmp_lt(v_r2_2.v, v_hjg2_2.v);
+
+      v_doi_mask.v = vec_and(vec_add(v_doi_mask.v, v_doi_N3_mask.v), v_doi_mask_check.v);
+      
+      /* Combine two masks and form integer mask. */
+      doi_mask = vec_cmp_result(v_doi_mask.v);
+      //doi_mask2 = vec_cmp_result(vec_add(vec_and(v_doi_mask2.v, v_doi_mask2_check.v), v_doi_N3_mask2.v));
+      
+#endif /* HAVE_AVX512_F */
+
+      /* If there are any interactions left pack interaction values into c2
+       * cache. */
+      if (doi_mask) {
+        
+        //for(int k=0; k<VEC_SIZE; k++) {
+        //  if( v_r2.f[k] == 0.f) v_r2.f[k] = 1.f;
+        //}
+        storeForceInteractions(doi_mask, pjd, &v_r2, &v_dx_tmp, &v_dy_tmp, &v_dz_tmp,
+                          &mj, &pjvx, &pjvy, &pjvz, &v_rhoj, &v_grad_hj, &v_pOrhoj2, &v_balsara_j, &v_cj, &v_hj_inv, cell_cache, &int_cache,
+                          &icount, &a_hydro_xSum, &a_hydro_ySum, &a_hydro_zSum,
+                          &h_dtSum, &v_sigSum, &entropy_dtSum,
+                          v_hi_inv, v_vix, v_viy, v_viz, &v_rhoi, &v_grad_hi, &v_pOrhoi2, &v_balsara_i, &v_ci);
+      }
+
+    } /* Loop over all other particles. */
+
+    //printFloatVector(a_hydro_xSum,"a_hydro_xSum before rem",8);
+    /* Perform padded vector remainder interactions if any are present. */
+    calcRemForceInteractions(&int_cache, icount, &a_hydro_xSum, &a_hydro_ySum, &a_hydro_zSum,
+                             &h_dtSum, &v_sigSum, &entropy_dtSum, v_hi_inv,
+                             v_vix, v_viy, v_viz, v_rhoi, v_grad_hi, v_pOrhoi2, v_balsara_i, v_ci,
+                             &icount_align);
+    //printFloatVector(a_hydro_xSum,"a_hydro_xSum after rem",8);
+
+    /* Initialise masks to true in case remainder interactions have been
+     * performed. */
+    vector int_mask;//, int_mask2;
+#ifdef HAVE_AVX512_F
+    KNL_MASK_16 knl_mask = 0xFFFF;
+    KNL_MASK_16 knl_mask2 = 0xFFFF;
+    int_mask.m = vec_setint1(0xFFFFFFFF);
+    int_mask2.m = vec_setint1(0xFFFFFFFF);
+#else
+    int_mask.m = vec_setint1(0xFFFFFFFF);
+    //int_mask2.m = vec_setint1(0xFFFFFFFF);
+#endif
+
+    /* Perform interaction with 2 vectors. */
+    for (int pjd = 0; pjd < icount_align; pjd += (1 * VEC_SIZE)) {
+      
+      vector r2, dx, dy, dz;
+      
+      vector vjx, vjy, vjz, mj, rhoj, grad_hj, pOrhoj2, balsara_j, cj, hj_inv;
+      
+      r2.v = vec_load(&int_cache.r2q[pjd]);
+      dx.v = vec_load(&int_cache.dxq[pjd]);
+      dy.v = vec_load(&int_cache.dyq[pjd]);
+      dz.v = vec_load(&int_cache.dzq[pjd]);
+
+      vjx.v = vec_load(&int_cache.vxq[pjd]);
+      vjy.v = vec_load(&int_cache.vyq[pjd]);
+      vjz.v = vec_load(&int_cache.vzq[pjd]);
+      mj.v = vec_load(&int_cache.mq[pjd]);
+      
+      rhoj.v = vec_load(&int_cache.rhoq[pjd]);
+      grad_hj.v = vec_load(&int_cache.grad_hq[pjd]);
+      pOrhoj2.v = vec_load(&int_cache.pOrho2q[pjd]);
+      balsara_j.v = vec_load(&int_cache.balsaraq[pjd]);
+      cj.v = vec_load(&int_cache.soundspeedq[pjd]);
+      hj_inv.v = vec_load(&int_cache.h_invq[pjd]);
+
+      runner_iact_nonsym_1_vec_force_2(
+        &r2, &dx, &dy, &dz, &v_vix, &v_viy, &v_viz, &v_rhoi, &v_grad_hi, &v_pOrhoi2, &v_balsara_i, &v_ci,
+        &vjx, &vjy, &vjz, &rhoj, &grad_hj, &pOrhoj2, &balsara_j, &cj, &mj, v_hi_inv, hj_inv,
+        &a_hydro_xSum, &a_hydro_ySum, &a_hydro_zSum, &h_dtSum, &v_sigSum, &entropy_dtSum, int_mask
+#ifdef HAVE_AVX512_F
+          knl_mask, knl_mask2);
+#else
+          );
+#endif
+    }
+    
+    VEC_HADD(a_hydro_xSum, pi->a_hydro[0]);
+    VEC_HADD(a_hydro_ySum, pi->a_hydro[1]);
+    VEC_HADD(a_hydro_zSum, pi->a_hydro[2]);
+    VEC_HADD(h_dtSum, pi->force.h_dt);
+    /* TODO: Implement a horizontal max of a vector. */
+    for(int k=0; k<VEC_SIZE; k++)
+      pi->force.v_sig = max(pi->force.v_sig, v_sigSum.f[k]);
+    VEC_HADD(entropy_dtSum, pi->entropy_dt);
+
+    intCount += icount;
+
+    message("No. of interactions for particle %lld: %d", pi->id, icount);
+    /* Reset interaction count. */
+    icount = 0;
+  } /* loop over all particles. */
+
+  message("No. of force interactions: %d", intCount);
+  //TIMER_TOC(timer_doself_force);
+#endif /* WITH_VECTORIZATION */
+}
+
 /**
  * @brief Compute the density interactions between a cell pair (non-symmetric)
  * using vector intrinsics.