diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c
index f96e305a720e512da960b18fee4a4f3b37c2dc50..131c7898bb449338e3395f8a5dbef5d98e48f05c 100644
--- a/src/runner_doiact_vec.c
+++ b/src/runner_doiact_vec.c
@@ -225,6 +225,8 @@ __attribute__((always_inline)) INLINE static void storeInteractions(
       (*icount)++;
     }
   }
+#endif /* defined(HAVE_AVX2) || defined(HAVE_AVX512_F) */
+
   /* Flush the c2 cache if it has reached capacity. */
   if (*icount >= (C2_CACHE_SIZE - (NUM_VEC_PROC * VEC_SIZE))) {
 
@@ -254,7 +256,6 @@ __attribute__((always_inline)) INLINE static void storeInteractions(
     *icount = 0;
   }
 
-#endif /* defined(HAVE_AVX2) || defined(HAVE_AVX512_F) */
 }
 #endif /* WITH_VECTORIZATION */ 
 
@@ -882,13 +883,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell *
   struct c2_cache int_cache;
   int num_vec_proc = 1;
 
-  vector v_hi, v_vix, v_viy, v_viz, v_hig2;//, v_r2;
-
-  float r2q[VEC_SIZE] __attribute__((aligned(16)));
-  float hiq[VEC_SIZE] __attribute__((aligned(16)));
-  float hjq[VEC_SIZE] __attribute__((aligned(16)));
-  float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
-  struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
+  vector v_hi, v_vix, v_viy, v_viz, v_hig2;
 
   TIMER_TIC;
 
@@ -949,8 +944,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell *
     struct part *restrict pi = &parts_i[sort_i[pid].i];
     if (!part_is_active(pi, e)) continue;
 
-    //int ci_cache_idx = sort_i[pid].i;
-    int ci_cache_idx = pid;
+    int ci_cache_idx = pid; //sort_i[pid].i;
 
     const float hi = ci_cache->h[ci_cache_idx];
     const double di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift;
@@ -1017,8 +1011,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell *
     for (int pjd = 0; pjd < exit_iteration_align; pjd += VEC_SIZE) {
 
       /* Get the cache index to the jth particle. */
-      //int cj_cache_idx = sort_j[pjd].i;
-      int cj_cache_idx = pjd;
+      int cj_cache_idx = pjd; //sort_j[pjd].i;
 
       vector v_dx, v_dy, v_dz, v_r2;
 
@@ -1117,67 +1110,167 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell *
     /* Get a hold of the jth part in cj. */
     struct part *restrict pj = &parts_j[sort_j[pjd].i];
     if (!part_is_active(pj, e)) continue;
-    const float hj = pj->h;
+
+    int cj_cache_idx = pjd;
+
+    const float hj = cj_cache.h[cj_cache_idx];
     const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift;
     if (dj > di_max) continue;
 
-    double pjx[3];
-    for (int k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k];
     const float hjg2 = hj * hj * kernel_gamma2;
 
-    /* Loop over the parts in ci. */
-    for (int pid = count_i - 1; pid >= 0 && sort_i[pid].d > dj; pid--) {
+    vector pjx, pjy, pjz;
+    vector v_hj, v_vjx, v_vjy, v_vjz, v_hjg2;
 
-      /* Get a pointer to the jth particle. */
-      struct part *restrict pi = &parts_i[sort_i[pid].i];
+    /* Fill particle pi vectors. */
+    pjx.v = vec_set1(cj_cache.x[cj_cache_idx]);
+    pjy.v = vec_set1(cj_cache.y[cj_cache_idx]);
+    pjz.v = vec_set1(cj_cache.z[cj_cache_idx]);
+    v_hj.v = vec_set1(hj);
+    v_vjx.v = vec_set1(cj_cache.vx[cj_cache_idx]);
+    v_vjy.v = vec_set1(cj_cache.vy[cj_cache_idx]);
+    v_vjz.v = vec_set1(cj_cache.vz[cj_cache_idx]);
 
-      /* Compute the pairwise distance. */
-      float r2 = 0.0f;
-      float dx[3];
-      for (int k = 0; k < 3; k++) {
-        dx[k] = pjx[k] - pi->x[k];
-        r2 += dx[k] * dx[k];
-      }
+    v_hjg2.v = vec_set1(hjg2);
 
-      /* Hit or miss? */
-      if (r2 < hjg2) {
+    /* Reset cumulative sums of update vectors. */
+    vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum,
+        curlvySum, curlvzSum;
 
-#ifndef WITH_VECTORIZATION
+    /* Get the inverse of hj. */
+    vector v_hj_inv;
 
-        runner_iact_nonsym_density(r2, dx, hj, pi->h, pj, pi);
+    v_hj_inv = vec_reciprocal(v_hj);
 
-#else
+    rhoSum.v = vec_setzero();
+    rho_dhSum.v = vec_setzero();
+    wcountSum.v = vec_setzero();
+    wcount_dhSum.v = vec_setzero();
+    div_vSum.v = vec_setzero();
+    curlvxSum.v = vec_setzero();
+    curlvySum.v = vec_setzero();
+    curlvzSum.v = vec_setzero();
 
-        /* Add this interaction to the queue. */
-        r2q[icount] = r2;
-        dxq[3 * icount + 0] = dx[0];
-        dxq[3 * icount + 1] = dx[1];
-        dxq[3 * icount + 2] = dx[2];
-        hiq[icount] = hj;
-        hjq[icount] = pi->h;
-        piq[icount] = pj;
-        pjq[icount] = pi;
-        icount += 1;
-
-        /* Flush? */
-        if (icount == VEC_SIZE) {
-          runner_iact_nonsym_vec_density(r2q, dxq, hiq, hjq, piq, pjq);
-          icount = 0;
-        }
+    int exit_iteration = 0;
+    for (int pid = count_i - 1; pid >= 0; pid--) {
+      if(sort_i[pid].d <= dj) exit_iteration = pid;
+    }
 
-#endif
+    /* Pad cache if there is a serial remainder. */
+    int exit_iteration_align = exit_iteration;
+    int rem = exit_iteration % (num_vec_proc * VEC_SIZE);
+    if (rem != 0) {
+      int pad = (num_vec_proc * VEC_SIZE) - rem;
+
+      exit_iteration_align -= pad;
+      /* Set positions to the same as particle pi so when the r2 > 0 mask is
+       * applied these extra contributions are masked out.*/
+      for (int i = exit_iteration; i >= exit_iteration_align; i--) {
+        ci_cache->x[i] = pjx.f[0];
+        ci_cache->y[i] = pjy.f[0];
+        ci_cache->z[i] = pjz.f[0];
       }
+    }
+
+    vector pix, piy, piz;
+    vector pivx, pivy, pivz, mi;
+
+    /* Loop over the parts in ci. */
+    //for (int pid = count_i - 1; pid >= 0 && sort_i[pid].d > dj; pid--) {
+    for (int pid = count_i - 1; pid >= 0; pid -= VEC_SIZE) {
+
+      /* Get the cache index to the ith particle. */
+      int ci_cache_idx = pid; //sort_i[pid].i;
+
+      vector v_dx, v_dy, v_dz, v_r2;
+
+      /* Load 2 sets of vectors from the particle cache. */
+      pix.v = vec_load(&ci_cache->x[ci_cache_idx]);
+      piy.v = vec_load(&ci_cache->y[ci_cache_idx]);
+      piz.v = vec_load(&ci_cache->z[ci_cache_idx]);
+      pivx.v = vec_load(&ci_cache->vx[ci_cache_idx]);
+      pivy.v = vec_load(&ci_cache->vy[ci_cache_idx]);
+      pivz.v = vec_load(&ci_cache->vz[ci_cache_idx]);
+      mi.v = vec_load(&ci_cache->m[ci_cache_idx]);
+
+      /* Compute the pairwise distance. */
+      v_dx.v = vec_sub(pjx.v, pix.v);
+      v_dy.v = vec_sub(pjy.v, piy.v);
+      v_dz.v = vec_sub(pjz.v, piz.v);
+
+      v_r2.v = vec_mul(v_dx.v, v_dx.v);
+      v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v);
+      v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v);
+
+      vector v_doj_mask, v_doj_mask_check;
+      int doj_mask;
+
+      /* Form r2 > 0 mask and r2 < hig2 mask. */
+      v_doj_mask_check.v = vec_cmp_gt(v_r2.v, vec_setzero());
+      v_doj_mask.v = vec_cmp_lt(v_r2.v, v_hjg2.v);
 
+      /* Combine two masks and form integer mask. */
+      doj_mask = vec_cmp_result(vec_and(v_doj_mask.v, v_doj_mask_check.v));
+
+      /* If there are any interactions left pack interaction values into c2
+       * cache. */
+      if (doj_mask)
+        storeInteractions(doj_mask, ci_cache_idx, &v_r2, &v_dx, &v_dy, &v_dz,
+                          &mi, &pivx, &pivy, &pivz, ci_cache, &int_cache,
+                          &icount, &rhoSum, &rho_dhSum, &wcountSum,
+                          &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum,
+                          &curlvzSum, v_hj_inv, v_vjx, v_vjy, v_vjz);
+      
     } /* loop over the parts in cj. */
 
-  } /* loop over the parts in ci. */
+    /* Perform padded vector remainder interactions if any are present. */
+    calcRemInteractions(&int_cache, icount, &rhoSum, &rho_dhSum,
+                        &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum,
+                        &curlvySum, &curlvzSum, v_hj_inv, v_vjx, v_vjy, v_vjz,
+                        &icount_align);
+    
+    /* Initialise masks to true in case remainder interactions have been
+     * performed. */
+    vector int_mask, int_mask2;
+#ifdef HAVE_AVX512_F
+    KNL_MASK_16 knl_mask = 0xFFFF;
+    KNL_MASK_16 knl_mask2 = 0xFFFF;
+    int_mask.m = vec_setint1(0xFFFFFFFF);
+    int_mask2.m = vec_setint1(0xFFFFFFFF);
+#else
+    int_mask.m = vec_setint1(0xFFFFFFFF);
+    int_mask2.m = vec_setint1(0xFFFFFFFF);
+#endif
 
-#ifdef WITH_VECTORIZATION
-  /* Pick up any leftovers. */
-  if (icount > 0)
-    for (int k = 0; k < icount; k++)
-      runner_iact_nonsym_density(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
+    /* Perform interaction with 2 vectors. */
+    for (int pjd = 0; pjd < icount_align; pjd += (NUM_VEC_PROC * VEC_SIZE)) {
+      runner_iact_nonsym_2_vec_density(
+          &int_cache.r2q[pjd], &int_cache.dxq[pjd], &int_cache.dyq[pjd],
+          &int_cache.dzq[pjd], v_hj_inv, v_vjx, v_vjy, v_vjz,
+          &int_cache.vxq[pjd], &int_cache.vyq[pjd], &int_cache.vzq[pjd],
+          &int_cache.mq[pjd], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum,
+          &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, int_mask, int_mask2,
+#ifdef HAVE_AVX512_F
+          knl_mask, knl_mask2);
+#else
+      0, 0);
 #endif
+    }
+
+    /* Perform horizontal adds on vector sums and store result in particle pi.
+     */
+    VEC_HADD(rhoSum, pj->rho);
+    VEC_HADD(rho_dhSum, pj->density.rho_dh);
+    VEC_HADD(wcountSum, pj->density.wcount);
+    VEC_HADD(wcount_dhSum, pj->density.wcount_dh);
+    VEC_HADD(div_vSum, pj->density.div_v);
+    VEC_HADD(curlvxSum, pj->density.rot_v[0]);
+    VEC_HADD(curlvySum, pj->density.rot_v[1]);
+    VEC_HADD(curlvzSum, pj->density.rot_v[2]);
+
+    icount = 0;
+
+  } /* loop over the parts in ci. */
 
   TIMER_TOC(timer_dopair_density);