diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c
index 0b44a08ebcdb4426120c69c7587a4757da5fc2ff..552aa8e9deef12c10f0561e12816e1b49af0266b 100644
--- a/src/runner_doiact_vec.c
+++ b/src/runner_doiact_vec.c
@@ -975,45 +975,17 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec_2(
 float max_di[MAX_NO_OF_PARTS] __attribute__((aligned(sizeof(VEC_SIZE * sizeof(float))))); /* max distance into ci */
 float max_dj[MAX_NO_OF_PARTS] __attribute__((aligned(sizeof(VEC_SIZE * sizeof(float))))); /* max distance into cj */
 
-FILE *faceIntFile;
-FILE *edgeIntFile;
-FILE *cornerIntFile;
-
-/** C2_CACHE VERSION
- * @brief Compute the interactions between a cell pair (non-symmetric).
- *  
- * @param r The #runner.
- * @param ci The first #cell.
- * @param cj The second #cell.
- */
 void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell *cj) {
 
 #ifdef WITH_VECTORIZATION
   const struct engine *restrict e = r->e;
 
-  static int faceIntCount = 0;
-  static int faceCtr = 0;
-  static int edgeIntCount = 0;
-  static int edgeCtr = 0;
-  static int cornerIntCount = 0;
-  static int cornerCtr = 0;
-  static int numFaceTested = 0;
-  static int numEdgeTested = 0;
-  static int numCornerTested = 0;
-  int icount = 0, icount_align = 0;
-  struct c2_cache int_cache;
   int num_vec_proc = 1;
 
   vector v_hi, v_vix, v_viy, v_viz, v_hig2;
 
   TIMER_TIC;
 
-  if(faceCtr + edgeCtr + cornerCtr == 0) {
-    faceIntFile = fopen("particle_interactions_face.dat","w"); 
-    edgeIntFile = fopen("particle_interactions_edge.dat","w"); 
-    cornerIntFile = fopen("particle_interactions_corner.dat","w"); 
-  }
-
   /* Anything to do here? */
   if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return;
 
@@ -1026,10 +998,6 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell *
   double shift[3] = {0.0, 0.0, 0.0};
   const int sid = space_getsid(e->s, &ci, &cj, shift);
 
-  int face = (sid == 4 || sid == 10 || sid == 12);
-  int edge = (sid == 1 || sid == 3 || sid == 5 || sid == 7 || sid == 9 || sid == 11);
-  int corner = (sid == 0 || sid == 2 || sid == 6 || sid == 8);
-
   /* Have the cells been sorted? */
   if (!(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid)))
     error("Trying to interact unsorted cells.");
@@ -1062,18 +1030,38 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell *
     cache_init(&cj_cache, count_j);
   }
 
-  cache_read_two_cells_sorted(ci, cj, ci_cache, &cj_cache, sort_i, sort_j, shift);
-
+  int first_pi, last_pj;
+  
   /* Find particles maximum distance into cj, max_di[] and ci, max_dj[]. */
   /* For particles in ci */  
-  populate_max_d(ci, cj, sort_i, sort_j, ci_cache, &cj_cache, dx_max, rshift, max_di, max_dj);
+  populate_max_d_no_cache(ci, cj, sort_i, sort_j, dx_max, rshift, max_di, max_dj, &first_pi, &last_pj);
 
   float di, dj;
 
   int max_ind_j = count_j - 1;
+  int max_ind_i = 0;
+
+  dj = sort_j[max_ind_j].d;
+  while(max_ind_j > 0 && max_di[count_i - 1] < dj) {
+    max_ind_j--;
+
+    dj = sort_j[max_ind_j].d;
+  }
+
+  di = sort_i[max_ind_i].d;
+  while(max_ind_i < count_i - 1 && max_dj[0] > di) {
+    max_ind_i++;
+
+    di = sort_i[max_ind_i].d;
+  }
+
+  last_pj = max(last_pj, max_ind_j); 
+  first_pi = min(first_pi, max_ind_i);
+ 
+  cache_read_two_cells_sorted_2(ci, cj, ci_cache, &cj_cache, sort_i, sort_j, shift, first_pi, last_pj, num_vec_proc);
 
   /* Loop over the parts in ci. */
-  for (int pid = count_i - 1; pid >= 0 && max_ind_j >= 0; pid--) {
+  for (int pid = count_i - 1; pid >= first_pi && max_ind_j >= 0; pid--) {
 
     /* Get a hold of the ith part in ci. */
     struct part *restrict pi = &parts_i[sort_i[pid].i];
@@ -1136,7 +1124,6 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell *
     }
 
     vector pjx, pjy, pjz;
-    vector pjvx, pjvy, pjvz, mj;
 
     /* Loop over the parts in cj. */
     for (int pjd = 0; pjd < exit_iteration_align; pjd += VEC_SIZE) {
@@ -1150,10 +1137,10 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell *
       pjx.v = vec_load(&cj_cache.x[cj_cache_idx]);
       pjy.v = vec_load(&cj_cache.y[cj_cache_idx]);
       pjz.v = vec_load(&cj_cache.z[cj_cache_idx]);
-      pjvx.v = vec_load(&cj_cache.vx[cj_cache_idx]);
-      pjvy.v = vec_load(&cj_cache.vy[cj_cache_idx]);
-      pjvz.v = vec_load(&cj_cache.vz[cj_cache_idx]);
-      mj.v = vec_load(&cj_cache.m[cj_cache_idx]);
+      //pjvx.v = vec_load(&cj_cache.vx[cj_cache_idx]);
+      //pjvy.v = vec_load(&cj_cache.vy[cj_cache_idx]);
+      //pjvz.v = vec_load(&cj_cache.vz[cj_cache_idx]);
+      //mj.v = vec_load(&cj_cache.m[cj_cache_idx]);
 
       /* Compute the pairwise distance. */
       v_dx.v = vec_sub(pix.v, pjx.v);
@@ -1173,50 +1160,19 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell *
       /* Form integer mask. */
       doi_mask = vec_cmp_result(v_doi_mask.v);
 
-      /* If there are any interactions left pack interaction values into c2
-       * cache. */
-      if (doi_mask)
-        storeInteractions(doi_mask, cj_cache_idx, &v_r2, &v_dx, &v_dy, &v_dz,
-                          &mj, &pjvx, &pjvy, &pjvz, &cj_cache, &int_cache,
-                          &icount, &rhoSum, &rho_dhSum, &wcountSum,
-                          &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum,
-                          &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz);
-      
-    } /* loop over the parts in cj. */
-
-    /* Perform padded vector remainder interactions if any are present. */
-    calcRemInteractions(&int_cache, icount, &rhoSum, &rho_dhSum,
-                        &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum,
-                        &curlvySum, &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz,
-                        &icount_align);
-    
-    /* Initialise masks to true in case remainder interactions have been
-     * performed. */
-    vector int_mask, int_mask2;
-#ifdef HAVE_AVX512_F
-    KNL_MASK_16 knl_mask = 0xFFFF;
-    KNL_MASK_16 knl_mask2 = 0xFFFF;
-    int_mask.m = vec_setint1(0xFFFFFFFF);
-    int_mask2.m = vec_setint1(0xFFFFFFFF);
-#else
-    int_mask.m = vec_setint1(0xFFFFFFFF);
-    int_mask2.m = vec_setint1(0xFFFFFFFF);
-#endif
-
-    /* Perform interaction with 2 vectors. */
-    for (int pjd = 0; pjd < icount_align; pjd += (NUM_VEC_PROC * VEC_SIZE)) {
-      runner_iact_nonsym_2_vec_density(
-          &int_cache.r2q[pjd], &int_cache.dxq[pjd], &int_cache.dyq[pjd],
-          &int_cache.dzq[pjd], v_hi_inv, v_vix, v_viy, v_viz,
-          &int_cache.vxq[pjd], &int_cache.vyq[pjd], &int_cache.vzq[pjd],
-          &int_cache.mq[pjd], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum,
-          &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, int_mask, int_mask2,
+      if(doi_mask)
+        runner_iact_nonsym_intrinsic_vec_density(
+          &v_r2, &v_dx, &v_dy,&v_dz, v_hi_inv, v_vix, v_viy, v_viz,
+          &cj_cache.vx[cj_cache_idx], &cj_cache.vy[cj_cache_idx], &cj_cache.vz[cj_cache_idx],
+          &cj_cache.m[cj_cache_idx], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum,
+          &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, v_doi_mask,
 #ifdef HAVE_AVX512_F
-          knl_mask, knl_mask2);
+          knl_mask);
 #else
-      0, 0);
+          0);
 #endif
-    }
+            
+    } /* loop over the parts in cj. */
 
     /* Perform horizontal adds on vector sums and store result in particle pi.
      */
@@ -1229,42 +1185,10 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell *
     VEC_HADD(curlvySum, pi->density.rot_v[1]);
     VEC_HADD(curlvzSum, pi->density.rot_v[2]);
 
-    if(face) {
-      faceIntCount += icount;
-      fprintf(faceIntFile,"%d\n",icount);
-      numFaceTested++;
-    }
-    else if(edge) {
-      edgeIntCount += icount;
-      fprintf(edgeIntFile,"%d\n",icount);
-      numEdgeTested++;
-    }
-    else if(corner) {
-      cornerIntCount += icount;
-      fprintf(cornerIntFile,"%d\n",icount);
-      numCornerTested++;
-    }
-
-    icount = 0;
-
   } /* loop over the parts in ci. */
 
-  if(face) {
-    faceCtr++;
-    message("Total number of face interactions: %d, average per particle: %f, number tested: %d.", faceIntCount, ((float)faceIntCount) / ((float)numFaceTested), numFaceTested);
-  }
-  else if(edge) {
-    edgeCtr++;
-    message("Total number of edge interactions: %d, average per particle: %f, number tested: %d", edgeIntCount, ((float)edgeIntCount) / ((float)numEdgeTested), numEdgeTested);
-  }
-  else if(corner) {
-    cornerCtr++;
-    message("Total number of corner interactions: %d, average per particle: %f, number tested: %d", cornerIntCount, ((float)cornerIntCount) / ((float)numCornerTested), numCornerTested);
-  }
-
-  int max_ind_i = 0;
   /* Loop over the parts in cj. */
-  for (int pjd = 0; pjd < count_j && max_ind_i < count_i; pjd++) {
+  for (int pjd = 0; pjd <= last_pj && max_ind_i < count_i; pjd++) {
 
     /* Get a hold of the jth part in cj. */
     struct part *restrict pj = &parts_j[sort_j[pjd].i];
@@ -1328,11 +1252,10 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell *
     }
 
     vector pix, piy, piz;
-    vector pivx, pivy, pivz, mi;
+    //vector pivx, pivy, pivz, mi;
 
     /* Loop over the parts in ci. */
-    //for (int pid = count_i - 1; pid >= 0; pid -= VEC_SIZE) {
-    for (int pid = exit_iteration_align; pid < count_i; pid += (num_vec_proc * VEC_SIZE)) {
+    for (int pid = exit_iteration_align; pid < count_i; pid += VEC_SIZE) {
 
       /* Get the cache index to the ith particle. */
       int ci_cache_idx = pid; //sort_i[pid].i;
@@ -1343,10 +1266,10 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell *
       pix.v = vec_load(&ci_cache->x[ci_cache_idx]);
       piy.v = vec_load(&ci_cache->y[ci_cache_idx]);
       piz.v = vec_load(&ci_cache->z[ci_cache_idx]);
-      pivx.v = vec_load(&ci_cache->vx[ci_cache_idx]);
-      pivy.v = vec_load(&ci_cache->vy[ci_cache_idx]);
-      pivz.v = vec_load(&ci_cache->vz[ci_cache_idx]);
-      mi.v = vec_load(&ci_cache->m[ci_cache_idx]);
+      //pivx.v = vec_load(&ci_cache->vx[ci_cache_idx]);
+      //pivy.v = vec_load(&ci_cache->vy[ci_cache_idx]);
+      //pivz.v = vec_load(&ci_cache->vz[ci_cache_idx]);
+      //mi.v = vec_load(&ci_cache->m[ci_cache_idx]);
 
       /* Compute the pairwise distance. */
       v_dx.v = vec_sub(pjx.v, pix.v);
@@ -1366,50 +1289,20 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell *
       /* Form integer mask. */
       doj_mask = vec_cmp_result(v_doj_mask.v);
 
-      /* If there are any interactions left pack interaction values into c2
-       * cache. */
+      /* Perform interaction with 2 vectors. */
       if (doj_mask)
-        storeInteractions(doj_mask, ci_cache_idx, &v_r2, &v_dx, &v_dy, &v_dz,
-                          &mi, &pivx, &pivy, &pivz, ci_cache, &int_cache,
-                          &icount, &rhoSum, &rho_dhSum, &wcountSum,
-                          &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum,
-                          &curlvzSum, v_hj_inv, v_vjx, v_vjy, v_vjz);
-      
-    } /* loop over the parts in cj. */
-
-    /* Perform padded vector remainder interactions if any are present. */
-    calcRemInteractions(&int_cache, icount, &rhoSum, &rho_dhSum,
-                        &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum,
-                        &curlvySum, &curlvzSum, v_hj_inv, v_vjx, v_vjy, v_vjz,
-                        &icount_align);
-    
-    /* Initialise masks to true in case remainder interactions have been
-     * performed. */
-    vector int_mask, int_mask2;
-#ifdef HAVE_AVX512_F
-    KNL_MASK_16 knl_mask = 0xFFFF;
-    KNL_MASK_16 knl_mask2 = 0xFFFF;
-    int_mask.m = vec_setint1(0xFFFFFFFF);
-    int_mask2.m = vec_setint1(0xFFFFFFFF);
-#else
-    int_mask.m = vec_setint1(0xFFFFFFFF);
-    int_mask2.m = vec_setint1(0xFFFFFFFF);
-#endif
-
-    /* Perform interaction with 2 vectors. */
-    for (int pjd = 0; pjd < icount_align; pjd += (NUM_VEC_PROC * VEC_SIZE)) {
-      runner_iact_nonsym_2_vec_density(
-          &int_cache.r2q[pjd], &int_cache.dxq[pjd], &int_cache.dyq[pjd],
-          &int_cache.dzq[pjd], v_hj_inv, v_vjx, v_vjy, v_vjz,
-          &int_cache.vxq[pjd], &int_cache.vyq[pjd], &int_cache.vzq[pjd],
-          &int_cache.mq[pjd], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum,
-          &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, int_mask, int_mask2,
+        runner_iact_nonsym_intrinsic_vec_density(
+          &v_r2, &v_dx, &v_dy, &v_dz, v_hj_inv, v_vjx, v_vjy, v_vjz,
+          &ci_cache->vx[ci_cache_idx], &ci_cache->vy[ci_cache_idx], &ci_cache->vz[ci_cache_idx],
+          &ci_cache->m[ci_cache_idx], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum,
+          &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, v_doj_mask,
 #ifdef HAVE_AVX512_F
-          knl_mask, knl_mask2);
+          knl_mask);
 #else
-      0, 0);
+      0);
 #endif
-    }
+        
+    } /* loop over the parts in cj. */
 
     /* Perform horizontal adds on vector sums and store result in particle pi.
      */
@@ -1422,8 +1315,6 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell *
     VEC_HADD(curlvySum, pj->density.rot_v[1]);
     VEC_HADD(curlvzSum, pj->density.rot_v[2]);
 
-    icount = 0;
-
   } /* loop over the parts in ci. */
 
   TIMER_TOC(timer_dopair_density);
@@ -1431,17 +1322,45 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell *
 #endif /* WITH_VECTORIZATION */
 }
 
+FILE *faceIntFile;
+FILE *edgeIntFile;
+FILE *cornerIntFile;
+
+/** C2_CACHE VERSION
+ * @brief Compute the interactions between a cell pair (non-symmetric).
+ *  
+ * @param r The #runner.
+ * @param ci The first #cell.
+ * @param cj The second #cell.
+ */
 void runner_dopair1_density_vec_1(struct runner *r, struct cell *ci, struct cell *cj) {
 
 #ifdef WITH_VECTORIZATION
   const struct engine *restrict e = r->e;
 
+  static int faceIntCount = 0;
+  static int faceCtr = 0;
+  static int edgeIntCount = 0;
+  static int edgeCtr = 0;
+  static int cornerIntCount = 0;
+  static int cornerCtr = 0;
+  static int numFaceTested = 0;
+  static int numEdgeTested = 0;
+  static int numCornerTested = 0;
+  int icount = 0, icount_align = 0;
+  struct c2_cache int_cache;
   int num_vec_proc = 1;
 
   vector v_hi, v_vix, v_viy, v_viz, v_hig2;
 
   TIMER_TIC;
 
+  if(faceCtr + edgeCtr + cornerCtr == 0) {
+    faceIntFile = fopen("particle_interactions_face.dat","w"); 
+    edgeIntFile = fopen("particle_interactions_edge.dat","w"); 
+    cornerIntFile = fopen("particle_interactions_corner.dat","w"); 
+  }
+
   /* Anything to do here? */
   if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return;
 
@@ -1454,6 +1373,10 @@ void runner_dopair1_density_vec_1(struct runner *r, struct cell *ci, struct cell
   double shift[3] = {0.0, 0.0, 0.0};
   const int sid = space_getsid(e->s, &ci, &cj, shift);
 
+  int face = (sid == 4 || sid == 10 || sid == 12);
+  int edge = (sid == 1 || sid == 3 || sid == 5 || sid == 7 || sid == 9 || sid == 11);
+  int corner = (sid == 0 || sid == 2 || sid == 6 || sid == 8);
+
   /* Have the cells been sorted? */
   if (!(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid)))
     error("Trying to interact unsorted cells.");
@@ -1486,38 +1409,18 @@ void runner_dopair1_density_vec_1(struct runner *r, struct cell *ci, struct cell
     cache_init(&cj_cache, count_j);
   }
 
-  int first_pi, last_pj;
-  
+  cache_read_two_cells_sorted(ci, cj, ci_cache, &cj_cache, sort_i, sort_j, shift);
+
   /* Find particles maximum distance into cj, max_di[] and ci, max_dj[]. */
   /* For particles in ci */  
-  populate_max_d_no_cache(ci, cj, sort_i, sort_j, dx_max, rshift, max_di, max_dj, &first_pi, &last_pj);
+  populate_max_d(ci, cj, sort_i, sort_j, ci_cache, &cj_cache, dx_max, rshift, max_di, max_dj);
 
   float di, dj;
 
   int max_ind_j = count_j - 1;
-  int max_ind_i = 0;
-
-  dj = sort_j[max_ind_j].d;
-  while(max_ind_j > 0 && max_di[count_i - 1] < dj) {
-    max_ind_j--;
-
-    dj = sort_j[max_ind_j].d;
-  }
-
-  di = sort_i[max_ind_i].d;
-  while(max_ind_i < count_i - 1 && max_dj[0] > di) {
-    max_ind_i++;
-
-    di = sort_i[max_ind_i].d;
-  }
-
-  last_pj = max(last_pj, max_ind_j); 
-  first_pi = min(first_pi, max_ind_i);
- 
-  cache_read_two_cells_sorted_2(ci, cj, ci_cache, &cj_cache, sort_i, sort_j, shift, first_pi, last_pj, num_vec_proc);
 
   /* Loop over the parts in ci. */
-  for (int pid = count_i - 1; pid >= first_pi && max_ind_j >= 0; pid--) {
+  for (int pid = count_i - 1; pid >= 0 && max_ind_j >= 0; pid--) {
 
     /* Get a hold of the ith part in ci. */
     struct part *restrict pi = &parts_i[sort_i[pid].i];
@@ -1580,6 +1483,7 @@ void runner_dopair1_density_vec_1(struct runner *r, struct cell *ci, struct cell
     }
 
     vector pjx, pjy, pjz;
+    vector pjvx, pjvy, pjvz, mj;
 
     /* Loop over the parts in cj. */
     for (int pjd = 0; pjd < exit_iteration_align; pjd += VEC_SIZE) {
@@ -1593,10 +1497,10 @@ void runner_dopair1_density_vec_1(struct runner *r, struct cell *ci, struct cell
       pjx.v = vec_load(&cj_cache.x[cj_cache_idx]);
       pjy.v = vec_load(&cj_cache.y[cj_cache_idx]);
       pjz.v = vec_load(&cj_cache.z[cj_cache_idx]);
-      //pjvx.v = vec_load(&cj_cache.vx[cj_cache_idx]);
-      //pjvy.v = vec_load(&cj_cache.vy[cj_cache_idx]);
-      //pjvz.v = vec_load(&cj_cache.vz[cj_cache_idx]);
-      //mj.v = vec_load(&cj_cache.m[cj_cache_idx]);
+      pjvx.v = vec_load(&cj_cache.vx[cj_cache_idx]);
+      pjvy.v = vec_load(&cj_cache.vy[cj_cache_idx]);
+      pjvz.v = vec_load(&cj_cache.vz[cj_cache_idx]);
+      mj.v = vec_load(&cj_cache.m[cj_cache_idx]);
 
       /* Compute the pairwise distance. */
       v_dx.v = vec_sub(pix.v, pjx.v);
@@ -1616,19 +1520,50 @@ void runner_dopair1_density_vec_1(struct runner *r, struct cell *ci, struct cell
       /* Form integer mask. */
       doi_mask = vec_cmp_result(v_doi_mask.v);
 
-      if(doi_mask)
-        runner_iact_nonsym_intrinsic_vec_density(
-          &v_r2, &v_dx, &v_dy,&v_dz, v_hi_inv, v_vix, v_viy, v_viz,
-          &cj_cache.vx[cj_cache_idx], &cj_cache.vy[cj_cache_idx], &cj_cache.vz[cj_cache_idx],
-          &cj_cache.m[cj_cache_idx], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum,
-          &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, v_doi_mask,
+      /* If there are any interactions left pack interaction values into c2
+       * cache. */
+      if (doi_mask)
+        storeInteractions(doi_mask, cj_cache_idx, &v_r2, &v_dx, &v_dy, &v_dz,
+                          &mj, &pjvx, &pjvy, &pjvz, &cj_cache, &int_cache,
+                          &icount, &rhoSum, &rho_dhSum, &wcountSum,
+                          &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum,
+                          &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz);
+      
+    } /* loop over the parts in cj. */
+
+    /* Perform padded vector remainder interactions if any are present. */
+    calcRemInteractions(&int_cache, icount, &rhoSum, &rho_dhSum,
+                        &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum,
+                        &curlvySum, &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz,
+                        &icount_align);
+    
+    /* Initialise masks to true in case remainder interactions have been
+     * performed. */
+    vector int_mask, int_mask2;
 #ifdef HAVE_AVX512_F
-          knl_mask);
+    KNL_MASK_16 knl_mask = 0xFFFF;
+    KNL_MASK_16 knl_mask2 = 0xFFFF;
+    int_mask.m = vec_setint1(0xFFFFFFFF);
+    int_mask2.m = vec_setint1(0xFFFFFFFF);
 #else
-          0);
+    int_mask.m = vec_setint1(0xFFFFFFFF);
+    int_mask2.m = vec_setint1(0xFFFFFFFF);
 #endif
-            
-    } /* loop over the parts in cj. */
+
+    /* Perform interaction with 2 vectors. */
+    for (int pjd = 0; pjd < icount_align; pjd += (NUM_VEC_PROC * VEC_SIZE)) {
+      runner_iact_nonsym_2_vec_density(
+          &int_cache.r2q[pjd], &int_cache.dxq[pjd], &int_cache.dyq[pjd],
+          &int_cache.dzq[pjd], v_hi_inv, v_vix, v_viy, v_viz,
+          &int_cache.vxq[pjd], &int_cache.vyq[pjd], &int_cache.vzq[pjd],
+          &int_cache.mq[pjd], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum,
+          &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, int_mask, int_mask2,
+#ifdef HAVE_AVX512_F
+          knl_mask, knl_mask2);
+#else
+      0, 0);
+#endif
+    }
 
     /* Perform horizontal adds on vector sums and store result in particle pi.
      */
@@ -1641,10 +1576,42 @@ void runner_dopair1_density_vec_1(struct runner *r, struct cell *ci, struct cell
     VEC_HADD(curlvySum, pi->density.rot_v[1]);
     VEC_HADD(curlvzSum, pi->density.rot_v[2]);
 
+    if(face) {
+      faceIntCount += icount;
+      fprintf(faceIntFile,"%d\n",icount);
+      numFaceTested++;
+    }
+    else if(edge) {
+      edgeIntCount += icount;
+      fprintf(edgeIntFile,"%d\n",icount);
+      numEdgeTested++;
+    }
+    else if(corner) {
+      cornerIntCount += icount;
+      fprintf(cornerIntFile,"%d\n",icount);
+      numCornerTested++;
+    }
+
+    icount = 0;
+
   } /* loop over the parts in ci. */
 
+  if(face) {
+    faceCtr++;
+    message("Total number of face interactions: %d, average per particle: %f, number tested: %d.", faceIntCount, ((float)faceIntCount) / ((float)numFaceTested), numFaceTested);
+  }
+  else if(edge) {
+    edgeCtr++;
+    message("Total number of edge interactions: %d, average per particle: %f, number tested: %d", edgeIntCount, ((float)edgeIntCount) / ((float)numEdgeTested), numEdgeTested);
+  }
+  else if(corner) {
+    cornerCtr++;
+    message("Total number of corner interactions: %d, average per particle: %f, number tested: %d", cornerIntCount, ((float)cornerIntCount) / ((float)numCornerTested), numCornerTested);
+  }
+
+  int max_ind_i = 0;
   /* Loop over the parts in cj. */
-  for (int pjd = 0; pjd <= last_pj && max_ind_i < count_i; pjd++) {
+  for (int pjd = 0; pjd < count_j && max_ind_i < count_i; pjd++) {
 
     /* Get a hold of the jth part in cj. */
     struct part *restrict pj = &parts_j[sort_j[pjd].i];
@@ -1708,10 +1675,11 @@ void runner_dopair1_density_vec_1(struct runner *r, struct cell *ci, struct cell
     }
 
     vector pix, piy, piz;
-    //vector pivx, pivy, pivz, mi;
+    vector pivx, pivy, pivz, mi;
 
     /* Loop over the parts in ci. */
-    for (int pid = exit_iteration_align; pid < count_i; pid += VEC_SIZE) {
+    //for (int pid = count_i - 1; pid >= 0; pid -= VEC_SIZE) {
+    for (int pid = exit_iteration_align; pid < count_i; pid += (num_vec_proc * VEC_SIZE)) {
 
       /* Get the cache index to the ith particle. */
       int ci_cache_idx = pid; //sort_i[pid].i;
@@ -1722,10 +1690,10 @@ void runner_dopair1_density_vec_1(struct runner *r, struct cell *ci, struct cell
       pix.v = vec_load(&ci_cache->x[ci_cache_idx]);
       piy.v = vec_load(&ci_cache->y[ci_cache_idx]);
       piz.v = vec_load(&ci_cache->z[ci_cache_idx]);
-      //pivx.v = vec_load(&ci_cache->vx[ci_cache_idx]);
-      //pivy.v = vec_load(&ci_cache->vy[ci_cache_idx]);
-      //pivz.v = vec_load(&ci_cache->vz[ci_cache_idx]);
-      //mi.v = vec_load(&ci_cache->m[ci_cache_idx]);
+      pivx.v = vec_load(&ci_cache->vx[ci_cache_idx]);
+      pivy.v = vec_load(&ci_cache->vy[ci_cache_idx]);
+      pivz.v = vec_load(&ci_cache->vz[ci_cache_idx]);
+      mi.v = vec_load(&ci_cache->m[ci_cache_idx]);
 
       /* Compute the pairwise distance. */
       v_dx.v = vec_sub(pjx.v, pix.v);
@@ -1745,20 +1713,50 @@ void runner_dopair1_density_vec_1(struct runner *r, struct cell *ci, struct cell
       /* Form integer mask. */
       doj_mask = vec_cmp_result(v_doj_mask.v);
 
-      /* Perform interaction with 2 vectors. */
+      /* If there are any interactions left pack interaction values into c2
+       * cache. */
       if (doj_mask)
-        runner_iact_nonsym_intrinsic_vec_density(
-          &v_r2, &v_dx, &v_dy, &v_dz, v_hj_inv, v_vjx, v_vjy, v_vjz,
-          &ci_cache->vx[ci_cache_idx], &ci_cache->vy[ci_cache_idx], &ci_cache->vz[ci_cache_idx],
-          &ci_cache->m[ci_cache_idx], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum,
-          &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, v_doj_mask,
+        storeInteractions(doj_mask, ci_cache_idx, &v_r2, &v_dx, &v_dy, &v_dz,
+                          &mi, &pivx, &pivy, &pivz, ci_cache, &int_cache,
+                          &icount, &rhoSum, &rho_dhSum, &wcountSum,
+                          &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum,
+                          &curlvzSum, v_hj_inv, v_vjx, v_vjy, v_vjz);
+      
+    } /* loop over the parts in cj. */
+
+    /* Perform padded vector remainder interactions if any are present. */
+    calcRemInteractions(&int_cache, icount, &rhoSum, &rho_dhSum,
+                        &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum,
+                        &curlvySum, &curlvzSum, v_hj_inv, v_vjx, v_vjy, v_vjz,
+                        &icount_align);
+    
+    /* Initialise masks to true in case remainder interactions have been
+     * performed. */
+    vector int_mask, int_mask2;
 #ifdef HAVE_AVX512_F
-          knl_mask);
+    KNL_MASK_16 knl_mask = 0xFFFF;
+    KNL_MASK_16 knl_mask2 = 0xFFFF;
+    int_mask.m = vec_setint1(0xFFFFFFFF);
+    int_mask2.m = vec_setint1(0xFFFFFFFF);
 #else
-      0);
+    int_mask.m = vec_setint1(0xFFFFFFFF);
+    int_mask2.m = vec_setint1(0xFFFFFFFF);
 #endif
-        
-    } /* loop over the parts in cj. */
+
+    /* Perform interaction with 2 vectors. */
+    for (int pjd = 0; pjd < icount_align; pjd += (NUM_VEC_PROC * VEC_SIZE)) {
+      runner_iact_nonsym_2_vec_density(
+          &int_cache.r2q[pjd], &int_cache.dxq[pjd], &int_cache.dyq[pjd],
+          &int_cache.dzq[pjd], v_hj_inv, v_vjx, v_vjy, v_vjz,
+          &int_cache.vxq[pjd], &int_cache.vyq[pjd], &int_cache.vzq[pjd],
+          &int_cache.mq[pjd], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum,
+          &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, int_mask, int_mask2,
+#ifdef HAVE_AVX512_F
+          knl_mask, knl_mask2);
+#else
+      0, 0);
+#endif
+    }
 
     /* Perform horizontal adds on vector sums and store result in particle pi.
      */
@@ -1771,6 +1769,8 @@ void runner_dopair1_density_vec_1(struct runner *r, struct cell *ci, struct cell
     VEC_HADD(curlvySum, pj->density.rot_v[1]);
     VEC_HADD(curlvzSum, pj->density.rot_v[2]);
 
+    icount = 0;
+
   } /* loop over the parts in ci. */
 
   TIMER_TOC(timer_dopair_density);
diff --git a/src/runner_doiact_vec.h b/src/runner_doiact_vec.h
index 42b5cc2587fa325f880f74e7e0e213f9d4ba0137..3c07645dffe7e761087b5e1d0dca0d86711acf95 100644
--- a/src/runner_doiact_vec.h
+++ b/src/runner_doiact_vec.h
@@ -38,5 +38,6 @@
 void runner_doself1_density_vec(struct runner *r, struct cell *restrict c);
 void runner_doself1_density_vec_2(struct runner *r, struct cell *restrict c);
 void runner_dopair1_density_vec(struct runner *r, struct cell *restrict ci, struct cell *restrict cj);
+void runner_dopair1_density_vec_2(struct runner *r, struct cell *restrict ci, struct cell *restrict cj);
 
 #endif /* SWIFT_RUNNER_VEC_H */