diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c
index e02f41811b93bf34b9af5c5ffc822c5267260263..33cd2b181f18d3b749261dd6093836cffe7fc335 100644
--- a/src/runner_doiact_vec.c
+++ b/src/runner_doiact_vec.c
@@ -419,17 +419,17 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
       vector v_dx_tmp2, v_dy_tmp2, v_dz_tmp2, v_r2_2;
 
       v_dx_tmp.v = vec_sub(pix.v, pjx.v);
-      v_dy_tmp.v = vec_sub(piy.v, pjy.v);
-      v_dz_tmp.v = vec_sub(piz.v, pjz.v);
       v_dx_tmp2.v = vec_sub(pix.v, pjx2.v);
+      v_dy_tmp.v = vec_sub(piy.v, pjy.v);
       v_dy_tmp2.v = vec_sub(piy.v, pjy2.v);
+      v_dz_tmp.v = vec_sub(piz.v, pjz.v);
       v_dz_tmp2.v = vec_sub(piz.v, pjz2.v);
 
       v_r2.v = vec_mul(v_dx_tmp.v, v_dx_tmp.v);
-      v_r2.v = vec_fma(v_dy_tmp.v, v_dy_tmp.v, v_r2.v);
-      v_r2.v = vec_fma(v_dz_tmp.v, v_dz_tmp.v, v_r2.v);
       v_r2_2.v = vec_mul(v_dx_tmp2.v, v_dx_tmp2.v);
+      v_r2.v = vec_fma(v_dy_tmp.v, v_dy_tmp.v, v_r2.v);
       v_r2_2.v = vec_fma(v_dy_tmp2.v, v_dy_tmp2.v, v_r2_2.v);
+      v_r2.v = vec_fma(v_dz_tmp.v, v_dz_tmp.v, v_r2.v);
       v_r2_2.v = vec_fma(v_dz_tmp2.v, v_dz_tmp2.v, v_r2_2.v);
 
 /* Form a mask from r2 < hig2 and r2 > 0.*/
@@ -1307,7 +1307,7 @@ void runner_dopair1_density_vec_2(struct runner *r, struct cell *ci, struct cell
 #ifdef WITH_VECTORIZATION
   const struct engine *restrict e = r->e;
 
-  int num_vec_proc = 1;
+  int num_vec_proc = 2;
 
   vector v_hi, v_vix, v_viy, v_viz, v_hig2;
 
@@ -1373,6 +1373,7 @@ void runner_dopair1_density_vec_2(struct runner *r, struct cell *ci, struct cell
 
     /* Get a hold of the ith part in ci. */
     struct part *restrict pi = &parts_i[sort_i[pid].i];
+    //struct part *restrict pi2 = &parts_i[sort_i[pid - 1].i];
     if (!part_is_active(pi, e)) continue;
 
     dj = sort_j[max_ind_j].d;
@@ -1432,19 +1433,24 @@ void runner_dopair1_density_vec_2(struct runner *r, struct cell *ci, struct cell
     }
 
     vector pjx, pjy, pjz;
+    vector pjx2, pjy2, pjz2;
 
     /* Loop over the parts in cj. */
-    for (int pjd = 0; pjd < exit_iteration_align; pjd += VEC_SIZE) {
+    for (int pjd = 0; pjd < exit_iteration_align; pjd += (num_vec_proc * VEC_SIZE)) {
 
       /* Get the cache index to the jth particle. */
       int cj_cache_idx = pjd; //sort_j[pjd].i;
 
       vector v_dx, v_dy, v_dz, v_r2;
+      vector v_dx2, v_dy2, v_dz2, v_r2_2;
 
       /* Load 2 sets of vectors from the particle cache. */
       pjx.v = vec_load(&cj_cache.x[cj_cache_idx]);
+      pjx2.v = vec_load(&cj_cache.x[cj_cache_idx + VEC_SIZE]);
       pjy.v = vec_load(&cj_cache.y[cj_cache_idx]);
+      pjy2.v = vec_load(&cj_cache.y[cj_cache_idx + VEC_SIZE]);
       pjz.v = vec_load(&cj_cache.z[cj_cache_idx]);
+      pjz2.v = vec_load(&cj_cache.z[cj_cache_idx + VEC_SIZE]);
       //pjvx.v = vec_load(&cj_cache.vx[cj_cache_idx]);
       //pjvy.v = vec_load(&cj_cache.vy[cj_cache_idx]);
       //pjvz.v = vec_load(&cj_cache.vz[cj_cache_idx]);
@@ -1452,21 +1458,29 @@ void runner_dopair1_density_vec_2(struct runner *r, struct cell *ci, struct cell
 
       /* Compute the pairwise distance. */
       v_dx.v = vec_sub(pix.v, pjx.v);
+      v_dx2.v = vec_sub(pix.v, pjx2.v);
       v_dy.v = vec_sub(piy.v, pjy.v);
+      v_dy2.v = vec_sub(piy.v, pjy2.v);
       v_dz.v = vec_sub(piz.v, pjz.v);
+      v_dz2.v = vec_sub(piz.v, pjz2.v);
 
       v_r2.v = vec_mul(v_dx.v, v_dx.v);
+      v_r2_2.v = vec_mul(v_dx2.v, v_dx2.v);
       v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v);
+      v_r2_2.v = vec_fma(v_dy2.v, v_dy2.v, v_r2_2.v);
       v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v);
+      v_r2_2.v = vec_fma(v_dz2.v, v_dz2.v, v_r2_2.v);
 
-      vector v_doi_mask;
-      int doi_mask;
+      vector v_doi_mask, v_doi_mask2;
+      int doi_mask, doi_mask2;
 
       /* Form r2 < hig2 mask. */
       v_doi_mask.v = vec_cmp_lt(v_r2.v, v_hig2.v);
+      v_doi_mask2.v = vec_cmp_lt(v_r2_2.v, v_hig2.v);
 
       /* Form integer mask. */
       doi_mask = vec_cmp_result(v_doi_mask.v);
+      doi_mask2 = vec_cmp_result(v_doi_mask2.v);
 
       if(doi_mask)
         runner_iact_nonsym_intrinsic_vec_density(
@@ -1479,7 +1493,17 @@ void runner_dopair1_density_vec_2(struct runner *r, struct cell *ci, struct cell
 #else
           0);
 #endif
-            
+      if(doi_mask2)
+        runner_iact_nonsym_intrinsic_vec_density(
+          &v_r2_2, &v_dx2, &v_dy2,&v_dz2, v_hi_inv, v_vix, v_viy, v_viz,
+          &cj_cache.vx[cj_cache_idx + VEC_SIZE], &cj_cache.vy[cj_cache_idx + VEC_SIZE], &cj_cache.vz[cj_cache_idx + VEC_SIZE],
+          &cj_cache.m[cj_cache_idx + VEC_SIZE], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum,
+          &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, v_doi_mask2,
+#ifdef HAVE_AVX512_F
+          knl_mask);
+#else
+          0);
+#endif     
     } /* loop over the parts in cj. */
 
     /* Perform horizontal adds on vector sums and store result in particle pi.
@@ -1561,20 +1585,25 @@ void runner_dopair1_density_vec_2(struct runner *r, struct cell *ci, struct cell
     }
 
     vector pix, piy, piz;
+    vector pix2, piy2, piz2;
     //vector pivx, pivy, pivz, mi;
 
     /* Loop over the parts in ci. */
-    for (int pid = count_i - 1; pid >= 0; pid -= VEC_SIZE) {
+    for (int pid = count_i - 1; pid >= 0; pid -= (num_vec_proc * VEC_SIZE)) {
 
       /* Get the cache index to the ith particle. */
       int ci_cache_idx = pid; //sort_i[pid].i;
 
       vector v_dx, v_dy, v_dz, v_r2;
+      vector v_dx2, v_dy2, v_dz2, v_r2_2;
 
       /* Load 2 sets of vectors from the particle cache. */
       pix.v = vec_load(&ci_cache->x[ci_cache_idx]);
+      pix2.v = vec_load(&ci_cache->x[ci_cache_idx - VEC_SIZE]);
       piy.v = vec_load(&ci_cache->y[ci_cache_idx]);
+      piy2.v = vec_load(&ci_cache->y[ci_cache_idx - VEC_SIZE]);
       piz.v = vec_load(&ci_cache->z[ci_cache_idx]);
+      piz2.v = vec_load(&ci_cache->z[ci_cache_idx - VEC_SIZE]);
       //pivx.v = vec_load(&ci_cache->vx[ci_cache_idx]);
       //pivy.v = vec_load(&ci_cache->vy[ci_cache_idx]);
       //pivz.v = vec_load(&ci_cache->vz[ci_cache_idx]);
@@ -1582,21 +1611,29 @@ void runner_dopair1_density_vec_2(struct runner *r, struct cell *ci, struct cell
 
       /* Compute the pairwise distance. */
       v_dx.v = vec_sub(pjx.v, pix.v);
+      v_dx2.v = vec_sub(pjx.v, pix2.v);
       v_dy.v = vec_sub(pjy.v, piy.v);
+      v_dy2.v = vec_sub(pjy.v, piy2.v);
       v_dz.v = vec_sub(pjz.v, piz.v);
+      v_dz2.v = vec_sub(pjz.v, piz2.v);
 
       v_r2.v = vec_mul(v_dx.v, v_dx.v);
+      v_r2_2.v = vec_mul(v_dx2.v, v_dx2.v);
       v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v);
+      v_r2_2.v = vec_fma(v_dy2.v, v_dy2.v, v_r2_2.v);
       v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v);
+      v_r2_2.v = vec_fma(v_dz2.v, v_dz2.v, v_r2_2.v);
 
-      vector v_doj_mask;
-      int doj_mask;
+      vector v_doj_mask, v_doj_mask2;
+      int doj_mask, doj_mask2;
 
       /* Form r2 < hig2 mask. */
       v_doj_mask.v = vec_cmp_lt(v_r2.v, v_hjg2.v);
+      v_doj_mask2.v = vec_cmp_lt(v_r2_2.v, v_hjg2.v);
 
       /* Form integer mask. */
       doj_mask = vec_cmp_result(v_doj_mask.v);
+      doj_mask2 = vec_cmp_result(v_doj_mask2.v);
 
       /* Perform interaction with 2 vectors. */
       if (doj_mask)
@@ -1610,7 +1647,17 @@ void runner_dopair1_density_vec_2(struct runner *r, struct cell *ci, struct cell
 #else
       0);
 #endif
-        
+       if (doj_mask2)
+        runner_iact_nonsym_intrinsic_vec_density(
+          &v_r2_2, &v_dx2, &v_dy2, &v_dz2, v_hj_inv, v_vjx, v_vjy, v_vjz,
+          &ci_cache->vx[ci_cache_idx - VEC_SIZE], &ci_cache->vy[ci_cache_idx - VEC_SIZE], &ci_cache->vz[ci_cache_idx - VEC_SIZE],
+          &ci_cache->m[ci_cache_idx - VEC_SIZE], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum,
+          &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, v_doj_mask2,
+#ifdef HAVE_AVX512_F
+          knl_mask);
+#else
+      0);
+#endif 
     } /* loop over the parts in cj. */
 
     /* Perform horizontal adds on vector sums and store result in particle pi.