diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c
index 6daa814044338c9eaaa32dca1377fb8a06b29d82..24ecd47ed272d5e3137328418d42850ba5edb3c1 100644
--- a/src/runner_doiact_vec.c
+++ b/src/runner_doiact_vec.c
@@ -1486,13 +1486,6 @@ void runner_dopair1_density_vec_2(struct runner *r, struct cell *ci, struct cell
       //pjvz.v = vec_load(&cj_cache.vz[cj_cache_idx]);
       //mj.v = vec_load(&cj_cache.m[cj_cache_idx]);
 
-      pjx.v = vec_load(&cj_cache.x[cj_cache_idx]);
-      pjx2.v = vec_load(&cj_cache.x[cj_cache_idx + VEC_SIZE]);
-      pjy.v = vec_load(&cj_cache.y[cj_cache_idx]);
-      pjy2.v = vec_load(&cj_cache.y[cj_cache_idx + VEC_SIZE]);
-      pjz.v = vec_load(&cj_cache.z[cj_cache_idx]);
-      pjz2.v = vec_load(&cj_cache.z[cj_cache_idx + VEC_SIZE]);
-
       /* Compute the pairwise distance. */
       v_dx.v = vec_sub(pix.v, pjx.v);
       v_dx2.v = vec_sub(pix.v, pjx2.v);
@@ -1613,10 +1606,11 @@ void runner_dopair1_density_vec_2(struct runner *r, struct cell *ci, struct cell
 
   int max_ind_i = 0;
   /* Loop over the parts in cj. */
-  for (int pjd = 0; pjd < count_j && max_ind_i < count_i; pjd++) {
+  for (int pjd = 0; pjd < count_j && max_ind_i < count_i; pjd+=2) {
 
     /* Get a hold of the jth part in cj. */
     struct part *restrict pj = &parts_j[sort_j[pjd].i];
+    struct part *restrict pj2 = &parts_j[sort_j[pjd + 1].i];
     if (!part_is_active(pj, e)) continue;
 
     di = sort_i[max_ind_i].d;
@@ -1630,13 +1624,17 @@ void runner_dopair1_density_vec_2(struct runner *r, struct cell *ci, struct cell
     int cj_cache_idx = pjd;
 
     const float hj = cj_cache.h[cj_cache_idx];
+    const float hj_2 = cj_cache.h[cj_cache_idx + 1];
     const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift;
     if (dj > di_max) continue;
 
     const float hjg2 = hj * hj * kernel_gamma2;
+    const float hjg2_2 = hj_2 * hj_2 * kernel_gamma2;
 
     vector pjx, pjy, pjz;
+    vector pjx2, pjy2, pjz2;
     vector v_hj, v_vjx, v_vjy, v_vjz, v_hjg2;
+    vector v_hj_2, v_vjx2, v_vjy2, v_vjz2, v_hjg2_2;
 
     /* Fill particle pi vectors. */
     pjx.v = vec_set1(cj_cache.x[cj_cache_idx]);
@@ -1649,14 +1647,29 @@ void runner_dopair1_density_vec_2(struct runner *r, struct cell *ci, struct cell
 
     v_hjg2.v = vec_set1(hjg2);
 
+    pjx2.v = vec_set1(cj_cache.x[cj_cache_idx + 1]);
+    pjy2.v = vec_set1(cj_cache.y[cj_cache_idx + 1]);
+    pjz2.v = vec_set1(cj_cache.z[cj_cache_idx + 1]);
+    v_hj_2.v = vec_set1(hj_2);
+    v_vjx2.v = vec_set1(cj_cache.vx[cj_cache_idx + 1]);
+    v_vjy2.v = vec_set1(cj_cache.vy[cj_cache_idx + 1]);
+    v_vjz2.v = vec_set1(cj_cache.vz[cj_cache_idx + 1]);
+
+    v_hjg2_2.v = vec_set1(hjg2_2);
+
     /* Reset cumulative sums of update vectors. */
     vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum,
         curlvySum, curlvzSum;
 
+    vector rhoSum2, rho_dhSum2, wcountSum2, wcount_dhSum2, div_vSum2, curlvxSum2,
+        curlvySum2, curlvzSum2;
+    
     /* Get the inverse of hj. */
     vector v_hj_inv;
+    vector v_hj_inv_2;
 
     v_hj_inv = vec_reciprocal(v_hj);
+    v_hj_inv_2 = vec_reciprocal(v_hj_2);
 
     rhoSum.v = vec_setzero();
     rho_dhSum.v = vec_setzero();
@@ -1667,6 +1680,15 @@ void runner_dopair1_density_vec_2(struct runner *r, struct cell *ci, struct cell
     curlvySum.v = vec_setzero();
     curlvzSum.v = vec_setzero();
 
+    rhoSum2.v = vec_setzero();
+    rho_dhSum2.v = vec_setzero();
+    wcountSum2.v = vec_setzero();
+    wcount_dhSum2.v = vec_setzero();
+    div_vSum2.v = vec_setzero();
+    curlvxSum2.v = vec_setzero();
+    curlvySum2.v = vec_setzero();
+    curlvzSum2.v = vec_setzero();
+
     /* Pad cache if there is a serial remainder. */
     int exit_iteration_align = exit_iteration;
     int rem = exit_iteration % (num_vec_proc * VEC_SIZE);
@@ -1688,6 +1710,8 @@ void runner_dopair1_density_vec_2(struct runner *r, struct cell *ci, struct cell
 
       vector v_dx, v_dy, v_dz, v_r2;
       vector v_dx2, v_dy2, v_dz2, v_r2_2;
+      vector v2_dx, v2_dy, v2_dz, v2_r2;
+      vector v2_dx2, v2_dy2, v2_dz2, v2_r2_2;
 
       /* Load 2 sets of vectors from the particle cache. */
       pix.v = vec_load(&ci_cache->x[ci_cache_idx]);
@@ -1704,28 +1728,51 @@ void runner_dopair1_density_vec_2(struct runner *r, struct cell *ci, struct cell
       /* Compute the pairwise distance. */
       v_dx.v = vec_sub(pjx.v, pix.v);
       v_dx2.v = vec_sub(pjx.v, pix2.v);
+      v2_dx.v = vec_sub(pjx2.v, pix.v);
+      v2_dx2.v = vec_sub(pjx2.v, pix2.v);
+      
       v_dy.v = vec_sub(pjy.v, piy.v);
       v_dy2.v = vec_sub(pjy.v, piy2.v);
+      v2_dy.v = vec_sub(pjy2.v, piy.v);
+      v2_dy2.v = vec_sub(pjy2.v, piy2.v);
+      
       v_dz.v = vec_sub(pjz.v, piz.v);
       v_dz2.v = vec_sub(pjz.v, piz2.v);
+      v2_dz.v = vec_sub(pjz2.v, piz.v);
+      v2_dz2.v = vec_sub(pjz2.v, piz2.v);
 
       v_r2.v = vec_mul(v_dx.v, v_dx.v);
       v_r2_2.v = vec_mul(v_dx2.v, v_dx2.v);
+      v2_r2.v = vec_mul(v2_dx.v, v2_dx.v);
+      v2_r2_2.v = vec_mul(v2_dx2.v, v2_dx2.v);
+      
       v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v);
       v_r2_2.v = vec_fma(v_dy2.v, v_dy2.v, v_r2_2.v);
+      v2_r2.v = vec_fma(v2_dy.v, v2_dy.v, v2_r2.v);
+      v2_r2_2.v = vec_fma(v2_dy2.v, v2_dy2.v, v2_r2_2.v);
+      
       v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v);
       v_r2_2.v = vec_fma(v_dz2.v, v_dz2.v, v_r2_2.v);
+      v2_r2.v = vec_fma(v2_dz.v, v2_dz.v, v2_r2.v);
+      v2_r2_2.v = vec_fma(v2_dz2.v, v2_dz2.v, v2_r2_2.v);
 
       vector v_doj_mask, v_doj_mask2;
       int doj_mask, doj_mask2;
 
+      vector v2_doj_mask, v2_doj_mask2;
+      int doj2_mask, doj2_mask2;
+
       /* Form r2 < hig2 mask. */
       v_doj_mask.v = vec_cmp_lt(v_r2.v, v_hjg2.v);
       v_doj_mask2.v = vec_cmp_lt(v_r2_2.v, v_hjg2.v);
+      v2_doj_mask.v = vec_cmp_lt(v2_r2.v, v_hjg2_2.v);
+      v2_doj_mask2.v = vec_cmp_lt(v2_r2_2.v, v_hjg2_2.v);
 
       /* Form integer mask. */
       doj_mask = vec_cmp_result(v_doj_mask.v);
       doj_mask2 = vec_cmp_result(v_doj_mask2.v);
+      doj2_mask = vec_cmp_result(v2_doj_mask.v);
+      doj2_mask2 = vec_cmp_result(v2_doj_mask2.v);
 
       /* Perform interaction with 2 vectors. */
       if (doj_mask)
@@ -1749,6 +1796,28 @@ void runner_dopair1_density_vec_2(struct runner *r, struct cell *ci, struct cell
           knl_mask);
 #else
       0);
+#endif
+      if (doj2_mask)
+        runner_iact_nonsym_intrinsic_vec_density(
+          &v2_r2, &v2_dx, &v2_dy, &v2_dz, v_hj_inv_2, v_vjx2, v_vjy2, v_vjz2,
+          &ci_cache->vx[ci_cache_idx], &ci_cache->vy[ci_cache_idx], &ci_cache->vz[ci_cache_idx],
+          &ci_cache->m[ci_cache_idx], &rhoSum2, &rho_dhSum2, &wcountSum2, &wcount_dhSum2,
+          &div_vSum2, &curlvxSum2, &curlvySum2, &curlvzSum2, v2_doj_mask,
+#ifdef HAVE_AVX512_F
+          knl_mask);
+#else
+      0);
+#endif
+       if (doj2_mask2)
+        runner_iact_nonsym_intrinsic_vec_density(
+          &v2_r2_2, &v2_dx2, &v2_dy2, &v2_dz2, v_hj_inv_2, v_vjx2, v_vjy2, v_vjz2,
+          &ci_cache->vx[ci_cache_idx - VEC_SIZE], &ci_cache->vy[ci_cache_idx - VEC_SIZE], &ci_cache->vz[ci_cache_idx - VEC_SIZE],
+          &ci_cache->m[ci_cache_idx - VEC_SIZE], &rhoSum2, &rho_dhSum2, &wcountSum2, &wcount_dhSum2,
+          &div_vSum2, &curlvxSum2, &curlvySum2, &curlvzSum2, v2_doj_mask2,
+#ifdef HAVE_AVX512_F
+          knl_mask);
+#else
+      0);
 #endif 
     } /* loop over the parts in cj. */
 
@@ -1763,6 +1832,15 @@ void runner_dopair1_density_vec_2(struct runner *r, struct cell *ci, struct cell
     VEC_HADD(curlvySum, pj->density.rot_v[1]);
     VEC_HADD(curlvzSum, pj->density.rot_v[2]);
 
+    VEC_HADD(rhoSum2, pj2->rho);
+    VEC_HADD(rho_dhSum2, pj2->density.rho_dh);
+    VEC_HADD(wcountSum2, pj2->density.wcount);
+    VEC_HADD(wcount_dhSum2, pj2->density.wcount_dh);
+    VEC_HADD(div_vSum2, pj2->density.div_v);
+    VEC_HADD(curlvxSum2, pj2->density.rot_v[0]);
+    VEC_HADD(curlvySum2, pj2->density.rot_v[1]);
+    VEC_HADD(curlvzSum2, pj2->density.rot_v[2]);
+
   } /* loop over the parts in ci. */
 
   TIMER_TOC(timer_dopair_density);