diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c index 33cd2b181f18d3b749261dd6093836cffe7fc335..6daa814044338c9eaaa32dca1377fb8a06b29d82 100644 --- a/src/runner_doiact_vec.c +++ b/src/runner_doiact_vec.c @@ -1310,6 +1310,7 @@ void runner_dopair1_density_vec_2(struct runner *r, struct cell *ci, struct cell int num_vec_proc = 2; vector v_hi, v_vix, v_viy, v_viz, v_hig2; + vector v_hi_2, v_vix2, v_viy2, v_viz2, v_hig2_2; TIMER_TIC; @@ -1369,11 +1370,11 @@ void runner_dopair1_density_vec_2(struct runner *r, struct cell *ci, struct cell int max_ind_j = count_j - 1; /* Loop over the parts in ci. */ - for (int pid = count_i - 1; pid >= 0 && max_ind_j >= 0; pid--) { + for (int pid = count_i - 1; pid >= 0 && max_ind_j >= 0; pid-=2) { /* Get a hold of the ith part in ci. */ struct part *restrict pi = &parts_i[sort_i[pid].i]; - //struct part *restrict pi2 = &parts_i[sort_i[pid - 1].i]; + struct part *restrict pi2 = &parts_i[sort_i[pid - 1].i]; if (!part_is_active(pi, e)) continue; dj = sort_j[max_ind_j].d; @@ -1384,15 +1385,18 @@ void runner_dopair1_density_vec_2(struct runner *r, struct cell *ci, struct cell } int exit_iteration = max_ind_j; - int ci_cache_idx = pid; //sort_i[pid].i; + int ci_cache_idx = pid;//sort_i[pid].i; const float hi = ci_cache->h[ci_cache_idx]; + const float hi_2 = ci_cache->h[ci_cache_idx - 1]; const double di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift; if (di < dj_min) continue; const float hig2 = hi * hi * kernel_gamma2; + const float hig2_2 = hi_2 * hi_2 * kernel_gamma2; vector pix, piy, piz; + vector pix2, piy2, piz2; /* Fill particle pi vectors. */ pix.v = vec_set1(ci_cache->x[ci_cache_idx]); @@ -1405,14 +1409,29 @@ void runner_dopair1_density_vec_2(struct runner *r, struct cell *ci, struct cell v_hig2.v = vec_set1(hig2); + pix2.v = vec_set1(ci_cache->x[ci_cache_idx - 1]); + piy2.v = vec_set1(ci_cache->y[ci_cache_idx - 1]); + piz2.v = vec_set1(ci_cache->z[ci_cache_idx - 1]); + v_hi_2.v = vec_set1(hi_2); + v_vix2.v = vec_set1(ci_cache->vx[ci_cache_idx - 1]); + v_viy2.v = vec_set1(ci_cache->vy[ci_cache_idx - 1]); + v_viz2.v = vec_set1(ci_cache->vz[ci_cache_idx - 1]); + + v_hig2_2.v = vec_set1(hig2_2); + /* Reset cumulative sums of update vectors. */ vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, curlvySum, curlvzSum; + vector rhoSum2, rho_dhSum2, wcountSum2, wcount_dhSum2, div_vSum2, curlvxSum2, + curlvySum2, curlvzSum2; + /* Get the inverse of hi. */ vector v_hi_inv; + vector v_hi_inv_2; v_hi_inv = vec_reciprocal(v_hi); + v_hi_inv_2 = vec_reciprocal(v_hi_2); rhoSum.v = vec_setzero(); rho_dhSum.v = vec_setzero(); @@ -1423,6 +1442,15 @@ void runner_dopair1_density_vec_2(struct runner *r, struct cell *ci, struct cell curlvySum.v = vec_setzero(); curlvzSum.v = vec_setzero(); + rhoSum2.v = vec_setzero(); + rho_dhSum2.v = vec_setzero(); + wcountSum2.v = vec_setzero(); + wcount_dhSum2.v = vec_setzero(); + div_vSum2.v = vec_setzero(); + curlvxSum2.v = vec_setzero(); + curlvySum2.v = vec_setzero(); + curlvzSum2.v = vec_setzero(); + /* Pad cache if there is a serial remainder. */ int exit_iteration_align = exit_iteration; int rem = exit_iteration % (num_vec_proc * VEC_SIZE); @@ -1439,10 +1467,12 @@ void runner_dopair1_density_vec_2(struct runner *r, struct cell *ci, struct cell for (int pjd = 0; pjd < exit_iteration_align; pjd += (num_vec_proc * VEC_SIZE)) { /* Get the cache index to the jth particle. */ - int cj_cache_idx = pjd; //sort_j[pjd].i; + int cj_cache_idx = pjd;//sort_j[pjd].i; vector v_dx, v_dy, v_dz, v_r2; vector v_dx2, v_dy2, v_dz2, v_r2_2; + vector v2_dx, v2_dy, v2_dz, v2_r2; + vector v2_dx2, v2_dy2, v2_dz2, v2_r2_2; /* Load 2 sets of vectors from the particle cache. */ pjx.v = vec_load(&cj_cache.x[cj_cache_idx]); @@ -1456,31 +1486,61 @@ void runner_dopair1_density_vec_2(struct runner *r, struct cell *ci, struct cell //pjvz.v = vec_load(&cj_cache.vz[cj_cache_idx]); //mj.v = vec_load(&cj_cache.m[cj_cache_idx]); + pjx.v = vec_load(&cj_cache.x[cj_cache_idx]); + pjx2.v = vec_load(&cj_cache.x[cj_cache_idx + VEC_SIZE]); + pjy.v = vec_load(&cj_cache.y[cj_cache_idx]); + pjy2.v = vec_load(&cj_cache.y[cj_cache_idx + VEC_SIZE]); + pjz.v = vec_load(&cj_cache.z[cj_cache_idx]); + pjz2.v = vec_load(&cj_cache.z[cj_cache_idx + VEC_SIZE]); + /* Compute the pairwise distance. */ v_dx.v = vec_sub(pix.v, pjx.v); v_dx2.v = vec_sub(pix.v, pjx2.v); + v2_dx.v = vec_sub(pix2.v, pjx.v); + v2_dx2.v = vec_sub(pix2.v, pjx2.v); + v_dy.v = vec_sub(piy.v, pjy.v); v_dy2.v = vec_sub(piy.v, pjy2.v); + v2_dy.v = vec_sub(piy2.v, pjy.v); + v2_dy2.v = vec_sub(piy2.v, pjy2.v); + v_dz.v = vec_sub(piz.v, pjz.v); v_dz2.v = vec_sub(piz.v, pjz2.v); + v2_dz.v = vec_sub(piz2.v, pjz.v); + v2_dz2.v = vec_sub(piz2.v, pjz2.v); v_r2.v = vec_mul(v_dx.v, v_dx.v); v_r2_2.v = vec_mul(v_dx2.v, v_dx2.v); + v2_r2.v = vec_mul(v2_dx.v, v2_dx.v); + v2_r2_2.v = vec_mul(v2_dx2.v, v2_dx2.v); + v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v); v_r2_2.v = vec_fma(v_dy2.v, v_dy2.v, v_r2_2.v); + v2_r2.v = vec_fma(v2_dy.v, v2_dy.v, v2_r2.v); + v2_r2_2.v = vec_fma(v2_dy2.v, v2_dy2.v, v2_r2_2.v); + v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v); v_r2_2.v = vec_fma(v_dz2.v, v_dz2.v, v_r2_2.v); + v2_r2.v = vec_fma(v2_dz.v, v2_dz.v, v2_r2.v); + v2_r2_2.v = vec_fma(v2_dz2.v, v2_dz2.v, v2_r2_2.v); vector v_doi_mask, v_doi_mask2; int doi_mask, doi_mask2; + vector v2_doi_mask, v2_doi_mask2; + int doi2_mask, doi2_mask2; + /* Form r2 < hig2 mask. */ v_doi_mask.v = vec_cmp_lt(v_r2.v, v_hig2.v); v_doi_mask2.v = vec_cmp_lt(v_r2_2.v, v_hig2.v); + v2_doi_mask.v = vec_cmp_lt(v2_r2.v, v_hig2_2.v); + v2_doi_mask2.v = vec_cmp_lt(v2_r2_2.v, v_hig2_2.v); /* Form integer mask. */ doi_mask = vec_cmp_result(v_doi_mask.v); doi_mask2 = vec_cmp_result(v_doi_mask2.v); + doi2_mask = vec_cmp_result(v2_doi_mask.v); + doi2_mask2 = vec_cmp_result(v2_doi_mask2.v); if(doi_mask) runner_iact_nonsym_intrinsic_vec_density( @@ -1503,7 +1563,30 @@ void runner_dopair1_density_vec_2(struct runner *r, struct cell *ci, struct cell knl_mask); #else 0); -#endif +#endif + if(doi2_mask) + runner_iact_nonsym_intrinsic_vec_density( + &v2_r2, &v2_dx, &v2_dy, &v2_dz, v_hi_inv_2, v_vix2, v_viy2, v_viz2, + &cj_cache.vx[cj_cache_idx], &cj_cache.vy[cj_cache_idx], &cj_cache.vz[cj_cache_idx], + &cj_cache.m[cj_cache_idx], &rhoSum2, &rho_dhSum2, &wcountSum2, &wcount_dhSum2, + &div_vSum2, &curlvxSum2, &curlvySum2, &curlvzSum2, v2_doi_mask, +#ifdef HAVE_AVX512_F + knl_mask); +#else + 0); +#endif + if(doi2_mask2) + runner_iact_nonsym_intrinsic_vec_density( + &v2_r2_2, &v2_dx2, &v2_dy2, &v2_dz2, v_hi_inv_2, v_vix2, v_viy2, v_viz2, + &cj_cache.vx[cj_cache_idx + VEC_SIZE], &cj_cache.vy[cj_cache_idx + VEC_SIZE], &cj_cache.vz[cj_cache_idx + VEC_SIZE], + &cj_cache.m[cj_cache_idx + VEC_SIZE], &rhoSum2, &rho_dhSum2, &wcountSum2, &wcount_dhSum2, + &div_vSum2, &curlvxSum2, &curlvySum2, &curlvzSum2, v2_doi_mask2, +#ifdef HAVE_AVX512_F + knl_mask); +#else + 0); +#endif + } /* loop over the parts in cj. */ /* Perform horizontal adds on vector sums and store result in particle pi. @@ -1517,6 +1600,15 @@ void runner_dopair1_density_vec_2(struct runner *r, struct cell *ci, struct cell VEC_HADD(curlvySum, pi->density.rot_v[1]); VEC_HADD(curlvzSum, pi->density.rot_v[2]); + VEC_HADD(rhoSum2, pi2->rho); + VEC_HADD(rho_dhSum2, pi2->density.rho_dh); + VEC_HADD(wcountSum2, pi2->density.wcount); + VEC_HADD(wcount_dhSum2, pi2->density.wcount_dh); + VEC_HADD(div_vSum2, pi2->density.div_v); + VEC_HADD(curlvxSum2, pi2->density.rot_v[0]); + VEC_HADD(curlvySum2, pi2->density.rot_v[1]); + VEC_HADD(curlvzSum2, pi2->density.rot_v[2]); + } /* loop over the parts in ci. */ int max_ind_i = 0; @@ -1678,6 +1770,498 @@ void runner_dopair1_density_vec_2(struct runner *r, struct cell *ci, struct cell #endif /* WITH_VECTORIZATION */ } +void runner_dopair1_density_vec_3(struct runner *r, struct cell *ci, struct cell *cj) { + +#ifdef WITH_VECTORIZATION + const struct engine *restrict e = r->e; + + int num_vec_proc = 2; + + vector v_hi, v_vix, v_viy, v_viz, v_hig2; + //vector v_hi_2, v_vix2, v_viy2, v_viz2, v_hig2_2; + + TIMER_TIC; + + /* Anything to do here? */ + if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return; + +#ifdef SWIFT_DEBUG_CHECKS + cell_is_drifted(ci, e); + cell_is_drifted(cj, e); +#endif + + /* Get the sort ID. */ + double shift[3] = {0.0, 0.0, 0.0}; + const int sid = space_getsid(e->s, &ci, &cj, shift); + + /* Have the cells been sorted? */ + if (!(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid))) + error("Trying to interact unsorted cells."); + + /* Get the cutoff shift. */ + double rshift = 0.0; + for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k]; + + /* Pick-out the sorted lists. */ + const struct entry *restrict sort_i = &ci->sort[sid * (ci->count + 1)]; + const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)]; + + /* Get some other useful values. */ + const int count_i = ci->count; + const int count_j = cj->count; + struct part *restrict parts_i = ci->parts; + struct part *restrict parts_j = cj->parts; + const double di_max = sort_i[count_i - 1].d - rshift; + const double dj_min = sort_j[0].d; + const float dx_max = (ci->dx_max + cj->dx_max); + + /* Get the particle cache from the runner and re-allocate + * the cache if it is not big enough for the cell. */ + struct cache *restrict ci_cache = &r->par_cache; + + if (ci_cache->count < count_i) { + cache_init(ci_cache, count_i); + } + if (cj_cache.count < count_j) { + cache_init(&cj_cache, count_j); + } + + cache_read_two_cells(ci, cj, ci_cache, &cj_cache, shift); + //cache_read_two_cells_sorted(ci, cj, ci_cache, &cj_cache, sort_i, sort_j, shift); + + /* Find particles maximum distance into cj, max_di[] and ci, max_dj[]. */ + /* For particles in ci */ + populate_max_d(ci, cj, sort_i, sort_j, ci_cache, &cj_cache, dx_max, rshift, max_di, max_dj); + + float di, dj; + + int max_ind_j = count_j - 1; + + /* Loop over the parts in ci. */ + for (int pid = count_i - 1; pid >= 0 && max_ind_j >= 0; pid--) { + //for (int pid = count_i - 1; pid >= 0 && max_ind_j >= 0; pid-=2) { + + /* Get a hold of the ith part in ci. */ + struct part *restrict pi = &parts_i[sort_i[pid].i]; + //struct part *restrict pi2 = &parts_i[sort_i[pid - 1].i]; + if (!part_is_active(pi, e)) continue; + + dj = sort_j[max_ind_j].d; + while(max_ind_j > 0 && max_di[pid] < dj) { + max_ind_j--; + + dj = sort_j[max_ind_j].d; + } + int exit_iteration = max_ind_j; + + int ci_cache_idx = sort_i[pid].i; + + const float hi = ci_cache->h[ci_cache_idx]; + //const float hi_2 = ci_cache->h[ci_cache_idx - 1]; + const double di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift; + if (di < dj_min) continue; + + const float hig2 = hi * hi * kernel_gamma2; + //const float hig2_2 = hi_2 * hi_2 * kernel_gamma2; + + vector pix, piy, piz; + //vector pix2, piy2, piz2; + + /* Fill particle pi vectors. */ + pix.v = vec_set1(ci_cache->x[ci_cache_idx]); + piy.v = vec_set1(ci_cache->y[ci_cache_idx]); + piz.v = vec_set1(ci_cache->z[ci_cache_idx]); + v_hi.v = vec_set1(hi); + v_vix.v = vec_set1(ci_cache->vx[ci_cache_idx]); + v_viy.v = vec_set1(ci_cache->vy[ci_cache_idx]); + v_viz.v = vec_set1(ci_cache->vz[ci_cache_idx]); + + v_hig2.v = vec_set1(hig2); + + //pix2.v = vec_set1(ci_cache->x[ci_cache_idx - 1]); + //piy2.v = vec_set1(ci_cache->y[ci_cache_idx - 1]); + //piz2.v = vec_set1(ci_cache->z[ci_cache_idx - 1]); + //v_hi_2.v = vec_set1(hi_2); + //v_vix2.v = vec_set1(ci_cache->vx[ci_cache_idx - 1]); + //v_viy2.v = vec_set1(ci_cache->vy[ci_cache_idx - 1]); + //v_viz2.v = vec_set1(ci_cache->vz[ci_cache_idx - 1]); + + //v_hig2_2.v = vec_set1(hig2_2); + + /* Reset cumulative sums of update vectors. */ + vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, + curlvySum, curlvzSum; + + //vector rhoSum2, rho_dhSum2, wcountSum2, wcount_dhSum2, div_vSum2, curlvxSum2, + // curlvySum2, curlvzSum2; + + /* Get the inverse of hi. */ + vector v_hi_inv; + //vector v_hi_inv_2; + + v_hi_inv = vec_reciprocal(v_hi); + //v_hi_inv_2 = vec_reciprocal(v_hi_2); + + rhoSum.v = vec_setzero(); + rho_dhSum.v = vec_setzero(); + wcountSum.v = vec_setzero(); + wcount_dhSum.v = vec_setzero(); + div_vSum.v = vec_setzero(); + curlvxSum.v = vec_setzero(); + curlvySum.v = vec_setzero(); + curlvzSum.v = vec_setzero(); + + //rhoSum2.v = vec_setzero(); + //rho_dhSum2.v = vec_setzero(); + //wcountSum2.v = vec_setzero(); + //wcount_dhSum2.v = vec_setzero(); + //div_vSum2.v = vec_setzero(); + //curlvxSum2.v = vec_setzero(); + //curlvySum2.v = vec_setzero(); + //curlvzSum2.v = vec_setzero(); + + /* Pad cache if there is a serial remainder. */ + int exit_iteration_align = exit_iteration; + int rem = exit_iteration % (num_vec_proc * VEC_SIZE); + if (rem != 0) { + int pad = (num_vec_proc * VEC_SIZE) - rem; + + exit_iteration_align += pad; + } + + vector pjx, pjy, pjz; + vector pjx2, pjy2, pjz2; + + /* Loop over the parts in cj. */ + for (int pjd = 0; pjd < exit_iteration_align; pjd += (num_vec_proc * VEC_SIZE)) { + + /* Get the cache index to the jth particle. */ + //int cj_cache_idx = sort_j[pjd].i; + int indices[2 * VEC_SIZE]; + + for (int i=0; i<2 * VEC_SIZE; i++) + indices[i] = sort_j[pjd + i].i; + + vector v_dx, v_dy, v_dz, v_r2; + vector v_dx2, v_dy2, v_dz2, v_r2_2; + //vector v2_dx, v2_dy, v2_dz, v2_r2; + //vector v2_dx2, v2_dy2, v2_dz2, v2_r2_2; + + /* Load 2 sets of vectors from the particle cache. */ + //pjx.v = vec_load(&cj_cache.x[cj_cache_idx]); + //pjx2.v = vec_load(&cj_cache.x[cj_cache_idx + VEC_SIZE]); + //pjy.v = vec_load(&cj_cache.y[cj_cache_idx]); + //pjy2.v = vec_load(&cj_cache.y[cj_cache_idx + VEC_SIZE]); + //pjz.v = vec_load(&cj_cache.z[cj_cache_idx]); + //pjz2.v = vec_load(&cj_cache.z[cj_cache_idx + VEC_SIZE]); + //pjvx.v = vec_load(&cj_cache.vx[cj_cache_idx]); + //pjvy.v = vec_load(&cj_cache.vy[cj_cache_idx]); + //pjvz.v = vec_load(&cj_cache.vz[cj_cache_idx]); + //mj.v = vec_load(&cj_cache.m[cj_cache_idx]); + + pjx.v = vec_set(cj_cache.x[indices[0]], cj_cache.x[indices[1]], cj_cache.x[indices[2]], cj_cache.x[indices[3]], + cj_cache.x[indices[4]], cj_cache.x[indices[5]], cj_cache.x[indices[6]], cj_cache.x[indices[7]]); + pjx2.v = vec_set(cj_cache.x[indices[8]], cj_cache.x[indices[9]], cj_cache.x[indices[10]], cj_cache.x[indices[11]], + cj_cache.x[indices[12]], cj_cache.x[indices[13]], cj_cache.x[indices[14]], cj_cache.x[indices[15]]); + pjy.v = vec_set(cj_cache.y[indices[0]], cj_cache.y[indices[1]], cj_cache.y[indices[2]], cj_cache.y[indices[3]], + cj_cache.y[indices[4]], cj_cache.y[indices[5]], cj_cache.y[indices[6]], cj_cache.y[indices[7]]); + pjy2.v = vec_set(cj_cache.y[indices[8]], cj_cache.y[indices[9]], cj_cache.y[indices[10]], cj_cache.y[indices[11]], + cj_cache.y[indices[12]], cj_cache.y[indices[13]], cj_cache.y[indices[14]], cj_cache.y[indices[15]]); + pjz.v = vec_set(cj_cache.z[indices[0]], cj_cache.z[indices[1]], cj_cache.z[indices[2]], cj_cache.z[indices[3]], + cj_cache.z[indices[4]], cj_cache.z[indices[5]], cj_cache.z[indices[6]], cj_cache.z[indices[7]]); + pjz2.v = vec_set(cj_cache.z[indices[8]], cj_cache.z[indices[9]], cj_cache.z[indices[10]], cj_cache.z[indices[11]], + cj_cache.z[indices[12]], cj_cache.z[indices[13]], cj_cache.z[indices[14]], cj_cache.z[indices[15]]); + + /* Compute the pairwise distance. */ + v_dx.v = vec_sub(pix.v, pjx.v); + v_dx2.v = vec_sub(pix.v, pjx2.v); + //v2_dx.v = vec_sub(pix2.v, pjx.v); + //v2_dx2.v = vec_sub(pix2.v, pjx2.v); + + v_dy.v = vec_sub(piy.v, pjy.v); + v_dy2.v = vec_sub(piy.v, pjy2.v); + //v2_dy.v = vec_sub(piy2.v, pjy.v); + //v2_dy2.v = vec_sub(piy2.v, pjy2.v); + + v_dz.v = vec_sub(piz.v, pjz.v); + v_dz2.v = vec_sub(piz.v, pjz2.v); + //v2_dz.v = vec_sub(piz2.v, pjz.v); + //v2_dz2.v = vec_sub(piz2.v, pjz2.v); + + v_r2.v = vec_mul(v_dx.v, v_dx.v); + v_r2_2.v = vec_mul(v_dx2.v, v_dx2.v); + //v2_r2.v = vec_mul(v2_dx.v, v2_dx.v); + //v2_r2_2.v = vec_mul(v2_dx2.v, v2_dx2.v); + + v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v); + v_r2_2.v = vec_fma(v_dy2.v, v_dy2.v, v_r2_2.v); + //v2_r2.v = vec_fma(v2_dy.v, v2_dy.v, v2_r2.v); + //v2_r2_2.v = vec_fma(v2_dy2.v, v2_dy2.v, v2_r2_2.v); + + v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v); + v_r2_2.v = vec_fma(v_dz2.v, v_dz2.v, v_r2_2.v); + //v2_r2.v = vec_fma(v2_dz.v, v2_dz.v, v2_r2.v); + //v2_r2_2.v = vec_fma(v2_dz2.v, v2_dz2.v, v2_r2_2.v); + + vector v_doi_mask, v_doi_mask2; + int doi_mask, doi_mask2; + + //vector v2_doi_mask, v2_doi_mask2; + //int doi2_mask, doi2_mask2; + + /* Form r2 < hig2 mask. */ + v_doi_mask.v = vec_cmp_lt(v_r2.v, v_hig2.v); + v_doi_mask2.v = vec_cmp_lt(v_r2_2.v, v_hig2.v); + //v2_doi_mask.v = vec_cmp_lt(v2_r2.v, v_hig2_2.v); + //v2_doi_mask2.v = vec_cmp_lt(v2_r2_2.v, v_hig2_2.v); + + /* Form integer mask. */ + doi_mask = vec_cmp_result(v_doi_mask.v); + doi_mask2 = vec_cmp_result(v_doi_mask2.v); + //doi2_mask = vec_cmp_result(v2_doi_mask.v); + //doi2_mask2 = vec_cmp_result(v2_doi_mask2.v); + + if(doi_mask) + runner_iact_nonsym_intrinsic_vec_2_density( + &cj_cache, &indices[0], &v_r2, &v_dx, &v_dy,&v_dz, v_hi_inv, v_vix, v_viy, v_viz, + &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, + &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, v_doi_mask, +#ifdef HAVE_AVX512_F + knl_mask); +#else + 0); +#endif + if(doi_mask2) + runner_iact_nonsym_intrinsic_vec_2_density( + &cj_cache, &indices[VEC_SIZE], &v_r2_2, &v_dx2, &v_dy2,&v_dz2, v_hi_inv, v_vix, v_viy, v_viz, + &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, + &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, v_doi_mask2, +#ifdef HAVE_AVX512_F + knl_mask); +#else + 0); +#endif +// if(doi2_mask) +// runner_iact_nonsym_intrinsic_vec_density( +// &v2_r2, &v2_dx, &v2_dy, &v2_dz, v_hi_inv_2, v_vix2, v_viy2, v_viz2, +// &cj_cache.vx[cj_cache_idx], &cj_cache.vy[cj_cache_idx], &cj_cache.vz[cj_cache_idx], +// &cj_cache.m[cj_cache_idx], &rhoSum2, &rho_dhSum2, &wcountSum2, &wcount_dhSum2, +// &div_vSum2, &curlvxSum2, &curlvySum2, &curlvzSum2, v2_doi_mask, +//#ifdef HAVE_AVX512_F +// knl_mask); +//#else +// 0); +//#endif +// if(doi2_mask2) +// runner_iact_nonsym_intrinsic_vec_density( +// &v2_r2_2, &v2_dx2, &v2_dy2, &v2_dz2, v_hi_inv_2, v_vix2, v_viy2, v_viz2, +// &cj_cache.vx[cj_cache_idx + VEC_SIZE], &cj_cache.vy[cj_cache_idx + VEC_SIZE], &cj_cache.vz[cj_cache_idx + VEC_SIZE], +// &cj_cache.m[cj_cache_idx + VEC_SIZE], &rhoSum2, &rho_dhSum2, &wcountSum2, &wcount_dhSum2, +// &div_vSum2, &curlvxSum2, &curlvySum2, &curlvzSum2, v2_doi_mask2, +//#ifdef HAVE_AVX512_F +// knl_mask); +//#else +// 0); +//#endif + + } /* loop over the parts in cj. */ + + /* Perform horizontal adds on vector sums and store result in particle pi. + */ + VEC_HADD(rhoSum, pi->rho); + VEC_HADD(rho_dhSum, pi->density.rho_dh); + VEC_HADD(wcountSum, pi->density.wcount); + VEC_HADD(wcount_dhSum, pi->density.wcount_dh); + VEC_HADD(div_vSum, pi->density.div_v); + VEC_HADD(curlvxSum, pi->density.rot_v[0]); + VEC_HADD(curlvySum, pi->density.rot_v[1]); + VEC_HADD(curlvzSum, pi->density.rot_v[2]); + + //VEC_HADD(rhoSum2, pi2->rho); + //VEC_HADD(rho_dhSum2, pi2->density.rho_dh); + //VEC_HADD(wcountSum2, pi2->density.wcount); + //VEC_HADD(wcount_dhSum2, pi2->density.wcount_dh); + //VEC_HADD(div_vSum2, pi2->density.div_v); + //VEC_HADD(curlvxSum2, pi2->density.rot_v[0]); + //VEC_HADD(curlvySum2, pi2->density.rot_v[1]); + //VEC_HADD(curlvzSum2, pi2->density.rot_v[2]); + + } /* loop over the parts in ci. */ + + int max_ind_i = 0; + /* Loop over the parts in cj. */ + for (int pjd = 0; pjd < count_j && max_ind_i < count_i; pjd++) { + + /* Get a hold of the jth part in cj. */ + struct part *restrict pj = &parts_j[sort_j[pjd].i]; + if (!part_is_active(pj, e)) continue; + + di = sort_i[max_ind_i].d; + while(max_ind_i < count_i - 1 && max_dj[pjd] > di) { + max_ind_i++; + + di = sort_i[max_ind_i].d; + } + int exit_iteration = max_ind_i; + + int cj_cache_idx = pjd; + + const float hj = cj_cache.h[cj_cache_idx]; + const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift; + if (dj > di_max) continue; + + const float hjg2 = hj * hj * kernel_gamma2; + + vector pjx, pjy, pjz; + vector v_hj, v_vjx, v_vjy, v_vjz, v_hjg2; + + /* Fill particle pi vectors. */ + pjx.v = vec_set1(cj_cache.x[cj_cache_idx]); + pjy.v = vec_set1(cj_cache.y[cj_cache_idx]); + pjz.v = vec_set1(cj_cache.z[cj_cache_idx]); + v_hj.v = vec_set1(hj); + v_vjx.v = vec_set1(cj_cache.vx[cj_cache_idx]); + v_vjy.v = vec_set1(cj_cache.vy[cj_cache_idx]); + v_vjz.v = vec_set1(cj_cache.vz[cj_cache_idx]); + + v_hjg2.v = vec_set1(hjg2); + + /* Reset cumulative sums of update vectors. */ + vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, + curlvySum, curlvzSum; + + /* Get the inverse of hj. */ + vector v_hj_inv; + + v_hj_inv = vec_reciprocal(v_hj); + + rhoSum.v = vec_setzero(); + rho_dhSum.v = vec_setzero(); + wcountSum.v = vec_setzero(); + wcount_dhSum.v = vec_setzero(); + div_vSum.v = vec_setzero(); + curlvxSum.v = vec_setzero(); + curlvySum.v = vec_setzero(); + curlvzSum.v = vec_setzero(); + + /* Pad cache if there is a serial remainder. */ + int exit_iteration_align = exit_iteration; + int rem = exit_iteration % (num_vec_proc * VEC_SIZE); + if (rem != 0) { + int pad = (num_vec_proc * VEC_SIZE) - rem; + + exit_iteration_align -= pad; + } + + vector pix, piy, piz; + vector pix2, piy2, piz2; + //vector pivx, pivy, pivz, mi; + + /* Loop over the parts in ci. */ + for (int pid = count_i - 1; pid >= 0; pid -= (num_vec_proc * VEC_SIZE)) { + + int indices[2 * VEC_SIZE]; + + for (int i=(2 * VEC_SIZE) - 1; i>=0; i--) + indices[i] = sort_j[pid - i].i; + + /* Get the cache index to the ith particle. */ + //int ci_cache_idx = sort_i[pid].i; + + vector v_dx, v_dy, v_dz, v_r2; + vector v_dx2, v_dy2, v_dz2, v_r2_2; + + /* Load 2 sets of vectors from the particle cache. */ + //pix.v = vec_load(&ci_cache->x[ci_cache_idx]); + //pix2.v = vec_load(&ci_cache->x[ci_cache_idx - VEC_SIZE]); + //piy.v = vec_load(&ci_cache->y[ci_cache_idx]); + //piy2.v = vec_load(&ci_cache->y[ci_cache_idx - VEC_SIZE]); + //piz.v = vec_load(&ci_cache->z[ci_cache_idx]); + //piz2.v = vec_load(&ci_cache->z[ci_cache_idx - VEC_SIZE]); + //pivx.v = vec_load(&ci_cache->vx[ci_cache_idx]); + //pivy.v = vec_load(&ci_cache->vy[ci_cache_idx]); + //pivz.v = vec_load(&ci_cache->vz[ci_cache_idx]); + //mi.v = vec_load(&ci_cache->m[ci_cache_idx]); + + pix.v = vec_set(ci_cache->x[indices[0]], ci_cache->x[indices[1]], ci_cache->x[indices[2]], ci_cache->x[indices[3]], + ci_cache->x[indices[4]], ci_cache->x[indices[5]], ci_cache->x[indices[6]], ci_cache->x[indices[7]]); + pix2.v = vec_set(ci_cache->x[indices[8]], ci_cache->x[indices[9]], ci_cache->x[indices[10]], ci_cache->x[indices[11]], + ci_cache->x[indices[12]], ci_cache->x[indices[13]], ci_cache->x[indices[14]], ci_cache->x[indices[15]]); + piy.v = vec_set(ci_cache->y[indices[0]], ci_cache->y[indices[1]], ci_cache->y[indices[2]], ci_cache->y[indices[3]], + ci_cache->y[indices[4]], ci_cache->y[indices[5]], ci_cache->y[indices[6]], ci_cache->y[indices[7]]); + piy2.v = vec_set(ci_cache->y[indices[8]], ci_cache->y[indices[9]], ci_cache->y[indices[10]], ci_cache->y[indices[11]], + ci_cache->y[indices[12]], ci_cache->y[indices[13]], ci_cache->y[indices[14]], ci_cache->y[indices[15]]); + piz.v = vec_set(ci_cache->z[indices[0]], ci_cache->z[indices[1]], ci_cache->z[indices[2]], ci_cache->z[indices[3]], + ci_cache->z[indices[4]], ci_cache->z[indices[5]], ci_cache->z[indices[6]], ci_cache->z[indices[7]]); + piz2.v = vec_set(ci_cache->z[indices[8]], ci_cache->z[indices[9]], ci_cache->z[indices[10]], ci_cache->z[indices[11]], + ci_cache->z[indices[12]], ci_cache->z[indices[13]], ci_cache->z[indices[14]], ci_cache->z[indices[15]]); + + /* Compute the pairwise distance. */ + v_dx.v = vec_sub(pjx.v, pix.v); + v_dx2.v = vec_sub(pjx.v, pix2.v); + v_dy.v = vec_sub(pjy.v, piy.v); + v_dy2.v = vec_sub(pjy.v, piy2.v); + v_dz.v = vec_sub(pjz.v, piz.v); + v_dz2.v = vec_sub(pjz.v, piz2.v); + + v_r2.v = vec_mul(v_dx.v, v_dx.v); + v_r2_2.v = vec_mul(v_dx2.v, v_dx2.v); + v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v); + v_r2_2.v = vec_fma(v_dy2.v, v_dy2.v, v_r2_2.v); + v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v); + v_r2_2.v = vec_fma(v_dz2.v, v_dz2.v, v_r2_2.v); + + vector v_doj_mask, v_doj_mask2; + int doj_mask, doj_mask2; + + /* Form r2 < hig2 mask. */ + v_doj_mask.v = vec_cmp_lt(v_r2.v, v_hjg2.v); + v_doj_mask2.v = vec_cmp_lt(v_r2_2.v, v_hjg2.v); + + /* Form integer mask. */ + doj_mask = vec_cmp_result(v_doj_mask.v); + doj_mask2 = vec_cmp_result(v_doj_mask2.v); + + /* Perform interaction with 2 vectors. */ + if (doj_mask) + runner_iact_nonsym_intrinsic_vec_2_density( + ci_cache, &indices[0], &v_r2, &v_dx, &v_dy, &v_dz, v_hj_inv, v_vjx, v_vjy, v_vjz, + &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, + &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, v_doj_mask, +#ifdef HAVE_AVX512_F + knl_mask); +#else + 0); +#endif + if (doj_mask2) + runner_iact_nonsym_intrinsic_vec_2_density( + ci_cache, &indices[VEC_SIZE], &v_r2_2, &v_dx2, &v_dy2, &v_dz2, v_hj_inv, v_vjx, v_vjy, v_vjz, + &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, + &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, v_doj_mask2, +#ifdef HAVE_AVX512_F + knl_mask); +#else + 0); +#endif + } /* loop over the parts in cj. */ + + /* Perform horizontal adds on vector sums and store result in particle pi. + */ + VEC_HADD(rhoSum, pj->rho); + VEC_HADD(rho_dhSum, pj->density.rho_dh); + VEC_HADD(wcountSum, pj->density.wcount); + VEC_HADD(wcount_dhSum, pj->density.wcount_dh); + VEC_HADD(div_vSum, pj->density.div_v); + VEC_HADD(curlvxSum, pj->density.rot_v[0]); + VEC_HADD(curlvySum, pj->density.rot_v[1]); + VEC_HADD(curlvzSum, pj->density.rot_v[2]); + + } /* loop over the parts in ci. */ + + TIMER_TOC(timer_dopair_density); + +#endif /* WITH_VECTORIZATION */ +} /** * @brief Compute the interactions between a cell pair (non-symmetric). *