diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c index b9d010f3aafcc9dc3833a677215c4c82f4da9369..1b1bfe538509a561444791bf8041e0d2399486fb 100644 --- a/src/runner_doiact_vec.c +++ b/src/runner_doiact_vec.c @@ -1303,6 +1303,335 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell * #endif /* WITH_VECTORIZATION */ } +void runner_dopair1_density_vec_1(struct runner *r, struct cell *ci, struct cell *cj) { + +#ifdef WITH_VECTORIZATION + const struct engine *restrict e = r->e; + + int num_vec_proc = 1; + + vector v_hi, v_vix, v_viy, v_viz, v_hig2; + + TIMER_TIC; + + /* Anything to do here? */ + if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return; + +#ifdef SWIFT_DEBUG_CHECKS + cell_is_drifted(ci, e); + cell_is_drifted(cj, e); +#endif + + /* Get the sort ID. */ + double shift[3] = {0.0, 0.0, 0.0}; + const int sid = space_getsid(e->s, &ci, &cj, shift); + + /* Have the cells been sorted? */ + if (!(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid))) + error("Trying to interact unsorted cells."); + + /* Get the cutoff shift. */ + double rshift = 0.0; + for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k]; + + /* Pick-out the sorted lists. */ + const struct entry *restrict sort_i = &ci->sort[sid * (ci->count + 1)]; + const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)]; + + /* Get some other useful values. */ + const int count_i = ci->count; + const int count_j = cj->count; + struct part *restrict parts_i = ci->parts; + struct part *restrict parts_j = cj->parts; + const double di_max = sort_i[count_i - 1].d - rshift; + const double dj_min = sort_j[0].d; + const float dx_max = (ci->dx_max + cj->dx_max); + + /* Get the particle cache from the runner and re-allocate + * the cache if it is not big enough for the cell. */ + struct cache *restrict ci_cache = &r->par_cache; + + if (ci_cache->count < count_i) { + cache_init(ci_cache, count_i); + } + if (cj_cache.count < count_j) { + cache_init(&cj_cache, count_j); + } + + //cache_read_two_cells(ci, cj, ci_cache, &cj_cache, shift); + cache_read_two_cells_sorted(ci, cj, ci_cache, &cj_cache, sort_i, sort_j, shift); + + /* Find particles maximum distance into cj, max_di[] and ci, max_dj[]. */ + /* For particles in ci */ + populate_max_d(ci, cj, sort_i, sort_j, ci_cache, &cj_cache, dx_max, rshift, max_di, max_dj); + + float di, dj; + + int max_ind_j = count_j - 1; + + /* Loop over the parts in ci. */ + for (int pid = count_i - 1; pid >= 0 && max_ind_j >= 0; pid--) { + + /* Get a hold of the ith part in ci. */ + struct part *restrict pi = &parts_i[sort_i[pid].i]; + if (!part_is_active(pi, e)) continue; + + dj = sort_j[max_ind_j].d; + while(max_ind_j > 0 && max_di[pid] < dj) { + max_ind_j--; + + dj = sort_j[max_ind_j].d; + } + int exit_iteration = max_ind_j; + + int ci_cache_idx = pid; //sort_i[pid].i; + + const float hi = ci_cache->h[ci_cache_idx]; + const double di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift; + if (di < dj_min) continue; + + const float hig2 = hi * hi * kernel_gamma2; + + vector pix, piy, piz; + + /* Fill particle pi vectors. */ + pix.v = vec_set1(ci_cache->x[ci_cache_idx]); + piy.v = vec_set1(ci_cache->y[ci_cache_idx]); + piz.v = vec_set1(ci_cache->z[ci_cache_idx]); + v_hi.v = vec_set1(hi); + v_vix.v = vec_set1(ci_cache->vx[ci_cache_idx]); + v_viy.v = vec_set1(ci_cache->vy[ci_cache_idx]); + v_viz.v = vec_set1(ci_cache->vz[ci_cache_idx]); + + v_hig2.v = vec_set1(hig2); + + /* Reset cumulative sums of update vectors. */ + vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, + curlvySum, curlvzSum; + + /* Get the inverse of hi. */ + vector v_hi_inv; + + v_hi_inv = vec_reciprocal(v_hi); + + rhoSum.v = vec_setzero(); + rho_dhSum.v = vec_setzero(); + wcountSum.v = vec_setzero(); + wcount_dhSum.v = vec_setzero(); + div_vSum.v = vec_setzero(); + curlvxSum.v = vec_setzero(); + curlvySum.v = vec_setzero(); + curlvzSum.v = vec_setzero(); + + /* Pad cache if there is a serial remainder. */ + int exit_iteration_align = exit_iteration; + int rem = exit_iteration % (num_vec_proc * VEC_SIZE); + if (rem != 0) { + int pad = (num_vec_proc * VEC_SIZE) - rem; + + exit_iteration_align += pad; + } + + vector pjx, pjy, pjz; + + /* Loop over the parts in cj. */ + for (int pjd = 0; pjd < exit_iteration_align; pjd += VEC_SIZE) { + + /* Get the cache index to the jth particle. */ + int cj_cache_idx = pjd; //sort_j[pjd].i; + + vector v_dx, v_dy, v_dz, v_r2; + + /* Load 2 sets of vectors from the particle cache. */ + pjx.v = vec_load(&cj_cache.x[cj_cache_idx]); + pjy.v = vec_load(&cj_cache.y[cj_cache_idx]); + pjz.v = vec_load(&cj_cache.z[cj_cache_idx]); + //pjvx.v = vec_load(&cj_cache.vx[cj_cache_idx]); + //pjvy.v = vec_load(&cj_cache.vy[cj_cache_idx]); + //pjvz.v = vec_load(&cj_cache.vz[cj_cache_idx]); + //mj.v = vec_load(&cj_cache.m[cj_cache_idx]); + + /* Compute the pairwise distance. */ + v_dx.v = vec_sub(pix.v, pjx.v); + v_dy.v = vec_sub(piy.v, pjy.v); + v_dz.v = vec_sub(piz.v, pjz.v); + + v_r2.v = vec_mul(v_dx.v, v_dx.v); + v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v); + v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v); + + vector v_doi_mask; + int doi_mask; + + /* Form r2 < hig2 mask. */ + v_doi_mask.v = vec_cmp_lt(v_r2.v, v_hig2.v); + + /* Form integer mask. */ + doi_mask = vec_cmp_result(v_doi_mask.v); + + if(doi_mask) + runner_iact_nonsym_intrinsic_vec_density( + &v_r2, &v_dx, &v_dy,&v_dz, v_hi_inv, v_vix, v_viy, v_viz, + &cj_cache.vx[cj_cache_idx], &cj_cache.vy[cj_cache_idx], &cj_cache.vz[cj_cache_idx], + &cj_cache.m[cj_cache_idx], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, + &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, v_doi_mask, +#ifdef HAVE_AVX512_F + knl_mask); +#else + 0); +#endif + + } /* loop over the parts in cj. */ + + /* Perform horizontal adds on vector sums and store result in particle pi. + */ + VEC_HADD(rhoSum, pi->rho); + VEC_HADD(rho_dhSum, pi->density.rho_dh); + VEC_HADD(wcountSum, pi->density.wcount); + VEC_HADD(wcount_dhSum, pi->density.wcount_dh); + VEC_HADD(div_vSum, pi->density.div_v); + VEC_HADD(curlvxSum, pi->density.rot_v[0]); + VEC_HADD(curlvySum, pi->density.rot_v[1]); + VEC_HADD(curlvzSum, pi->density.rot_v[2]); + + } /* loop over the parts in ci. */ + + int max_ind_i = 0; + /* Loop over the parts in cj. */ + for (int pjd = 0; pjd < count_j && max_ind_i < count_i; pjd++) { + + /* Get a hold of the jth part in cj. */ + struct part *restrict pj = &parts_j[sort_j[pjd].i]; + if (!part_is_active(pj, e)) continue; + + di = sort_i[max_ind_i].d; + while(max_ind_i < count_i - 1 && max_dj[pjd] > di) { + max_ind_i++; + + di = sort_i[max_ind_i].d; + } + int exit_iteration = max_ind_i; + + int cj_cache_idx = pjd; + + const float hj = cj_cache.h[cj_cache_idx]; + const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift; + if (dj > di_max) continue; + + const float hjg2 = hj * hj * kernel_gamma2; + + vector pjx, pjy, pjz; + vector v_hj, v_vjx, v_vjy, v_vjz, v_hjg2; + + /* Fill particle pi vectors. */ + pjx.v = vec_set1(cj_cache.x[cj_cache_idx]); + pjy.v = vec_set1(cj_cache.y[cj_cache_idx]); + pjz.v = vec_set1(cj_cache.z[cj_cache_idx]); + v_hj.v = vec_set1(hj); + v_vjx.v = vec_set1(cj_cache.vx[cj_cache_idx]); + v_vjy.v = vec_set1(cj_cache.vy[cj_cache_idx]); + v_vjz.v = vec_set1(cj_cache.vz[cj_cache_idx]); + + v_hjg2.v = vec_set1(hjg2); + + /* Reset cumulative sums of update vectors. */ + vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, + curlvySum, curlvzSum; + + /* Get the inverse of hj. */ + vector v_hj_inv; + + v_hj_inv = vec_reciprocal(v_hj); + + rhoSum.v = vec_setzero(); + rho_dhSum.v = vec_setzero(); + wcountSum.v = vec_setzero(); + wcount_dhSum.v = vec_setzero(); + div_vSum.v = vec_setzero(); + curlvxSum.v = vec_setzero(); + curlvySum.v = vec_setzero(); + curlvzSum.v = vec_setzero(); + + /* Pad cache if there is a serial remainder. */ + int exit_iteration_align = exit_iteration; + int rem = exit_iteration % (num_vec_proc * VEC_SIZE); + if (rem != 0) { + int pad = (num_vec_proc * VEC_SIZE) - rem; + + exit_iteration_align -= pad; + } + + vector pix, piy, piz; + //vector pivx, pivy, pivz, mi; + + /* Loop over the parts in ci. */ + for (int pid = exit_iteration_align; pid < count_i; pid += VEC_SIZE) { + + /* Get the cache index to the ith particle. */ + int ci_cache_idx = pid; //sort_i[pid].i; + + vector v_dx, v_dy, v_dz, v_r2; + + /* Load 2 sets of vectors from the particle cache. */ + pix.v = vec_load(&ci_cache->x[ci_cache_idx]); + piy.v = vec_load(&ci_cache->y[ci_cache_idx]); + piz.v = vec_load(&ci_cache->z[ci_cache_idx]); + //pivx.v = vec_load(&ci_cache->vx[ci_cache_idx]); + //pivy.v = vec_load(&ci_cache->vy[ci_cache_idx]); + //pivz.v = vec_load(&ci_cache->vz[ci_cache_idx]); + //mi.v = vec_load(&ci_cache->m[ci_cache_idx]); + + /* Compute the pairwise distance. */ + v_dx.v = vec_sub(pjx.v, pix.v); + v_dy.v = vec_sub(pjy.v, piy.v); + v_dz.v = vec_sub(pjz.v, piz.v); + + v_r2.v = vec_mul(v_dx.v, v_dx.v); + v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v); + v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v); + + vector v_doj_mask; + int doj_mask; + + /* Form r2 < hig2 mask. */ + v_doj_mask.v = vec_cmp_lt(v_r2.v, v_hjg2.v); + + /* Form integer mask. */ + doj_mask = vec_cmp_result(v_doj_mask.v); + + /* Perform interaction with 2 vectors. */ + if (doj_mask) + runner_iact_nonsym_intrinsic_vec_density( + &v_r2, &v_dx, &v_dy, &v_dz, v_hj_inv, v_vjx, v_vjy, v_vjz, + &ci_cache->vx[ci_cache_idx], &ci_cache->vy[ci_cache_idx], &ci_cache->vz[ci_cache_idx], + &ci_cache->m[ci_cache_idx], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, + &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, v_doj_mask, +#ifdef HAVE_AVX512_F + knl_mask); +#else + 0); +#endif + + } /* loop over the parts in cj. */ + + /* Perform horizontal adds on vector sums and store result in particle pi. + */ + VEC_HADD(rhoSum, pj->rho); + VEC_HADD(rho_dhSum, pj->density.rho_dh); + VEC_HADD(wcountSum, pj->density.wcount); + VEC_HADD(wcount_dhSum, pj->density.wcount_dh); + VEC_HADD(div_vSum, pj->density.div_v); + VEC_HADD(curlvxSum, pj->density.rot_v[0]); + VEC_HADD(curlvySum, pj->density.rot_v[1]); + VEC_HADD(curlvzSum, pj->density.rot_v[2]); + + } /* loop over the parts in ci. */ + + TIMER_TOC(timer_dopair_density); + +#endif /* WITH_VECTORIZATION */ +} + /* Similar to AUTO-VEC but process 2 pi at a time and use two vectors in interaction loop. */ void runner_dopair1_density_vec_2(struct runner *r, struct cell *ci, struct cell *cj) { diff --git a/tests/test27cells.c b/tests/test27cells.c index 89953c5bcac2e75ebe1af2b74a11fcc67a3d2c11..9b590a00cafcc8c9928aa04ee0dda03739cfc5dc 100644 --- a/tests/test27cells.c +++ b/tests/test27cells.c @@ -47,6 +47,11 @@ #define DOPAIR1_NAME "runner_dopair1_density_vec" #endif +#if defined(WITH_VECTORIZATION) && defined(DOPAIR1_VEC_1) +#define DOPAIR1 runner_dopair1_density_vec_1 +#define DOPAIR1_NAME "runner_dopair1_density_vec_1" +#endif + #if defined(WITH_VECTORIZATION) && defined(DOPAIR1_VEC_2) #define DOPAIR1 runner_dopair1_density_vec_2 #define DOPAIR1_NAME "runner_dopair1_density_vec_2" @@ -315,6 +320,7 @@ int check_results(struct part *serial_parts, struct part *vec_parts, int count, /* Just a forward declaration... */ void runner_dopair1_density(struct runner *r, struct cell *ci, struct cell *cj); void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell *cj); +void runner_dopair1_density_vec_1(struct runner *r, struct cell *ci, struct cell *cj); void runner_dopair1_density_vec_2(struct runner *r, struct cell *ci, struct cell *cj); void runner_dopair1_density_vec_3(struct runner *r, struct cell *ci, struct cell *cj); void runner_dopair1_density_auto_vec(struct runner *r, struct cell *ci, struct cell *cj);