diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c index 90b612684d7a8296f0d3802ec539b3b769a0e877..9f9db24d8e27e6ab64afc75fcd207b67aa34597c 100644 --- a/src/runner_doiact_vec.c +++ b/src/runner_doiact_vec.c @@ -604,370 +604,6 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( #endif /* WITH_VECTORIZATION */ } -/** - * @brief Compute the cell self-interaction (non-symmetric) using vector - * intrinsics with two particle pis at a time. - * - * CURRENTLY BROKEN DO NOT USE. - * - * @param r The #runner. - * @param c The #cell. - */ -__attribute__((always_inline)) INLINE void runner_doself1_density_vec_2( - struct runner *r, struct cell *restrict c) { - -#ifdef WITH_VECTORIZATION - const struct engine *e = r->e; - int doi_mask; - int doi2_mask; - struct part *restrict pi; - struct part *restrict pi2; - int count_align; - - vector v_hi, v_vix, v_viy, v_viz, v_hig2, v_r2; - vector v_hi2, v_vix2, v_viy2, v_viz2, v_hig2_2, v2_r2; - - TIMER_TIC - - if (!cell_is_active(c, e)) return; - - if (!cell_is_drifted(c, e)) cell_drift_particles(c, e); - - /* TODO: Need to find two active particles, not just one. */ - - struct part *restrict parts = c->parts; - const int count = c->count; - - /* Get the particle cache from the runner and re-allocate - * the cache if it is not big enough for the cell. */ - struct cache *restrict cell_cache = &r->ci_cache; - - if (cell_cache->count < count) { - cache_init(cell_cache, count); - } - - /* Read the particles from the cell and store them locally in the cache. */ - cache_read_particles(c, &r->ci_cache); - - /* Create two secondary caches. */ - int icount = 0, icount_align = 0; - struct c2_cache int_cache; - - int icount2 = 0, icount_align2 = 0; - struct c2_cache int_cache2; - - /* Loop over the particles in the cell. */ - for (int pid = 0; pid < count; pid += 2) { - - /* Get a pointer to the ith particle and next i particle. */ - pi = &parts[pid]; - pi2 = &parts[pid + 1]; - - /* Is the ith particle active? */ - if (!part_is_active(pi, e)) continue; - - vector pix, piy, piz; - vector pix2, piy2, piz2; - - const float hi = cell_cache->h[pid]; - const float hi2 = cell_cache->h[pid + 1]; - - /* Fill pi position vector. */ - pix.v = vec_set1(cell_cache->x[pid]); - piy.v = vec_set1(cell_cache->y[pid]); - piz.v = vec_set1(cell_cache->z[pid]); - v_hi.v = vec_set1(hi); - v_vix.v = vec_set1(cell_cache->vx[pid]); - v_viy.v = vec_set1(cell_cache->vy[pid]); - v_viz.v = vec_set1(cell_cache->vz[pid]); - - pix2.v = vec_set1(cell_cache->x[pid + 1]); - piy2.v = vec_set1(cell_cache->y[pid + 1]); - piz2.v = vec_set1(cell_cache->z[pid + 1]); - v_hi2.v = vec_set1(hi2); - v_vix2.v = vec_set1(cell_cache->vx[pid + 1]); - v_viy2.v = vec_set1(cell_cache->vy[pid + 1]); - v_viz2.v = vec_set1(cell_cache->vz[pid + 1]); - - const float hig2 = hi * hi * kernel_gamma2; - const float hig2_2 = hi2 * hi2 * kernel_gamma2; - v_hig2.v = vec_set1(hig2); - v_hig2_2.v = vec_set1(hig2_2); - - vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, - curlvySum, curlvzSum; - vector rhoSum2, rho_dhSum2, wcountSum2, wcount_dhSum2, div_vSum2, - curlvxSum2, curlvySum2, curlvzSum2; - - vector v_hi_inv, v_hi_inv2; - - v_hi_inv = vec_reciprocal(v_hi); - v_hi_inv2 = vec_reciprocal(v_hi2); - - rhoSum.v = vec_setzero(); - rho_dhSum.v = vec_setzero(); - wcountSum.v = vec_setzero(); - wcount_dhSum.v = vec_setzero(); - div_vSum.v = vec_setzero(); - curlvxSum.v = vec_setzero(); - curlvySum.v = vec_setzero(); - curlvzSum.v = vec_setzero(); - - rhoSum2.v = vec_setzero(); - rho_dhSum2.v = vec_setzero(); - wcountSum2.v = vec_setzero(); - wcount_dhSum2.v = vec_setzero(); - div_vSum2.v = vec_setzero(); - curlvxSum2.v = vec_setzero(); - curlvySum2.v = vec_setzero(); - curlvzSum2.v = vec_setzero(); - - /* Pad cache if there is a serial remainder. */ - count_align = count; - int rem = count % (NUM_VEC_PROC * VEC_SIZE); - if (rem != 0) { - int pad = (NUM_VEC_PROC * VEC_SIZE) - rem; - - count_align += pad; - /* Set positions to the same as particle pi so when the r2 > 0 mask is - * applied these extra contributions are masked out.*/ - for (int i = count; i < count_align; i++) { - cell_cache->x[i] = pix.f[0]; - cell_cache->y[i] = piy.f[0]; - cell_cache->z[i] = piz.f[0]; - } - } - - vector pjx, pjy, pjz; - vector pjvx, pjvy, pjvz, mj; - vector pjx2, pjy2, pjz2; - vector pjvx2, pjvy2, pjvz2, mj2; - - /* Find all of particle pi's interacions and store needed values in - * secondary cache.*/ - for (int pjd = 0; pjd < count_align; pjd += (NUM_VEC_PROC * VEC_SIZE)) { - - /* Load 2 sets of vectors from the particle cache. */ - pjx.v = vec_load(&cell_cache->x[pjd]); - pjy.v = vec_load(&cell_cache->y[pjd]); - pjz.v = vec_load(&cell_cache->z[pjd]); - pjvx.v = vec_load(&cell_cache->vx[pjd]); - pjvy.v = vec_load(&cell_cache->vy[pjd]); - pjvz.v = vec_load(&cell_cache->vz[pjd]); - mj.v = vec_load(&cell_cache->m[pjd]); - - pjx2.v = vec_load(&cell_cache->x[pjd + VEC_SIZE]); - pjy2.v = vec_load(&cell_cache->y[pjd + VEC_SIZE]); - pjz2.v = vec_load(&cell_cache->z[pjd + VEC_SIZE]); - pjvx2.v = vec_load(&cell_cache->vx[pjd + VEC_SIZE]); - pjvy2.v = vec_load(&cell_cache->vy[pjd + VEC_SIZE]); - pjvz2.v = vec_load(&cell_cache->vz[pjd + VEC_SIZE]); - mj2.v = vec_load(&cell_cache->m[pjd + VEC_SIZE]); - - /* Compute the pairwise distance. */ - vector v_dx_tmp, v_dy_tmp, v_dz_tmp; - vector v_dx_tmp2, v_dy_tmp2, v_dz_tmp2, v_r2_2; - vector v_dx2_tmp, v_dy2_tmp, v_dz2_tmp; - vector v_dx2_tmp2, v_dy2_tmp2, v_dz2_tmp2, v2_r2_2; - - v_dx_tmp.v = vec_sub(pix.v, pjx.v); - v_dy_tmp.v = vec_sub(piy.v, pjy.v); - v_dz_tmp.v = vec_sub(piz.v, pjz.v); - v_dx_tmp2.v = vec_sub(pix.v, pjx2.v); - v_dy_tmp2.v = vec_sub(piy.v, pjy2.v); - v_dz_tmp2.v = vec_sub(piz.v, pjz2.v); - - v_dx2_tmp.v = vec_sub(pix2.v, pjx.v); - v_dy2_tmp.v = vec_sub(piy2.v, pjy.v); - v_dz2_tmp.v = vec_sub(piz2.v, pjz.v); - v_dx2_tmp2.v = vec_sub(pix2.v, pjx2.v); - v_dy2_tmp2.v = vec_sub(piy2.v, pjy2.v); - v_dz2_tmp2.v = vec_sub(piz2.v, pjz2.v); - - v_r2.v = vec_mul(v_dx_tmp.v, v_dx_tmp.v); - v_r2.v = vec_fma(v_dy_tmp.v, v_dy_tmp.v, v_r2.v); - v_r2.v = vec_fma(v_dz_tmp.v, v_dz_tmp.v, v_r2.v); - v_r2_2.v = vec_mul(v_dx_tmp2.v, v_dx_tmp2.v); - v_r2_2.v = vec_fma(v_dy_tmp2.v, v_dy_tmp2.v, v_r2_2.v); - v_r2_2.v = vec_fma(v_dz_tmp2.v, v_dz_tmp2.v, v_r2_2.v); - - v2_r2.v = vec_mul(v_dx2_tmp.v, v_dx2_tmp.v); - v2_r2.v = vec_fma(v_dy2_tmp.v, v_dy2_tmp.v, v2_r2.v); - v2_r2.v = vec_fma(v_dz2_tmp.v, v_dz2_tmp.v, v2_r2.v); - v2_r2_2.v = vec_mul(v_dx2_tmp2.v, v_dx2_tmp2.v); - v2_r2_2.v = vec_fma(v_dy2_tmp2.v, v_dy2_tmp2.v, v2_r2_2.v); - v2_r2_2.v = vec_fma(v_dz2_tmp2.v, v_dz2_tmp2.v, v2_r2_2.v); - -/* Form a mask from r2 < hig2 and r2 > 0.*/ -#ifdef HAVE_AVX512_F - // KNL_MASK_16 doi_mask, doi_mask_check, doi_mask2, doi_mask2_check; - KNL_MASK_16 doi_mask_check, doi_mask2, doi_mask2_check; - KNL_MASK_16 doi2_mask_check, doi2_mask2, doi2_mask2_check; - - doi_mask_check = vec_cmp_gt(v_r2.v, vec_setzero()); - doi_mask = vec_cmp_lt(v_r2.v, v_hig2.v); - - doi2_mask_check = vec_cmp_gt(v2_r2.v, vec_setzero()); - doi2_mask = vec_cmp_lt(v2_r2.v, v_hig2_2.v); - - doi_mask2_check = vec_cmp_gt(v_r2_2.v, vec_setzero()); - doi_mask2 = vec_cmp_lt(v_r2_2.v, v_hig2.v); - - doi2_mask2_check = vec_cmp_gt(v2_r2_2.v, vec_setzero()); - doi2_mask2 = vec_cmp_lt(v2_r2_2.v, v_hig2_2.v); - - doi_mask = doi_mask & doi_mask_check; - doi_mask2 = doi_mask2 & doi_mask2_check; - - doi2_mask = doi2_mask & doi2_mask_check; - doi2_mask2 = doi2_mask2 & doi2_mask2_check; -#else - vector v_doi_mask, v_doi_mask_check, v_doi_mask2, v_doi_mask2_check; - int doi_mask2; - - vector v_doi2_mask, v_doi2_mask_check, v_doi2_mask2, v_doi2_mask2_check; - int doi2_mask2; - - v_doi_mask_check.v = vec_cmp_gt(v_r2.v, vec_setzero()); - v_doi_mask.v = vec_cmp_lt(v_r2.v, v_hig2.v); - - v_doi2_mask_check.v = vec_cmp_gt(v2_r2.v, vec_setzero()); - v_doi2_mask.v = vec_cmp_lt(v2_r2.v, v_hig2_2.v); - - v_doi_mask2_check.v = vec_cmp_gt(v_r2_2.v, vec_setzero()); - v_doi_mask2.v = vec_cmp_lt(v_r2_2.v, v_hig2.v); - - v_doi2_mask2_check.v = vec_cmp_gt(v2_r2_2.v, vec_setzero()); - v_doi2_mask2.v = vec_cmp_lt(v2_r2_2.v, v_hig2_2.v); - - doi_mask = vec_cmp_result(vec_and(v_doi_mask.v, v_doi_mask_check.v)); - doi_mask2 = vec_cmp_result(vec_and(v_doi_mask2.v, v_doi_mask2_check.v)); - doi2_mask = vec_cmp_result(vec_and(v_doi2_mask.v, v_doi2_mask_check.v)); - doi2_mask2 = - vec_cmp_result(vec_and(v_doi2_mask2.v, v_doi2_mask2_check.v)); -#endif /* HAVE_AVX512_F */ - - /* Hit or miss? */ - // if (doi_mask) { - storeInteractions(doi_mask, pjd, &v_r2, &v_dx_tmp, &v_dy_tmp, &v_dz_tmp, - &mj, &pjvx, &pjvy, &pjvz, cell_cache, &int_cache, - &icount, &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, - &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, v_hi_inv, - v_vix, v_viy, v_viz); - //} - // if (doi2_mask) { - storeInteractions( - doi2_mask, pjd, &v2_r2, &v_dx2_tmp, &v_dy2_tmp, &v_dz2_tmp, &mj, - &pjvx, &pjvy, &pjvz, cell_cache, &int_cache2, &icount2, &rhoSum2, - &rho_dhSum2, &wcountSum2, &wcount_dhSum2, &div_vSum2, &curlvxSum2, - &curlvySum2, &curlvzSum2, v_hi_inv2, v_vix2, v_viy2, v_viz2); - //} - /* Hit or miss? */ - // if (doi_mask2) { - storeInteractions(doi_mask2, pjd + VEC_SIZE, &v_r2_2, &v_dx_tmp2, - &v_dy_tmp2, &v_dz_tmp2, &mj2, &pjvx2, &pjvy2, &pjvz2, - cell_cache, &int_cache, &icount, &rhoSum, &rho_dhSum, - &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, - &curlvySum, &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz); - //} - // if (doi2_mask2) { - storeInteractions(doi2_mask2, pjd + VEC_SIZE, &v2_r2_2, &v_dx2_tmp2, - &v_dy2_tmp2, &v_dz2_tmp2, &mj2, &pjvx2, &pjvy2, &pjvz2, - cell_cache, &int_cache2, &icount2, &rhoSum2, - &rho_dhSum2, &wcountSum2, &wcount_dhSum2, &div_vSum2, - &curlvxSum2, &curlvySum2, &curlvzSum2, v_hi_inv2, - v_vix2, v_viy2, v_viz2); - //} - } - - /* Perform padded vector remainder interactions if any are present. */ - calcRemInteractions(&int_cache, icount, &rhoSum, &rho_dhSum, &wcountSum, - &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, - &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz, - &icount_align); - - calcRemInteractions(&int_cache2, icount2, &rhoSum2, &rho_dhSum2, - &wcountSum2, &wcount_dhSum2, &div_vSum2, &curlvxSum2, - &curlvySum2, &curlvzSum2, v_hi_inv2, v_vix2, v_viy2, - v_viz2, &icount_align2); - - /* Initialise masks to true incase remainder interactions have been - * performed. */ - vector int_mask, int_mask2; - vector int2_mask, int2_mask2; -#ifdef HAVE_AVX512_F - KNL_MASK_16 knl_mask = 0xFFFF; - KNL_MASK_16 knl_mask2 = 0xFFFF; - int_mask.m = vec_setint1(0xFFFFFFFF); - int_mask2.m = vec_setint1(0xFFFFFFFF); - int2_mask.m = vec_setint1(0xFFFFFFFF); - int2_mask2.m = vec_setint1(0xFFFFFFFF); -#else - int_mask.m = vec_setint1(0xFFFFFFFF); - int_mask2.m = vec_setint1(0xFFFFFFFF); - - int2_mask.m = vec_setint1(0xFFFFFFFF); - int2_mask2.m = vec_setint1(0xFFFFFFFF); -#endif - - /* Perform interaction with 2 vectors. */ - for (int pjd = 0; pjd < icount_align; pjd += (NUM_VEC_PROC * VEC_SIZE)) { - runner_iact_nonsym_2_vec_density( - &int_cache.r2q[pjd], &int_cache.dxq[pjd], &int_cache.dyq[pjd], - &int_cache.dzq[pjd], v_hi_inv, v_vix, v_viy, v_viz, - &int_cache.vxq[pjd], &int_cache.vyq[pjd], &int_cache.vzq[pjd], - &int_cache.mq[pjd], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, - &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, int_mask, int_mask2, -#ifdef HAVE_AVX512_F - knl_mask, knl_mask2); -#else - 0, 0); -#endif - } - - for (int pjd = 0; pjd < icount_align2; pjd += (NUM_VEC_PROC * VEC_SIZE)) { - runner_iact_nonsym_2_vec_density( - &int_cache2.r2q[pjd], &int_cache2.dxq[pjd], &int_cache2.dyq[pjd], - &int_cache2.dzq[pjd], v_hi_inv2, v_vix2, v_viy2, v_viz2, - &int_cache2.vxq[pjd], &int_cache2.vyq[pjd], &int_cache2.vzq[pjd], - &int_cache2.mq[pjd], &rhoSum2, &rho_dhSum2, &wcountSum2, - &wcount_dhSum2, &div_vSum2, &curlvxSum2, &curlvySum2, &curlvzSum2, - int2_mask, int2_mask2, -#ifdef HAVE_AVX512_F - knl_mask, knl_mask2); -#else - 0, 0); -#endif - } - /* Perform horizontal adds on vector sums and store result in particle pi. - */ - VEC_HADD(rhoSum, pi->rho); - VEC_HADD(rho_dhSum, pi->density.rho_dh); - VEC_HADD(wcountSum, pi->density.wcount); - VEC_HADD(wcount_dhSum, pi->density.wcount_dh); - VEC_HADD(div_vSum, pi->density.div_v); - VEC_HADD(curlvxSum, pi->density.rot_v[0]); - VEC_HADD(curlvySum, pi->density.rot_v[1]); - VEC_HADD(curlvzSum, pi->density.rot_v[2]); - - VEC_HADD(rhoSum2, pi2->rho); - VEC_HADD(rho_dhSum2, pi2->density.rho_dh); - VEC_HADD(wcountSum2, pi2->density.wcount); - VEC_HADD(wcount_dhSum2, pi2->density.wcount_dh); - VEC_HADD(div_vSum2, pi2->density.div_v); - VEC_HADD(curlvxSum2, pi2->density.rot_v[0]); - VEC_HADD(curlvySum2, pi2->density.rot_v[1]); - VEC_HADD(curlvzSum2, pi2->density.rot_v[2]); - - /* Reset interaction count. */ - icount = 0; - icount2 = 0; - } /* loop over all particles. */ - - TIMER_TOC(timer_doself_density); -#endif /* WITH_VECTORIZATION */ -} - /** * @brief Compute the density interactions between a cell pair (non-symmetric) * using vector intrinsics.