diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c index 553843f5303446499474a8285a3fbe234fac02a5..3be71c78167fc67b6491ea2cc43a3b912ea5fdcf 100644 --- a/src/runner_doiact_vec.c +++ b/src/runner_doiact_vec.c @@ -225,261 +225,6 @@ __attribute__((always_inline)) INLINE static void storeInteractions( } } -/** - * @brief Compute the vector remainder force interactions from the secondary - * cache. - * - * @param int_cache (return) secondary #cache of interactions between two - * particles. - * @param icount Interaction count. - * @param a_hydro_xSum (return) #vector holding the cumulative sum of the x - * acceleration - * update on pi. - * @param a_hydro_ySum (return) #vector holding the cumulative sum of the y - * acceleration - * update on pi. - * @param a_hydro_zSum (return) #vector holding the cumulative sum of the z - * acceleration - * update on pi. - * @param h_dtSum (return) #vector holding the cumulative sum of the time - * derivative of the smoothing length update on pi. - * @param v_sigSum (return) #vector holding the maximum of the signal velocity - * update on pi. - * @param entropy_dtSum (return) #vector holding the cumulative sum of the time - * derivative of the entropy - * update on pi. - * @param v_hi_inv #vector of 1/h for pi. - * @param v_vix #vector of x velocity of pi. - * @param v_viy #vector of y velocity of pi. - * @param v_viz #vector of z velocity of pi. - * @param v_rhoi #vector of density of pi. - * @param v_grad_hi #vector of smoothing length gradient of pi. - * @param v_pOrhoi2 #vector of pressure over density squared of pi. - * @param v_balsara_i #vector of balsara switch of pi. - * @param v_ci #vector of sound speed of pi. - * @param icount_align (return) Interaction count after the remainder - * @param num_vec_proc #int of the number of vectors to use to perform - * interaction. - * interactions have been performed, should be a multiple of the vector length. - */ -__attribute__((always_inline)) INLINE static void calcRemForceInteractions( - struct c2_cache *const int_cache, const int icount, vector *a_hydro_xSum, - vector *a_hydro_ySum, vector *a_hydro_zSum, vector *h_dtSum, - vector *v_sigSum, vector *entropy_dtSum, vector v_hi_inv, vector v_vix, - vector v_viy, vector v_viz, vector v_rhoi, vector v_grad_hi, - vector v_pOrhoi2, vector v_balsara_i, vector v_ci, int *icount_align, - int num_vec_proc) { - - mask_t int_mask, int_mask2; - - /* Work out the number of remainder interactions and pad secondary cache. */ - *icount_align = icount; - int rem = icount % (num_vec_proc * VEC_SIZE); - if (rem != 0) { - int pad = (num_vec_proc * VEC_SIZE) - rem; - *icount_align += pad; - - /* Initialise masks to true. */ - vec_init_mask_true(int_mask); - vec_init_mask_true(int_mask2); - - /* Pad secondary cache so that there are no contributions in the interaction - * function. */ - for (int i = icount; i < *icount_align; i++) { - int_cache->mq[i] = 0.f; - int_cache->r2q[i] = 1.f; - int_cache->dxq[i] = 0.f; - int_cache->dyq[i] = 0.f; - int_cache->dzq[i] = 0.f; - int_cache->vxq[i] = 0.f; - int_cache->vyq[i] = 0.f; - int_cache->vzq[i] = 0.f; - int_cache->rhoq[i] = 1.f; - int_cache->grad_hq[i] = 1.f; - int_cache->pOrho2q[i] = 1.f; - int_cache->balsaraq[i] = 1.f; - int_cache->soundspeedq[i] = 1.f; - int_cache->h_invq[i] = 1.f; - } - - /* Zero parts of mask that represent the padded values.*/ - if (pad < VEC_SIZE) { - vec_pad_mask(int_mask2, pad); - } else { - vec_pad_mask(int_mask, VEC_SIZE - rem); - vec_zero_mask(int_mask2); - } - - /* Perform remainder interaction and remove remainder from aligned - * interaction count. */ - *icount_align = icount - rem; - runner_iact_nonsym_2_vec_force( - &int_cache->r2q[*icount_align], &int_cache->dxq[*icount_align], - &int_cache->dyq[*icount_align], &int_cache->dzq[*icount_align], v_vix, - v_viy, v_viz, v_rhoi, v_grad_hi, v_pOrhoi2, v_balsara_i, v_ci, - &int_cache->vxq[*icount_align], &int_cache->vyq[*icount_align], - &int_cache->vzq[*icount_align], &int_cache->rhoq[*icount_align], - &int_cache->grad_hq[*icount_align], &int_cache->pOrho2q[*icount_align], - &int_cache->balsaraq[*icount_align], - &int_cache->soundspeedq[*icount_align], &int_cache->mq[*icount_align], - v_hi_inv, &int_cache->h_invq[*icount_align], a_hydro_xSum, a_hydro_ySum, - a_hydro_zSum, h_dtSum, v_sigSum, entropy_dtSum, int_mask, int_mask2, 1); - } -} - -/** - * @brief Left-packs the values needed by an interaction into the secondary - * cache (Supports AVX, AVX2 and AVX512 instruction sets). - * - * @param mask Contains which particles need to interact. - * @param pjd Index of the particle to store into. - * @param v_r2 #vector of the separation between two particles squared. - * @param v_dx #vector of the x separation between two particles. - * @param v_dy #vector of the y separation between two particles. - * @param v_dz #vector of the z separation between two particles. - * @param cell_cache #cache of all particles in the cell. - * @param int_cache (return) secondary #cache of interactions between two - * particles. - * @param icount Interaction count. - * @param a_hydro_xSum (return) #vector holding the cumulative sum of the x - * acceleration - * update on pi. - * @param a_hydro_ySum (return) #vector holding the cumulative sum of the y - * acceleration - * update on pi. - * @param a_hydro_zSum (return) #vector holding the cumulative sum of the z - * acceleration - * update on pi. - * @param h_dtSum (return) #vector holding the cumulative sum of the time - * derivative of the smoothing length update on pi. - * @param v_sigSum (return) #vector holding the maximum of the signal velocity - * update on pi. - * @param entropy_dtSum (return) #vector holding the cumulative sum of the time - * derivative of the entropy - * update on pi. - * @param v_hi_inv #vector of 1/h for pi. - * @param v_vix #vector of x velocity of pi. - * @param v_viy #vector of y velocity of pi. - * @param v_viz #vector of z velocity of pi. - * @param v_rhoi #vector of density of pi. - * @param v_grad_hi #vector of smoothing length gradient of pi. - * @param v_pOrhoi2 #vector of pressure over density squared of pi. - * @param v_balsara_i #vector of balsara switch of pi. - * @param v_ci #vector of sound speed of pi. - * @param num_vec_proc #int of the number of vectors to use to perform - * interaction. - * interactions have been performed, should be a multiple of the vector length. - */ -__attribute__((always_inline)) INLINE static void storeForceInteractions( - const int mask, const int pjd, vector *v_r2, vector *v_dx, vector *v_dy, - vector *v_dz, const struct cache *const cell_cache, - struct c2_cache *const int_cache, int *icount, vector *a_hydro_xSum, - vector *a_hydro_ySum, vector *a_hydro_zSum, vector *h_dtSum, - vector *v_sigSum, vector *entropy_dtSum, vector v_hi_inv, vector v_vix, - vector v_viy, vector v_viz, vector v_rhoi, vector v_grad_hi, - vector v_pOrhoi2, vector v_balsara_i, vector v_ci, int num_vec_proc) { - -/* Left-pack values needed into the secondary cache using the interaction mask. - */ -#if defined(HAVE_AVX2) || defined(HAVE_AVX512_F) - /* Invert hj. */ - vector v_hj, v_hj_inv; - v_hj.v = vec_load(&cell_cache->h[pjd]); - v_hj_inv = vec_reciprocal(v_hj); - - mask_t packed_mask; - VEC_FORM_PACKED_MASK(mask, packed_mask); - - VEC_LEFT_PACK(v_r2->v, packed_mask, &int_cache->r2q[*icount]); - VEC_LEFT_PACK(v_dx->v, packed_mask, &int_cache->dxq[*icount]); - VEC_LEFT_PACK(v_dy->v, packed_mask, &int_cache->dyq[*icount]); - VEC_LEFT_PACK(v_dz->v, packed_mask, &int_cache->dzq[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->m[pjd]), packed_mask, - &int_cache->mq[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->vx[pjd]), packed_mask, - &int_cache->vxq[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->vy[pjd]), packed_mask, - &int_cache->vyq[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->vz[pjd]), packed_mask, - &int_cache->vzq[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->rho[pjd]), packed_mask, - &int_cache->rhoq[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->grad_h[pjd]), packed_mask, - &int_cache->grad_hq[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->pOrho2[pjd]), packed_mask, - &int_cache->pOrho2q[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->balsara[pjd]), packed_mask, - &int_cache->balsaraq[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->soundspeed[pjd]), packed_mask, - &int_cache->soundspeedq[*icount]); - VEC_LEFT_PACK(v_hj_inv.v, packed_mask, &int_cache->h_invq[*icount]); - - /* Increment interaction count by number of bits set in mask. */ - (*icount) += __builtin_popcount(mask); -#else - /* Quicker to do it serially in AVX rather than use intrinsics. */ - for (int bit_index = 0; bit_index < VEC_SIZE; bit_index++) { - if (mask & (1 << bit_index)) { - /* Add this interaction to the queue. */ - int_cache->r2q[*icount] = v_r2->f[bit_index]; - int_cache->dxq[*icount] = v_dx->f[bit_index]; - int_cache->dyq[*icount] = v_dy->f[bit_index]; - int_cache->dzq[*icount] = v_dz->f[bit_index]; - int_cache->mq[*icount] = cell_cache->m[pjd + bit_index]; - int_cache->vxq[*icount] = cell_cache->vx[pjd + bit_index]; - int_cache->vyq[*icount] = cell_cache->vy[pjd + bit_index]; - int_cache->vzq[*icount] = cell_cache->vz[pjd + bit_index]; - - int_cache->rhoq[*icount] = cell_cache->rho[pjd + bit_index]; - int_cache->grad_hq[*icount] = cell_cache->grad_h[pjd + bit_index]; - int_cache->pOrho2q[*icount] = cell_cache->pOrho2[pjd + bit_index]; - int_cache->balsaraq[*icount] = cell_cache->balsara[pjd + bit_index]; - int_cache->soundspeedq[*icount] = cell_cache->soundspeed[pjd + bit_index]; - int_cache->h_invq[*icount] = 1.f / cell_cache->h[pjd + bit_index]; - - (*icount)++; - } - } - -#endif /* defined(HAVE_AVX2) || defined(HAVE_AVX512_F) */ - - /* Flush the c2 cache if it has reached capacity. */ - if (*icount >= (C2_CACHE_SIZE - (num_vec_proc * VEC_SIZE))) { - - int icount_align = *icount; - - /* Peform remainder interactions. */ - calcRemForceInteractions(int_cache, *icount, a_hydro_xSum, a_hydro_ySum, - a_hydro_zSum, h_dtSum, v_sigSum, entropy_dtSum, - v_hi_inv, v_vix, v_viy, v_viz, v_rhoi, v_grad_hi, - v_pOrhoi2, v_balsara_i, v_ci, &icount_align, 2); - - /* Initialise masks to true in case remainder interactions have been - * performed. */ - mask_t int_mask, int_mask2; - vec_init_mask_true(int_mask); - vec_init_mask_true(int_mask2); - - /* Perform interactions. */ - for (int pjd = 0; pjd < icount_align; pjd += (num_vec_proc * VEC_SIZE)) { - - runner_iact_nonsym_2_vec_force( - &int_cache->r2q[pjd], &int_cache->dxq[pjd], &int_cache->dyq[pjd], - &int_cache->dzq[pjd], v_vix, v_viy, v_viz, v_rhoi, v_grad_hi, - v_pOrhoi2, v_balsara_i, v_ci, &int_cache->vxq[pjd], - &int_cache->vyq[pjd], &int_cache->vzq[pjd], &int_cache->rhoq[pjd], - &int_cache->grad_hq[pjd], &int_cache->pOrho2q[pjd], - &int_cache->balsaraq[pjd], &int_cache->soundspeedq[pjd], - &int_cache->mq[pjd], v_hi_inv, &int_cache->h_invq[pjd], a_hydro_xSum, - a_hydro_ySum, a_hydro_zSum, h_dtSum, v_sigSum, entropy_dtSum, - int_mask, int_mask2, 0); - } - - /* Reset interaction count. */ - *icount = 0; - } -} - /** * @brief Populates the arrays max_index_i and max_index_j with the maximum * indices of @@ -738,22 +483,22 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( pjz2.v = vec_load(&cell_cache->z[pjd + VEC_SIZE]); /* Compute the pairwise distance. */ - vector v_dx_tmp, v_dy_tmp, v_dz_tmp; - vector v_dx_tmp2, v_dy_tmp2, v_dz_tmp2, v_r2_2; - - v_dx_tmp.v = vec_sub(pix.v, pjx.v); - v_dx_tmp2.v = vec_sub(pix.v, pjx2.v); - v_dy_tmp.v = vec_sub(piy.v, pjy.v); - v_dy_tmp2.v = vec_sub(piy.v, pjy2.v); - v_dz_tmp.v = vec_sub(piz.v, pjz.v); - v_dz_tmp2.v = vec_sub(piz.v, pjz2.v); - - v_r2.v = vec_mul(v_dx_tmp.v, v_dx_tmp.v); - v_r2_2.v = vec_mul(v_dx_tmp2.v, v_dx_tmp2.v); - v_r2.v = vec_fma(v_dy_tmp.v, v_dy_tmp.v, v_r2.v); - v_r2_2.v = vec_fma(v_dy_tmp2.v, v_dy_tmp2.v, v_r2_2.v); - v_r2.v = vec_fma(v_dz_tmp.v, v_dz_tmp.v, v_r2.v); - v_r2_2.v = vec_fma(v_dz_tmp2.v, v_dz_tmp2.v, v_r2_2.v); + vector v_dx, v_dy, v_dz; + vector v_dx_2, v_dy_2, v_dz_2, v_r2_2; + + v_dx.v = vec_sub(pix.v, pjx.v); + v_dx_2.v = vec_sub(pix.v, pjx2.v); + v_dy.v = vec_sub(piy.v, pjy.v); + v_dy_2.v = vec_sub(piy.v, pjy2.v); + v_dz.v = vec_sub(piz.v, pjz.v); + v_dz_2.v = vec_sub(piz.v, pjz2.v); + + v_r2.v = vec_mul(v_dx.v, v_dx.v); + v_r2_2.v = vec_mul(v_dx_2.v, v_dx_2.v); + v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v); + v_r2_2.v = vec_fma(v_dy_2.v, v_dy_2.v, v_r2_2.v); + v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v); + v_r2_2.v = vec_fma(v_dz_2.v, v_dz_2.v, v_r2_2.v); /* Form a mask from r2 < hig2 and r2 > 0.*/ mask_t v_doi_mask, v_doi_mask_self_check, v_doi_mask2, @@ -783,15 +528,15 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( /* If there are any interactions left pack interaction values into c2 * cache. */ if (doi_mask) { - storeInteractions(doi_mask, pjd, &v_r2, &v_dx_tmp, &v_dy_tmp, &v_dz_tmp, + storeInteractions(doi_mask, pjd, &v_r2, &v_dx, &v_dy, &v_dz, cell_cache, &int_cache, &icount, &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz); } if (doi_mask2) { - storeInteractions(doi_mask2, pjd + VEC_SIZE, &v_r2_2, &v_dx_tmp2, - &v_dy_tmp2, &v_dz_tmp2, cell_cache, &int_cache, + storeInteractions(doi_mask2, pjd + VEC_SIZE, &v_r2_2, &v_dx_2, + &v_dy_2, &v_dz_2, cell_cache, &int_cache, &icount, &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz); @@ -960,8 +705,7 @@ for (int pid = 0; pid < count; pid++) { * secondary cache.*/ for (int pjd = 0; pjd < count_align; pjd += (num_vec_proc * VEC_SIZE)) { - int cj_cache_idx = pjd; - /* Load 2 sets of vectors from the particle cache. */ + /* Load 1 set of vectors from the particle cache. */ pjx.v = vec_load(&cell_cache->x[pjd]); pjy.v = vec_load(&cell_cache->y[pjd]); pjz.v = vec_load(&cell_cache->z[pjd]); @@ -969,15 +713,15 @@ for (int pid = 0; pid < count; pid++) { hjg2.v = vec_mul(vec_mul(hj.v, hj.v), kernel_gamma2_vec.v); /* Compute the pairwise distance. */ - vector v_dx_tmp, v_dy_tmp, v_dz_tmp; + vector v_dx, v_dy, v_dz; - v_dx_tmp.v = vec_sub(pix.v, pjx.v); - v_dy_tmp.v = vec_sub(piy.v, pjy.v); - v_dz_tmp.v = vec_sub(piz.v, pjz.v); + v_dx.v = vec_sub(pix.v, pjx.v); + v_dy.v = vec_sub(piy.v, pjy.v); + v_dz.v = vec_sub(piz.v, pjz.v); - v_r2.v = vec_mul(v_dx_tmp.v, v_dx_tmp.v); - v_r2.v = vec_fma(v_dy_tmp.v, v_dy_tmp.v, v_r2.v); - v_r2.v = vec_fma(v_dz_tmp.v, v_dz_tmp.v, v_r2.v); + v_r2.v = vec_mul(v_dx.v, v_dx.v); + v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v); + v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v); /* Form r2 > 0 mask, r2 < hig2 mask and r2 < hjg2 mask. */ mask_t v_doi_mask, v_doi_mask_self_check; @@ -999,15 +743,15 @@ for (int pid = 0; pid < count; pid++) { * cache. */ if (doi_mask) { vector v_hj, v_hj_inv; - v_hj.v = vec_load(&cell_cache->h[cj_cache_idx]); + v_hj.v = vec_load(&cell_cache->h[pjd]); v_hj_inv = vec_reciprocal(v_hj); runner_iact_nonsym_1_vec_force( - &v_r2, &v_dx_tmp, &v_dy_tmp, &v_dz_tmp, v_vix, v_viy, v_viz, + &v_r2, &v_dx, &v_dy, &v_dz, v_vix, v_viy, v_viz, v_rhoi, v_grad_hi, v_pOrhoi2, v_balsara_i, v_ci, - &cell_cache->vx[cj_cache_idx], &cell_cache->vy[cj_cache_idx], - &cell_cache->vz[cj_cache_idx], &cell_cache->rho[cj_cache_idx], &cell_cache->grad_h[cj_cache_idx], - &cell_cache->pOrho2[cj_cache_idx], &cell_cache->balsara[cj_cache_idx], &cell_cache->soundspeed[cj_cache_idx], &cell_cache->m[cj_cache_idx], v_hi_inv, v_hj_inv, &a_hydro_xSum, &a_hydro_ySum, &a_hydro_zSum, + &cell_cache->vx[pjd], &cell_cache->vy[pjd], + &cell_cache->vz[pjd], &cell_cache->rho[pjd], &cell_cache->grad_h[pjd], + &cell_cache->pOrho2[pjd], &cell_cache->balsara[pjd], &cell_cache->soundspeed[pjd], &cell_cache->m[pjd], v_hi_inv, v_hj_inv, &a_hydro_xSum, &a_hydro_ySum, &a_hydro_zSum, &h_dtSum, &v_sigSum, &entropy_dtSum, v_doi_mask); } @@ -1023,242 +767,6 @@ for (int pid = 0; pid < count; pid++) { } /* loop over all particles. */ -TIMER_TOC(timer_doself_force); -#endif /* WITH_VECTORIZATION */ -} -__attribute__((always_inline)) INLINE void runner_doself2_force_vec_2( - struct runner *r, struct cell *restrict c) { -#ifdef WITH_VECTORIZATION - const struct engine *e = r->e; - struct part *restrict pi; - int count_align; - const int num_vec_proc = 1;//2; - - struct part *restrict parts = c->parts; - const int count = c->count; - - vector v_hi, v_vix, v_viy, v_viz, v_hig2, v_r2; - vector v_rhoi, v_grad_hi, v_pOrhoi2, v_balsara_i, v_ci; - - TIMER_TIC - - if (!cell_is_active(c, e)) return; - - if (!cell_are_part_drifted(c, e)) error("Interacting undrifted cell."); - - /* Get the particle cache from the runner and re-allocate - * the cache if it is not big enough for the cell. */ - struct cache *restrict cell_cache = &r->ci_cache; - - if (cell_cache->count < count) { - cache_init(cell_cache, count); - } - - /* Read the particles from the cell and store them locally in the cache. */ - cache_read_force_particles(c, cell_cache); - -#ifdef SWIFT_DEBUG_CHECKS - for (int i = 0; i < count; i++) { - pi = &c->parts[i]; - /* Check that particles have been drifted to the current time */ - if (pi->ti_drift != e->ti_current) - error("Particle pi not drifted to current time"); - } -} -#endif - -/* Create secondary cache to store particle interactions. */ -struct c2_cache int_cache; -int icount = 0, icount_align = 0; - -/* Loop over the particles in the cell. */ -for (int pid = 0; pid < count; pid++) { - - /* Get a pointer to the ith particle. */ - pi = &parts[pid]; - - /* Is the ith particle active? */ - if (!part_is_active(pi, e)) continue; - - vector pix, piy, piz; - - const float hi = cell_cache->h[pid]; - - /* Fill particle pi vectors. */ - pix.v = vec_set1(cell_cache->x[pid]); - piy.v = vec_set1(cell_cache->y[pid]); - piz.v = vec_set1(cell_cache->z[pid]); - v_hi.v = vec_set1(hi); - v_vix.v = vec_set1(cell_cache->vx[pid]); - v_viy.v = vec_set1(cell_cache->vy[pid]); - v_viz.v = vec_set1(cell_cache->vz[pid]); - - v_rhoi.v = vec_set1(cell_cache->rho[pid]); - v_grad_hi.v = vec_set1(cell_cache->grad_h[pid]); - v_pOrhoi2.v = vec_set1(cell_cache->pOrho2[pid]); - v_balsara_i.v = vec_set1(cell_cache->balsara[pid]); - v_ci.v = vec_set1(cell_cache->soundspeed[pid]); - - const float hig2 = hi * hi * kernel_gamma2; - v_hig2.v = vec_set1(hig2); - - /* Reset cumulative sums of update vectors. */ - vector a_hydro_xSum, a_hydro_ySum, a_hydro_zSum, h_dtSum, v_sigSum, - entropy_dtSum; - - /* Get the inverse of hi. */ - vector v_hi_inv; - - v_hi_inv = vec_reciprocal(v_hi); - - a_hydro_xSum.v = vec_setzero(); - a_hydro_ySum.v = vec_setzero(); - a_hydro_zSum.v = vec_setzero(); - h_dtSum.v = vec_setzero(); - v_sigSum.v = vec_set1(pi->force.v_sig); - entropy_dtSum.v = vec_setzero(); - - /* Pad cache if there is a serial remainder. */ - count_align = count; - int rem = count % (num_vec_proc * VEC_SIZE); - if (rem != 0) { - int pad = (num_vec_proc * VEC_SIZE) - rem; - - count_align += pad; - - /* Set positions to the same as particle pi so when the r2 > 0 mask is - * applied these extra contributions are masked out.*/ - for (int i = count; i < count_align; i++) { - cell_cache->x[i] = pix.f[0]; - cell_cache->y[i] = piy.f[0]; - cell_cache->z[i] = piz.f[0]; - cell_cache->h[i] = 1.f; - } - } - - vector pjx, pjy, pjz, hj, hjg2; - //vector pjx_2, pjy_2, pjz_2, hj_2, hjg2_2, v_r2_2; - - /* Find all of particle pi's interacions and store needed values in the - * secondary cache.*/ - for (int pjd = 0; pjd < count_align; pjd += (num_vec_proc * VEC_SIZE)) { - - /* Load 2 sets of vectors from the particle cache. */ - pjx.v = vec_load(&cell_cache->x[pjd]); - //pjx_2.v = vec_load(&cell_cache->x[pjd + VEC_SIZE]); - pjy.v = vec_load(&cell_cache->y[pjd]); - //pjy_2.v = vec_load(&cell_cache->y[pjd + VEC_SIZE]); - pjz.v = vec_load(&cell_cache->z[pjd]); - //pjz_2.v = vec_load(&cell_cache->z[pjd + VEC_SIZE]); - hj.v = vec_load(&cell_cache->h[pjd]); - //hj_2.v = vec_load(&cell_cache->h[pjd + VEC_SIZE]); - hjg2.v = vec_mul(vec_mul(hj.v, hj.v), kernel_gamma2_vec.v); - //hjg2_2.v = vec_mul(vec_mul(hj_2.v, hj_2.v), kernel_gamma2_vec.v); - - /* Compute the pairwise distance. */ - vector v_dx_tmp, v_dy_tmp, v_dz_tmp; - //vector v_dx_tmp_2, v_dy_tmp_2, v_dz_tmp_2; - - v_dx_tmp.v = vec_sub(pix.v, pjx.v); - //v_dx_tmp_2.v = vec_sub(pix.v, pjx_2.v); - v_dy_tmp.v = vec_sub(piy.v, pjy.v); - //v_dy_tmp_2.v = vec_sub(piy.v, pjy_2.v); - v_dz_tmp.v = vec_sub(piz.v, pjz.v); - //v_dz_tmp_2.v = vec_sub(piz.v, pjz_2.v); - - v_r2.v = vec_mul(v_dx_tmp.v, v_dx_tmp.v); - //v_r2_2.v = vec_mul(v_dx_tmp_2.v, v_dx_tmp_2.v); - v_r2.v = vec_fma(v_dy_tmp.v, v_dy_tmp.v, v_r2.v); - //v_r2_2.v = vec_mul(v_dy_tmp_2.v, v_dy_tmp_2.v); - v_r2.v = vec_fma(v_dz_tmp.v, v_dz_tmp.v, v_r2.v); - //v_r2_2.v = vec_mul(v_dz_tmp_2.v, v_dz_tmp_2.v); - - /* Form r2 > 0 mask, r2 < hig2 mask and r2 < hjg2 mask. */ - mask_t v_doi_mask, v_doi_mask_self_check; - //mask_t v_doi_mask_2, v_doi_mask_self_check_2; - int doi_mask, doi_mask_self_check; - //int doi_mask_2, doi_mask_self_check_2; - - /* Form r2 > 0 mask.*/ - vec_create_mask(v_doi_mask_self_check, vec_cmp_gt(v_r2.v, vec_setzero())); - //vec_create_mask(v_doi_mask_self_check_2, vec_cmp_gt(v_r2_2.v, vec_setzero())); - - /* Form a mask from r2 < hig2 mask and r2 < hjg2 mask. */ - vector v_h2; - //vector v_h2_2; - v_h2.v = vec_fmax(v_hig2.v, hjg2.v); - //v_h2_2.v = vec_fmax(v_hig2.v, hjg2_2.v); - vec_create_mask(v_doi_mask, vec_cmp_lt(v_r2.v, v_h2.v)); - //vec_create_mask(v_doi_mask_2, vec_cmp_lt(v_r2_2.v, v_h2_2.v)); - - /* Form integer masks. */ - doi_mask_self_check = vec_form_int_mask(v_doi_mask_self_check); - //doi_mask_self_check_2 = vec_form_int_mask(v_doi_mask_self_check_2); - doi_mask = vec_form_int_mask(v_doi_mask); - //doi_mask_2 = vec_form_int_mask(v_doi_mask_2); - - /* Combine all 3 masks. */ - doi_mask = doi_mask & doi_mask_self_check; - //doi_mask_2 = doi_mask_2 & doi_mask_self_check_2; - - /* If there are any interactions left pack interaction values into c2 - * cache. */ - if (doi_mask) { - - storeForceInteractions( - doi_mask, pjd, &v_r2, &v_dx_tmp, &v_dy_tmp, &v_dz_tmp, cell_cache, - &int_cache, &icount, &a_hydro_xSum, &a_hydro_ySum, &a_hydro_zSum, - &h_dtSum, &v_sigSum, &entropy_dtSum, v_hi_inv, v_vix, v_viy, v_viz, - v_rhoi, v_grad_hi, v_pOrhoi2, v_balsara_i, v_ci, 2); - } - - //if (doi_mask_2) { - - // storeForceInteractions( - // doi_mask_2, pjd + VEC_SIZE, &v_r2_2, &v_dx_tmp_2, &v_dy_tmp_2, &v_dz_tmp_2, cell_cache, - // &int_cache, &icount, &a_hydro_xSum, &a_hydro_ySum, &a_hydro_zSum, - // &h_dtSum, &v_sigSum, &entropy_dtSum, v_hi_inv, v_vix, v_viy, v_viz, - // v_rhoi, v_grad_hi, v_pOrhoi2, v_balsara_i, v_ci, 2); - //} - - } /* Loop over all other particles. */ - - /* Perform padded vector remainder interactions if any are present. */ - calcRemForceInteractions(&int_cache, icount, &a_hydro_xSum, &a_hydro_ySum, - &a_hydro_zSum, &h_dtSum, &v_sigSum, &entropy_dtSum, - v_hi_inv, v_vix, v_viy, v_viz, v_rhoi, v_grad_hi, - v_pOrhoi2, v_balsara_i, v_ci, &icount_align, 2); - - /* Initialise masks to true in case remainder interactions have been - * performed. */ - mask_t int_mask, int_mask2; - vec_init_mask_true(int_mask); - vec_init_mask_true(int_mask2); - - /* Perform interaction with 2 vectors. */ - for (int pjd = 0; pjd < icount_align; pjd += (2 * VEC_SIZE)) { - runner_iact_nonsym_2_vec_force( - &int_cache.r2q[pjd], &int_cache.dxq[pjd], &int_cache.dyq[pjd], - &int_cache.dzq[pjd], v_vix, v_viy, v_viz, v_rhoi, v_grad_hi, v_pOrhoi2, - v_balsara_i, v_ci, &int_cache.vxq[pjd], &int_cache.vyq[pjd], - &int_cache.vzq[pjd], &int_cache.rhoq[pjd], &int_cache.grad_hq[pjd], - &int_cache.pOrho2q[pjd], &int_cache.balsaraq[pjd], - &int_cache.soundspeedq[pjd], &int_cache.mq[pjd], v_hi_inv, - &int_cache.h_invq[pjd], &a_hydro_xSum, &a_hydro_ySum, &a_hydro_zSum, - &h_dtSum, &v_sigSum, &entropy_dtSum, int_mask, int_mask2, 0); - } - - VEC_HADD(a_hydro_xSum, pi->a_hydro[0]); - VEC_HADD(a_hydro_ySum, pi->a_hydro[1]); - VEC_HADD(a_hydro_zSum, pi->a_hydro[2]); - VEC_HADD(h_dtSum, pi->force.h_dt); - VEC_HMAX(v_sigSum, pi->force.v_sig); - VEC_HADD(entropy_dtSum, pi->entropy_dt); - - /* Reset interaction count. */ - icount = 0; -} /* loop over all particles. */ - TIMER_TOC(timer_doself_force); #endif /* WITH_VECTORIZATION */ }