diff --git a/src/cache.h b/src/cache.h index f3eaaf00d63d77b2dca793a7ce5806b2df0c5c78..ac1dbf8718f2b0c11152e380f56c5f195973bb5a 100644 --- a/src/cache.h +++ b/src/cache.h @@ -198,11 +198,8 @@ __attribute__((always_inline)) INLINE void cache_read_particles( swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT); const struct part *restrict parts = ci->parts; - double loc[3]; - loc[0] = ci->loc[0]; - loc[1] = ci->loc[1]; - loc[2] = ci->loc[2]; - + const double loc[3] = {ci->loc[0], ci->loc[1],ci->loc[2]}; + /* Shift the particles positions to a local frame so single precision can be * used instead of double precision. */ for (int i = 0; i < ci->count; i++) { @@ -210,7 +207,6 @@ __attribute__((always_inline)) INLINE void cache_read_particles( y[i] = (float)(parts[i].x[1] - loc[1]); z[i] = (float)(parts[i].x[2] - loc[2]); h[i] = parts[i].h; - m[i] = parts[i].mass; vx[i] = parts[i].v[0]; vy[i] = parts[i].v[1]; @@ -254,10 +250,7 @@ __attribute__((always_inline)) INLINE void cache_read_force_particles( SWIFT_CACHE_ALIGNMENT); const struct part *restrict parts = ci->parts; - double loc[3]; - loc[0] = ci->loc[0]; - loc[1] = ci->loc[1]; - loc[2] = ci->loc[2]; + const double loc[3] = {ci->loc[0], ci->loc[1],ci->loc[2]}; /* Shift the particles positions to a local frame so single precision can be * used instead of double precision. */ @@ -266,12 +259,10 @@ __attribute__((always_inline)) INLINE void cache_read_force_particles( y[i] = (float)(parts[i].x[1] - loc[1]); z[i] = (float)(parts[i].x[2] - loc[2]); h[i] = parts[i].h; - m[i] = parts[i].mass; vx[i] = parts[i].v[0]; vy[i] = parts[i].v[1]; vz[i] = parts[i].v[2]; - rho[i] = parts[i].rho; grad_h[i] = parts[i].force.f; pOrho2[i] = parts[i].force.P_over_rho2; @@ -325,17 +316,13 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( const int last_pj_align = *last_pj; const struct part *restrict parts_i = ci->parts; const struct part *restrict parts_j = cj->parts; - double loc[3]; - loc[0] = cj->loc[0]; - loc[1] = cj->loc[1]; - loc[2] = cj->loc[2]; - - /* Shift ci particles for boundary conditions and location of cell.*/ - double total_ci_shift[3]; - total_ci_shift[0] = loc[0] + shift[0]; - total_ci_shift[1] = loc[1] + shift[1]; - total_ci_shift[2] = loc[2] + shift[2]; + /* Shift particles to the local frame and account for boundary conditions.*/ + const double total_ci_shift[3] = {cj->loc[0] + shift[0], cj->loc[1] + shift[1], + cj->loc[2] + shift[2]}; + const double total_cj_shift[3] = {cj->loc[0], cj->loc[1], + cj->loc[2]}; + /* Let the compiler know that the data is aligned and create pointers to the * arrays inside the cache. */ swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT); @@ -348,6 +335,7 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT); int ci_cache_count = ci->count - first_pi_align; + /* Shift the particles positions to a local frame (ci frame) so single * precision * can be @@ -355,12 +343,12 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( * positions * due to BCs but leave cell cj. */ for (int i = 0; i < ci_cache_count; i++) { + /* Make sure ci_cache is filled from the first element. */ idx = sort_i[i + first_pi_align].i; x[i] = (float)(parts_i[idx].x[0] - total_ci_shift[0]); y[i] = (float)(parts_i[idx].x[1] - total_ci_shift[1]); z[i] = (float)(parts_i[idx].x[2] - total_ci_shift[2]); h[i] = parts_i[idx].h; - m[i] = parts_i[idx].mass; vx[i] = parts_i[idx].v[0]; vy[i] = parts_i[idx].v[1]; @@ -437,11 +425,10 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( for (int i = 0; i <= last_pj_align; i++) { idx = sort_j[i].i; - xj[i] = (float)(parts_j[idx].x[0] - loc[0]); - yj[i] = (float)(parts_j[idx].x[1] - loc[1]); - zj[i] = (float)(parts_j[idx].x[2] - loc[2]); + xj[i] = (float)(parts_j[idx].x[0] - total_cj_shift[0]); + yj[i] = (float)(parts_j[idx].x[1] - total_cj_shift[1]); + zj[i] = (float)(parts_j[idx].x[2] - total_cj_shift[2]); hj[i] = parts_j[idx].h; - mj[i] = parts_j[idx].mass; vxj[i] = parts_j[idx].v[0]; vyj[i] = parts_j[idx].v[1]; @@ -488,7 +475,6 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( yj[i] = pos_padded_j[1]; zj[i] = pos_padded_j[2]; hj[i] = 1.f; - mj[i] = 1.f; vxj[i] = 1.f; vyj[i] = 1.f; @@ -539,10 +525,12 @@ cache_read_two_partial_cells_sorted_force( const int last_pj_align = *last_pj; const struct part *restrict parts_i = ci->parts; const struct part *restrict parts_j = cj->parts; - double loc[3]; - loc[0] = ci->loc[0]; - loc[1] = ci->loc[1]; - loc[2] = ci->loc[2]; + + /* Shift particles to the local frame and account for boundary conditions.*/ + const double total_ci_shift[3] = {cj->loc[0] + shift[0], cj->loc[1] + shift[1], + cj->loc[2] + shift[2]}; + const double total_cj_shift[3] = {cj->loc[0], cj->loc[1], + cj->loc[2]}; /* Let the compiler know that the data is aligned and create pointers to the * arrays inside the cache. */ @@ -573,18 +561,15 @@ cache_read_two_partial_cells_sorted_force( * due to BCs but leave cell cj. */ for (int i = 0; i < ci_cache_count; i++) { /* Make sure ci_cache is filled from the first element. */ - idx = sort_i[i + first_pi_align].i; - x[i] = (float)(parts_i[idx].x[0] - loc[0] - shift[0]); - y[i] = (float)(parts_i[idx].x[1] - loc[1] - shift[1]); - z[i] = (float)(parts_i[idx].x[2] - loc[2] - shift[2]); + x[i] = (float)(parts_i[idx].x[0] - total_ci_shift[0]); + y[i] = (float)(parts_i[idx].x[1] - total_ci_shift[1]); + z[i] = (float)(parts_i[idx].x[2] - total_ci_shift[2]); h[i] = parts_i[idx].h; - m[i] = parts_i[idx].mass; vx[i] = parts_i[idx].v[0]; vy[i] = parts_i[idx].v[1]; vz[i] = parts_i[idx].v[2]; - rho[i] = parts_i[idx].rho; grad_h[i] = parts_i[idx].force.f; pOrho2[i] = parts_i[idx].force.P_over_rho2; @@ -606,12 +591,10 @@ cache_read_two_partial_cells_sorted_force( y[i] = pos_padded[1]; z[i] = pos_padded[2]; h[i] = h_padded; - m[i] = 1.f; vx[i] = 1.f; vy[i] = 1.f; vz[i] = 1.f; - rho[i] = 1.f; grad_h[i] = 1.f; pOrho2[i] = 1.f; @@ -641,16 +624,14 @@ cache_read_two_partial_cells_sorted_force( for (int i = 0; i <= last_pj_align; i++) { idx = sort_j[i].i; - xj[i] = (float)(parts_j[idx].x[0] - loc[0]); - yj[i] = (float)(parts_j[idx].x[1] - loc[1]); - zj[i] = (float)(parts_j[idx].x[2] - loc[2]); + xj[i] = (float)(parts_j[idx].x[0] - total_cj_shift[0]); + yj[i] = (float)(parts_j[idx].x[1] - total_cj_shift[1]); + zj[i] = (float)(parts_j[idx].x[2] - total_cj_shift[2]); hj[i] = parts_j[idx].h; - mj[i] = parts_j[idx].mass; vxj[i] = parts_j[idx].v[0]; vyj[i] = parts_j[idx].v[1]; vzj[i] = parts_j[idx].v[2]; - rhoj[i] = parts_j[idx].rho; grad_hj[i] = parts_j[idx].force.f; pOrho2j[i] = parts_j[idx].force.P_over_rho2; @@ -670,12 +651,10 @@ cache_read_two_partial_cells_sorted_force( yj[i] = pos_padded_j[1]; zj[i] = pos_padded_j[2]; hj[i] = h_padded_j; - mj[i] = 1.f; vxj[i] = 1.f; vyj[i] = 1.f; vzj[i] = 1.f; - rhoj[i] = 1.f; grad_hj[i] = 1.f; pOrho2j[i] = 1.f; @@ -699,6 +678,11 @@ static INLINE void cache_clean(struct cache *c) { free(c->vz); free(c->h); free(c->max_index); + free(c->rho); + free(c->grad_h); + free(c->pOrho2); + free(c->balsara); + free(c->soundspeed); } } diff --git a/src/runner_doiact.h b/src/runner_doiact.h index 5ab3a9a1b43ce0814866c5abaff454f7b6d85690..baae55d799030a6af3a638478be3bcd883112f01 100644 --- a/src/runner_doiact.h +++ b/src/runner_doiact.h @@ -1072,37 +1072,6 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj, const int sid, struct entry *restrict sort_j = cj->sort[sid]; #ifdef SWIFT_DEBUG_CHECKS - /* Check that the dx_max_sort values in the cell are indeed an upper - bound on particle movement. */ - for (int pid = 0; pid < ci->count; pid++) { - const struct part *p = &ci->parts[sort_i[pid].i]; - const float d = p->x[0] * runner_shift[sid][0] + - p->x[1] * runner_shift[sid][1] + - p->x[2] * runner_shift[sid][2]; - if (fabsf(d - sort_i[pid].d) - ci->dx_max_sort > - 1.0e-4 * max(fabsf(d), ci->dx_max_sort_old)) - error( - "particle shift diff exceeds dx_max_sort in cell ci. ci->nodeID=%d " - "cj->nodeID=%d d=%e sort_i[pid].d=%e ci->dx_max_sort=%e " - "ci->dx_max_sort_old=%e", - ci->nodeID, cj->nodeID, d, sort_i[pid].d, ci->dx_max_sort, - ci->dx_max_sort_old); - } - for (int pjd = 0; pjd < cj->count; pjd++) { - const struct part *p = &cj->parts[sort_j[pjd].i]; - const float d = p->x[0] * runner_shift[sid][0] + - p->x[1] * runner_shift[sid][1] + - p->x[2] * runner_shift[sid][2]; - if (fabsf(d - sort_j[pjd].d) - cj->dx_max_sort > - 1.0e-4 * max(fabsf(d), cj->dx_max_sort_old)) - error( - "particle shift diff exceeds dx_max_sort in cell cj. cj->nodeID=%d " - "ci->nodeID=%d d=%e sort_j[pjd].d=%e cj->dx_max_sort=%e " - "cj->dx_max_sort_old=%e", - cj->nodeID, ci->nodeID, d, sort_j[pjd].d, cj->dx_max_sort, - cj->dx_max_sort_old); - } - /* Some constants used to checks that the parts are in the right frame */ const float shift_threshold_x = 2. * ci->width[0] + 2. * max(ci->dx_max_part, cj->dx_max_part); @@ -1110,7 +1079,6 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj, const int sid, 2. * ci->width[1] + 2. * max(ci->dx_max_part, cj->dx_max_part); const float shift_threshold_z = 2. * ci->width[2] + 2. * max(ci->dx_max_part, cj->dx_max_part); - #endif /* SWIFT_DEBUG_CHECKS */ /* Get some other useful values. */ @@ -1514,7 +1482,6 @@ void DOPAIR2_BRANCH(struct runner *r, struct cell *ci, struct cell *cj) { error("Interacting unsorted cells."); #ifdef SWIFT_DEBUG_CHECKS - /* Pick-out the sorted lists. */ const struct entry *restrict sort_i = ci->sort[sid]; const struct entry *restrict sort_j = cj->sort[sid]; diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c index c4f9405b39504d0f693a4f528fb9bde53f9f1113..d9595b4eda47f39009509d1b5e35553246d5eb89 100644 --- a/src/runner_doiact_vec.c +++ b/src/runner_doiact_vec.c @@ -35,21 +35,21 @@ static const vector kernel_gamma2_vec = FILL_VEC(kernel_gamma2); * @param int_cache (return) secondary #cache of interactions between two * particles. * @param icount Interaction count. - * @param rhoSum (return) #vector holding the cumulative sum of the density + * @param v_rhoSum (return) #vector holding the cumulative sum of the density * update on pi. - * @param rho_dhSum (return) #vector holding the cumulative sum of the density + * @param v_rho_dhSum (return) #vector holding the cumulative sum of the density * gradient update on pi. - * @param wcountSum (return) #vector holding the cumulative sum of the wcount + * @param v_wcountSum (return) #vector holding the cumulative sum of the wcount * update on pi. - * @param wcount_dhSum (return) #vector holding the cumulative sum of the wcount + * @param v_wcount_dhSum (return) #vector holding the cumulative sum of the wcount * gradient update on pi. - * @param div_vSum (return) #vector holding the cumulative sum of the divergence + * @param v_div_vSum (return) #vector holding the cumulative sum of the divergence * update on pi. - * @param curlvxSum (return) #vector holding the cumulative sum of the curl of + * @param v_curlvxSum (return) #vector holding the cumulative sum of the curl of * vx update on pi. - * @param curlvySum (return) #vector holding the cumulative sum of the curl of + * @param v_curlvySum (return) #vector holding the cumulative sum of the curl of * vy update on pi. - * @param curlvzSum (return) #vector holding the cumulative sum of the curl of + * @param v_curlvzSum (return) #vector holding the cumulative sum of the curl of * vz update on pi. * @param v_hi_inv #vector of 1/h for pi. * @param v_vix #vector of x velocity of pi. @@ -59,9 +59,9 @@ static const vector kernel_gamma2_vec = FILL_VEC(kernel_gamma2); * interactions have been performed, should be a multiple of the vector length. */ __attribute__((always_inline)) INLINE static void calcRemInteractions( - struct c2_cache *const int_cache, const int icount, vector *rhoSum, - vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum, - vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum, + struct c2_cache *const int_cache, const int icount, vector *v_rhoSum, + vector *v_rho_dhSum, vector *v_wcountSum, vector *v_wcount_dhSum, + vector *v_div_vSum, vector *v_curlvxSum, vector *v_curlvySum, vector *v_curlvzSum, vector v_hi_inv, vector v_vix, vector v_viy, vector v_viz, int *icount_align) { @@ -107,8 +107,8 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions( &int_cache->dyq[*icount_align], &int_cache->dzq[*icount_align], v_hi_inv, v_vix, v_viy, v_viz, &int_cache->vxq[*icount_align], &int_cache->vyq[*icount_align], &int_cache->vzq[*icount_align], - &int_cache->mq[*icount_align], rhoSum, rho_dhSum, wcountSum, - wcount_dhSum, div_vSum, curlvxSum, curlvySum, curlvzSum, int_mask, + &int_cache->mq[*icount_align], v_rhoSum, v_rho_dhSum, v_wcountSum, + v_wcount_dhSum, v_div_vSum, v_curlvxSum, v_curlvySum, v_curlvzSum, int_mask, int_mask2, 1); } } @@ -127,20 +127,20 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions( * @param int_cache (return) secondary #cache of interactions between two * particles. * @param icount Interaction count. - * @param rhoSum #vector holding the cumulative sum of the density update on pi. - * @param rho_dhSum #vector holding the cumulative sum of the density gradient + * @param v_rhoSum #vector holding the cumulative sum of the density update on pi. + * @param v_rho_dhSum #vector holding the cumulative sum of the density gradient * update on pi. - * @param wcountSum #vector holding the cumulative sum of the wcount update on + * @param v_wcountSum #vector holding the cumulative sum of the wcount update on * pi. - * @param wcount_dhSum #vector holding the cumulative sum of the wcount gradient + * @param v_wcount_dhSum #vector holding the cumulative sum of the wcount gradient * update on pi. - * @param div_vSum #vector holding the cumulative sum of the divergence update + * @param v_div_vSum #vector holding the cumulative sum of the divergence update * on pi. - * @param curlvxSum #vector holding the cumulative sum of the curl of vx update + * @param v_curlvxSum #vector holding the cumulative sum of the curl of vx update * on pi. - * @param curlvySum #vector holding the cumulative sum of the curl of vy update + * @param v_curlvySum #vector holding the cumulative sum of the curl of vy update * on pi. - * @param curlvzSum #vector holding the cumulative sum of the curl of vz update + * @param v_curlvzSum #vector holding the cumulative sum of the curl of vz update * on pi. * @param v_hi_inv #vector of 1/h for pi. * @param v_vix #vector of x velocity of pi. @@ -150,9 +150,9 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions( __attribute__((always_inline)) INLINE static void storeInteractions( const int mask, const int pjd, vector *v_r2, vector *v_dx, vector *v_dy, vector *v_dz, const struct cache *const cell_cache, - struct c2_cache *const int_cache, int *icount, vector *rhoSum, - vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum, - vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum, + struct c2_cache *const int_cache, int *icount, vector *v_rhoSum, + vector *v_rho_dhSum, vector *v_wcountSum, vector *v_wcount_dhSum, + vector *v_div_vSum, vector *v_curlvxSum, vector *v_curlvySum, vector *v_curlvzSum, vector v_hi_inv, vector v_vix, vector v_viy, vector v_viz) { /* Left-pack values needed into the secondary cache using the interaction mask. @@ -202,8 +202,8 @@ __attribute__((always_inline)) INLINE static void storeInteractions( int icount_align = *icount; /* Peform remainder interactions. */ - calcRemInteractions(int_cache, *icount, rhoSum, rho_dhSum, wcountSum, - wcount_dhSum, div_vSum, curlvxSum, curlvySum, curlvzSum, + calcRemInteractions(int_cache, *icount, v_rhoSum, v_rho_dhSum, v_wcountSum, + v_wcount_dhSum, v_div_vSum, v_curlvxSum, v_curlvySum, v_curlvzSum, v_hi_inv, v_vix, v_viy, v_viz, &icount_align); mask_t int_mask, int_mask2; @@ -215,9 +215,9 @@ __attribute__((always_inline)) INLINE static void storeInteractions( runner_iact_nonsym_2_vec_density( &int_cache->r2q[j], &int_cache->dxq[j], &int_cache->dyq[j], &int_cache->dzq[j], v_hi_inv, v_vix, v_viy, v_viz, &int_cache->vxq[j], - &int_cache->vyq[j], &int_cache->vzq[j], &int_cache->mq[j], rhoSum, - rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, curlvySum, - curlvzSum, int_mask, int_mask2, 0); + &int_cache->vyq[j], &int_cache->vzq[j], &int_cache->mq[j], v_rhoSum, + v_rho_dhSum, v_wcountSum, v_wcount_dhSum, v_div_vSum, v_curlvxSum, v_curlvySum, + v_curlvzSum, int_mask, int_mask2, 0); } /* Reset interaction count. */ @@ -358,6 +358,33 @@ __attribute__((always_inline)) INLINE static void populate_max_index_no_cache( *init_pj = last_pj; } +/** + * @brief Populates the arrays max_index_i and max_index_j with the maximum + * indices of + * particles into their neighbouring cells. Also finds the first pi that + * interacts with any particle in cj and the last pj that interacts with any + * particle in ci. + * + * @param ci #cell pointer to ci + * @param cj #cell pointer to cj + * @param sort_i #entry array for particle distance in ci + * @param sort_j #entry array for particle distance in cj + * @param dx_max maximum particle movement allowed in cell + * @param rshift cutoff shift + * @param hi_max_raw Maximal smoothing length in cell ci + * @param hj_max_raw Maximal smoothing length in cell cj + * @param hi_max Maximal smoothing length in cell ci scaled by kernel_gamma + * @param hj_max Maximal smoothing length in cell cj scaled by kernel_gamma + * @param di_max Maximal position on the axis that can interact in cell ci + * @param dj_min Minimal position on the axis that can interact in cell ci + * @param max_index_i array to hold the maximum distances of pi particles into + * #cell cj + * @param max_index_j array to hold the maximum distances of pj particles into + * #cell cj + * @param init_pi first pi to interact with a pj particle + * @param init_pj last pj to interact with a pi particle + * @param max_active_bin The largest time-bin active during this step. + */ __attribute__((always_inline)) INLINE static void populate_max_index_no_cache_force(const struct cell *ci, const struct cell *cj, const struct entry *restrict sort_i, @@ -530,14 +557,14 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( /* Is the ith particle active? */ if (!part_is_active_no_debug(pi, max_active_bin)) continue; - vector pix, piy, piz; + vector v_pix, v_piy, v_piz; const float hi = cell_cache->h[pid]; /* Fill particle pi vectors. */ - pix.v = vec_set1(cell_cache->x[pid]); - piy.v = vec_set1(cell_cache->y[pid]); - piz.v = vec_set1(cell_cache->z[pid]); + v_pix.v = vec_set1(cell_cache->x[pid]); + v_piy.v = vec_set1(cell_cache->y[pid]); + v_piz.v = vec_set1(cell_cache->z[pid]); v_hi.v = vec_set1(hi); v_vix.v = vec_set1(cell_cache->vx[pid]); v_viy.v = vec_set1(cell_cache->vy[pid]); @@ -547,22 +574,22 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( v_hig2.v = vec_set1(hig2); /* Reset cumulative sums of update vectors. */ - vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, - curlvySum, curlvzSum; + vector v_rhoSum, v_rho_dhSum, v_wcountSum, v_wcount_dhSum, v_div_vSum, v_curlvxSum, + v_curlvySum, v_curlvzSum; /* Get the inverse of hi. */ vector v_hi_inv; v_hi_inv = vec_reciprocal(v_hi); - rhoSum.v = vec_setzero(); - rho_dhSum.v = vec_setzero(); - wcountSum.v = vec_setzero(); - wcount_dhSum.v = vec_setzero(); - div_vSum.v = vec_setzero(); - curlvxSum.v = vec_setzero(); - curlvySum.v = vec_setzero(); - curlvzSum.v = vec_setzero(); + v_rhoSum.v = vec_setzero(); + v_rho_dhSum.v = vec_setzero(); + v_wcountSum.v = vec_setzero(); + v_wcount_dhSum.v = vec_setzero(); + v_div_vSum.v = vec_setzero(); + v_curlvxSum.v = vec_setzero(); + v_curlvySum.v = vec_setzero(); + v_curlvzSum.v = vec_setzero(); /* Pad cache if there is a serial remainder. */ count_align = count; @@ -575,38 +602,38 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( /* Set positions to the same as particle pi so when the r2 > 0 mask is * applied these extra contributions are masked out.*/ for (int i = count; i < count_align; i++) { - cell_cache->x[i] = pix.f[0]; - cell_cache->y[i] = piy.f[0]; - cell_cache->z[i] = piz.f[0]; + cell_cache->x[i] = v_pix.f[0]; + cell_cache->y[i] = v_piy.f[0]; + cell_cache->z[i] = v_piz.f[0]; } } - vector pjx, pjy, pjz; - vector pjx2, pjy2, pjz2; + vector v_pjx, v_pjy, v_pjz; + vector v_pjx2, v_pjy2, v_pjz2; /* Find all of particle pi's interacions and store needed values in the * secondary cache.*/ for (int pjd = 0; pjd < count_align; pjd += (num_vec_proc * VEC_SIZE)) { /* Load 2 sets of vectors from the particle cache. */ - pjx.v = vec_load(&cell_cache->x[pjd]); - pjy.v = vec_load(&cell_cache->y[pjd]); - pjz.v = vec_load(&cell_cache->z[pjd]); + v_pjx.v = vec_load(&cell_cache->x[pjd]); + v_pjy.v = vec_load(&cell_cache->y[pjd]); + v_pjz.v = vec_load(&cell_cache->z[pjd]); - pjx2.v = vec_load(&cell_cache->x[pjd + VEC_SIZE]); - pjy2.v = vec_load(&cell_cache->y[pjd + VEC_SIZE]); - pjz2.v = vec_load(&cell_cache->z[pjd + VEC_SIZE]); + v_pjx2.v = vec_load(&cell_cache->x[pjd + VEC_SIZE]); + v_pjy2.v = vec_load(&cell_cache->y[pjd + VEC_SIZE]); + v_pjz2.v = vec_load(&cell_cache->z[pjd + VEC_SIZE]); /* Compute the pairwise distance. */ vector v_dx, v_dy, v_dz; vector v_dx_2, v_dy_2, v_dz_2, v_r2_2; - v_dx.v = vec_sub(pix.v, pjx.v); - v_dx_2.v = vec_sub(pix.v, pjx2.v); - v_dy.v = vec_sub(piy.v, pjy.v); - v_dy_2.v = vec_sub(piy.v, pjy2.v); - v_dz.v = vec_sub(piz.v, pjz.v); - v_dz_2.v = vec_sub(piz.v, pjz2.v); + v_dx.v = vec_sub(v_pix.v, v_pjx.v); + v_dx_2.v = vec_sub(v_pix.v, v_pjx2.v); + v_dy.v = vec_sub(v_piy.v, v_pjy.v); + v_dy_2.v = vec_sub(v_piy.v, v_pjy2.v); + v_dz.v = vec_sub(v_piz.v, v_pjz.v); + v_dz_2.v = vec_sub(v_piz.v, v_pjz2.v); v_r2.v = vec_mul(v_dx.v, v_dx.v); v_r2_2.v = vec_mul(v_dx_2.v, v_dx_2.v); @@ -644,23 +671,23 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( * cache. */ if (doi_mask) { storeInteractions(doi_mask, pjd, &v_r2, &v_dx, &v_dy, &v_dz, cell_cache, - &int_cache, &icount, &rhoSum, &rho_dhSum, &wcountSum, - &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, - &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz); + &int_cache, &icount, &v_rhoSum, &v_rho_dhSum, &v_wcountSum, + &v_wcount_dhSum, &v_div_vSum, &v_curlvxSum, &v_curlvySum, + &v_curlvzSum, v_hi_inv, v_vix, v_viy, v_viz); } if (doi_mask2) { storeInteractions(doi_mask2, pjd + VEC_SIZE, &v_r2_2, &v_dx_2, &v_dy_2, - &v_dz_2, cell_cache, &int_cache, &icount, &rhoSum, - &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, - &curlvxSum, &curlvySum, &curlvzSum, v_hi_inv, v_vix, + &v_dz_2, cell_cache, &int_cache, &icount, &v_rhoSum, + &v_rho_dhSum, &v_wcountSum, &v_wcount_dhSum, &v_div_vSum, + &v_curlvxSum, &v_curlvySum, &v_curlvzSum, v_hi_inv, v_vix, v_viy, v_viz); } } /* Perform padded vector remainder interactions if any are present. */ - calcRemInteractions(&int_cache, icount, &rhoSum, &rho_dhSum, &wcountSum, - &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, - &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz, + calcRemInteractions(&int_cache, icount, &v_rhoSum, &v_rho_dhSum, &v_wcountSum, + &v_wcount_dhSum, &v_div_vSum, &v_curlvxSum, &v_curlvySum, + &v_curlvzSum, v_hi_inv, v_vix, v_viy, v_viz, &icount_align); /* Initialise masks to true in case remainder interactions have been @@ -675,21 +702,21 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( &int_cache.r2q[pjd], &int_cache.dxq[pjd], &int_cache.dyq[pjd], &int_cache.dzq[pjd], v_hi_inv, v_vix, v_viy, v_viz, &int_cache.vxq[pjd], &int_cache.vyq[pjd], &int_cache.vzq[pjd], - &int_cache.mq[pjd], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, - &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, int_mask, int_mask2, + &int_cache.mq[pjd], &v_rhoSum, &v_rho_dhSum, &v_wcountSum, &v_wcount_dhSum, + &v_div_vSum, &v_curlvxSum, &v_curlvySum, &v_curlvzSum, int_mask, int_mask2, 0); } /* Perform horizontal adds on vector sums and store result in particle pi. */ - VEC_HADD(rhoSum, pi->rho); - VEC_HADD(rho_dhSum, pi->density.rho_dh); - VEC_HADD(wcountSum, pi->density.wcount); - VEC_HADD(wcount_dhSum, pi->density.wcount_dh); - VEC_HADD(div_vSum, pi->density.div_v); - VEC_HADD(curlvxSum, pi->density.rot_v[0]); - VEC_HADD(curlvySum, pi->density.rot_v[1]); - VEC_HADD(curlvzSum, pi->density.rot_v[2]); + VEC_HADD(v_rhoSum, pi->rho); + VEC_HADD(v_rho_dhSum, pi->density.rho_dh); + VEC_HADD(v_wcountSum, pi->density.wcount); + VEC_HADD(v_wcount_dhSum, pi->density.wcount_dh); + VEC_HADD(v_div_vSum, pi->density.div_v); + VEC_HADD(v_curlvxSum, pi->density.rot_v[0]); + VEC_HADD(v_curlvySum, pi->density.rot_v[1]); + VEC_HADD(v_curlvzSum, pi->density.rot_v[2]); /* Reset interaction count. */ icount = 0; @@ -758,14 +785,14 @@ __attribute__((always_inline)) INLINE void runner_doself2_force_vec( /* Is the ith particle active? */ if (!part_is_active_no_debug(pi, max_active_bin)) continue; - vector pix, piy, piz; + vector v_pix, v_piy, v_piz; const float hi = cell_cache->h[pid]; /* Fill particle pi vectors. */ - pix.v = vec_set1(cell_cache->x[pid]); - piy.v = vec_set1(cell_cache->y[pid]); - piz.v = vec_set1(cell_cache->z[pid]); + v_pix.v = vec_set1(cell_cache->x[pid]); + v_piy.v = vec_set1(cell_cache->y[pid]); + v_piz.v = vec_set1(cell_cache->z[pid]); v_hi.v = vec_set1(hi); v_vix.v = vec_set1(cell_cache->vx[pid]); v_viy.v = vec_set1(cell_cache->vy[pid]); @@ -781,20 +808,20 @@ __attribute__((always_inline)) INLINE void runner_doself2_force_vec( v_hig2.v = vec_set1(hig2); /* Reset cumulative sums of update vectors. */ - vector a_hydro_xSum, a_hydro_ySum, a_hydro_zSum, h_dtSum, v_sigSum, - entropy_dtSum; + vector v_a_hydro_xSum, v_a_hydro_ySum, v_a_hydro_zSum, v_h_dtSum, v_sigSum, + v_entropy_dtSum; /* Get the inverse of hi. */ vector v_hi_inv; v_hi_inv = vec_reciprocal(v_hi); - a_hydro_xSum.v = vec_setzero(); - a_hydro_ySum.v = vec_setzero(); - a_hydro_zSum.v = vec_setzero(); - h_dtSum.v = vec_setzero(); + v_a_hydro_xSum.v = vec_setzero(); + v_a_hydro_ySum.v = vec_setzero(); + v_a_hydro_zSum.v = vec_setzero(); + v_h_dtSum.v = vec_setzero(); v_sigSum.v = vec_set1(pi->force.v_sig); - entropy_dtSum.v = vec_setzero(); + v_entropy_dtSum.v = vec_setzero(); /* Pad cache if there is a serial remainder. */ count_align = count; @@ -807,9 +834,9 @@ __attribute__((always_inline)) INLINE void runner_doself2_force_vec( /* Set positions to the same as particle pi so when the r2 > 0 mask is * applied these extra contributions are masked out.*/ for (int i = count; i < count_align; i++) { - cell_cache->x[i] = pix.f[0]; - cell_cache->y[i] = piy.f[0]; - cell_cache->z[i] = piz.f[0]; + cell_cache->x[i] = v_pix.f[0]; + cell_cache->y[i] = v_piy.f[0]; + cell_cache->z[i] = v_piz.f[0]; cell_cache->h[i] = 1.f; cell_cache->rho[i] = 1.f; cell_cache->grad_h[i] = 1.f; @@ -819,25 +846,25 @@ __attribute__((always_inline)) INLINE void runner_doself2_force_vec( } } - vector pjx, pjy, pjz, hj, hjg2; + vector v_pjx, v_pjy, v_pjz, hj, hjg2; /* Find all of particle pi's interacions and store needed values in the * secondary cache.*/ for (int pjd = 0; pjd < count_align; pjd += (num_vec_proc * VEC_SIZE)) { /* Load 1 set of vectors from the particle cache. */ - pjx.v = vec_load(&cell_cache->x[pjd]); - pjy.v = vec_load(&cell_cache->y[pjd]); - pjz.v = vec_load(&cell_cache->z[pjd]); + v_pjx.v = vec_load(&cell_cache->x[pjd]); + v_pjy.v = vec_load(&cell_cache->y[pjd]); + v_pjz.v = vec_load(&cell_cache->z[pjd]); hj.v = vec_load(&cell_cache->h[pjd]); hjg2.v = vec_mul(vec_mul(hj.v, hj.v), kernel_gamma2_vec.v); /* Compute the pairwise distance. */ vector v_dx, v_dy, v_dz; - v_dx.v = vec_sub(pix.v, pjx.v); - v_dy.v = vec_sub(piy.v, pjy.v); - v_dz.v = vec_sub(piz.v, pjz.v); + v_dx.v = vec_sub(v_pix.v, v_pjx.v); + v_dy.v = vec_sub(v_piy.v, v_pjy.v); + v_dz.v = vec_sub(v_piz.v, v_pjz.v); v_r2.v = vec_mul(v_dx.v, v_dx.v); v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v); @@ -875,19 +902,19 @@ __attribute__((always_inline)) INLINE void runner_doself2_force_vec( &cell_cache->vy[pjd], &cell_cache->vz[pjd], &cell_cache->rho[pjd], &cell_cache->grad_h[pjd], &cell_cache->pOrho2[pjd], &cell_cache->balsara[pjd], &cell_cache->soundspeed[pjd], - &cell_cache->m[pjd], v_hi_inv, v_hj_inv, &a_hydro_xSum, - &a_hydro_ySum, &a_hydro_zSum, &h_dtSum, &v_sigSum, &entropy_dtSum, + &cell_cache->m[pjd], v_hi_inv, v_hj_inv, &v_a_hydro_xSum, + &v_a_hydro_ySum, &v_a_hydro_zSum, &v_h_dtSum, &v_sigSum, &v_entropy_dtSum, v_doi_mask); } } /* Loop over all other particles. */ - VEC_HADD(a_hydro_xSum, pi->a_hydro[0]); - VEC_HADD(a_hydro_ySum, pi->a_hydro[1]); - VEC_HADD(a_hydro_zSum, pi->a_hydro[2]); - VEC_HADD(h_dtSum, pi->force.h_dt); + VEC_HADD(v_a_hydro_xSum, pi->a_hydro[0]); + VEC_HADD(v_a_hydro_ySum, pi->a_hydro[1]); + VEC_HADD(v_a_hydro_zSum, pi->a_hydro[2]); + VEC_HADD(v_h_dtSum, pi->force.h_dt); VEC_HMAX(v_sigSum, pi->force.v_sig); - VEC_HADD(entropy_dtSum, pi->entropy_dt); + VEC_HADD(v_entropy_dtSum, pi->entropy_dt); } /* loop over all particles. */ @@ -1032,12 +1059,12 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, const float hig2 = hi * hi * kernel_gamma2; - vector pix, piy, piz; + vector v_pix, v_piy, v_piz; /* Fill particle pi vectors. */ - pix.v = vec_set1(ci_cache->x[ci_cache_idx]); - piy.v = vec_set1(ci_cache->y[ci_cache_idx]); - piz.v = vec_set1(ci_cache->z[ci_cache_idx]); + v_pix.v = vec_set1(ci_cache->x[ci_cache_idx]); + v_piy.v = vec_set1(ci_cache->y[ci_cache_idx]); + v_piz.v = vec_set1(ci_cache->z[ci_cache_idx]); v_hi.v = vec_set1(hi); v_vix.v = vec_set1(ci_cache->vx[ci_cache_idx]); v_viy.v = vec_set1(ci_cache->vy[ci_cache_idx]); @@ -1046,22 +1073,22 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, v_hig2.v = vec_set1(hig2); /* Reset cumulative sums of update vectors. */ - vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, - curlvySum, curlvzSum; + vector v_rhoSum, v_rho_dhSum, v_wcountSum, v_wcount_dhSum, v_div_vSum, v_curlvxSum, + v_curlvySum, v_curlvzSum; /* Get the inverse of hi. */ vector v_hi_inv; v_hi_inv = vec_reciprocal(v_hi); - rhoSum.v = vec_setzero(); - rho_dhSum.v = vec_setzero(); - wcountSum.v = vec_setzero(); - wcount_dhSum.v = vec_setzero(); - div_vSum.v = vec_setzero(); - curlvxSum.v = vec_setzero(); - curlvySum.v = vec_setzero(); - curlvzSum.v = vec_setzero(); + v_rhoSum.v = vec_setzero(); + v_rho_dhSum.v = vec_setzero(); + v_wcountSum.v = vec_setzero(); + v_wcount_dhSum.v = vec_setzero(); + v_div_vSum.v = vec_setzero(); + v_curlvxSum.v = vec_setzero(); + v_curlvySum.v = vec_setzero(); + v_curlvzSum.v = vec_setzero(); /* Pad the exit iteration if there is a serial remainder. */ int exit_iteration_align = exit_iteration; @@ -1073,7 +1100,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, exit_iteration_align += pad; } - vector pjx, pjy, pjz; + vector v_pjx, v_pjy, v_pjz; /* Loop over the parts in cj. */ for (int pjd = 0; pjd <= exit_iteration_align; pjd += VEC_SIZE) { @@ -1092,14 +1119,14 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, #endif /* Load 2 sets of vectors from the particle cache. */ - pjx.v = vec_load(&cj_cache->x[cj_cache_idx]); - pjy.v = vec_load(&cj_cache->y[cj_cache_idx]); - pjz.v = vec_load(&cj_cache->z[cj_cache_idx]); + v_pjx.v = vec_load(&cj_cache->x[cj_cache_idx]); + v_pjy.v = vec_load(&cj_cache->y[cj_cache_idx]); + v_pjz.v = vec_load(&cj_cache->z[cj_cache_idx]); /* Compute the pairwise distance. */ - v_dx.v = vec_sub(pix.v, pjx.v); - v_dy.v = vec_sub(piy.v, pjy.v); - v_dz.v = vec_sub(piz.v, pjz.v); + v_dx.v = vec_sub(v_pix.v, v_pjx.v); + v_dy.v = vec_sub(v_piy.v, v_pjy.v); + v_dz.v = vec_sub(v_piz.v, v_pjz.v); v_r2.v = vec_mul(v_dx.v, v_dx.v); v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v); @@ -1119,22 +1146,22 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, runner_iact_nonsym_1_vec_density( &v_r2, &v_dx, &v_dy, &v_dz, v_hi_inv, v_vix, v_viy, v_viz, &cj_cache->vx[cj_cache_idx], &cj_cache->vy[cj_cache_idx], - &cj_cache->vz[cj_cache_idx], &cj_cache->m[cj_cache_idx], &rhoSum, - &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, - &curlvySum, &curlvzSum, v_doi_mask); + &cj_cache->vz[cj_cache_idx], &cj_cache->m[cj_cache_idx], &v_rhoSum, + &v_rho_dhSum, &v_wcountSum, &v_wcount_dhSum, &v_div_vSum, &v_curlvxSum, + &v_curlvySum, &v_curlvzSum, v_doi_mask); } /* loop over the parts in cj. */ /* Perform horizontal adds on vector sums and store result in particle pi. */ - VEC_HADD(rhoSum, pi->rho); - VEC_HADD(rho_dhSum, pi->density.rho_dh); - VEC_HADD(wcountSum, pi->density.wcount); - VEC_HADD(wcount_dhSum, pi->density.wcount_dh); - VEC_HADD(div_vSum, pi->density.div_v); - VEC_HADD(curlvxSum, pi->density.rot_v[0]); - VEC_HADD(curlvySum, pi->density.rot_v[1]); - VEC_HADD(curlvzSum, pi->density.rot_v[2]); + VEC_HADD(v_rhoSum, pi->rho); + VEC_HADD(v_rho_dhSum, pi->density.rho_dh); + VEC_HADD(v_wcountSum, pi->density.wcount); + VEC_HADD(v_wcount_dhSum, pi->density.wcount_dh); + VEC_HADD(v_div_vSum, pi->density.div_v); + VEC_HADD(v_curlvxSum, pi->density.rot_v[0]); + VEC_HADD(v_curlvySum, pi->density.rot_v[1]); + VEC_HADD(v_curlvzSum, pi->density.rot_v[2]); } /* loop over the parts in ci. */ } @@ -1161,13 +1188,13 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, const float hjg2 = hj * hj * kernel_gamma2; - vector pjx, pjy, pjz; + vector v_pjx, v_pjy, v_pjz; vector v_hj, v_vjx, v_vjy, v_vjz, v_hjg2; /* Fill particle pi vectors. */ - pjx.v = vec_set1(cj_cache->x[cj_cache_idx]); - pjy.v = vec_set1(cj_cache->y[cj_cache_idx]); - pjz.v = vec_set1(cj_cache->z[cj_cache_idx]); + v_pjx.v = vec_set1(cj_cache->x[cj_cache_idx]); + v_pjy.v = vec_set1(cj_cache->y[cj_cache_idx]); + v_pjz.v = vec_set1(cj_cache->z[cj_cache_idx]); v_hj.v = vec_set1(hj); v_vjx.v = vec_set1(cj_cache->vx[cj_cache_idx]); v_vjy.v = vec_set1(cj_cache->vy[cj_cache_idx]); @@ -1176,24 +1203,24 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, v_hjg2.v = vec_set1(hjg2); /* Reset cumulative sums of update vectors. */ - vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, - curlvySum, curlvzSum; + vector v_rhoSum, v_rho_dhSum, v_wcountSum, v_wcount_dhSum, v_div_vSum, v_curlvxSum, + v_curlvySum, v_curlvzSum; /* Get the inverse of hj. */ vector v_hj_inv; v_hj_inv = vec_reciprocal(v_hj); - rhoSum.v = vec_setzero(); - rho_dhSum.v = vec_setzero(); - wcountSum.v = vec_setzero(); - wcount_dhSum.v = vec_setzero(); - div_vSum.v = vec_setzero(); - curlvxSum.v = vec_setzero(); - curlvySum.v = vec_setzero(); - curlvzSum.v = vec_setzero(); + v_rhoSum.v = vec_setzero(); + v_rho_dhSum.v = vec_setzero(); + v_wcountSum.v = vec_setzero(); + v_wcount_dhSum.v = vec_setzero(); + v_div_vSum.v = vec_setzero(); + v_curlvxSum.v = vec_setzero(); + v_curlvySum.v = vec_setzero(); + v_curlvzSum.v = vec_setzero(); - vector pix, piy, piz; + vector v_pix, v_piy, v_piz; /* Convert exit iteration to cache indices. */ int exit_iteration_align = exit_iteration - first_pi; @@ -1223,14 +1250,14 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, vector v_dx, v_dy, v_dz, v_r2; /* Load 2 sets of vectors from the particle cache. */ - pix.v = vec_load(&ci_cache->x[ci_cache_idx]); - piy.v = vec_load(&ci_cache->y[ci_cache_idx]); - piz.v = vec_load(&ci_cache->z[ci_cache_idx]); + v_pix.v = vec_load(&ci_cache->x[ci_cache_idx]); + v_piy.v = vec_load(&ci_cache->y[ci_cache_idx]); + v_piz.v = vec_load(&ci_cache->z[ci_cache_idx]); /* Compute the pairwise distance. */ - v_dx.v = vec_sub(pjx.v, pix.v); - v_dy.v = vec_sub(pjy.v, piy.v); - v_dz.v = vec_sub(pjz.v, piz.v); + v_dx.v = vec_sub(v_pjx.v, v_pix.v); + v_dy.v = vec_sub(v_pjy.v, v_piy.v); + v_dz.v = vec_sub(v_pjz.v, v_piz.v); v_r2.v = vec_mul(v_dx.v, v_dx.v); v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v); @@ -1250,22 +1277,22 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, runner_iact_nonsym_1_vec_density( &v_r2, &v_dx, &v_dy, &v_dz, v_hj_inv, v_vjx, v_vjy, v_vjz, &ci_cache->vx[ci_cache_idx], &ci_cache->vy[ci_cache_idx], - &ci_cache->vz[ci_cache_idx], &ci_cache->m[ci_cache_idx], &rhoSum, - &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, - &curlvySum, &curlvzSum, v_doj_mask); + &ci_cache->vz[ci_cache_idx], &ci_cache->m[ci_cache_idx], &v_rhoSum, + &v_rho_dhSum, &v_wcountSum, &v_wcount_dhSum, &v_div_vSum, &v_curlvxSum, + &v_curlvySum, &v_curlvzSum, v_doj_mask); } /* loop over the parts in ci. */ /* Perform horizontal adds on vector sums and store result in particle pj. */ - VEC_HADD(rhoSum, pj->rho); - VEC_HADD(rho_dhSum, pj->density.rho_dh); - VEC_HADD(wcountSum, pj->density.wcount); - VEC_HADD(wcount_dhSum, pj->density.wcount_dh); - VEC_HADD(div_vSum, pj->density.div_v); - VEC_HADD(curlvxSum, pj->density.rot_v[0]); - VEC_HADD(curlvySum, pj->density.rot_v[1]); - VEC_HADD(curlvzSum, pj->density.rot_v[2]); + VEC_HADD(v_rhoSum, pj->rho); + VEC_HADD(v_rho_dhSum, pj->density.rho_dh); + VEC_HADD(v_wcountSum, pj->density.wcount); + VEC_HADD(v_wcount_dhSum, pj->density.wcount_dh); + VEC_HADD(v_div_vSum, pj->density.div_v); + VEC_HADD(v_curlvxSum, pj->density.rot_v[0]); + VEC_HADD(v_curlvySum, pj->density.rot_v[1]); + VEC_HADD(v_curlvzSum, pj->density.rot_v[2]); } /* loop over the parts in cj. */ } @@ -1282,6 +1309,8 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, * @param r The #runner. * @param ci The first #cell. * @param cj The second #cell. + * @param sid The direction of the pair + * @param shift The shift vector to apply to the particles in ci. */ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, struct cell *cj, const int sid, @@ -1415,12 +1444,12 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, const float hig2 = hi * hi * kernel_gamma2; - vector pix, piy, piz; + vector v_pix, v_piy, v_piz; /* Fill particle pi vectors. */ - pix.v = vec_set1(ci_cache->x[ci_cache_idx]); - piy.v = vec_set1(ci_cache->y[ci_cache_idx]); - piz.v = vec_set1(ci_cache->z[ci_cache_idx]); + v_pix.v = vec_set1(ci_cache->x[ci_cache_idx]); + v_piy.v = vec_set1(ci_cache->y[ci_cache_idx]); + v_piz.v = vec_set1(ci_cache->z[ci_cache_idx]); v_hi.v = vec_set1(hi); v_vix.v = vec_set1(ci_cache->vx[ci_cache_idx]); v_viy.v = vec_set1(ci_cache->vy[ci_cache_idx]); @@ -1435,19 +1464,19 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, v_hig2.v = vec_set1(hig2); /* Reset cumulative sums of update vectors. */ - vector a_hydro_xSum, a_hydro_ySum, a_hydro_zSum, h_dtSum, v_sigSum, - entropy_dtSum; + vector v_a_hydro_xSum, v_a_hydro_ySum, v_a_hydro_zSum, v_h_dtSum, v_sigSum, + v_entropy_dtSum; /* Get the inverse of hi. */ vector v_hi_inv; v_hi_inv = vec_reciprocal(v_hi); - a_hydro_xSum.v = vec_setzero(); - a_hydro_ySum.v = vec_setzero(); - a_hydro_zSum.v = vec_setzero(); - h_dtSum.v = vec_setzero(); + v_a_hydro_xSum.v = vec_setzero(); + v_a_hydro_ySum.v = vec_setzero(); + v_a_hydro_zSum.v = vec_setzero(); + v_h_dtSum.v = vec_setzero(); v_sigSum.v = vec_set1(pi->force.v_sig); - entropy_dtSum.v = vec_setzero(); + v_entropy_dtSum.v = vec_setzero(); /* Pad the exit iteration if there is a serial remainder. */ int exit_iteration_align = exit_iteration; @@ -1459,7 +1488,7 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, exit_iteration_align += pad; } - vector pjx, pjy, pjz, hj, hjg2; + vector v_pjx, v_pjy, v_pjz, hj, hjg2; /* Loop over the parts in cj. */ for (int pjd = 0; pjd <= exit_iteration_align; pjd += VEC_SIZE) { @@ -1478,16 +1507,16 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, #endif /* Load 2 sets of vectors from the particle cache. */ - pjx.v = vec_load(&cj_cache->x[cj_cache_idx]); - pjy.v = vec_load(&cj_cache->y[cj_cache_idx]); - pjz.v = vec_load(&cj_cache->z[cj_cache_idx]); + v_pjx.v = vec_load(&cj_cache->x[cj_cache_idx]); + v_pjy.v = vec_load(&cj_cache->y[cj_cache_idx]); + v_pjz.v = vec_load(&cj_cache->z[cj_cache_idx]); hj.v = vec_load(&cj_cache->h[cj_cache_idx]); hjg2.v = vec_mul(vec_mul(hj.v, hj.v), kernel_gamma2_vec.v); /* Compute the pairwise distance. */ - v_dx.v = vec_sub(pix.v, pjx.v); - v_dy.v = vec_sub(piy.v, pjy.v); - v_dz.v = vec_sub(piz.v, pjz.v); + v_dx.v = vec_sub(v_pix.v, v_pjx.v); + v_dy.v = vec_sub(v_piy.v, v_pjy.v); + v_dz.v = vec_sub(v_piz.v, v_pjz.v); v_r2.v = vec_mul(v_dx.v, v_dx.v); v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v); @@ -1517,20 +1546,20 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, &cj_cache->grad_h[cj_cache_idx], &cj_cache->pOrho2[cj_cache_idx], &cj_cache->balsara[cj_cache_idx], &cj_cache->soundspeed[cj_cache_idx], &cj_cache->m[cj_cache_idx], - v_hi_inv, v_hj_inv, &a_hydro_xSum, &a_hydro_ySum, &a_hydro_zSum, - &h_dtSum, &v_sigSum, &entropy_dtSum, v_doi_mask); + v_hi_inv, v_hj_inv, &v_a_hydro_xSum, &v_a_hydro_ySum, &v_a_hydro_zSum, + &v_h_dtSum, &v_sigSum, &v_entropy_dtSum, v_doi_mask); } } /* loop over the parts in cj. */ /* Perform horizontal adds on vector sums and store result in particle pi. */ - VEC_HADD(a_hydro_xSum, pi->a_hydro[0]); - VEC_HADD(a_hydro_ySum, pi->a_hydro[1]); - VEC_HADD(a_hydro_zSum, pi->a_hydro[2]); - VEC_HADD(h_dtSum, pi->force.h_dt); + VEC_HADD(v_a_hydro_xSum, pi->a_hydro[0]); + VEC_HADD(v_a_hydro_ySum, pi->a_hydro[1]); + VEC_HADD(v_a_hydro_zSum, pi->a_hydro[2]); + VEC_HADD(v_h_dtSum, pi->force.h_dt); VEC_HMAX(v_sigSum, pi->force.v_sig); - VEC_HADD(entropy_dtSum, pi->entropy_dt); + VEC_HADD(v_entropy_dtSum, pi->entropy_dt); } /* loop over the parts in ci. */ } @@ -1558,14 +1587,14 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, const float hjg2 = hj * hj * kernel_gamma2; - vector pjx, pjy, pjz; + vector v_pjx, v_pjy, v_pjz; vector v_hj, v_vjx, v_vjy, v_vjz, v_hjg2; vector v_rhoj, v_grad_hj, v_pOrhoj2, v_balsara_j, v_cj; /* Fill particle pi vectors. */ - pjx.v = vec_set1(cj_cache->x[cj_cache_idx]); - pjy.v = vec_set1(cj_cache->y[cj_cache_idx]); - pjz.v = vec_set1(cj_cache->z[cj_cache_idx]); + v_pjx.v = vec_set1(cj_cache->x[cj_cache_idx]); + v_pjy.v = vec_set1(cj_cache->y[cj_cache_idx]); + v_pjz.v = vec_set1(cj_cache->z[cj_cache_idx]); v_hj.v = vec_set1(hj); v_vjx.v = vec_set1(cj_cache->vx[cj_cache_idx]); v_vjy.v = vec_set1(cj_cache->vy[cj_cache_idx]); @@ -1580,20 +1609,20 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, v_hjg2.v = vec_set1(hjg2); /* Reset cumulative sums of update vectors. */ - vector a_hydro_xSum, a_hydro_ySum, a_hydro_zSum, h_dtSum, v_sigSum, - entropy_dtSum; + vector v_a_hydro_xSum, v_a_hydro_ySum, v_a_hydro_zSum, v_h_dtSum, v_sigSum, + v_entropy_dtSum; /* Get the inverse of hj. */ vector v_hj_inv; v_hj_inv = vec_reciprocal(v_hj); - a_hydro_xSum.v = vec_setzero(); - a_hydro_ySum.v = vec_setzero(); - a_hydro_zSum.v = vec_setzero(); - h_dtSum.v = vec_setzero(); + v_a_hydro_xSum.v = vec_setzero(); + v_a_hydro_ySum.v = vec_setzero(); + v_a_hydro_zSum.v = vec_setzero(); + v_h_dtSum.v = vec_setzero(); v_sigSum.v = vec_set1(pj->force.v_sig); - entropy_dtSum.v = vec_setzero(); + v_entropy_dtSum.v = vec_setzero(); /* Convert exit iteration to cache indices. */ int exit_iteration_align = exit_iteration - first_pi; @@ -1605,7 +1634,7 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, } else exit_iteration_align -= rem; - vector pix, piy, piz, hi, hig2; + vector v_pix, v_piy, v_piz, hi, hig2; /* Loop over the parts in ci. */ for (int ci_cache_idx = exit_iteration_align; @@ -1620,16 +1649,16 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, vector v_dx, v_dy, v_dz; /* Load 2 sets of vectors from the particle cache. */ - pix.v = vec_load(&ci_cache->x[ci_cache_idx]); - piy.v = vec_load(&ci_cache->y[ci_cache_idx]); - piz.v = vec_load(&ci_cache->z[ci_cache_idx]); + v_pix.v = vec_load(&ci_cache->x[ci_cache_idx]); + v_piy.v = vec_load(&ci_cache->y[ci_cache_idx]); + v_piz.v = vec_load(&ci_cache->z[ci_cache_idx]); hi.v = vec_load(&ci_cache->h[ci_cache_idx]); hig2.v = vec_mul(vec_mul(hi.v, hi.v), kernel_gamma2_vec.v); /* Compute the pairwise distance. */ - v_dx.v = vec_sub(pjx.v, pix.v); - v_dy.v = vec_sub(pjy.v, piy.v); - v_dz.v = vec_sub(pjz.v, piz.v); + v_dx.v = vec_sub(v_pjx.v, v_pix.v); + v_dy.v = vec_sub(v_pjy.v, v_piy.v); + v_dz.v = vec_sub(v_pjz.v, v_piz.v); v_r2.v = vec_mul(v_dx.v, v_dx.v); v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v); @@ -1659,19 +1688,19 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, &ci_cache->grad_h[ci_cache_idx], &ci_cache->pOrho2[ci_cache_idx], &ci_cache->balsara[ci_cache_idx], &ci_cache->soundspeed[ci_cache_idx], &ci_cache->m[ci_cache_idx], - v_hj_inv, v_hi_inv, &a_hydro_xSum, &a_hydro_ySum, &a_hydro_zSum, - &h_dtSum, &v_sigSum, &entropy_dtSum, v_doj_mask); + v_hj_inv, v_hi_inv, &v_a_hydro_xSum, &v_a_hydro_ySum, &v_a_hydro_zSum, + &v_h_dtSum, &v_sigSum, &v_entropy_dtSum, v_doj_mask); } } /* loop over the parts in ci. */ /* Perform horizontal adds on vector sums and store result in particle pj. */ - VEC_HADD(a_hydro_xSum, pj->a_hydro[0]); - VEC_HADD(a_hydro_ySum, pj->a_hydro[1]); - VEC_HADD(a_hydro_zSum, pj->a_hydro[2]); - VEC_HADD(h_dtSum, pj->force.h_dt); + VEC_HADD(v_a_hydro_xSum, pj->a_hydro[0]); + VEC_HADD(v_a_hydro_ySum, pj->a_hydro[1]); + VEC_HADD(v_a_hydro_zSum, pj->a_hydro[2]); + VEC_HADD(v_h_dtSum, pj->force.h_dt); VEC_HMAX(v_sigSum, pj->force.v_sig); - VEC_HADD(entropy_dtSum, pj->entropy_dt); + VEC_HADD(v_entropy_dtSum, pj->entropy_dt); } /* loop over the parts in cj. */