diff --git a/src/cache.h b/src/cache.h index cb0f1c62d3473630d8125a9a2c47b28b5a852501..1b675e1cc0da5daab53ef14fa06106eca186bd15 100644 --- a/src/cache.h +++ b/src/cache.h @@ -198,8 +198,8 @@ __attribute__((always_inline)) INLINE void cache_read_particles( swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT); const struct part *restrict parts = ci->parts; - const double loc[3] = {ci->loc[0], ci->loc[1],ci->loc[2]}; - + const double loc[3] = {ci->loc[0], ci->loc[1], ci->loc[2]}; + /* Shift the particles positions to a local frame so single precision can be * used instead of double precision. */ for (int i = 0; i < ci->count; i++) { @@ -250,7 +250,7 @@ __attribute__((always_inline)) INLINE void cache_read_force_particles( SWIFT_CACHE_ALIGNMENT); const struct part *restrict parts = ci->parts; - const double loc[3] = {ci->loc[0], ci->loc[1],ci->loc[2]}; + const double loc[3] = {ci->loc[0], ci->loc[1], ci->loc[2]}; /* Shift the particles positions to a local frame so single precision can be * used instead of double precision. */ @@ -296,7 +296,6 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( const struct entry *restrict sort_j, const double *restrict const shift, int *first_pi, int *last_pj) { - int idx; /* Pad number of particles read to the vector size. */ int rem = (ci->count - *first_pi) % VEC_SIZE; if (rem != 0) { @@ -312,17 +311,17 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( if (*last_pj + pad < cj->count) *last_pj += pad; } + /* Get some local pointers */ const int first_pi_align = *first_pi; const int last_pj_align = *last_pj; const struct part *restrict parts_i = ci->parts; const struct part *restrict parts_j = cj->parts; /* Shift particles to the local frame and account for boundary conditions.*/ - const double total_ci_shift[3] = {cj->loc[0] + shift[0], cj->loc[1] + shift[1], - cj->loc[2] + shift[2]}; - const double total_cj_shift[3] = {cj->loc[0], cj->loc[1], - cj->loc[2]}; - + const double total_ci_shift[3] = { + cj->loc[0] + shift[0], cj->loc[1] + shift[1], cj->loc[2] + shift[2]}; + const double total_cj_shift[3] = {cj->loc[0], cj->loc[1], cj->loc[2]}; + /* Let the compiler know that the data is aligned and create pointers to the * arrays inside the cache. */ swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT); @@ -335,16 +334,11 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT); int ci_cache_count = ci->count - first_pi_align; - + /* Shift the particles positions to a local frame (ci frame) so single - * precision - * can be - * used instead of double precision. Also shift the cell ci, particles - * positions - * due to BCs but leave cell cj. */ + * precision can be used instead of double precision. */ for (int i = 0; i < ci_cache_count; i++) { - /* Make sure ci_cache is filled from the first element. */ - idx = sort_i[i + first_pi_align].i; + const int idx = sort_i[i + first_pi_align].i; x[i] = (float)(parts_i[idx].x[0] - total_ci_shift[0]); y[i] = (float)(parts_i[idx].x[1] - total_ci_shift[1]); z[i] = (float)(parts_i[idx].x[2] - total_ci_shift[2]); @@ -371,30 +365,31 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( "is not within " "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + " "2*space_maxreldx)]. x=%f, ci->width[0]=%f", - ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1], cj->loc[2], i, x[i], - ci->width[0]); + ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1], + cj->loc[2], i, x[i], ci->width[0]); if (y[i] > shift_threshold_y || y[i] < -shift_threshold_y) error( "Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d y pos " "is not within " "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + " "2*space_maxreldx)]. y=%f, ci->width[1]=%f", - ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1], cj->loc[2], i, y[i], - ci->width[1]); + ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1], + cj->loc[2], i, y[i], ci->width[1]); if (z[i] > shift_threshold_z || z[i] < -shift_threshold_z) error( "Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d z pos " "is not within " "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + " "2*space_maxreldx)]. z=%f, ci->width[2]=%f", - ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1], cj->loc[2], i, z[i], - ci->width[2]); + ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1], + cj->loc[2], i, z[i], ci->width[2]); } #endif /* Pad cache with fake particles that exist outside the cell so will not - * interact.*/ - const float max_dx = max(ci->dx_max_part, cj->dx_max_part); + * interact. We use values of the same magnitude (but negative!) as the real + * particles to avoid overflow problems. */ + const double max_dx = max(ci->dx_max_part, cj->dx_max_part); const float pos_padded[3] = {-(2. * ci->width[0] + max_dx), -(2. * ci->width[1] + max_dx), -(2. * ci->width[2] + max_dx)}; @@ -425,7 +420,7 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( swift_declare_aligned_ptr(float, vzj, cj_cache->vz, SWIFT_CACHE_ALIGNMENT); for (int i = 0; i <= last_pj_align; i++) { - idx = sort_j[i].i; + const int idx = sort_j[i].i; xj[i] = (float)(parts_j[idx].x[0] - total_cj_shift[0]); yj[i] = (float)(parts_j[idx].x[1] - total_cj_shift[1]); zj[i] = (float)(parts_j[idx].x[2] - total_cj_shift[2]); @@ -445,29 +440,30 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( "pos is not within " "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + " "2*space_maxreldx)]. xj=%f, ci->width[0]=%f", - ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1], cj->loc[2], i, xj[i], - ci->width[0]); + ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1], + cj->loc[2], i, xj[i], ci->width[0]); if (yj[i] > shift_threshold_y || yj[i] < -shift_threshold_y) error( "Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d yj " "pos is not within " "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + " "2*space_maxreldx)]. yj=%f, ci->width[1]=%f", - ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1], cj->loc[2], i, yj[i], - ci->width[1]); + ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1], + cj->loc[2], i, yj[i], ci->width[1]); if (zj[i] > shift_threshold_z || zj[i] < -shift_threshold_z) error( "Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d zj " "pos is not within " "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + " "2*space_maxreldx)]. zj=%f, ci->width[2]=%f", - ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1], cj->loc[2], i, zj[i], - ci->width[2]); + ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1], + cj->loc[2], i, zj[i], ci->width[2]); } #endif /* Pad cache with fake particles that exist outside the cell so will not - * interact.*/ + * interact. We use values of the same magnitude (but negative!) as the real + * particles to avoid overflow problems. */ const float pos_padded_j[3] = {-(2. * cj->width[0] + max_dx), -(2. * cj->width[1] + max_dx), -(2. * cj->width[2] + max_dx)}; @@ -508,7 +504,6 @@ cache_read_two_partial_cells_sorted_force( const struct entry *restrict sort_i, const struct entry *restrict sort_j, const double *const shift, int *first_pi, int *last_pj) { - int idx; /* Pad number of particles read to the vector size. */ int rem = (ci->count - *first_pi) % VEC_SIZE; if (rem != 0) { @@ -524,16 +519,16 @@ cache_read_two_partial_cells_sorted_force( if (*last_pj + pad < cj->count) *last_pj += pad; } + /* Get some local pointers */ const int first_pi_align = *first_pi; const int last_pj_align = *last_pj; const struct part *restrict parts_i = ci->parts; const struct part *restrict parts_j = cj->parts; - + /* Shift particles to the local frame and account for boundary conditions.*/ - const double total_ci_shift[3] = {cj->loc[0] + shift[0], cj->loc[1] + shift[1], - cj->loc[2] + shift[2]}; - const double total_cj_shift[3] = {cj->loc[0], cj->loc[1], - cj->loc[2]}; + const double total_ci_shift[3] = { + cj->loc[0] + shift[0], cj->loc[1] + shift[1], cj->loc[2] + shift[2]}; + const double total_cj_shift[3] = {cj->loc[0], cj->loc[1], cj->loc[2]}; /* Let the compiler know that the data is aligned and create pointers to the * arrays inside the cache. */ @@ -557,14 +552,10 @@ cache_read_two_partial_cells_sorted_force( int ci_cache_count = ci->count - first_pi_align; /* Shift the particles positions to a local frame (ci frame) so single - * precision - * can be - * used instead of double precision. Also shift the cell ci, particles - * positions - * due to BCs but leave cell cj. */ + * precision can be used instead of double precision. */ for (int i = 0; i < ci_cache_count; i++) { - /* Make sure ci_cache is filled from the first element. */ - idx = sort_i[i + first_pi_align].i; + + const int idx = sort_i[i + first_pi_align].i; x[i] = (float)(parts_i[idx].x[0] - total_ci_shift[0]); y[i] = (float)(parts_i[idx].x[1] - total_ci_shift[1]); z[i] = (float)(parts_i[idx].x[2] - total_ci_shift[2]); @@ -581,8 +572,9 @@ cache_read_two_partial_cells_sorted_force( } /* Pad cache with fake particles that exist outside the cell so will not - * interact.*/ - const float max_dx = max(ci->dx_max_part, cj->dx_max_part); + * interact. We use values of the same magnitude (but negative!) as the real + * particles to avoid overflow problems. */ + const double max_dx = max(ci->dx_max_part, cj->dx_max_part); const float pos_padded[3] = {-(2. * ci->width[0] + max_dx), -(2. * ci->width[1] + max_dx), -(2. * ci->width[2] + max_dx)}; @@ -626,7 +618,7 @@ cache_read_two_partial_cells_sorted_force( SWIFT_CACHE_ALIGNMENT); for (int i = 0; i <= last_pj_align; i++) { - idx = sort_j[i].i; + const int idx = sort_j[i].i; xj[i] = (float)(parts_j[idx].x[0] - total_cj_shift[0]); yj[i] = (float)(parts_j[idx].x[1] - total_cj_shift[1]); zj[i] = (float)(parts_j[idx].x[2] - total_cj_shift[2]); @@ -643,7 +635,8 @@ cache_read_two_partial_cells_sorted_force( } /* Pad cache with fake particles that exist outside the cell so will not - * interact.*/ + * interact. We use values of the same magnitude (but negative!) as the real + * particles to avoid overflow problems. */ const float pos_padded_j[3] = {-(2. * cj->width[0] + max_dx), -(2. * cj->width[1] + max_dx), -(2. * cj->width[2] + max_dx)}; diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c index 6ecf72c1e7444f4a73cf99045978f748bed67bfc..c9b76db8e863e763437cf53aa621e8b636d2ddad 100644 --- a/src/runner_doiact_vec.c +++ b/src/runner_doiact_vec.c @@ -41,9 +41,11 @@ static const vector kernel_gamma2_vec = FILL_VEC(kernel_gamma2); * gradient update on pi. * @param v_wcountSum (return) #vector holding the cumulative sum of the wcount * update on pi. - * @param v_wcount_dhSum (return) #vector holding the cumulative sum of the wcount + * @param v_wcount_dhSum (return) #vector holding the cumulative sum of the + * wcount * gradient update on pi. - * @param v_div_vSum (return) #vector holding the cumulative sum of the divergence + * @param v_div_vSum (return) #vector holding the cumulative sum of the + * divergence * update on pi. * @param v_curlvxSum (return) #vector holding the cumulative sum of the curl of * vx update on pi. @@ -61,9 +63,9 @@ static const vector kernel_gamma2_vec = FILL_VEC(kernel_gamma2); __attribute__((always_inline)) INLINE static void calcRemInteractions( struct c2_cache *const int_cache, const int icount, vector *v_rhoSum, vector *v_rho_dhSum, vector *v_wcountSum, vector *v_wcount_dhSum, - vector *v_div_vSum, vector *v_curlvxSum, vector *v_curlvySum, vector *v_curlvzSum, - vector v_hi_inv, vector v_vix, vector v_viy, vector v_viz, - int *icount_align) { + vector *v_div_vSum, vector *v_curlvxSum, vector *v_curlvySum, + vector *v_curlvzSum, vector v_hi_inv, vector v_vix, vector v_viy, + vector v_viz, int *icount_align) { mask_t int_mask, int_mask2; @@ -108,8 +110,8 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions( v_hi_inv, v_vix, v_viy, v_viz, &int_cache->vxq[*icount_align], &int_cache->vyq[*icount_align], &int_cache->vzq[*icount_align], &int_cache->mq[*icount_align], v_rhoSum, v_rho_dhSum, v_wcountSum, - v_wcount_dhSum, v_div_vSum, v_curlvxSum, v_curlvySum, v_curlvzSum, int_mask, - int_mask2, 1); + v_wcount_dhSum, v_div_vSum, v_curlvxSum, v_curlvySum, v_curlvzSum, + int_mask, int_mask2, 1); } } @@ -127,20 +129,25 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions( * @param int_cache (return) secondary #cache of interactions between two * particles. * @param icount Interaction count. - * @param v_rhoSum #vector holding the cumulative sum of the density update on pi. + * @param v_rhoSum #vector holding the cumulative sum of the density update on + * pi. * @param v_rho_dhSum #vector holding the cumulative sum of the density gradient * update on pi. * @param v_wcountSum #vector holding the cumulative sum of the wcount update on * pi. - * @param v_wcount_dhSum #vector holding the cumulative sum of the wcount gradient + * @param v_wcount_dhSum #vector holding the cumulative sum of the wcount + * gradient * update on pi. * @param v_div_vSum #vector holding the cumulative sum of the divergence update * on pi. - * @param v_curlvxSum #vector holding the cumulative sum of the curl of vx update + * @param v_curlvxSum #vector holding the cumulative sum of the curl of vx + * update * on pi. - * @param v_curlvySum #vector holding the cumulative sum of the curl of vy update + * @param v_curlvySum #vector holding the cumulative sum of the curl of vy + * update * on pi. - * @param v_curlvzSum #vector holding the cumulative sum of the curl of vz update + * @param v_curlvzSum #vector holding the cumulative sum of the curl of vz + * update * on pi. * @param v_hi_inv #vector of 1/h for pi. * @param v_vix #vector of x velocity of pi. @@ -152,8 +159,9 @@ __attribute__((always_inline)) INLINE static void storeInteractions( vector *v_dz, const struct cache *const cell_cache, struct c2_cache *const int_cache, int *icount, vector *v_rhoSum, vector *v_rho_dhSum, vector *v_wcountSum, vector *v_wcount_dhSum, - vector *v_div_vSum, vector *v_curlvxSum, vector *v_curlvySum, vector *v_curlvzSum, - vector v_hi_inv, vector v_vix, vector v_viy, vector v_viz) { + vector *v_div_vSum, vector *v_curlvxSum, vector *v_curlvySum, + vector *v_curlvzSum, vector v_hi_inv, vector v_vix, vector v_viy, + vector v_viz) { /* Left-pack values needed into the secondary cache using the interaction mask. */ @@ -203,8 +211,9 @@ __attribute__((always_inline)) INLINE static void storeInteractions( /* Peform remainder interactions. */ calcRemInteractions(int_cache, *icount, v_rhoSum, v_rho_dhSum, v_wcountSum, - v_wcount_dhSum, v_div_vSum, v_curlvxSum, v_curlvySum, v_curlvzSum, - v_hi_inv, v_vix, v_viy, v_viz, &icount_align); + v_wcount_dhSum, v_div_vSum, v_curlvxSum, v_curlvySum, + v_curlvzSum, v_hi_inv, v_vix, v_viy, v_viz, + &icount_align); mask_t int_mask, int_mask2; vec_init_mask_true(int_mask); @@ -216,8 +225,8 @@ __attribute__((always_inline)) INLINE static void storeInteractions( &int_cache->r2q[j], &int_cache->dxq[j], &int_cache->dyq[j], &int_cache->dzq[j], v_hi_inv, v_vix, v_viy, v_viz, &int_cache->vxq[j], &int_cache->vyq[j], &int_cache->vzq[j], &int_cache->mq[j], v_rhoSum, - v_rho_dhSum, v_wcountSum, v_wcount_dhSum, v_div_vSum, v_curlvxSum, v_curlvySum, - v_curlvzSum, int_mask, int_mask2, 0); + v_rho_dhSum, v_wcountSum, v_wcount_dhSum, v_div_vSum, v_curlvxSum, + v_curlvySum, v_curlvzSum, int_mask, int_mask2, 0); } /* Reset interaction count. */ @@ -574,8 +583,8 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( v_hig2.v = vec_set1(hig2); /* Reset cumulative sums of update vectors. */ - vector v_rhoSum, v_rho_dhSum, v_wcountSum, v_wcount_dhSum, v_div_vSum, v_curlvxSum, - v_curlvySum, v_curlvzSum; + vector v_rhoSum, v_rho_dhSum, v_wcountSum, v_wcount_dhSum, v_div_vSum, + v_curlvxSum, v_curlvySum, v_curlvzSum; /* Get the inverse of hi. */ vector v_hi_inv; @@ -671,24 +680,25 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( * cache. */ if (doi_mask) { storeInteractions(doi_mask, pjd, &v_r2, &v_dx, &v_dy, &v_dz, cell_cache, - &int_cache, &icount, &v_rhoSum, &v_rho_dhSum, &v_wcountSum, - &v_wcount_dhSum, &v_div_vSum, &v_curlvxSum, &v_curlvySum, - &v_curlvzSum, v_hi_inv, v_vix, v_viy, v_viz); + &int_cache, &icount, &v_rhoSum, &v_rho_dhSum, + &v_wcountSum, &v_wcount_dhSum, &v_div_vSum, + &v_curlvxSum, &v_curlvySum, &v_curlvzSum, v_hi_inv, + v_vix, v_viy, v_viz); } if (doi_mask2) { storeInteractions(doi_mask2, pjd + VEC_SIZE, &v_r2_2, &v_dx_2, &v_dy_2, &v_dz_2, cell_cache, &int_cache, &icount, &v_rhoSum, - &v_rho_dhSum, &v_wcountSum, &v_wcount_dhSum, &v_div_vSum, - &v_curlvxSum, &v_curlvySum, &v_curlvzSum, v_hi_inv, v_vix, - v_viy, v_viz); + &v_rho_dhSum, &v_wcountSum, &v_wcount_dhSum, + &v_div_vSum, &v_curlvxSum, &v_curlvySum, &v_curlvzSum, + v_hi_inv, v_vix, v_viy, v_viz); } } /* Perform padded vector remainder interactions if any are present. */ - calcRemInteractions(&int_cache, icount, &v_rhoSum, &v_rho_dhSum, &v_wcountSum, - &v_wcount_dhSum, &v_div_vSum, &v_curlvxSum, &v_curlvySum, - &v_curlvzSum, v_hi_inv, v_vix, v_viy, v_viz, - &icount_align); + calcRemInteractions(&int_cache, icount, &v_rhoSum, &v_rho_dhSum, + &v_wcountSum, &v_wcount_dhSum, &v_div_vSum, + &v_curlvxSum, &v_curlvySum, &v_curlvzSum, v_hi_inv, + v_vix, v_viy, v_viz, &icount_align); /* Initialise masks to true in case remainder interactions have been * performed. */ @@ -702,9 +712,9 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( &int_cache.r2q[pjd], &int_cache.dxq[pjd], &int_cache.dyq[pjd], &int_cache.dzq[pjd], v_hi_inv, v_vix, v_viy, v_viz, &int_cache.vxq[pjd], &int_cache.vyq[pjd], &int_cache.vzq[pjd], - &int_cache.mq[pjd], &v_rhoSum, &v_rho_dhSum, &v_wcountSum, &v_wcount_dhSum, - &v_div_vSum, &v_curlvxSum, &v_curlvySum, &v_curlvzSum, int_mask, int_mask2, - 0); + &int_cache.mq[pjd], &v_rhoSum, &v_rho_dhSum, &v_wcountSum, + &v_wcount_dhSum, &v_div_vSum, &v_curlvxSum, &v_curlvySum, + &v_curlvzSum, int_mask, int_mask2, 0); } /* Perform horizontal adds on vector sums and store result in particle pi. @@ -903,8 +913,8 @@ __attribute__((always_inline)) INLINE void runner_doself2_force_vec( &cell_cache->grad_h[pjd], &cell_cache->pOrho2[pjd], &cell_cache->balsara[pjd], &cell_cache->soundspeed[pjd], &cell_cache->m[pjd], v_hi_inv, v_hj_inv, &v_a_hydro_xSum, - &v_a_hydro_ySum, &v_a_hydro_zSum, &v_h_dtSum, &v_sigSum, &v_entropy_dtSum, - v_doi_mask); + &v_a_hydro_ySum, &v_a_hydro_zSum, &v_h_dtSum, &v_sigSum, + &v_entropy_dtSum, v_doi_mask); } } /* Loop over all other particles. */ @@ -1030,8 +1040,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, /* Read the needed particles into the two caches. */ cache_read_two_partial_cells_sorted(ci, cj, ci_cache, cj_cache, sort_i, - sort_j, shift, &first_pi, - &last_pj); + sort_j, shift, &first_pi, &last_pj); /* Get the number of particles read into the ci cache. */ int ci_cache_count = count_i - first_pi; @@ -1073,8 +1082,8 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, v_hig2.v = vec_set1(hig2); /* Reset cumulative sums of update vectors. */ - vector v_rhoSum, v_rho_dhSum, v_wcountSum, v_wcount_dhSum, v_div_vSum, v_curlvxSum, - v_curlvySum, v_curlvzSum; + vector v_rhoSum, v_rho_dhSum, v_wcountSum, v_wcount_dhSum, v_div_vSum, + v_curlvxSum, v_curlvySum, v_curlvzSum; /* Get the inverse of hi. */ vector v_hi_inv; @@ -1113,8 +1122,8 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, #ifdef SWIFT_DEBUG_CHECKS if (cj_cache_idx % VEC_SIZE != 0 || cj_cache_idx < 0 || cj_cache_idx + (VEC_SIZE - 1) > (last_pj + 1 + VEC_SIZE)) { - error("Unaligned read!!! cj_cache_idx=%d, last_pj=%d", - cj_cache_idx, last_pj); + error("Unaligned read!!! cj_cache_idx=%d, last_pj=%d", cj_cache_idx, + last_pj); } #endif @@ -1146,9 +1155,10 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, runner_iact_nonsym_1_vec_density( &v_r2, &v_dx, &v_dy, &v_dz, v_hi_inv, v_vix, v_viy, v_viz, &cj_cache->vx[cj_cache_idx], &cj_cache->vy[cj_cache_idx], - &cj_cache->vz[cj_cache_idx], &cj_cache->m[cj_cache_idx], &v_rhoSum, - &v_rho_dhSum, &v_wcountSum, &v_wcount_dhSum, &v_div_vSum, &v_curlvxSum, - &v_curlvySum, &v_curlvzSum, v_doi_mask); + &cj_cache->vz[cj_cache_idx], &cj_cache->m[cj_cache_idx], + &v_rhoSum, &v_rho_dhSum, &v_wcountSum, &v_wcount_dhSum, + &v_div_vSum, &v_curlvxSum, &v_curlvySum, &v_curlvzSum, + v_doi_mask); } /* loop over the parts in cj. */ @@ -1203,8 +1213,8 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, v_hjg2.v = vec_set1(hjg2); /* Reset cumulative sums of update vectors. */ - vector v_rhoSum, v_rho_dhSum, v_wcountSum, v_wcount_dhSum, v_div_vSum, v_curlvxSum, - v_curlvySum, v_curlvzSum; + vector v_rhoSum, v_rho_dhSum, v_wcountSum, v_wcount_dhSum, v_div_vSum, + v_curlvxSum, v_curlvySum, v_curlvzSum; /* Get the inverse of hj. */ vector v_hj_inv; @@ -1238,8 +1248,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, #ifdef SWIFT_DEBUG_CHECKS if (ci_cache_idx % VEC_SIZE != 0 || ci_cache_idx < 0 || - ci_cache_idx + (VEC_SIZE - 1) > - (count_i - first_pi + VEC_SIZE)) { + ci_cache_idx + (VEC_SIZE - 1) > (count_i - first_pi + VEC_SIZE)) { error( "Unaligned read!!! ci_cache_idx=%d, first_pi=%d, " "count_i=%d", @@ -1277,9 +1286,10 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, runner_iact_nonsym_1_vec_density( &v_r2, &v_dx, &v_dy, &v_dz, v_hj_inv, v_vjx, v_vjy, v_vjz, &ci_cache->vx[ci_cache_idx], &ci_cache->vy[ci_cache_idx], - &ci_cache->vz[ci_cache_idx], &ci_cache->m[ci_cache_idx], &v_rhoSum, - &v_rho_dhSum, &v_wcountSum, &v_wcount_dhSum, &v_div_vSum, &v_curlvxSum, - &v_curlvySum, &v_curlvzSum, v_doj_mask); + &ci_cache->vz[ci_cache_idx], &ci_cache->m[ci_cache_idx], + &v_rhoSum, &v_rho_dhSum, &v_wcountSum, &v_wcount_dhSum, + &v_div_vSum, &v_curlvxSum, &v_curlvySum, &v_curlvzSum, + v_doj_mask); } /* loop over the parts in ci. */ @@ -1351,7 +1361,8 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, /* Check if any particles are active and return if there are none. */ int numActive = 0; - /* Use the largest smoothing length to make sure that no interactions are missed. */ + /* Use the largest smoothing length to make sure that no interactions are + * missed. */ const double h_max = max(hi_max, hj_max); if (active_ci) { @@ -1416,8 +1427,7 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, /* Read the needed particles into the two caches. */ cache_read_two_partial_cells_sorted_force(ci, cj, ci_cache, cj_cache, sort_i, - sort_j, shift, &first_pi, - &last_pj); + sort_j, shift, &first_pi, &last_pj); /* Get the number of particles read into the ci cache. */ int ci_cache_count = count_i - first_pi; @@ -1465,8 +1475,8 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, v_hig2.v = vec_set1(hig2); /* Reset cumulative sums of update vectors. */ - vector v_a_hydro_xSum, v_a_hydro_ySum, v_a_hydro_zSum, v_h_dtSum, v_sigSum, - v_entropy_dtSum; + vector v_a_hydro_xSum, v_a_hydro_ySum, v_a_hydro_zSum, v_h_dtSum, + v_sigSum, v_entropy_dtSum; /* Get the inverse of hi. */ vector v_hi_inv; @@ -1502,8 +1512,8 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, #ifdef SWIFT_DEBUG_CHECKS if (cj_cache_idx % VEC_SIZE != 0 || cj_cache_idx < 0 || cj_cache_idx + (VEC_SIZE - 1) > (last_pj + 1 + VEC_SIZE)) { - error("Unaligned read!!! cj_cache_idx=%d, last_pj=%d", - cj_cache_idx, last_pj); + error("Unaligned read!!! cj_cache_idx=%d, last_pj=%d", cj_cache_idx, + last_pj); } #endif @@ -1547,8 +1557,9 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, &cj_cache->grad_h[cj_cache_idx], &cj_cache->pOrho2[cj_cache_idx], &cj_cache->balsara[cj_cache_idx], &cj_cache->soundspeed[cj_cache_idx], &cj_cache->m[cj_cache_idx], - v_hi_inv, v_hj_inv, &v_a_hydro_xSum, &v_a_hydro_ySum, &v_a_hydro_zSum, - &v_h_dtSum, &v_sigSum, &v_entropy_dtSum, v_doi_mask); + v_hi_inv, v_hj_inv, &v_a_hydro_xSum, &v_a_hydro_ySum, + &v_a_hydro_zSum, &v_h_dtSum, &v_sigSum, &v_entropy_dtSum, + v_doi_mask); } } /* loop over the parts in cj. */ @@ -1610,8 +1621,8 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, v_hjg2.v = vec_set1(hjg2); /* Reset cumulative sums of update vectors. */ - vector v_a_hydro_xSum, v_a_hydro_ySum, v_a_hydro_zSum, v_h_dtSum, v_sigSum, - v_entropy_dtSum; + vector v_a_hydro_xSum, v_a_hydro_ySum, v_a_hydro_zSum, v_h_dtSum, + v_sigSum, v_entropy_dtSum; /* Get the inverse of hj. */ vector v_hj_inv; @@ -1689,8 +1700,9 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, &ci_cache->grad_h[ci_cache_idx], &ci_cache->pOrho2[ci_cache_idx], &ci_cache->balsara[ci_cache_idx], &ci_cache->soundspeed[ci_cache_idx], &ci_cache->m[ci_cache_idx], - v_hj_inv, v_hi_inv, &v_a_hydro_xSum, &v_a_hydro_ySum, &v_a_hydro_zSum, - &v_h_dtSum, &v_sigSum, &v_entropy_dtSum, v_doj_mask); + v_hj_inv, v_hi_inv, &v_a_hydro_xSum, &v_a_hydro_ySum, + &v_a_hydro_zSum, &v_h_dtSum, &v_sigSum, &v_entropy_dtSum, + v_doj_mask); } } /* loop over the parts in ci. */