diff --git a/src/cache.h b/src/cache.h index 1b675e1cc0da5daab53ef14fa06106eca186bd15..3eb1e194dd4232319ac1d4a4323ca8099f044063 100644 --- a/src/cache.h +++ b/src/cache.h @@ -296,11 +296,16 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( const struct entry *restrict sort_j, const double *restrict const shift, int *first_pi, int *last_pj) { - /* Pad number of particles read to the vector size. */ + /* Make the number of particles to be read a multiple of the vector size. + * This eliminates serial remainder loops where possible when populating the + * cache. */ + + /* Is the number of particles to read a multiple of the vector size? */ int rem = (ci->count - *first_pi) % VEC_SIZE; if (rem != 0) { int pad = VEC_SIZE - rem; + /* Decrease first_pi if there are particles in the cell left to read. */ if (*first_pi - pad >= 0) *first_pi -= pad; } @@ -308,6 +313,7 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( if (rem != 0) { int pad = VEC_SIZE - rem; + /* Increase last_pj if there are particles in the cell left to read. */ if (*last_pj + pad < cj->count) *last_pj += pad; } @@ -504,11 +510,16 @@ cache_read_two_partial_cells_sorted_force( const struct entry *restrict sort_i, const struct entry *restrict sort_j, const double *const shift, int *first_pi, int *last_pj) { - /* Pad number of particles read to the vector size. */ + /* Make the number of particles to be read a multiple of the vector size. + * This eliminates serial remainder loops where possible when populating the + * cache. */ + + /* Is the number of particles to read a multiple of the vector size? */ int rem = (ci->count - *first_pi) % VEC_SIZE; if (rem != 0) { int pad = VEC_SIZE - rem; + /* Decrease first_pi if there are particles in the cell left to read. */ if (*first_pi - pad >= 0) *first_pi -= pad; } @@ -516,6 +527,7 @@ cache_read_two_partial_cells_sorted_force( if (rem != 0) { int pad = VEC_SIZE - rem; + /* Increase last_pj if there are particles in the cell left to read. */ if (*last_pj + pad < cj->count) *last_pj += pad; } diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c index 96bc4d049cbb773751409c6e91ba7196e88e1162..4403513177a6ce59a765bef4e277a8c8457ae356 100644 --- a/src/runner_doiact_vec.c +++ b/src/runner_doiact_vec.c @@ -297,7 +297,8 @@ __attribute__((always_inline)) INLINE static void populate_max_index_no_cache( const float first_di = sort_i[first_pi].d + pi->h * kernel_gamma + dx_max - rshift; - /* Loop through particles in cell j until they are not in range of pi. */ + /* Loop through particles in cell j until they are not in range of pi. + * Make sure that temp stays between 0 and cj->count - 1.*/ while (temp < cj->count - 1 && first_di > sort_j[temp].d) temp++; max_index_i[first_pi] = temp; @@ -309,6 +310,7 @@ __attribute__((always_inline)) INLINE static void populate_max_index_no_cache( const float di = sort_i[i].d + pi->h * kernel_gamma + dx_max - rshift; + /* Make sure that temp stays between 0 and cj->count - 1.*/ while (temp < cj->count - 1 && di > sort_j[temp].d) temp++; max_index_i[i] = temp; @@ -438,7 +440,8 @@ populate_max_index_no_cache_force(const struct cell *ci, const struct cell *cj, max(pi->h, hj_max_raw) * kernel_gamma + dx_max - rshift; - /* Loop through particles in cell j until they are not in range of pi. */ + /* Loop through particles in cell j until they are not in range of pi. + * Make sure that temp stays between 0 and cj->count - 1.*/ while (temp < cj->count - 1 && first_di > sort_j[temp].d) temp++; max_index_i[first_pi] = temp; @@ -451,6 +454,7 @@ populate_max_index_no_cache_force(const struct cell *ci, const struct cell *cj, const float di = sort_i[i].d + max(pi->h, hj_max_raw) * kernel_gamma + dx_max - rshift; + /* Make sure that temp stays between 0 and cj->count - 1.*/ while (temp < cj->count - 1 && di > sort_j[temp].d) temp++; max_index_i[i] = temp; @@ -1092,7 +1096,9 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, exit_iteration_align += pad; } - /* Loop over the parts in cj. */ + /* Loop over the parts in cj. Making sure to perform an iteration of the + * loop even if exit_iteration_align is zero and there is only one + * particle to interact with.*/ for (int pjd = 0; pjd <= exit_iteration_align; pjd += VEC_SIZE) { /* Get the cache index to the jth particle. */ @@ -1473,7 +1479,9 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, exit_iteration_align += pad; } - /* Loop over the parts in cj. */ + /* Loop over the parts in cj. Making sure to perform an iteration of the + * loop even if exit_iteration_align is zero and there is only one + * particle to interact with.*/ for (int pjd = 0; pjd <= exit_iteration_align; pjd += VEC_SIZE) { /* Get the cache index to the jth particle. */