diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c index 6888affef697238567f4fd60706392eba2724627..5c29f1d6feb4c2bc542397ef2f0d1b9ae2b49a65 100644 --- a/src/runner_doiact_vec.c +++ b/src/runner_doiact_vec.c @@ -553,16 +553,13 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( * the cache if it is not big enough for the cell. */ struct cache *restrict cell_cache = &r->ci_cache; - if (cell_cache->count < count) { - cache_init(cell_cache, count); - } + if (cell_cache->count < count) cache_init(cell_cache, count); /* Read the particles from the cell and store them locally in the cache. */ cache_read_particles(c, cell_cache); /* Create secondary cache to store particle interactions. */ struct c2_cache int_cache; - int icount = 0, icount_align = 0; /* Loop over the particles in the cell. */ for (int pid = 0; pid < count; pid++) { @@ -615,6 +612,10 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( } } + /* The number of interactions for pi and the padded version of it to + * make it a multiple of VEC_SIZE. */ + int icount = 0, icount_align = 0; + /* Find all of particle pi's interacions and store needed values in the * secondary cache.*/ for (int pjd = 0; pjd < count_align; pjd += (NUM_VEC_PROC * VEC_SIZE)) { @@ -748,16 +749,13 @@ __attribute__((always_inline)) INLINE void runner_doself_subset_density_vec( * the cache if it is not big enough for the cell. */ struct cache *restrict cell_cache = &r->ci_cache; - if (cell_cache->count < count) { - cache_init(cell_cache, count); - } + if (cell_cache->count < count) cache_init(cell_cache, count); /* Read the particles from the cell and store them locally in the cache. */ cache_read_particles(c, cell_cache); /* Create secondary cache to store particle interactions. */ struct c2_cache int_cache; - int icount = 0, icount_align = 0; /* Loop over the subset of particles in the parts that need updating. */ for (int pid = 0; pid < pi_count; pid++) { @@ -814,6 +812,10 @@ __attribute__((always_inline)) INLINE void runner_doself_subset_density_vec( } } + /* The number of interactions for pi and the padded version of it to + * make it a multiple of VEC_SIZE. */ + int icount = 0, icount_align = 0; + /* Find all of particle pi's interacions and store needed values in the * secondary cache.*/ for (int pjd = 0; pjd < count_align; pjd += (NUM_VEC_PROC * VEC_SIZE)) { @@ -938,11 +940,7 @@ __attribute__((always_inline)) INLINE void runner_doself2_force_vec( #ifdef WITH_VECTORIZATION const struct engine *e = r->e; - struct part *restrict pi; - int count_align; - const timebin_t max_active_bin = e->max_active_bin; - struct part *restrict parts = c->parts; const int count = c->count; @@ -964,9 +962,7 @@ __attribute__((always_inline)) INLINE void runner_doself2_force_vec( * the cache if it is not big enough for the cell. */ struct cache *restrict cell_cache = &r->ci_cache; - if (cell_cache->count < count) { - cache_init(cell_cache, count); - } + if (cell_cache->count < count) cache_init(cell_cache, count); /* Read the particles from the cell and store them locally in the cache. */ cache_read_force_particles(c, cell_cache); @@ -975,7 +971,7 @@ __attribute__((always_inline)) INLINE void runner_doself2_force_vec( for (int pid = 0; pid < count; pid++) { /* Get a pointer to the ith particle. */ - pi = &parts[pid]; + struct part *restrict pi = &parts[pid]; /* Is the ith particle active? */ if (!part_is_active_no_debug(pi, max_active_bin)) continue; @@ -1012,7 +1008,7 @@ __attribute__((always_inline)) INLINE void runner_doself2_force_vec( vector v_entropy_dtSum = vector_setzero(); /* Pad cache if there is a serial remainder. */ - count_align = count; + int count_align = count; int rem = count % VEC_SIZE; if (rem != 0) { int pad = VEC_SIZE - rem; @@ -1188,16 +1184,12 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cache *restrict ci_cache = &r->ci_cache; struct cache *restrict cj_cache = &r->cj_cache; - if (ci_cache->count < count_i) { - cache_init(ci_cache, count_i); - } - if (cj_cache->count < count_j) { - cache_init(cj_cache, count_j); - } + if (ci_cache->count < count_i) cache_init(ci_cache, count_i); + if (cj_cache->count < count_j) cache_init(cj_cache, count_j); int first_pi, last_pj; - int *max_index_i __attribute__((aligned(sizeof(int) * VEC_SIZE))); - int *max_index_j __attribute__((aligned(sizeof(int) * VEC_SIZE))); + int *restrict max_index_i SWIFT_CACHE_ALIGN; + int *restrict max_index_j SWIFT_CACHE_ALIGN; max_index_i = r->ci_cache.max_index; max_index_j = r->cj_cache.max_index; @@ -1210,8 +1202,8 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, &first_pi, &last_pj, max_active_bin); /* Limits of the outer loops. */ - int first_pi_loop = first_pi; - int last_pj_loop = last_pj; + const int first_pi_loop = first_pi; + const int last_pj_loop_end = last_pj + 1; /* Take the max/min of both values calculated to work out how many particles * to read into the cache. */ @@ -1244,7 +1236,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, if (di_test < dj_min) continue; /* Determine the exit iteration of the interaction loop. */ - const int exit_iteration = max_index_i[pid]; + const int exit_iteration_end = max_index_i[pid] + 1; /* Fill particle pi vectors. */ const vector v_pix = vector_set1(ci_cache->x[ci_cache_idx]); @@ -1274,7 +1266,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, /* Loop over the parts in cj. Making sure to perform an iteration of the * loop even if exit_iteration_align is zero and there is only one * particle to interact with.*/ - for (int pjd = 0; pjd <= exit_iteration; pjd += VEC_SIZE) { + for (int pjd = 0; pjd < exit_iteration_end; pjd += VEC_SIZE) { /* Get the cache index to the jth particle. */ const int cj_cache_idx = pjd; @@ -1336,7 +1328,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, if (active_cj) { /* Loop over the parts in cj until nothing is within range in ci. */ - for (int pjd = 0; pjd <= last_pj_loop; pjd++) { + for (int pjd = 0; pjd < last_pj_loop_end; pjd++) { /* Get a hold of the jth part in cj. */ struct part *restrict pj = &parts_j[sort_j[pjd].i]; @@ -1543,16 +1535,12 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, struct cache *restrict ci_cache = &r->ci_cache; struct cache *restrict cj_cache = &r->cj_cache; - if (ci_cache->count < count_i) { - cache_init(ci_cache, count_i); - } - if (cj_cache->count < count_j) { - cache_init(cj_cache, count_j); - } + if (ci_cache->count < count_i) cache_init(ci_cache, count_i); + if (cj_cache->count < count_j) cache_init(cj_cache, count_j); int first_pi, last_pj; - int *max_index_i SWIFT_CACHE_ALIGN; - int *max_index_j SWIFT_CACHE_ALIGN; + int *restrict max_index_i SWIFT_CACHE_ALIGN; + int *restrict max_index_j SWIFT_CACHE_ALIGN; max_index_i = r->ci_cache.max_index; max_index_j = r->cj_cache.max_index; @@ -1567,7 +1555,7 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, /* Limits of the outer loops. */ const int first_pi_loop = first_pi; - const int last_pj_loop = last_pj; + const int last_pj_loop_end = last_pj + 1; /* Take the max/min of both values calculated to work out how many particles * to read into the cache. */ @@ -1600,7 +1588,7 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, if (di_test < dj_min) continue; /* Determine the exit iteration of the interaction loop. */ - const int exit_iteration = max_index_i[pid]; + const int exit_iteration_end = max_index_i[pid] + 1; /* Fill particle pi vectors. */ const vector v_pix = vector_set1(ci_cache->x[ci_cache_idx]); @@ -1633,7 +1621,7 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, /* Loop over the parts in cj. Making sure to perform an iteration of the * loop even if exit_iteration_align is zero and there is only one * particle to interact with.*/ - for (int pjd = 0; pjd <= exit_iteration; pjd += VEC_SIZE) { + for (int pjd = 0; pjd < exit_iteration_end; pjd += VEC_SIZE) { /* Get the cache index to the jth particle. */ const int cj_cache_idx = pjd; @@ -1705,7 +1693,7 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, if (active_cj) { /* Loop over the parts in cj until nothing is within range in ci. */ - for (int pjd = 0; pjd <= last_pj_loop; pjd++) { + for (int pjd = 0; pjd < last_pj_loop_end; pjd++) { /* Get a hold of the jth part in cj. */ struct part *restrict pj = &parts_j[sort_j[pjd].i];