Skip to content
Snippets Groups Projects
Commit 7299e047 authored by James Willis's avatar James Willis
Browse files

Merge branch 'AVX512-Fixes' of gitlab.cosma.dur.ac.uk:swift/swiftsim into AVX512-Fixes

Conflicts:
	src/runner_doiact_vec.c
	tests/benchmarkInteractions.c
parents 814df924 f4c1af2e
No related branches found
No related tags found
1 merge request!396Avx512 fixes
......@@ -64,7 +64,7 @@ struct cache {
float *restrict vz __attribute__((aligned(CACHE_ALIGN)));
/* Maximum distance of particles into neighbouring cell. */
float *restrict max_d __attribute__((aligned(CACHE_ALIGN)));
int *restrict max_d __attribute__((aligned(CACHE_ALIGN)));
/* Cache size. */
int count;
......@@ -114,6 +114,7 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c,
unsigned int pad = 2 * VEC_SIZE, rem = count % VEC_SIZE;
if (rem > 0) pad += VEC_SIZE - rem;
unsigned int sizeBytes = (count + pad) * sizeof(float);
unsigned int sizeIntBytes = (count + pad) * sizeof(int);
int error = 0;
/* Free memory if cache has already been allocated. */
......@@ -137,7 +138,7 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c,
error += posix_memalign((void **)&c->vy, CACHE_ALIGN, sizeBytes);
error += posix_memalign((void **)&c->vz, CACHE_ALIGN, sizeBytes);
error += posix_memalign((void **)&c->h, CACHE_ALIGN, sizeBytes);
error += posix_memalign((void **)&c->max_d, CACHE_ALIGN, sizeBytes);
error += posix_memalign((void **)&c->max_d, CACHE_ALIGN, sizeIntBytes);
if (error != 0)
error("Couldn't allocate cache, no. of particles: %d", (int)count);
......
......@@ -256,63 +256,78 @@ __attribute__((always_inline)) INLINE static void populate_max_d_no_cache(
const struct entry *restrict sort_i, const struct entry *restrict sort_j,
const float dx_max, const float rshift, const double hi_max,
const double hj_max, const double di_max, const double dj_min,
float *max_di, float *max_dj, int *init_pi, int *init_pj,
int *max_index_i, int *max_index_j, int *init_pi, int *init_pj,
const struct engine *e) {
const struct part *restrict parts_i = ci->parts;
const struct part *restrict parts_j = cj->parts;
int first_pi = 0, last_pj = cj->count - 1;
int temp;
/* Find the first active particle in ci to interact with any particle in cj.
*/
/* Populate max_di with distances. */
int active_id = ci->count - 1;
for (int k = ci->count - 1; k >= 0; k--) {
const struct part *pi = &parts_i[sort_i[k].i];
const float d = sort_i[k].d + dx_max;
max_di[k] = d + hi_max;
/* If the particle is out of range set the index to
* the last active particle within range. */
if (d + hi_max < dj_min) {
first_pi = active_id;
break;
} else {
if (part_is_active(pi, e)) active_id = k;
}
}
/* Find the leftmost particle in cell i that interacts with any particle in cell j. */
first_pi = ci->count;
while(first_pi > 0 && sort_i[first_pi - 1].d + dx_max + hi_max > dj_min)
first_pi--;
/* Find the maximum distance of pi particles into cj.*/
for (int k = first_pi + 1; k < ci->count; k++) {
max_di[k] = fmaxf(max_di[k - 1], max_di[k]);
}
/* Find the maximum index into cell j for each particle in range in cell i. */
if(first_pi < ci->count) {
/* Find the last particle in cj to interact with any particle in ci. */
/* Populate max_dj with distances. */
active_id = 0;
for (int k = 0; k < cj->count; k++) {
const struct part *pj = &parts_j[sort_j[k].i];
const float d = sort_j[k].d - dx_max;
/* Start from the first particle in cell j. */
temp = 0;
/*TODO: don't think rshift should be taken off here, waiting on Pedro. */
// max_dj[k] = d - h * kernel_gamma - rshift;
max_dj[k] = d - hj_max;
const struct part *pi = &parts_i[sort_i[first_pi].i];
/* Loop through particles in cell j until they are not in range of pi. */
while(temp <= cj->count && sort_i[first_pi].d + (pi->h * kernel_gamma + dx_max - rshift) > sort_j[temp].d)
temp++;
max_index_i[first_pi] = temp;
/* Populate max_index_i for remaining particles that are within range. */
for(int i = first_pi + 1; i<ci->count; i++) {
temp = max_index_i[i - 1];
while(temp <= cj->count && sort_i[i].d + (pi->h * kernel_gamma + dx_max - rshift) > sort_j[temp].d)
temp++;
max_index_i[i] = temp;
/* If the particle is out of range set the index to
* the last active particle within range. */
if (d - hj_max > di_max) {
last_pj = active_id;
break;
} else {
if (part_is_active(pj, e)) active_id = k;
}
}
/* Find the maximum distance of pj particles into ci.*/
for (int k = 1; k <= last_pj; k++) {
max_dj[k] = fmaxf(max_dj[k - 1], max_dj[k]);
/* Find the rightmost particle in cell j that interacts with any particle in cell i. */
last_pj = 0;
while(last_pj < cj->count && sort_j[last_pj].d - hj_max - dx_max < di_max)
last_pj++;
/* Find the maximum index into cell i for each particle in range in cell j. */
if(last_pj > 0 ) {
/* Decrement to make sure that we checking that correct particle. */
last_pj--;
/* Start from the last particle in cell i. */
temp = ci->count - 1;
const struct part *pj = &parts_j[sort_j[last_pj].i];
/* Loop through particles in cell i until they are not in range of pj. */
while(temp >= 0 && sort_j[last_pj].d - dx_max - (pj->h * kernel_gamma) < sort_i[temp].d - rshift)
temp--;
max_index_j[last_pj] = temp;
/* Populate max_index_j for remaining particles that are within range. */
for(int i = last_pj - 1; i>=0; i--) {
temp = max_index_j[i + 1];
while(temp >= 0 && sort_j[i].d - dx_max - (pj->h * kernel_gamma) < sort_i[temp].d - rshift)
temp--;
max_index_j[i] = temp;
}
}
*init_pi = first_pi;
......@@ -562,6 +577,8 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
TIMER_TIC;
//static int intCount = 0;
/* Get the cutoff shift. */
double rshift = 0.0;
for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k];
......@@ -651,47 +668,27 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
}
int first_pi, last_pj;
float *max_di __attribute__((aligned(sizeof(float) * VEC_SIZE)));
float *max_dj __attribute__((aligned(sizeof(float) * VEC_SIZE)));
int *max_index_i __attribute__((aligned(sizeof(int) * VEC_SIZE)));
int *max_index_j __attribute__((aligned(sizeof(int) * VEC_SIZE)));
max_di = r->ci_cache.max_d;
max_dj = r->cj_cache.max_d;
max_index_i = r->ci_cache.max_d;
max_index_j = r->cj_cache.max_d;
/* Find particles maximum distance into cj, max_di[] and ci, max_dj[]. */
/* Also find the first pi that interacts with any particle in cj and the last
* pj that interacts with any particle in ci. */
populate_max_d_no_cache(ci, cj, sort_i, sort_j, dx_max, rshift, hi_max,
hj_max, di_max, dj_min, max_di, max_dj, &first_pi,
hj_max, di_max, dj_min, max_index_i, max_index_j, &first_pi,
&last_pj, e);
/* Find the maximum index into cj that is required by a particle in ci. */
/* Find the maximum index into ci that is required by a particle in cj. */
float di, dj;
int max_ind_j = count_j - 1;
int max_ind_i = 0;
dj = sort_j[max_ind_j].d;
while (max_ind_j > 0 && max_di[count_i - 1] < dj) {
max_ind_j--;
dj = sort_j[max_ind_j].d;
}
di = sort_i[max_ind_i].d;
while (max_ind_i < count_i - 1 && max_dj[0] > di) {
max_ind_i++;
di = sort_i[max_ind_i].d;
}
/* Limits of the outer loops. */
int first_pi_loop = first_pi;
int last_pj_loop = last_pj;
/* Take the max/min of both values calculated to work out how many particles
* to read into the cache. */
last_pj = max(last_pj, max_ind_j);
first_pi = min(first_pi, max_ind_i);
last_pj = max(last_pj, max_index_i[count_i - 1]);
first_pi = min(first_pi, max_index_j[0]);
/* Read the needed particles into the two caches. */
int first_pi_align = first_pi;
......@@ -706,7 +703,8 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
if (cell_is_active(ci, e)) {
/* Loop over the parts in ci until nothing is within range in cj. */
for (int pid = count_i - 1; pid >= first_pi_loop && max_ind_j >= 0; pid--) {
//for (int pid = count_i - 1; pid >= first_pi_loop && max_index_i[pid] >= 0; pid--) {
for (int pid = count_i - 1; pid >= first_pi_loop; pid--) {
/* Get a hold of the ith part in ci. */
struct part *restrict pi = &parts_i[sort_i[pid].i];
......@@ -722,13 +720,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
if (di_test < dj_min) continue;
/* Determine the exit iteration of the interaction loop. */
dj = sort_j[max_ind_j].d;
while (max_ind_j > 0 && max_di[pid] < dj) {
max_ind_j--;
dj = sort_j[max_ind_j].d;
}
int exit_iteration = max_ind_j + 1;
int exit_iteration = max_index_i[pid];
const float hig2 = hi * hi * kernel_gamma2;
......@@ -812,6 +804,8 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
/* Form integer mask. */
doi_mask = vec_form_int_mask(v_doi_mask);
//intCount += __builtin_popcount(doi_mask);
/* If there are any interactions perform them. */
if (doi_mask)
runner_iact_nonsym_1_vec_density(
......@@ -840,7 +834,8 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
if (cell_is_active(cj, e)) {
/* Loop over the parts in cj until nothing is within range in ci. */
for (int pjd = 0; pjd <= last_pj_loop && max_ind_i < count_i; pjd++) {
//for (int pjd = 0; pjd <= last_pj_loop && max_index_j[pjd] < count_i; pjd++) {
for (int pjd = 0; pjd <= last_pj_loop; pjd++) {
/* Get a hold of the jth part in cj. */
struct part *restrict pj = &parts_j[sort_j[pjd].i];
......@@ -857,13 +852,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
if (dj_test > di_max) continue;
/* Determine the exit iteration of the interaction loop. */
di = sort_i[max_ind_i].d;
while (max_ind_i < count_i - 1 && max_dj[pjd] > di) {
max_ind_i++;
di = sort_i[max_ind_i].d;
}
int exit_iteration = max_ind_i;
int exit_iteration = max_index_j[pjd];
const float hjg2 = hj * hj * kernel_gamma2;
......@@ -946,6 +935,8 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
/* Form integer mask. */
doj_mask = vec_form_int_mask(v_doj_mask);
//intCount += __builtin_popcount(doj_mask);
/* If there are any interactions perform them. */
if (doj_mask)
runner_iact_nonsym_1_vec_density(
......
......@@ -517,7 +517,7 @@ int main(int argc, char *argv[]) {
const int half_dim = (dim - 1) / 2;
/* Test the periodic boundary conditions for each of the 8 corners. */
/* Test the periodic boundary conditions for each of the 8 corners. Interact each corner with all of its 26 neighbours.*/
test_boundary_conditions(cells, runner, 0, 0, 0, dim, swiftOutputFileName,
bruteForceOutputFileName);
test_boundary_conditions(cells, runner, dim - 1, 0, 0, dim,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment