Commit 8a4a38b0 authored by Matthieu Schaller's avatar Matthieu Schaller
Browse files

Merge branch 'fix_pairs' into 'master'

Patch vectorised pair interactions

Implements small optimisations to `runner_dopair1_density_vec` suggested in #354:
 - Break out of the loop checking for active particles if the cell itself is inactive
 - Do the calculation in the frame of `cj` and not `ci` since this is where the shift vector brings us.
 - Correctly apply the rshift value on the axis linking cells.

See merge request !407
parents bbca0e51 9b69b779
......@@ -327,9 +327,15 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
const struct part *restrict parts_i = ci->parts;
const struct part *restrict parts_j = cj->parts;
double loc[3];
loc[0] = ci->loc[0];
loc[1] = ci->loc[1];
loc[2] = ci->loc[2];
loc[0] = cj->loc[0];
loc[1] = cj->loc[1];
loc[2] = cj->loc[2];
/* Shift ci particles for boundary conditions and location of cell.*/
double total_ci_shift[3];
total_ci_shift[0] = loc[0] + shift[0];
total_ci_shift[1] = loc[1] + shift[1];
total_ci_shift[2] = loc[2] + shift[2];
/* Let the compiler know that the data is aligned and create pointers to the
* arrays inside the cache. */
......@@ -351,9 +357,9 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
* due to BCs but leave cell cj. */
for (int i = 0; i < ci_cache_count; i++) {
idx = sort_i[i + first_pi_align].i;
x[i] = (float)(parts_i[idx].x[0] - loc[0] - shift[0]);
y[i] = (float)(parts_i[idx].x[1] - loc[1] - shift[1]);
z[i] = (float)(parts_i[idx].x[2] - loc[2] - shift[2]);
x[i] = (float)(parts_i[idx].x[0] - total_ci_shift[0]);
y[i] = (float)(parts_i[idx].x[1] - total_ci_shift[1]);
z[i] = (float)(parts_i[idx].x[2] - total_ci_shift[2]);
h[i] = parts_i[idx].h;
m[i] = parts_i[idx].mass;
......@@ -362,6 +368,31 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
vz[i] = parts_i[idx].v[2];
}
#ifdef SWIFT_DEBUG_CHECKS
const float shift_threshold_x = 4. * ci->width[0] * (1. + 2.*space_maxreldx);
const float shift_threshold_y = 4. * ci->width[1] * (1. + 2.*space_maxreldx);
const float shift_threshold_z = 4. * ci->width[2] * (1. + 2.*space_maxreldx);
/* Make sure that particle positions have been shifted correctly. */
for (int i = 0; i < ci_cache_count; i++) {
if (x[i] > shift_threshold_x || x[i] < -shift_threshold_x)
error(
"Error: ci->loc[%lf,%lf,%lf],cj->loc[%lf,%lf,%lf] Particle %d x pos is not within "
"[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + 2*space_maxreldx)]. x=%f, ci->width[0]=%f", ci->loc[0], ci->loc[1], ci->loc[2],
loc[0], loc[1], loc[2], i, x[i], ci->width[0]);
if (y[i] > shift_threshold_y || y[i] < -shift_threshold_y)
error(
"Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d y pos is not within "
"[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + 2*space_maxreldx)]. y=%f, ci->width[1]=%f", ci->loc[0], ci->loc[1], ci->loc[2],
loc[0], loc[1], loc[2], i, y[i], ci->width[1]);
if (z[i] > shift_threshold_z || z[i] < -shift_threshold_z)
error(
"Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d z pos is not within "
"[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + 2*space_maxreldx)]. z=%f, ci->width[2]=%f", ci->loc[0], ci->loc[1], ci->loc[2],
loc[0], loc[1], loc[2], i, z[i], ci->width[2]);
}
#endif
/* Pad cache with fake particles that exist outside the cell so will not
* interact.*/
float fake_pix = 2.0f * parts_i[sort_i[ci->count - 1].i].x[0];
......@@ -402,6 +433,27 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
vzj[i] = parts_j[idx].v[2];
}
#ifdef SWIFT_DEBUG_CHECKS
/* Make sure that particle positions have been shifted correctly. */
for (int i = 0; i <= last_pj_align; i++) {
if (xj[i] > shift_threshold_x || xj[i] < -shift_threshold_x)
error(
"Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d xj pos is not within "
"[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + 2*space_maxreldx)]. xj=%f, ci->width[0]=%f", ci->loc[0], ci->loc[1], ci->loc[2],
loc[0], loc[1], loc[2], i, xj[i], ci->width[0]);
if (yj[i] > shift_threshold_y || yj[i] < -shift_threshold_y)
error(
"Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d yj pos is not within "
"[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + 2*space_maxreldx)]. yj=%f, ci->width[1]=%f", ci->loc[0], ci->loc[1], ci->loc[2],
loc[0], loc[1], loc[2], i, yj[i], ci->width[1]);
if (zj[i] > shift_threshold_z || zj[i] < -shift_threshold_z)
error(
"Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d zj pos is not within "
"[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + 2*space_maxreldx)]. zj=%f, ci->width[2]=%f", ci->loc[0], ci->loc[1], ci->loc[2],
loc[0], loc[1], loc[2], i, zj[i], ci->width[2]);
}
#endif
/* Pad cache with fake particles that exist outside the cell so will not
* interact.*/
float fake_pjx = 2.0f * cj->parts[sort_j[cj->count - 1].i].x[0];
......
......@@ -285,11 +285,10 @@ __attribute__((always_inline)) INLINE static void populate_max_index_no_cache(
temp = 0;
const struct part *pi = &parts_i[sort_i[first_pi].i];
const float first_di = sort_i[first_pi].d + pi->h * kernel_gamma + dx_max - rshift;
/* Loop through particles in cell j until they are not in range of pi. */
while (temp <= cj->count &&
(sort_i[first_pi].d + (pi->h * kernel_gamma + dx_max - rshift) >
sort_j[temp].d))
while (temp < cj->count && first_di > sort_j[temp].d)
temp++;
max_index_i[first_pi] = temp;
......@@ -298,10 +297,10 @@ __attribute__((always_inline)) INLINE static void populate_max_index_no_cache(
for (int i = first_pi + 1; i < ci->count; i++) {
temp = max_index_i[i - 1];
pi = &parts_i[sort_i[i].i];
const float di = sort_i[i].d + pi->h * kernel_gamma + dx_max - rshift;
while (temp <= cj->count &&
(sort_i[i].d + (pi->h * kernel_gamma + dx_max - rshift) >
sort_j[temp].d))
while (temp < cj->count && di > sort_j[temp].d)
temp++;
max_index_i[i] = temp;
......@@ -333,11 +332,10 @@ __attribute__((always_inline)) INLINE static void populate_max_index_no_cache(
temp = ci->count - 1;
const struct part *pj = &parts_j[sort_j[last_pj].i];
const float last_dj = sort_j[last_pj].d - dx_max - pj->h * kernel_gamma + rshift;
/* Loop through particles in cell i until they are not in range of pj. */
while (temp > 0 &&
sort_j[last_pj].d - dx_max - (pj->h * kernel_gamma) <
sort_i[temp].d - rshift)
while (temp > 0 && last_dj < sort_i[temp].d)
temp--;
max_index_j[last_pj] = temp;
......@@ -346,10 +344,9 @@ __attribute__((always_inline)) INLINE static void populate_max_index_no_cache(
for (int i = last_pj - 1; i >= 0; i--) {
temp = max_index_j[i + 1];
pj = &parts_j[sort_j[i].i];
const float dj = sort_j[i].d - dx_max - (pj->h * kernel_gamma) + rshift;
while (temp > 0 &&
sort_j[i].d - dx_max - (pj->h * kernel_gamma) <
sort_i[temp].d - rshift)
while (temp > 0 && dj < sort_i[temp].d)
temp--;
max_index_j[i] = temp;
......@@ -855,19 +852,24 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
const double di_max = sort_i[count_i - 1].d - rshift;
const double dj_min = sort_j[0].d;
const float dx_max = (ci->dx_max_sort + cj->dx_max_sort);
const int active_ci = cell_is_active(ci, e);
const int active_cj = cell_is_active(cj, e);
/* Check if any particles are active and return if there are not. */
int numActive = 0;
for (int pid = count_i - 1;
pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min; pid--) {
struct part *restrict pi = &parts_i[sort_i[pid].i];
if (part_is_active_no_debug(pi, max_active_bin)) {
numActive++;
break;
if (active_ci) {
for (int pid = count_i - 1;
pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min; pid--) {
struct part *restrict pi = &parts_i[sort_i[pid].i];
if (part_is_active(pi, e)) {
numActive++;
break;
}
}
}
if (!numActive) {
if (!numActive && active_cj) {
for (int pjd = 0; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max;
pjd++) {
struct part *restrict pj = &parts_j[sort_j[pjd].i];
......@@ -926,7 +928,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
/* Get the number of particles read into the ci cache. */
int ci_cache_count = count_i - first_pi_align;
if (cell_is_active(ci, e)) {
if (active_ci) {
/* Loop over the parts in ci until nothing is within range in cj. */
for (int pid = count_i - 1; pid >= first_pi_loop; pid--) {
......@@ -1056,7 +1058,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
} /* loop over the parts in ci. */
}
if (cell_is_active(cj, e)) {
if (active_cj) {
/* Loop over the parts in cj until nothing is within range in ci. */
for (int pjd = 0; pjd <= last_pj_loop; pjd++) {
......@@ -1068,11 +1070,9 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
/* Set the cache index. */
int cj_cache_idx = pjd;
/*TODO: rshift term. */
/* Skip this particle if no particle in ci is within range of it. */
const float hj = cj_cache->h[cj_cache_idx];
const double dj_test =
sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift;
const double dj_test = sort_j[pjd].d - hj * kernel_gamma - dx_max;
if (dj_test > di_max) continue;
/* Determine the exit iteration of the interaction loop. */
......@@ -1188,8 +1188,9 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
} /* loop over the parts in cj. */
TIMER_TOC(timer_dopair_density);
}
TIMER_TOC(timer_dopair_density);
#endif /* WITH_VECTORIZATION */
}
......@@ -272,6 +272,8 @@ struct cell *make_cell(size_t n, const double offset[3], double size, double h,
bzero(cell->parts, count * sizeof(struct part));
bzero(cell->xparts, count * sizeof(struct xpart));
float h_max = 0.f;
/* Construct the parts */
struct part *part = cell->parts;
struct xpart *xpart = cell->xparts;
......@@ -288,6 +290,7 @@ struct cell *make_cell(size_t n, const double offset[3], double size, double h,
offset[2] +
size * (z + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n;
part->h = size * h / (float)n;
h_max = fmax(h_max, part->h);
#if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH)
part->conserved.mass = density * volume / count;
......@@ -333,7 +336,7 @@ struct cell *make_cell(size_t n, const double offset[3], double size, double h,
/* Cell properties */
cell->split = 0;
cell->h_max = h;
cell->h_max = h_max;
cell->count = count;
cell->gcount = 0;
cell->dx_max_part = 0.;
......
......@@ -290,7 +290,7 @@ void test_boundary_conditions(struct cell **cells, struct runner runner,
struct cell *main_cell = cells[loc_i * (dim * dim) + loc_j * dim + loc_k];
/* Zero the fields */
for (int j = 0; j < 512; ++j) zero_particle_fields(cells[j]);
for (int j = 0; j < dim * dim * dim; ++j) zero_particle_fields(cells[j]);
/* Run all the pairs */
#if !(defined(MINIMAL_SPH) && defined(WITH_VECTORIZATION))
......@@ -337,7 +337,7 @@ void test_boundary_conditions(struct cell **cells, struct runner runner,
/* Now perform a brute-force version for accuracy tests */
/* Zero the fields */
for (int i = 0; i < 512; ++i) zero_particle_fields(cells[i]);
for (int i = 0; i < dim * dim * dim; ++i) zero_particle_fields(cells[i]);
#if !(defined(MINIMAL_SPH) && defined(WITH_VECTORIZATION))
......@@ -467,11 +467,12 @@ int main(int argc, char *argv[]) {
printf("\n");
/* Build the infrastructure */
const int dim = 8;
struct space space;
space.periodic = 1;
space.dim[0] = 8.;
space.dim[1] = 8.;
space.dim[2] = 8.;
space.dim[0] = dim;
space.dim[1] = dim;
space.dim[2] = dim;
struct hydro_props hp;
hp.h_max = FLT_MAX;
......@@ -487,8 +488,7 @@ int main(int argc, char *argv[]) {
runner.e = &engine;
/* Construct some cells */
struct cell *cells[512];
const int dim = 8;
struct cell *cells[dim * dim * dim];
static long long partId = 0;
for (int i = 0; i < dim; ++i) {
for (int j = 0; j < dim; ++j) {
......@@ -581,7 +581,7 @@ int main(int argc, char *argv[]) {
swiftOutputFileName, bruteForceOutputFileName);
/* Clean things to make the sanitizer happy ... */
for (int i = 0; i < 512; ++i) clean_up(cells[i]);
for (int i = 0; i < dim * dim * dim; ++i) clean_up(cells[i]);
return 0;
}
# ID pos_x pos_y pos_z v_x v_y v_z rho rho_dh wcount wcount_dh div_v curl_vx curl_vy curl_vz
0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 2.4e-6 1e-4 5e-4 1.2e-2 1e-5 3e-6 3e-6 8e-6
0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 2.4e-6 1e-4 5e-4 1.2e-2 1.1e-5 3e-6 3e-6 8e-6
0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1.2e-6 1.4e-2 1e-5 2e-3 2.5e-4 3e-3 3e-3 3e-3
0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e0 1e-6 4e-6 4e-6 4e-6
# ID pos_x pos_y pos_z v_x v_y v_z rho rho_dh wcount wcount_dh div_v curl_vx curl_vy curl_vz
0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 3e-6 1e-4 5e-4 1.5e-2 1.4e-5 3e-6 3e-6 9e-6
0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1.5e-6 1.57e-2 1e-5 4.74e-3 3.89e-4 3e-3 3e-3 3e-3
0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 3e-6 1e-4 5e-4 1.5e-2 1.4e-5 3e-6 3e-6 1e-5
0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1.5e-6 1.57e-2 1e-5 5.86e-3 4.96e-4 3e-3 3e-3 3e-3
0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e0 1e-6 4e-6 4e-6 4e-6
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment