diff --git a/examples/EAGLE_25/eagle_25.yml b/examples/EAGLE_25/eagle_25.yml index 5dee9dad0b5d7f694c61fa4c983ead0f1cd6e5e2..f575254301fdd315e7576d7ecbbfbf8f94d7f0c3 100644 --- a/examples/EAGLE_25/eagle_25.yml +++ b/examples/EAGLE_25/eagle_25.yml @@ -13,6 +13,9 @@ TimeIntegration: dt_min: 1e-10 # The minimal time-step size of the simulation (in internal units). dt_max: 1e-4 # The maximal time-step size of the simulation (in internal units). +Scheduler: + cell_split_size: 64 + # Parameters governing the snapshots Snapshots: basename: eagle # Common part of the name of output files diff --git a/src/align.h b/src/align.h index 915af33e6e2ba59be1a0849c4de0e2f1bd5b0d96..e1ad706d598eb69fd191ffbccd7be356411ce062 100644 --- a/src/align.h +++ b/src/align.h @@ -23,9 +23,52 @@ * @brief The default struct alignment in SWIFT. */ #define SWIFT_STRUCT_ALIGNMENT 32 + /** * @brief Defines alignment of structures */ #define SWIFT_STRUCT_ALIGN __attribute__((aligned(SWIFT_STRUCT_ALIGNMENT))) +/** + * @brief The default cache alignment in SWIFT. + */ +#define SWIFT_CACHE_ALIGNMENT 64 + +/** + * @brief Defines alignment of caches + */ +#define SWIFT_CACHE_ALIGN __attribute__((aligned(SWIFT_CACHE_ALIGNMENT))) + +/** + * @brief Macro to tell the compiler that a given array has the specified alignment. + * + * Note that this turns into a no-op but gives information to the compiler. + * + * @param array The array. + * @param alignment The alignment in bytes of the array. + */ +#if defined(__ICC) +#define swift_align_information(array, alignment) \ + __assume_aligned(array, alignment); +#elif defined(__GNUC__) +#define swift_align_information(array, alignment) \ + array = __builtin_assume_aligned(array, alignment); +#endif + +/** + * @brief Macro to tell the compiler that a given number is 0 modulo a given size. + * + * Note that this turns into a no-op but gives information to the compiler. + * GCC does not have the equivalent built-in so defaults to nothing. + * + * @param var The variable + * @param size The modulo of interest. + */ +#if defined(__ICC) +#define swift_assume_size(var, size) \ + __assume(var % size == 0); +#else +#define swift_assume_size(var, size) ; +#endif + #endif /* SWIFT_ALIGN_H */ diff --git a/src/engine.c b/src/engine.c index 2adddee54efcb21fc4b9b5db116149e519e63b8f..c158e36a81f630c5c1244409f260ce9a68307e3d 100644 --- a/src/engine.c +++ b/src/engine.c @@ -4543,6 +4543,10 @@ void engine_init(struct engine *e, struct space *s, e->runners[k].cj_cache.count = 0; cache_init(&e->runners[k].ci_cache, CACHE_SIZE); cache_init(&e->runners[k].cj_cache, CACHE_SIZE); + e->runners[k].ci_gravity_cache.count = 0; + e->runners[k].cj_gravity_cache.count = 0; + gravity_cache_init(&e->runners[k].ci_gravity_cache, space_splitsize); + gravity_cache_init(&e->runners[k].cj_gravity_cache, space_splitsize); #endif if (verbose) { @@ -4629,8 +4633,12 @@ void engine_compute_next_snapshot_time(struct engine *e) { void engine_clean(struct engine *e) { #ifdef WITH_VECTORIZATION - for (int i = 0; i < e->nr_threads; ++i) cache_clean(&e->runners[i].ci_cache); - for (int i = 0; i < e->nr_threads; ++i) cache_clean(&e->runners[i].cj_cache); + for (int i = 0; i < e->nr_threads; ++i) { + cache_clean(&e->runners[i].ci_cache); + cache_clean(&e->runners[i].cj_cache); + gravity_cache_clean(&e->runners[i].ci_gravity_cache); + gravity_cache_clean(&e->runners[i].cj_gravity_cache); + } #endif free(e->runners); free(e->snapshotUnits); diff --git a/src/gravity/Default/gravity_iact.h b/src/gravity/Default/gravity_iact.h index d4a95540de17631ad445075d672d03a1236e34e3..4d0c3f4c4d58bc2183f6244428bfaa5fb3ea2c2c 100644 --- a/src/gravity/Default/gravity_iact.h +++ b/src/gravity/Default/gravity_iact.h @@ -249,4 +249,5 @@ __attribute__((always_inline)) INLINE static void runner_iact_grav_pp_nonsym( gpi->a_grav[2] -= fdx[2]; } + #endif /* SWIFT_DEFAULT_GRAVITY_IACT_H */ diff --git a/src/runner.h b/src/runner.h index facadf1608fb7e06af952eedbf1151fa68530bef..aae7f91d8e4bacd0e97a5c70b1c297a156166c03 100644 --- a/src/runner.h +++ b/src/runner.h @@ -27,6 +27,7 @@ #include "../config.h" /* Includes. */ +#include "gravity_cache.h" #include "cache.h" struct cell; @@ -50,6 +51,12 @@ struct runner { struct engine *e; #ifdef WITH_VECTORIZATION + /*! The particle gravity_cache of cell ci. */ + struct gravity_cache ci_gravity_cache; + + /*! The particle gravity_cache of cell cj. */ + struct gravity_cache cj_gravity_cache; + /*! The particle cache of cell ci. */ struct cache ci_cache; diff --git a/src/runner_doiact_grav.h b/src/runner_doiact_grav.h index 0724f24c2745ab015bd79fdc0a3d7830bd8d76e0..c5795d5a8e5dea42ebfbae3bf91d2e8ec6768f4a 100644 --- a/src/runner_doiact_grav.h +++ b/src/runner_doiact_grav.h @@ -162,6 +162,224 @@ void runner_dopair_grav_mm(const struct runner *r, struct cell *restrict ci, void runner_dopair_grav_pp_full(struct runner *r, struct cell *ci, struct cell *cj, double shift[3]) { + /* Some constants */ + const struct engine *const e = r->e; + struct gravity_cache *const ci_cache = &r->ci_gravity_cache; + struct gravity_cache *const cj_cache = &r->cj_gravity_cache; + + /* Cell properties */ + const int gcount_i = ci->gcount; + const int gcount_j = cj->gcount; + struct gpart *restrict gparts_i = ci->gparts; + struct gpart *restrict gparts_j = cj->gparts; + const int ci_active = cell_is_active(ci, e); + const int cj_active = cell_is_active(cj, e); + + /* Anything to do here ?*/ + if (!ci_active && !cj_active) return; + + /* Check that we fit in cache */ + if(gcount_i > ci_cache->count || gcount_j > cj_cache->count) + error("Not enough space in the caches! gcount_i=%d gcount_j=%d", + gcount_i, gcount_j); + + /* Computed the padded counts */ + const int gcount_padded_i = gcount_i - (gcount_i % VEC_SIZE) + VEC_SIZE; + const int gcount_padded_j = gcount_j - (gcount_j % VEC_SIZE) + VEC_SIZE; + + /* Fill the caches */ + gravity_cache_populate(ci_cache, gparts_i, gcount_i, gcount_padded_i, ci_active, shift); + gravity_cache_populate_no_shift(cj_cache, gparts_j, gcount_j, gcount_padded_j, cj_active); + + /* Ok... Here we go ! */ + + if(ci_active) { + + /* Loop over all particles in ci... */ + for (int pid = 0; pid < gcount_i; pid++) { + + const float x_i = ci_cache->x[pid]; + const float y_i = ci_cache->y[pid]; + const float z_i = ci_cache->z[pid]; + + /* Some powers of the softening length */ + const float h_i = ci_cache->epsilon[pid]; + const float h2_i = h_i * h_i; + const float h_inv_i = 1.f / h_i; + const float h_inv3_i = h_inv_i * h_inv_i * h_inv_i; + + /* Local accumulators for the acceleration */ + float a_x = 0.f, a_y = 0.f, a_z = 0.f; + + /* Make the compiler understand we are in happy vectorization land */ + swift_align_information(cj_cache->x, SWIFT_CACHE_ALIGNMENT); + swift_align_information(cj_cache->y, SWIFT_CACHE_ALIGNMENT); + swift_align_information(cj_cache->z, SWIFT_CACHE_ALIGNMENT); + swift_align_information(cj_cache->m, SWIFT_CACHE_ALIGNMENT); + swift_assume_size(gcount_padded_j, VEC_SIZE); + + /* Loop over every particle in the other cell. */ + for (int pjd = 0; pjd < gcount_padded_j; pjd++) { + + /* Get info about j */ + const float x_j = cj_cache->x[pjd]; + const float y_j = cj_cache->y[pjd]; + const float z_j = cj_cache->z[pjd]; + const float mass_j = cj_cache->m[pjd]; + + /* Compute the pairwise (square) distance. */ + const float dx = x_i - x_j; + const float dy = y_i - y_j; + const float dz = z_i - z_j; + const float r2 = dx*dx + dy*dy + dz*dz; + +#ifdef SWIFT_DEBUG_CHECKS + if (r2 == 0.f) error("Interacting particles with 0 distance"); + + /* Check that particles have been drifted to the current time */ + if (gparts_i[pid].ti_drift != e->ti_current) + error("gpi not drifted to current time"); + if (pjd < gcount_j && gparts_j[pjd].ti_drift != e->ti_current) + error("gpj not drifted to current time"); +#endif + + /* Get the inverse distance */ + const float r_inv = 1.f / sqrtf(r2); + + float f_ij, W_ij; + + if (r2 >= h2_i) { + + /* Get Newtonian gravity */ + f_ij = mass_j * r_inv * r_inv * r_inv; + + } else { + + const float r = r2 * r_inv; + const float ui = r * h_inv_i; + + kernel_grav_eval(ui, &W_ij); + + /* Get softened gravity */ + f_ij = mass_j * h_inv3_i * W_ij; + } + + /* Store it back */ + a_x -= f_ij * dx; + a_y -= f_ij * dy; + a_z -= f_ij * dz; + +#ifdef SWIFT_DEBUG_CHECKS + gparts_i[pid].num_interacted++; +#endif + + } + + /* Store everything back in cache */ + ci_cache->a_x[pid] += a_x; + ci_cache->a_y[pid] += a_y; + ci_cache->a_z[pid] += a_z; + } + } + + /* Now do the opposite loop */ + if(cj_active) { + + /* Loop over all particles in ci... */ + for (int pjd = 0; pjd < gcount_j; pjd++) { + + const float x_j = cj_cache->x[pjd]; + const float y_j = cj_cache->y[pjd]; + const float z_j = cj_cache->z[pjd]; + + /* Some powers of the softening length */ + const float h_j = cj_cache->epsilon[pjd]; + const float h2_j = h_j * h_j; + const float h_inv_j = 1.f / h_j; + const float h_inv3_j = h_inv_j * h_inv_j * h_inv_j; + + /* Local accumulators for the acceleration */ + float a_x = 0.f, a_y = 0.f, a_z = 0.f; + + /* Make the compiler understand we are in happy vectorization land */ + swift_align_information(ci_cache->x, SWIFT_CACHE_ALIGNMENT); + swift_align_information(ci_cache->y, SWIFT_CACHE_ALIGNMENT); + swift_align_information(ci_cache->z, SWIFT_CACHE_ALIGNMENT); + swift_align_information(ci_cache->m, SWIFT_CACHE_ALIGNMENT); + swift_assume_size(gcount_padded_i, VEC_SIZE); + + /* Loop over every particle in the other cell. */ + for (int pid = 0; pid < gcount_padded_i; pid++) { + + /* Get info about j */ + const float x_i = ci_cache->x[pid]; + const float y_i = ci_cache->y[pid]; + const float z_i = ci_cache->z[pid]; + const float mass_i = ci_cache->m[pid]; + + /* Compute the pairwise (square) distance. */ + const float dx = x_j - x_i; + const float dy = y_j - y_i; + const float dz = z_j - z_i; + const float r2 = dx*dx + dy*dy + dz*dz; + +#ifdef SWIFT_DEBUG_CHECKS + if (r2 == 0.f) error("Interacting particles with 0 distance"); + + /* Check that particles have been drifted to the current time */ + if (gparts_j[pjd].ti_drift != e->ti_current) + error("gpj not drifted to current time"); + if (pid < gcount_i && gparts_i[pid].ti_drift != e->ti_current) + error("gpi not drifted to current time"); +#endif + + /* Get the inverse distance */ + const float r_inv = 1.f / sqrtf(r2); + + float f_ji, W_ji; + + if (r2 >= h2_j) { + + /* Get Newtonian gravity */ + f_ji = mass_i * r_inv * r_inv * r_inv; + + } else { + + const float r = r2 * r_inv; + const float uj = r * h_inv_j; + + kernel_grav_eval(uj, &W_ji); + + /* Get softened gravity */ + f_ji = mass_i * h_inv3_j * W_ji; + } + + /* Store it back */ + a_x -= f_ji * dx; + a_y -= f_ji * dy; + a_z -= f_ji * dz; + +#ifdef SWIFT_DEBUG_CHECKS + gparts_j[pjd].num_interacted++; +#endif + + } + + /* Store everything back in cache */ + cj_cache->a_x[pjd] += a_x; + cj_cache->a_y[pjd] += a_y; + cj_cache->a_z[pjd] += a_z; + } + } + + /* Write back to the particles */ + if(ci_active) + gravity_cache_write_back(ci_cache, gparts_i, gcount_i); + if(cj_active) + gravity_cache_write_back(cj_cache, gparts_j, gcount_j); + +#ifdef MATTHIEU_OLD_STUFF + /* Some constants */ const struct engine *const e = r->e; @@ -258,6 +476,7 @@ void runner_dopair_grav_pp_full(struct runner *r, struct cell *ci, } } } +#endif } /** @@ -273,6 +492,241 @@ void runner_dopair_grav_pp_full(struct runner *r, struct cell *ci, void runner_dopair_grav_pp_truncated(struct runner *r, struct cell *ci, struct cell *cj, double shift[3]) { + + /* Some constants */ + const struct engine *const e = r->e; + const struct space *s = e->s; + const double cell_width = s->width[0]; + const double a_smooth = e->gravity_properties->a_smooth; + const double rlr = cell_width * a_smooth; + const float rlr_inv = 1. / rlr; + + /* Caches to play with */ + struct gravity_cache *const ci_cache = &r->ci_gravity_cache; + struct gravity_cache *const cj_cache = &r->cj_gravity_cache; + + /* Cell properties */ + const int gcount_i = ci->gcount; + const int gcount_j = cj->gcount; + struct gpart *restrict gparts_i = ci->gparts; + struct gpart *restrict gparts_j = cj->gparts; + const int ci_active = cell_is_active(ci, e); + const int cj_active = cell_is_active(cj, e); + + /* Anything to do here ?*/ + if (!ci_active && !cj_active) return; + + /* Check that we fit in cache */ + if(gcount_i > ci_cache->count || gcount_j > cj_cache->count) + error("Not enough space in the caches! gcount_i=%d gcount_j=%d", + gcount_i, gcount_j); + + /* Computed the padded counts */ + const int gcount_padded_i = gcount_i - (gcount_i % VEC_SIZE) + VEC_SIZE; + const int gcount_padded_j = gcount_j - (gcount_j % VEC_SIZE) + VEC_SIZE; + + /* Fill the caches */ + gravity_cache_populate(ci_cache, gparts_i, gcount_i, gcount_padded_i, ci_active, shift); + gravity_cache_populate_no_shift(cj_cache, gparts_j, gcount_j, gcount_padded_j, cj_active); + + /* Ok... Here we go ! */ + + if(ci_active) { + + /* Loop over all particles in ci... */ + for (int pid = 0; pid < gcount_i; pid++) { + + const float x_i = ci_cache->x[pid]; + const float y_i = ci_cache->y[pid]; + const float z_i = ci_cache->z[pid]; + + /* Some powers of the softening length */ + const float h_i = ci_cache->epsilon[pid]; + const float h2_i = h_i * h_i; + const float h_inv_i = 1.f / h_i; + const float h_inv3_i = h_inv_i * h_inv_i * h_inv_i; + + /* Local accumulators for the acceleration */ + float a_x = 0.f, a_y = 0.f, a_z = 0.f; + + /* Make the compiler understand we are in happy vectorization land */ + swift_align_information(cj_cache->x, SWIFT_CACHE_ALIGNMENT); + swift_align_information(cj_cache->y, SWIFT_CACHE_ALIGNMENT); + swift_align_information(cj_cache->z, SWIFT_CACHE_ALIGNMENT); + swift_align_information(cj_cache->m, SWIFT_CACHE_ALIGNMENT); + swift_assume_size(gcount_padded_j, VEC_SIZE); + + /* Loop over every particle in the other cell. */ + for (int pjd = 0; pjd < gcount_padded_j; pjd++) { + + /* Get info about j */ + const float x_j = cj_cache->x[pjd]; + const float y_j = cj_cache->y[pjd]; + const float z_j = cj_cache->z[pjd]; + const float mass_j = cj_cache->m[pjd]; + + /* Compute the pairwise (square) distance. */ + const float dx = x_i - x_j; + const float dy = y_i - y_j; + const float dz = z_i - z_j; + const float r2 = dx*dx + dy*dy + dz*dz; + +#ifdef SWIFT_DEBUG_CHECKS + if (r2 == 0.f) error("Interacting particles with 0 distance"); + + /* Check that particles have been drifted to the current time */ + if (gparts_i[pid].ti_drift != e->ti_current) + error("gpi not drifted to current time"); + if (pjd < gcount_j && gparts_j[pjd].ti_drift != e->ti_current) + error("gpj not drifted to current time"); +#endif + + /* Get the inverse distance */ + const float r_inv = 1.f / sqrtf(r2); + const float r = r2 * r_inv; + + float f_ij, W_ij, corr_lr; + + if (r2 >= h2_i) { + + /* Get Newtonian gravity */ + f_ij = mass_j * r_inv * r_inv * r_inv; + + } else { + + const float ui = r * h_inv_i; + + kernel_grav_eval(ui, &W_ij); + + /* Get softened gravity */ + f_ij = mass_j * h_inv3_i * W_ij; + } + + /* Get long-range correction */ + const float u_lr = r * rlr_inv; + kernel_long_grav_eval(u_lr, &corr_lr); + f_ij *= corr_lr; + + /* Store it back */ + a_x -= f_ij * dx; + a_y -= f_ij * dy; + a_z -= f_ij * dz; + +#ifdef SWIFT_DEBUG_CHECKS + gparts_i[pid].num_interacted++; +#endif + + } + + /* Store everything back in cache */ + ci_cache->a_x[pid] += a_x; + ci_cache->a_y[pid] += a_y; + ci_cache->a_z[pid] += a_z; + } + } + + /* Now do the opposite loop */ + if(cj_active) { + + /* Loop over all particles in ci... */ + for (int pjd = 0; pjd < gcount_j; pjd++) { + + const float x_j = cj_cache->x[pjd]; + const float y_j = cj_cache->y[pjd]; + const float z_j = cj_cache->z[pjd]; + + /* Some powers of the softening length */ + const float h_j = cj_cache->epsilon[pjd]; + const float h2_j = h_j * h_j; + const float h_inv_j = 1.f / h_j; + const float h_inv3_j = h_inv_j * h_inv_j * h_inv_j; + + /* Local accumulators for the acceleration */ + float a_x = 0.f, a_y = 0.f, a_z = 0.f; + + /* Make the compiler understand we are in happy vectorization land */ + swift_align_information(ci_cache->x, SWIFT_CACHE_ALIGNMENT); + swift_align_information(ci_cache->y, SWIFT_CACHE_ALIGNMENT); + swift_align_information(ci_cache->z, SWIFT_CACHE_ALIGNMENT); + swift_align_information(ci_cache->m, SWIFT_CACHE_ALIGNMENT); + swift_assume_size(gcount_padded_i, VEC_SIZE); + + /* Loop over every particle in the other cell. */ + for (int pid = 0; pid < gcount_padded_i; pid++) { + + /* Get info about j */ + const float x_i = ci_cache->x[pid]; + const float y_i = ci_cache->y[pid]; + const float z_i = ci_cache->z[pid]; + const float mass_i = ci_cache->m[pid]; + + /* Compute the pairwise (square) distance. */ + const float dx = x_j - x_i; + const float dy = y_j - y_i; + const float dz = z_j - z_i; + const float r2 = dx*dx + dy*dy + dz*dz; + +#ifdef SWIFT_DEBUG_CHECKS + if (r2 == 0.f) error("Interacting particles with 0 distance"); + + /* Check that particles have been drifted to the current time */ + if (gparts_j[pjd].ti_drift != e->ti_current) + error("gpj not drifted to current time"); + if (pid < gcount_i && gparts_i[pid].ti_drift != e->ti_current) + error("gpi not drifted to current time"); +#endif + + /* Get the inverse distance */ + const float r_inv = 1.f / sqrtf(r2); + const float r = r2 * r_inv; + + float f_ji, W_ji, corr_lr; + + if (r2 >= h2_j) { + + /* Get Newtonian gravity */ + f_ji = mass_i * r_inv * r_inv * r_inv; + + } else { + + const float uj = r * h_inv_j; + + kernel_grav_eval(uj, &W_ji); + + /* Get softened gravity */ + f_ji = mass_i * h_inv3_j * W_ji; + } + + /* Get long-range correction */ + const float u_lr = r * rlr_inv; + kernel_long_grav_eval(u_lr, &corr_lr); + f_ji *= corr_lr; + + /* Store it back */ + a_x -= f_ji * dx; + a_y -= f_ji * dy; + a_z -= f_ji * dz; + +#ifdef SWIFT_DEBUG_CHECKS + gparts_j[pjd].num_interacted++; +#endif + + } + + /* Store everything back in cache */ + cj_cache->a_x[pjd] += a_x; + cj_cache->a_y[pjd] += a_y; + cj_cache->a_z[pjd] += a_z; + } + } + + /* Write back to the particles */ + if(ci_active) + gravity_cache_write_back(ci_cache, gparts_i, gcount_i); + if(cj_active) + gravity_cache_write_back(cj_cache, gparts_j, gcount_j); + +#ifdef MATTHIEU_OLD_STUFF /* Some constants */ const struct engine *const e = r->e; const struct space *s = e->s; @@ -374,6 +828,8 @@ void runner_dopair_grav_pp_truncated(struct runner *r, struct cell *ci, } } } + +#endif } /** diff --git a/src/vector.h b/src/vector.h index 7d82b9f5c9e2bc6e1fea9426ab3870eeec180408..6d5389039c028e4307d4e559dd6567cce6c25392 100644 --- a/src/vector.h +++ b/src/vector.h @@ -23,8 +23,12 @@ /* Have I already read this file? */ #ifndef VEC_MACRO +/* Config parameters. */ #include "../config.h" +/* Local headers */ +#include "inline.h" + #ifdef WITH_VECTORIZATION /* Need to check whether compiler supports this (IBM does not)