diff --git a/.gitignore b/.gitignore index 0c4d8bddadb17afcc29aa3dc1d4dba6cb6fd2f5b..e3e17bb82a01d9af0ace6ed72d196cf2dba242f6 100644 --- a/.gitignore +++ b/.gitignore @@ -34,7 +34,7 @@ examples/*/*/*.txt examples/*/*/used_parameters.yml examples/*/gravity_checks_*.dat -tests/testPair +tests/testActivePair tests/brute_force_periodic_BC_standard.dat tests/swift_periodic_BC_standard.dat tests/brute_force_periodic_BC_pertrubed.dat @@ -54,6 +54,11 @@ tests/brute_force_125_standard.dat tests/swift_dopair_125_standard.dat tests/brute_force_125_perturbed.dat tests/swift_dopair_125_perturbed.dat +tests/brute_force_active.dat +tests/brute_force_periodic_BC_perturbed.dat +tests/swift_dopair_active.dat +tests/test_nonsym_density_serial.dat +tests/test_nonsym_density_vec.dat tests/testGreetings tests/testReading tests/input.hdf5 @@ -75,8 +80,6 @@ tests/test27cells.sh tests/test27cellsPerturbed.sh tests/test125cells.sh tests/test125cellsPerturbed.sh -tests/testPair.sh -tests/testPairPerturbed.sh tests/testParser.sh tests/testReading.sh tests/testAdiabaticIndex diff --git a/configure.ac b/configure.ac index 95519863f6e303cde51bbef7e2046faa314f46ee..74fede99f4fbf578af4e703cedaa42f2c278b037 100644 --- a/configure.ac +++ b/configure.ac @@ -861,14 +861,14 @@ AM_CONDITIONAL([HAVE_DOXYGEN], [test "$ac_cv_path_ac_pt_DX_DOXYGEN" != ""]) # Handle .in files. AC_CONFIG_FILES([Makefile src/Makefile examples/Makefile doc/Makefile doc/Doxyfile tests/Makefile]) AC_CONFIG_FILES([tests/testReading.sh], [chmod +x tests/testReading.sh]) -AC_CONFIG_FILES([tests/testPair.sh], [chmod +x tests/testPair.sh]) -AC_CONFIG_FILES([tests/testPairPerturbed.sh], [chmod +x tests/testPairPerturbed.sh]) +AC_CONFIG_FILES([tests/testActivePair.sh], [chmod +x tests/testActivePair.sh]) AC_CONFIG_FILES([tests/test27cells.sh], [chmod +x tests/test27cells.sh]) AC_CONFIG_FILES([tests/test27cellsPerturbed.sh], [chmod +x tests/test27cellsPerturbed.sh]) AC_CONFIG_FILES([tests/test125cells.sh], [chmod +x tests/test125cells.sh]) AC_CONFIG_FILES([tests/test125cellsPerturbed.sh], [chmod +x tests/test125cellsPerturbed.sh]) AC_CONFIG_FILES([tests/testPeriodicBC.sh], [chmod +x tests/testPeriodicBC.sh]) AC_CONFIG_FILES([tests/testPeriodicBCPerturbed.sh], [chmod +x tests/testPeriodicBCPerturbed.sh]) +AC_CONFIG_FILES([tests/testInteractions.sh], [chmod +x tests/testInteractions.sh]) AC_CONFIG_FILES([tests/testParser.sh], [chmod +x tests/testParser.sh]) # Save the compilation options diff --git a/examples/EAGLE_12/eagle_12.yml b/examples/EAGLE_12/eagle_12.yml index fd9569c50f4ad0d87eccd54e34636180c0337596..f56c330590ac25cc5b3fe8f68ed68aa1e94d6490 100644 --- a/examples/EAGLE_12/eagle_12.yml +++ b/examples/EAGLE_12/eagle_12.yml @@ -26,7 +26,7 @@ Statistics: # Parameters for the self-gravity scheme Gravity: eta: 0.025 # Constant dimensionless multiplier for time integration. - epsilon: 0.0001 # Softening length (in internal units). + epsilon: 0.001 # Softening length (in internal units). theta: 0.7 # Opening angle (Multipole acceptance criterion) # Parameters for the hydrodynamics scheme diff --git a/examples/EAGLE_6/eagle_6.yml b/examples/EAGLE_6/eagle_6.yml index a8ba818a8bb278e783454e7c3cd2890f4e1faf89..f55ecc856953d4cb60a86e3461625318a1757693 100644 --- a/examples/EAGLE_6/eagle_6.yml +++ b/examples/EAGLE_6/eagle_6.yml @@ -12,6 +12,9 @@ TimeIntegration: time_end: 1e-2 # The end time of the simulation (in internal units). dt_min: 1e-10 # The minimal time-step size of the simulation (in internal units). dt_max: 1e-4 # The maximal time-step size of the simulation (in internal units). + +Scheduler: + cell_split_size: 64 # Parameters governing the snapshots Snapshots: diff --git a/examples/PerturbedBox_2D/perturbedPlane.yml b/examples/PerturbedBox_2D/perturbedPlane.yml index b92e29f620edc6f72399111fbe73ba6bd1485e92..a0c6b6d9dbc7a677002dbce5abc6e5d268b56e97 100644 --- a/examples/PerturbedBox_2D/perturbedPlane.yml +++ b/examples/PerturbedBox_2D/perturbedPlane.yml @@ -9,7 +9,7 @@ InternalUnitSystem: # Parameters governing the time integration TimeIntegration: time_begin: 0. # The starting time of the simulation (in internal units). - time_end: 10. # The end time of the simulation (in internal units). + time_end: 1000. # The end time of the simulation (in internal units). dt_min: 1e-6 # The minimal time-step size of the simulation (in internal units). dt_max: 1e-2 # The maximal time-step size of the simulation (in internal units). @@ -21,12 +21,11 @@ Snapshots: # Parameters governing the conserved quantities statistics Statistics: - delta_time: 1e-3 # Time between statistics output + delta_time: 1. # Time between statistics output # Parameters for the hydrodynamics scheme SPH: resolution_eta: 1.2348 # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel). - delta_neighbours: 0.1 # The tolerance for the targetted number of neighbours. CFL_condition: 0.1 # Courant-Friedrich-Levy condition for time integration. # Parameters related to the initial conditions diff --git a/examples/PerturbedBox_3D/perturbedBox.yml b/examples/PerturbedBox_3D/perturbedBox.yml index 71c8dece4df5505eb44511ee92291feedd7ffab1..3148510979d0e349c0d6242bf11e1a0db94f9e1f 100644 --- a/examples/PerturbedBox_3D/perturbedBox.yml +++ b/examples/PerturbedBox_3D/perturbedBox.yml @@ -9,9 +9,9 @@ InternalUnitSystem: # Parameters governing the time integration TimeIntegration: time_begin: 0. # The starting time of the simulation (in internal units). - time_end: 1. # The end time of the simulation (in internal units). + time_end: 1000 # The end time of the simulation (in internal units). dt_min: 1e-6 # The minimal time-step size of the simulation (in internal units). - dt_max: 1e-3 # The maximal time-step size of the simulation (in internal units). + dt_max: 1e-2 # The maximal time-step size of the simulation (in internal units). # Parameters governing the snapshots Snapshots: @@ -21,12 +21,11 @@ Snapshots: # Parameters governing the conserved quantities statistics Statistics: - delta_time: 1e-3 # Time between statistics output + delta_time: 1. # Time between statistics output # Parameters for the hydrodynamics scheme SPH: resolution_eta: 1.2348 # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel). - delta_neighbours: 0.1 # The tolerance for the targetted number of neighbours. CFL_condition: 0.1 # Courant-Friedrich-Levy condition for time integration. # Parameters related to the initial conditions diff --git a/examples/main.c b/examples/main.c index c8a1780c5ba5432ff9e713853c578666d1cc3a6f..ee1253062409ec2e787e064a5fb50da2c830d35d 100644 --- a/examples/main.c +++ b/examples/main.c @@ -190,7 +190,11 @@ int main(int argc, char *argv[]) { while ((c = getopt(argc, argv, "acCdDef:FgGhMn:P:sSt:Tv:y:Y:")) != -1) switch (c) { case 'a': +#if defined(HAVE_SETAFFINITY) && defined(HAVE_LIBNUMA) with_aff = 1; +#else + error("Need NUMA support for thread affinity"); +#endif break; case 'c': with_cosmology = 1; @@ -392,8 +396,12 @@ int main(int argc, char *argv[]) { parser_read_file(paramFileName, params); /* Handle any command-line overrides. */ - if (nparams > 0) + if (nparams > 0) { + message( + "Overwriting values read from the YAML file with command-line " + "values."); for (int k = 0; k < nparams; k++) parser_set_param(params, cmdparams[k]); + } /* And dump the parameters as used. */ // parser_print_params(¶ms); @@ -565,6 +573,11 @@ int main(int argc, char *argv[]) { message("nr of cells at depth %i is %i.", data[0], data[1]); } +/* Initialise the table of Ewald corrections for the gravity checks */ +#ifdef SWIFT_GRAVITY_FORCE_CHECKS + if (periodic) gravity_exact_force_ewald_init(dim[0]); +#endif + /* Initialise the external potential properties */ struct external_potential potential; if (with_external_gravity) diff --git a/src/Makefile.am b/src/Makefile.am index 3784886278544a1c6b606689bfbae0d6fa52cf19..ec01184928faf3d58b2d0890965a745d05718354 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -64,7 +64,7 @@ nobase_noinst_HEADERS = align.h approx_math.h atomic.h cycle.h error.h inline.h kernel_long_gravity.h vector.h cache.h runner_doiact.h runner_doiact_vec.h runner_doiact_grav.h runner_doiact_fft.h \ runner_doiact_nosort.h units.h intrinsics.h minmax.h kick.h timestep.h drift.h adiabatic_index.h io_properties.h \ dimension.h equation_of_state.h part_type.h periodic.h \ - gravity.h gravity_io.h \ + gravity.h gravity_io.h gravity_cache.h \ gravity/Default/gravity.h gravity/Default/gravity_iact.h gravity/Default/gravity_io.h \ gravity/Default/gravity_debug.h gravity/Default/gravity_part.h \ sourceterms.h \ diff --git a/src/align.h b/src/align.h index 915af33e6e2ba59be1a0849c4de0e2f1bd5b0d96..54435c4c9baa1ce9dc511e2903b7e2be2d6655de 100644 --- a/src/align.h +++ b/src/align.h @@ -23,9 +23,71 @@ * @brief The default struct alignment in SWIFT. */ #define SWIFT_STRUCT_ALIGNMENT 32 + /** * @brief Defines alignment of structures */ #define SWIFT_STRUCT_ALIGN __attribute__((aligned(SWIFT_STRUCT_ALIGNMENT))) +/** + * @brief The default cache alignment in SWIFT. + */ +#define SWIFT_CACHE_ALIGNMENT 64 + +/** + * @brief Defines alignment of caches + */ +#define SWIFT_CACHE_ALIGN __attribute__((aligned(SWIFT_CACHE_ALIGNMENT))) + +/** + * @brief Macro to tell the compiler that a given array has the specified + * alignment. + * + * Note that this turns into a no-op but gives information to the compiler. + * + * @param array The array. + * @param alignment The alignment in bytes of the array. + */ +#if defined(__ICC) +#define swift_align_information(array, alignment) \ + __assume_aligned(array, alignment); +#elif defined(__GNUC__) +#define swift_align_information(array, alignment) \ + array = __builtin_assume_aligned(array, alignment); +#else +#define swift_align_information(array, alignment) ; +#endif + +/** + * @brief Macro to create a restrict pointer to an array and tell the compiler + * that the given array has the specified + * alignment. + * + * Note that this turns into a no-op but gives information to the compiler. + * + * @param array The array. + * @param ptr Pointer to array + * @param type Type of array + * @param alignment The alignment in bytes of the array. + */ +#define swift_declare_aligned_ptr(type, array, ptr, alignment) \ + type *restrict array = ptr; \ + swift_align_information(array, alignment); + +/** + * @brief Macro to tell the compiler that a given number is 0 modulo a given + * size. + * + * Note that this turns into a no-op but gives information to the compiler. + * GCC does not have the equivalent built-in so defaults to nothing. + * + * @param var The variable + * @param size The modulo of interest. + */ +#if defined(__ICC) +#define swift_assume_size(var, size) __assume(var % size == 0); +#else +#define swift_assume_size(var, size) ; +#endif + #endif /* SWIFT_ALIGN_H */ diff --git a/src/cache.h b/src/cache.h index 1b6d89f03fafa1595cb9bdf857c9f2bc1cba316e..170eb8f5cb0939e85647bbb4622d66cc78ef3dbe 100644 --- a/src/cache.h +++ b/src/cache.h @@ -23,6 +23,7 @@ #include "../config.h" /* Local headers */ +#include "align.h" #include "cell.h" #include "error.h" #include "part.h" @@ -30,9 +31,7 @@ #include "vector.h" #define NUM_VEC_PROC 2 -#define CACHE_ALIGN 64 #define C2_CACHE_SIZE (NUM_VEC_PROC * VEC_SIZE * 6) + (NUM_VEC_PROC * VEC_SIZE) -#define C2_CACHE_ALIGN sizeof(float) * VEC_SIZE #ifdef WITH_VECTORIZATION /* Cache struct to hold a local copy of a cells' particle @@ -40,31 +39,31 @@ struct cache { /* Particle x position. */ - float *restrict x __attribute__((aligned(CACHE_ALIGN))); + float *restrict x SWIFT_CACHE_ALIGN; /* Particle y position. */ - float *restrict y __attribute__((aligned(CACHE_ALIGN))); + float *restrict y SWIFT_CACHE_ALIGN; /* Particle z position. */ - float *restrict z __attribute__((aligned(CACHE_ALIGN))); + float *restrict z SWIFT_CACHE_ALIGN; /* Particle smoothing length. */ - float *restrict h __attribute__((aligned(CACHE_ALIGN))); + float *restrict h SWIFT_CACHE_ALIGN; /* Particle mass. */ - float *restrict m __attribute__((aligned(CACHE_ALIGN))); + float *restrict m SWIFT_CACHE_ALIGN; /* Particle x velocity. */ - float *restrict vx __attribute__((aligned(CACHE_ALIGN))); + float *restrict vx SWIFT_CACHE_ALIGN; /* Particle y velocity. */ - float *restrict vy __attribute__((aligned(CACHE_ALIGN))); + float *restrict vy SWIFT_CACHE_ALIGN; /* Particle z velocity. */ - float *restrict vz __attribute__((aligned(CACHE_ALIGN))); + float *restrict vz SWIFT_CACHE_ALIGN; - /* Maximum distance of particles into neighbouring cell. */ - float *restrict max_d __attribute__((aligned(CACHE_ALIGN))); + /* Maximum index into neighbouring cell for particles that are in range. */ + int *restrict max_index SWIFT_CACHE_ALIGN; /* Cache size. */ int count; @@ -75,28 +74,28 @@ struct cache { struct c2_cache { /* Separation between two particles squared. */ - float r2q[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + float r2q[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN; /* x separation between two particles. */ - float dxq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + float dxq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN; /* y separation between two particles. */ - float dyq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + float dyq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN; /* z separation between two particles. */ - float dzq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + float dzq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN; /* Mass of particle pj. */ - float mq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + float mq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN; /* x velocity of particle pj. */ - float vxq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + float vxq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN; /* y velocity of particle pj. */ - float vyq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + float vyq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN; /* z velocity of particle pj. */ - float vzq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + float vzq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN; }; /** @@ -111,9 +110,10 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c, /* Align cache on correct byte boundary and pad cache size to be a multiple of * the vector size * and include 2 vector lengths for remainder operations. */ - unsigned int pad = 2 * VEC_SIZE, rem = count % VEC_SIZE; + size_t pad = 2 * VEC_SIZE, rem = count % VEC_SIZE; if (rem > 0) pad += VEC_SIZE - rem; - unsigned int sizeBytes = (count + pad) * sizeof(float); + size_t sizeBytes = (count + pad) * sizeof(float); + size_t sizeIntBytes = (count + pad) * sizeof(int); int error = 0; /* Free memory if cache has already been allocated. */ @@ -126,18 +126,19 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c, free(c->vy); free(c->vz); free(c->h); - free(c->max_d); + free(c->max_index); } - error += posix_memalign((void **)&c->x, CACHE_ALIGN, sizeBytes); - error += posix_memalign((void **)&c->y, CACHE_ALIGN, sizeBytes); - error += posix_memalign((void **)&c->z, CACHE_ALIGN, sizeBytes); - error += posix_memalign((void **)&c->m, CACHE_ALIGN, sizeBytes); - error += posix_memalign((void **)&c->vx, CACHE_ALIGN, sizeBytes); - error += posix_memalign((void **)&c->vy, CACHE_ALIGN, sizeBytes); - error += posix_memalign((void **)&c->vz, CACHE_ALIGN, sizeBytes); - error += posix_memalign((void **)&c->h, CACHE_ALIGN, sizeBytes); - error += posix_memalign((void **)&c->max_d, CACHE_ALIGN, sizeBytes); + error += posix_memalign((void **)&c->x, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->y, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->z, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->m, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->vx, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->vy, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->vz, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->h, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->max_index, SWIFT_CACHE_ALIGNMENT, + sizeIntBytes); if (error != 0) error("Couldn't allocate cache, no. of particles: %d", (int)count); @@ -151,156 +152,43 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c, * @param ci_cache The cache. */ __attribute__((always_inline)) INLINE void cache_read_particles( - const struct cell *const ci, struct cache *const ci_cache) { + const struct cell *restrict const ci, + struct cache *restrict const ci_cache) { #if defined(GADGET2_SPH) -/* Shift the particles positions to a local frame so single precision can be - * used instead of double precision. */ -#if defined(WITH_VECTORIZATION) && defined(__ICC) -#pragma vector aligned -#endif - for (int i = 0; i < ci->count; i++) { - ci_cache->x[i] = ci->parts[i].x[0] - ci->loc[0]; - ci_cache->y[i] = ci->parts[i].x[1] - ci->loc[1]; - ci_cache->z[i] = ci->parts[i].x[2] - ci->loc[2]; - ci_cache->h[i] = ci->parts[i].h; - - ci_cache->m[i] = ci->parts[i].mass; - ci_cache->vx[i] = ci->parts[i].v[0]; - ci_cache->vy[i] = ci->parts[i].v[1]; - ci_cache->vz[i] = ci->parts[i].v[2]; - } - -#endif -} - -/** - * @brief Populate cache by reading in the particles from two cells in unsorted - * order. - * - * @param ci The i #cell. - * @param cj The j #cell. - * @param ci_cache The cache for cell ci. - * @param cj_cache The cache for cell cj. - * @param shift The amount to shift the particle positions to account for BCs - */ -__attribute__((always_inline)) INLINE void cache_read_two_cells( - const struct cell *const ci, const struct cell *const cj, - struct cache *const ci_cache, struct cache *const cj_cache, - const double *const shift) { - - /* Shift the particles positions to a local frame (ci frame) so single - * precision can be - * used instead of double precision. Also shift the cell ci, particles - * positions due to BCs but leave cell cj. */ - for (int i = 0; i < ci->count; i++) { - ci_cache->x[i] = ci->parts[i].x[0] - ci->loc[0] - shift[0]; - ci_cache->y[i] = ci->parts[i].x[1] - ci->loc[1] - shift[1]; - ci_cache->z[i] = ci->parts[i].x[2] - ci->loc[2] - shift[2]; - ci_cache->h[i] = ci->parts[i].h; - - ci_cache->m[i] = ci->parts[i].mass; - ci_cache->vx[i] = ci->parts[i].v[0]; - ci_cache->vy[i] = ci->parts[i].v[1]; - ci_cache->vz[i] = ci->parts[i].v[2]; - } - - for (int i = 0; i < cj->count; i++) { - cj_cache->x[i] = cj->parts[i].x[0] - ci->loc[0]; - cj_cache->y[i] = cj->parts[i].x[1] - ci->loc[1]; - cj_cache->z[i] = cj->parts[i].x[2] - ci->loc[2]; - cj_cache->h[i] = cj->parts[i].h; - - cj_cache->m[i] = cj->parts[i].mass; - cj_cache->vx[i] = cj->parts[i].v[0]; - cj_cache->vy[i] = cj->parts[i].v[1]; - cj_cache->vz[i] = cj->parts[i].v[2]; - } -} - -__attribute__((always_inline)) INLINE void cache_read_cell_sorted( - const struct cell *const ci, struct cache *const ci_cache, - const struct entry *restrict sort_i, double *const loc, - double *const shift) { - - int idx; -/* Shift the particles positions to a local frame (ci frame) so single precision - * can be - * used instead of double precision. Also shift the cell ci, particles positions - * due to BCs but leave cell cj. */ -#if defined(WITH_VECTORIZATION) && defined(__ICC) -#pragma simd -#endif - for (int i = 0; i < ci->count; i++) { - idx = sort_i[i].i; - - ci_cache->x[i] = ci->parts[idx].x[0] - loc[0] - shift[0]; - ci_cache->y[i] = ci->parts[idx].x[1] - loc[1] - shift[1]; - ci_cache->z[i] = ci->parts[idx].x[2] - loc[2] - shift[2]; - ci_cache->h[i] = ci->parts[idx].h; - - ci_cache->m[i] = ci->parts[idx].mass; - ci_cache->vx[i] = ci->parts[idx].v[0]; - ci_cache->vy[i] = ci->parts[idx].v[1]; - ci_cache->vz[i] = ci->parts[idx].v[2]; - } -} - -/** - * @brief Populate cache by reading in the particles from two cells in sorted - * order. - * - * @param ci The i #cell. - * @param cj The j #cell. - * @param ci_cache The #cache for cell ci. - * @param cj_cache The #cache for cell cj. - * @param sort_i The array of sorted particle indices for cell ci. - * @param sort_j The array of sorted particle indices for cell ci. - * @param shift The amount to shift the particle positions to account for BCs - */ -__attribute__((always_inline)) INLINE void cache_read_two_cells_sorted( - const struct cell *const ci, const struct cell *const cj, - struct cache *const ci_cache, struct cache *const cj_cache, - const struct entry *restrict sort_i, const struct entry *restrict sort_j, - const double *const shift) { - - int idx; -/* Shift the particles positions to a local frame (ci frame) so single precision - * can be - * used instead of double precision. Also shift the cell ci, particles positions - * due to BCs but leave cell cj. */ -#if defined(WITH_VECTORIZATION) && defined(__ICC) -#pragma simd -#endif + /* Let the compiler know that the data is aligned and create pointers to the + * arrays inside the cache. */ + swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, y, ci_cache->y, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, z, ci_cache->z, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, h, ci_cache->h, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, m, ci_cache->m, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vx, ci_cache->vx, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT); + + const struct part *restrict parts = ci->parts; + double loc[3]; + loc[0] = ci->loc[0]; + loc[1] = ci->loc[1]; + loc[2] = ci->loc[2]; + + /* Shift the particles positions to a local frame so single precision can be + * used instead of double precision. */ for (int i = 0; i < ci->count; i++) { - idx = sort_i[i].i; - ci_cache->x[i] = ci->parts[idx].x[0] - ci->loc[0] - shift[0]; - ci_cache->y[i] = ci->parts[idx].x[1] - ci->loc[1] - shift[1]; - ci_cache->z[i] = ci->parts[idx].x[2] - ci->loc[2] - shift[2]; - ci_cache->h[i] = ci->parts[idx].h; - - ci_cache->m[i] = ci->parts[idx].mass; - ci_cache->vx[i] = ci->parts[idx].v[0]; - ci_cache->vy[i] = ci->parts[idx].v[1]; - ci_cache->vz[i] = ci->parts[idx].v[2]; + x[i] = (float)(parts[i].x[0] - loc[0]); + y[i] = (float)(parts[i].x[1] - loc[1]); + z[i] = (float)(parts[i].x[2] - loc[2]); + h[i] = parts[i].h; + + m[i] = parts[i].mass; + vx[i] = parts[i].v[0]; + vy[i] = parts[i].v[1]; + vz[i] = parts[i].v[2]; } -#if defined(WITH_VECTORIZATION) && defined(__ICC) -#pragma simd #endif - for (int i = 0; i < cj->count; i++) { - idx = sort_j[i].i; - cj_cache->x[i] = cj->parts[idx].x[0] - ci->loc[0]; - cj_cache->y[i] = cj->parts[idx].x[1] - ci->loc[1]; - cj_cache->z[i] = cj->parts[idx].x[2] - ci->loc[2]; - cj_cache->h[i] = cj->parts[idx].h; - - cj_cache->m[i] = cj->parts[idx].mass; - cj_cache->vx[i] = cj->parts[idx].v[0]; - cj_cache->vy[i] = cj->parts[idx].v[1]; - cj_cache->vz[i] = cj->parts[idx].v[2]; - } } /** @@ -321,13 +209,13 @@ __attribute__((always_inline)) INLINE void cache_read_two_cells_sorted( * interaction. */ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( - const struct cell *const ci, const struct cell *const cj, - struct cache *const ci_cache, struct cache *const cj_cache, - const struct entry *restrict sort_i, const struct entry *restrict sort_j, - const double *const shift, int *first_pi, int *last_pj, - const int num_vec_proc) { + const struct cell *restrict const ci, const struct cell *restrict const cj, + struct cache *restrict const ci_cache, + struct cache *restrict const cj_cache, const struct entry *restrict sort_i, + const struct entry *restrict sort_j, const double *restrict const shift, + int *first_pi, int *last_pj, const int num_vec_proc) { - int idx, ci_cache_idx; + int idx; /* Pad number of particles read to the vector size. */ int rem = (ci->count - *first_pi) % (num_vec_proc * VEC_SIZE); if (rem != 0) { @@ -345,74 +233,97 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( int first_pi_align = *first_pi; int last_pj_align = *last_pj; - -/* Shift the particles positions to a local frame (ci frame) so single precision - * can be - * used instead of double precision. Also shift the cell ci, particles positions - * due to BCs but leave cell cj. */ -#if defined(WITH_VECTORIZATION) && defined(__ICC) -#pragma vector aligned -#endif - for (int i = first_pi_align; i < ci->count; i++) { - /* Make sure ci_cache is filled from the first element. */ - ci_cache_idx = i - first_pi_align; - idx = sort_i[i].i; - ci_cache->x[ci_cache_idx] = ci->parts[idx].x[0] - ci->loc[0] - shift[0]; - ci_cache->y[ci_cache_idx] = ci->parts[idx].x[1] - ci->loc[1] - shift[1]; - ci_cache->z[ci_cache_idx] = ci->parts[idx].x[2] - ci->loc[2] - shift[2]; - ci_cache->h[ci_cache_idx] = ci->parts[idx].h; - - ci_cache->m[ci_cache_idx] = ci->parts[idx].mass; - ci_cache->vx[ci_cache_idx] = ci->parts[idx].v[0]; - ci_cache->vy[ci_cache_idx] = ci->parts[idx].v[1]; - ci_cache->vz[ci_cache_idx] = ci->parts[idx].v[2]; + const struct part *restrict parts_i = ci->parts; + const struct part *restrict parts_j = cj->parts; + double loc[3]; + loc[0] = ci->loc[0]; + loc[1] = ci->loc[1]; + loc[2] = ci->loc[2]; + + /* Let the compiler know that the data is aligned and create pointers to the + * arrays inside the cache. */ + swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, y, ci_cache->y, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, z, ci_cache->z, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, h, ci_cache->h, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, m, ci_cache->m, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vx, ci_cache->vx, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT); + + int ci_cache_count = ci->count - first_pi_align; + /* Shift the particles positions to a local frame (ci frame) so single + * precision + * can be + * used instead of double precision. Also shift the cell ci, particles + * positions + * due to BCs but leave cell cj. */ + for (int i = 0; i < ci_cache_count; i++) { + idx = sort_i[i + first_pi_align].i; + x[i] = (float)(parts_i[idx].x[0] - loc[0] - shift[0]); + y[i] = (float)(parts_i[idx].x[1] - loc[1] - shift[1]); + z[i] = (float)(parts_i[idx].x[2] - loc[2] - shift[2]); + h[i] = parts_i[idx].h; + + m[i] = parts_i[idx].mass; + vx[i] = parts_i[idx].v[0]; + vy[i] = parts_i[idx].v[1]; + vz[i] = parts_i[idx].v[2]; } /* Pad cache with fake particles that exist outside the cell so will not * interact.*/ - float fake_pix = 2.0f * ci->parts[sort_i[ci->count - 1].i].x[0]; + float fake_pix = 2.0f * parts_i[sort_i[ci->count - 1].i].x[0]; for (int i = ci->count - first_pi_align; i < ci->count - first_pi_align + VEC_SIZE; i++) { - ci_cache->x[i] = fake_pix; - ci_cache->y[i] = 1.f; - ci_cache->z[i] = 1.f; - ci_cache->h[i] = 1.f; - - ci_cache->m[i] = 1.f; - ci_cache->vx[i] = 1.f; - ci_cache->vy[i] = 1.f; - ci_cache->vz[i] = 1.f; + x[i] = fake_pix; + y[i] = 1.f; + z[i] = 1.f; + h[i] = 1.f; + + m[i] = 1.f; + vx[i] = 1.f; + vy[i] = 1.f; + vz[i] = 1.f; } -#if defined(WITH_VECTORIZATION) && defined(__ICC) -#pragma vector aligned -#endif + /* Let the compiler know that the data is aligned and create pointers to the + * arrays inside the cache. */ + swift_declare_aligned_ptr(float, xj, cj_cache->x, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, yj, cj_cache->y, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, zj, cj_cache->z, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, hj, cj_cache->h, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, mj, cj_cache->m, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vxj, cj_cache->vx, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vyj, cj_cache->vy, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vzj, cj_cache->vz, SWIFT_CACHE_ALIGNMENT); + for (int i = 0; i <= last_pj_align; i++) { idx = sort_j[i].i; - cj_cache->x[i] = cj->parts[idx].x[0] - ci->loc[0]; - cj_cache->y[i] = cj->parts[idx].x[1] - ci->loc[1]; - cj_cache->z[i] = cj->parts[idx].x[2] - ci->loc[2]; - cj_cache->h[i] = cj->parts[idx].h; - - cj_cache->m[i] = cj->parts[idx].mass; - cj_cache->vx[i] = cj->parts[idx].v[0]; - cj_cache->vy[i] = cj->parts[idx].v[1]; - cj_cache->vz[i] = cj->parts[idx].v[2]; + xj[i] = (float)(parts_j[idx].x[0] - loc[0]); + yj[i] = (float)(parts_j[idx].x[1] - loc[1]); + zj[i] = (float)(parts_j[idx].x[2] - loc[2]); + hj[i] = parts_j[idx].h; + + mj[i] = parts_j[idx].mass; + vxj[i] = parts_j[idx].v[0]; + vyj[i] = parts_j[idx].v[1]; + vzj[i] = parts_j[idx].v[2]; } /* Pad cache with fake particles that exist outside the cell so will not * interact.*/ float fake_pjx = 2.0f * cj->parts[sort_j[cj->count - 1].i].x[0]; for (int i = last_pj_align + 1; i < last_pj_align + 1 + VEC_SIZE; i++) { - cj_cache->x[i] = fake_pjx; - cj_cache->y[i] = 1.f; - cj_cache->z[i] = 1.f; - cj_cache->h[i] = 1.f; - - cj_cache->m[i] = 1.f; - cj_cache->vx[i] = 1.f; - cj_cache->vy[i] = 1.f; - cj_cache->vz[i] = 1.f; + xj[i] = fake_pjx; + yj[i] = 1.f; + zj[i] = 1.f; + hj[i] = 1.f; + + mj[i] = 1.f; + vxj[i] = 1.f; + vyj[i] = 1.f; + vzj[i] = 1.f; } } @@ -430,7 +341,7 @@ static INLINE void cache_clean(struct cache *c) { free(c->vy); free(c->vz); free(c->h); - free(c->max_d); + free(c->max_index); } } diff --git a/src/debug.c b/src/debug.c index a66aa93b85d586ba62c9ef78d7d7d38b94112836..dca66ee45bed4530bcc6e4765ecd5f96848f8caa 100644 --- a/src/debug.c +++ b/src/debug.c @@ -26,6 +26,7 @@ /* Some standard headers. */ #include <float.h> #include <stdio.h> +#include <unistd.h> /* This object's header. */ #include "debug.h" @@ -524,3 +525,69 @@ void dumpCellRanks(const char *prefix, struct cell *cells_top, int nr_cells) { } #endif /* HAVE_MPI */ + +/** + * @brief parse the process /proc/self/statm file to get the process + * memory use (in KB). Top field in (). + * + * @param size total virtual memory (VIRT) + * @param resident resident non-swapped memory (RES) + * @param share shared (mmap'd) memory (SHR) + * @param trs text (exe) resident set (CODE) + * @param lrs library resident set + * @param drs data+stack resident set (DATA) + * @param dt dirty pages (nDRT) + */ +void getProcMemUse(long *size, long *resident, long *share, long *trs, + long *lrs, long *drs, long *dt) { + + /* Open the file. */ + FILE *file = fopen("/proc/self/statm", "r"); + if (file != NULL) { + int nscan = fscanf(file, "%ld %ld %ld %ld %ld %ld %ld", size, resident, + share, trs, lrs, drs, dt); + + if (nscan == 7) { + /* Convert pages into bytes. Usually 4096, but could be 512 on some + * systems so take care in conversion to KB. */ + long sz = sysconf(_SC_PAGESIZE); + *size *= sz; + *resident *= sz; + *share *= sz; + *trs *= sz; + *lrs *= sz; + *drs *= sz; + *dt *= sz; + + *size /= 1024; + *resident /= 1024; + *share /= 1024; + *trs /= 1024; + *lrs /= 1024; + *drs /= 1024; + *dt /= 1024; + } else { + error("Failed to read sufficient fields from /proc/self/statm"); + } + fclose(file); + } else { + error("Failed to open /proc/self/statm"); + } +} + +/** + * @brief Print the current memory use of the process. A la "top". + */ +void printProcMemUse() { + long size; + long resident; + long share; + long trs; + long lrs; + long drs; + long dt; + getProcMemUse(&size, &resident, &share, &trs, &lrs, &drs, &dt); + printf("## VIRT = %ld , RES = %ld , SHR = %ld , CODE = %ld, DATA = %ld\n", + size, resident, share, trs, drs); + fflush(stdout); +} diff --git a/src/debug.h b/src/debug.h index e07c73c7a68b0ef244b4a30cf3aecebc6ff38f3d..7bea11de6ac3fe82ae9b3f1be84249f75742ecaf 100644 --- a/src/debug.h +++ b/src/debug.h @@ -45,4 +45,7 @@ void dumpMETISGraph(const char *prefix, idx_t nvtxs, idx_t ncon, idx_t *xadj, void dumpCellRanks(const char *prefix, struct cell *cells_top, int nr_cells); #endif +void getProcMemUse(long *size, long *resident, long *share, long *trs, + long *lrs, long *drs, long *dt); +void printProcMemUse(); #endif /* SWIFT_DEBUG_H */ diff --git a/src/dimension.h b/src/dimension.h index 4ee9377b4f9608b2549f3dfee2a683b61a71c76a..0b2093d718a61c6ce850db1970412af3e2e462b9 100644 --- a/src/dimension.h +++ b/src/dimension.h @@ -275,11 +275,11 @@ __attribute__((always_inline)) INLINE static vector pow_dimension_vec( #if defined(HYDRO_DIMENSION_3D) - return (vector)(x.v * x.v * x.v); + return (vector)(vec_mul(vec_mul(x.v, x.v), x.v)); #elif defined(HYDRO_DIMENSION_2D) - return (vector)(x.v * x.v); + return (vector)(vec_mul(x.v, x.v)); #elif defined(HYDRO_DIMENSION_1D) @@ -304,16 +304,16 @@ __attribute__((always_inline)) INLINE static vector pow_dimension_plus_one_vec( #if defined(HYDRO_DIMENSION_3D) - const vector x2 = (vector)(x.v * x.v); - return (vector)(x2.v * x2.v); + const vector x2 = (vector)(vec_mul(x.v, x.v)); + return (vector)(vec_mul(x2.v, x2.v)); #elif defined(HYDRO_DIMENSION_2D) - return (vector)(x.v * x.v * x.v); + return (vector)(vec_mul(x.v, vec_mul(x.v, x.v))); #elif defined(HYDRO_DIMENSION_1D) - return (vector)(x.v * x.v); + return (vector)(vec_mul(x.v, x.v)); #else diff --git a/src/engine.c b/src/engine.c index a45a2fb4375ef63138dd228c18f90bcacac37494..ad9f5bf57d3f1161e07bf83dbc082029e60818c4 100644 --- a/src/engine.c +++ b/src/engine.c @@ -57,6 +57,7 @@ #include "error.h" #include "gravity.h" #include "hydro.h" +#include "map.h" #include "minmax.h" #include "parallel_io.h" #include "part.h" @@ -153,7 +154,7 @@ void engine_make_hierarchical_tasks(struct engine *e, struct cell *c) { struct scheduler *s = &e->sched; const int periodic = e->s->periodic; - const int is_hydro = (e->policy & engine_policy_hydro); + const int is_with_hydro = (e->policy & engine_policy_hydro); const int is_self_gravity = (e->policy & engine_policy_self_gravity); const int is_with_cooling = (e->policy & engine_policy_cooling); const int is_with_sourceterms = (e->policy & engine_policy_sourceterms); @@ -162,15 +163,19 @@ void engine_make_hierarchical_tasks(struct engine *e, struct cell *c) { if (c->super == c) { /* Add the sort task. */ - c->sorts = - scheduler_addtask(s, task_type_sort, task_subtype_none, 0, 0, c, NULL); + if (is_with_hydro) { + c->sorts = scheduler_addtask(s, task_type_sort, task_subtype_none, 0, 0, + c, NULL); + } /* Local tasks only... */ if (c->nodeID == e->nodeID) { /* Add the drift task. */ - c->drift_part = scheduler_addtask(s, task_type_drift_part, - task_subtype_none, 0, 0, c, NULL); + if (is_with_hydro) { + c->drift_part = scheduler_addtask(s, task_type_drift_part, + task_subtype_none, 0, 0, c, NULL); + } /* Add the two half kicks */ c->kick1 = scheduler_addtask(s, task_type_kick1, task_subtype_none, 0, 0, @@ -207,7 +212,7 @@ void engine_make_hierarchical_tasks(struct engine *e, struct cell *c) { } /* Generate the ghost tasks. */ - if (is_hydro) { + if (is_with_hydro) { c->ghost_in = scheduler_addtask(s, task_type_ghost, task_subtype_none, 0, /* implicit = */ 1, c, NULL); @@ -215,14 +220,13 @@ void engine_make_hierarchical_tasks(struct engine *e, struct cell *c) { scheduler_addtask(s, task_type_ghost, task_subtype_none, 0, /* implicit = */ 1, c, NULL); engine_add_ghosts(e, c, c->ghost_in, c->ghost_out); - } #ifdef EXTRA_HYDRO_LOOP - /* Generate the extra ghost task. */ - if (is_hydro) + /* Generate the extra ghost task. */ c->extra_ghost = scheduler_addtask(s, task_type_extra_ghost, task_subtype_none, 0, 0, c, NULL); #endif + } /* Cooling task */ if (is_with_cooling) { @@ -898,6 +902,16 @@ void engine_repartition(struct engine *e) { partition_repartition(e->reparttype, e->nodeID, e->nr_nodes, e->s, e->sched.tasks, e->sched.nr_tasks); + /* Partitioning requires copies of the particles, so we need to reduce the + * memory in use to the minimum, we can free the sorting indices and the + * tasks as these will be regenerated at the next rebuild. */ + + /* Sorting indices. */ + if (e->s->cells_top != NULL) space_free_cells(e->s); + + /* Task arrays. */ + scheduler_free_tasks(&e->sched); + /* Now comes the tricky part: Exchange particles between all nodes. This is done in two steps, first allreducing a matrix of how many particles go from where to where, then re-allocating @@ -1731,7 +1745,7 @@ void engine_make_self_gravity_tasks_mapper(void *map_data, int num_elements, /* If the cells is local build a self-interaction */ scheduler_addtask(sched, task_type_self, task_subtype_grav, 0, 0, ci, NULL); - /* Deal with periodicity dependencies */ + /* Deal with periodicity FFT task dependencies */ const int ghost_id = cell_getid(cdim_ghost, i / 4, j / 4, k / 4); if (ghost_id > n_ghosts) error("Invalid ghost_id"); if (periodic) { @@ -1739,6 +1753,10 @@ void engine_make_self_gravity_tasks_mapper(void *map_data, int num_elements, ci->grav_ghost[1] = ghosts[2 * ghost_id + 1]; } + /* Recover the multipole information */ + struct gravity_tensors *const multi_i = ci->multipole; + const double CoM_i[3] = {multi_i->CoM[0], multi_i->CoM[1], multi_i->CoM[2]}; + /* Loop over every other cell */ for (int ii = 0; ii < cdim[0]; ii++) { for (int jj = 0; jj < cdim[1]; jj++) { @@ -1757,11 +1775,27 @@ void engine_make_self_gravity_tasks_mapper(void *map_data, int num_elements, /* Is that neighbour local ? */ if (cj->nodeID != nodeID) continue; // MATTHIEU - /* Are the cells to close for a MM interaction ? */ - if (!gravity_multipole_accept_rebuild(ci->multipole, cj->multipole, - theta_crit_inv, periodic, - dim)) { + /* Recover the multipole information */ + struct gravity_tensors *const multi_j = cj->multipole; + + /* Get the distance between the CoMs */ + double dx = CoM_i[0] - multi_j->CoM[0]; + double dy = CoM_i[1] - multi_j->CoM[1]; + double dz = CoM_i[2] - multi_j->CoM[2]; + + /* Apply BC */ + if (periodic) { + dx = nearest(dx, dim[0]); + dy = nearest(dy, dim[1]); + dz = nearest(dz, dim[2]); + } + const double r2 = dx * dx + dy * dy + dz * dz; + + /* Are the cells too close for a MM interaction ? */ + if (!gravity_multipole_accept_rebuild(multi_i, multi_j, + theta_crit_inv, r2)) { + /* Ok, we need to add a direct pair calculation */ scheduler_addtask(sched, task_type_pair, task_subtype_grav, 0, 0, ci, cj); } @@ -2796,8 +2830,7 @@ void engine_marktasks_mapper(void *map_data, int num_elements, /* Kick/Drift/init ? */ if (t->type == task_type_kick1 || t->type == task_type_kick2 || - t->type == task_type_drift_part || t->type == task_type_drift_gpart || - t->type == task_type_init_grav) { + t->type == task_type_drift_gpart || t->type == task_type_init_grav) { if (cell_is_active(t->ci, e)) scheduler_activate(s, t); } @@ -4583,13 +4616,13 @@ void engine_init(struct engine *e, struct space *s, error("Failed to initialize barrier."); /* Expected average for tasks per cell. If set to zero we use a heuristic - * guess that hopefully is large enough, but not too large. */ + * guess based on the numbers of cells and how many tasks per cell we expect. */ e->tasks_per_cell = parser_get_opt_param_int(params, "Scheduler:tasks_per_cell", 0); /* Init the scheduler. */ scheduler_init(&e->sched, e->s, engine_estimate_nr_tasks(e), nr_queues, - scheduler_flag_steal, e->nodeID, &e->threadpool); + (policy & scheduler_flag_steal), e->nodeID, &e->threadpool); /* Allocate and init the threads. */ if ((e->runners = (struct runner *)malloc(sizeof(struct runner) * @@ -4633,8 +4666,12 @@ void engine_init(struct engine *e, struct space *s, e->runners[k].qid = k * nr_queues / e->nr_threads; } -#ifdef WITH_VECTORIZATION /* Allocate particle caches. */ + e->runners[k].ci_gravity_cache.count = 0; + e->runners[k].cj_gravity_cache.count = 0; + gravity_cache_init(&e->runners[k].ci_gravity_cache, space_splitsize); + gravity_cache_init(&e->runners[k].cj_gravity_cache, space_splitsize); +#ifdef WITH_VECTORIZATION e->runners[k].ci_cache.count = 0; e->runners[k].cj_cache.count = 0; cache_init(&e->runners[k].ci_cache, CACHE_SIZE); @@ -4725,8 +4762,12 @@ void engine_compute_next_snapshot_time(struct engine *e) { void engine_clean(struct engine *e) { #ifdef WITH_VECTORIZATION - for (int i = 0; i < e->nr_threads; ++i) cache_clean(&e->runners[i].ci_cache); - for (int i = 0; i < e->nr_threads; ++i) cache_clean(&e->runners[i].cj_cache); + for (int i = 0; i < e->nr_threads; ++i) { + cache_clean(&e->runners[i].ci_cache); + cache_clean(&e->runners[i].cj_cache); + gravity_cache_clean(&e->runners[i].ci_gravity_cache); + gravity_cache_clean(&e->runners[i].cj_gravity_cache); + } #endif free(e->runners); free(e->snapshotUnits); diff --git a/src/gravity.c b/src/gravity.c index 579546182445ce3fe58c2bbd43c328c0df2d377b..f58bc1b7456bc5dfc95b4c976ebda8e1999ff3e0 100644 --- a/src/gravity.c +++ b/src/gravity.c @@ -21,9 +21,15 @@ #include "../config.h" /* Some standard headers. */ +#include <float.h> #include <stdio.h> +#include <stdlib.h> #include <unistd.h> +#ifdef HAVE_HDF5 +#include <hdf5.h> +#endif + /* This object's header. */ #include "gravity.h" @@ -39,6 +45,256 @@ struct exact_force_data { double const_G; }; +#ifdef SWIFT_GRAVITY_FORCE_CHECKS + +/* Size of the Ewald table */ +#define Newald 64 + +/* Components of the Ewald correction */ +static float fewald_x[Newald + 1][Newald + 1][Newald + 1]; +static float fewald_y[Newald + 1][Newald + 1][Newald + 1]; +static float fewald_z[Newald + 1][Newald + 1][Newald + 1]; + +/* Factor used to normalize the access to the Ewald table */ +float ewald_fac; +#endif + +/** + * @brief Allocates the memory and computes one octant of the + * Ewald correction table. + * + * We follow Hernquist, Bouchet & Suto, 1991, ApJS, Volume 75, p.231-240, + * equations (2.14a) and (2.14b) with alpha = 2. We consider all terms with + * |x - nL| < 4L and |h|^2 < 16. + * + * @param boxSize The side-length (L) of the volume. + */ +void gravity_exact_force_ewald_init(double boxSize) { + +#ifdef SWIFT_GRAVITY_FORCE_CHECKS + const ticks tic = getticks(); + message("Computing Ewald correction table..."); + + /* Level of correction (Hernquist et al. 1991)*/ + const float alpha = 2.f; + + /* some useful constants */ + const float alpha2 = alpha * alpha; + const float factor_exp1 = 2.f * alpha / sqrt(M_PI); + const float factor_exp2 = -M_PI * M_PI / alpha2; + const float factor_sin = 2.f * M_PI; + const float boxSize_inv2 = 1.f / (boxSize * boxSize); + + /* Ewald factor to access the table */ + ewald_fac = (double)(2 * Newald) / boxSize; + + /* Zero everything */ + bzero(fewald_x, (Newald + 1) * (Newald + 1) * (Newald + 1) * sizeof(float)); + bzero(fewald_y, (Newald + 1) * (Newald + 1) * (Newald + 1) * sizeof(float)); + bzero(fewald_z, (Newald + 1) * (Newald + 1) * (Newald + 1) * sizeof(float)); + + /* Compute the values in one of the octants */ + for (int i = 0; i <= Newald; ++i) { + for (int j = 0; j <= Newald; ++j) { + for (int k = 0; k <= Newald; ++k) { + + if (i == 0 && j == 0 && k == 0) continue; + + /* Distance vector */ + const float r_x = 0.5f * ((float)i) / Newald; + const float r_y = 0.5f * ((float)j) / Newald; + const float r_z = 0.5f * ((float)k) / Newald; + + /* Norm of distance vector */ + const float r2 = r_x * r_x + r_y * r_y + r_z * r_z; + const float r_inv = 1.f / sqrtf(r2); + const float r_inv3 = r_inv * r_inv * r_inv; + + /* Normal gravity potential term */ + float f_x = r_x * r_inv3; + float f_y = r_y * r_inv3; + float f_z = r_z * r_inv3; + + for (int n_i = -4; n_i <= 4; ++n_i) { + for (int n_j = -4; n_j <= 4; ++n_j) { + for (int n_k = -4; n_k <= 4; ++n_k) { + + const float d_x = r_x - n_i; + const float d_y = r_y - n_j; + const float d_z = r_z - n_k; + + /* Discretised distance */ + const float r_tilde2 = d_x * d_x + d_y * d_y + d_z * d_z; + const float r_tilde_inv = 1.f / sqrtf(r_tilde2); + const float r_tilde = r_tilde_inv * r_tilde2; + const float r_tilde_inv3 = + r_tilde_inv * r_tilde_inv * r_tilde_inv; + + const float val = + erfcf(alpha * r_tilde) + + factor_exp1 * r_tilde * expf(-alpha2 * r_tilde2); + + /* First correction term */ + const float f = val * r_tilde_inv3; + f_x -= f * d_x; + f_y -= f * d_y; + f_z -= f * d_z; + } + } + } + + for (int h_i = -4; h_i <= 4; ++h_i) { + for (int h_j = -4; h_j <= 4; ++h_j) { + for (int h_k = -4; h_k <= 4; ++h_k) { + + const float h2 = h_i * h_i + h_j * h_j + h_k * h_k; + + const float h2_inv = 1.f / (h2 + FLT_MIN); + const float h_dot_x = h_i * r_x + h_j * r_y + h_k * r_z; + + const float val = 2.f * h2_inv * expf(h2 * factor_exp2) * + sinf(factor_sin * h_dot_x); + + /* Second correction term */ + f_x -= val * h_i; + f_y -= val * h_j; + f_z -= val * h_k; + } + } + } + + /* Save back to memory */ + fewald_x[i][j][k] = f_x; + fewald_y[i][j][k] = f_y; + fewald_z[i][j][k] = f_z; + } + } + } + +/* Dump the Ewald table to a file */ +#ifdef HAVE_HDF5 + hid_t h_file = + H5Fcreate("Ewald.hdf5", H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT); + if (h_file < 0) error("Error while opening file for Ewald dump."); + + /* Create dataspace */ + hsize_t dim[3] = {Newald + 1, Newald + 1, Newald + 1}; + hid_t h_space = H5Screate_simple(3, dim, NULL); + hid_t h_data; + h_data = H5Dcreate(h_file, "Ewald_x", H5T_NATIVE_FLOAT, h_space, H5P_DEFAULT, + H5P_DEFAULT, H5P_DEFAULT); + H5Dwrite(h_data, H5T_NATIVE_FLOAT, h_space, H5S_ALL, H5P_DEFAULT, + &(fewald_x[0][0][0])); + H5Dclose(h_data); + h_data = H5Dcreate(h_file, "Ewald_y", H5T_NATIVE_FLOAT, h_space, H5P_DEFAULT, + H5P_DEFAULT, H5P_DEFAULT); + H5Dwrite(h_data, H5T_NATIVE_FLOAT, h_space, H5S_ALL, H5P_DEFAULT, + &(fewald_y[0][0][0])); + H5Dclose(h_data); + h_data = H5Dcreate(h_file, "Ewald_z", H5T_NATIVE_FLOAT, h_space, H5P_DEFAULT, + H5P_DEFAULT, H5P_DEFAULT); + H5Dwrite(h_data, H5T_NATIVE_FLOAT, h_space, H5S_ALL, H5P_DEFAULT, + &(fewald_z[0][0][0])); + H5Dclose(h_data); + H5Sclose(h_space); + H5Fclose(h_file); +#endif + + /* Apply the box-size correction */ + for (int i = 0; i <= Newald; ++i) { + for (int j = 0; j <= Newald; ++j) { + for (int k = 0; k <= Newald; ++k) { + fewald_x[i][j][k] *= boxSize_inv2; + fewald_y[i][j][k] *= boxSize_inv2; + fewald_z[i][j][k] *= boxSize_inv2; + } + } + } + + /* Say goodbye */ + message("Ewald correction table computed (took %.3f %s). ", + clocks_from_ticks(getticks() - tic), clocks_getunit()); +#else + error("Gravity checking function called without the corresponding flag."); +#endif +} + +#ifdef SWIFT_GRAVITY_FORCE_CHECKS +/** + * @brief Compute the Ewald correction for a given distance vector r. + * + * We interpolate the Ewald correction tables using a tri-linear interpolation + * similar to a CIC. + * + * @param rx x-coordinate of distance vector. + * @param ry y-coordinate of distance vector. + * @param rz z-coordinate of distance vector. + * @param corr (return) The Ewald correction. + */ +__attribute__((always_inline)) INLINE static void +gravity_exact_force_ewald_evaluate(double rx, double ry, double rz, + double corr[3]) { + + const double s_x = (rx < 0.) ? 1. : -1.; + const double s_y = (ry < 0.) ? 1. : -1.; + const double s_z = (rz < 0.) ? 1. : -1.; + rx = fabs(rx); + ry = fabs(ry); + rz = fabs(rz); + + int i = (int)(rx * ewald_fac); + if (i >= Newald) i = Newald - 1; + const double dx = rx * ewald_fac - i; + const double tx = 1. - dx; + + int j = (int)(ry * ewald_fac); + if (j >= Newald) j = Newald - 1; + const double dy = ry * ewald_fac - j; + const double ty = 1. - dy; + + int k = (int)(rz * ewald_fac); + if (k >= Newald) k = Newald - 1; + const double dz = rz * ewald_fac - k; + const double tz = 1. - dz; + + /* Interpolation in X */ + corr[0] = 0.; + corr[0] += fewald_x[i + 0][j + 0][k + 0] * tx * ty * tz; + corr[0] += fewald_x[i + 0][j + 0][k + 1] * tx * ty * dz; + corr[0] += fewald_x[i + 0][j + 1][k + 0] * tx * dy * tz; + corr[0] += fewald_x[i + 0][j + 1][k + 1] * tx * dy * dz; + corr[0] += fewald_x[i + 1][j + 0][k + 0] * dx * ty * tz; + corr[0] += fewald_x[i + 1][j + 0][k + 1] * dx * ty * dz; + corr[0] += fewald_x[i + 1][j + 1][k + 0] * dx * dy * tz; + corr[0] += fewald_x[i + 1][j + 1][k + 1] * dx * dy * dz; + corr[0] *= s_x; + + /* Interpolation in Y */ + corr[1] = 0.; + corr[1] += fewald_y[i + 0][j + 0][k + 0] * tx * ty * tz; + corr[1] += fewald_y[i + 0][j + 0][k + 1] * tx * ty * dz; + corr[1] += fewald_y[i + 0][j + 1][k + 0] * tx * dy * tz; + corr[1] += fewald_y[i + 0][j + 1][k + 1] * tx * dy * dz; + corr[1] += fewald_y[i + 1][j + 0][k + 0] * dx * ty * tz; + corr[1] += fewald_y[i + 1][j + 0][k + 1] * dx * ty * dz; + corr[1] += fewald_y[i + 1][j + 1][k + 0] * dx * dy * tz; + corr[1] += fewald_y[i + 1][j + 1][k + 1] * dx * dy * dz; + corr[1] *= s_y; + + /* Interpolation in Z */ + corr[2] = 0.; + corr[2] += fewald_z[i + 0][j + 0][k + 0] * tx * ty * tz; + corr[2] += fewald_z[i + 0][j + 0][k + 1] * tx * ty * dz; + corr[2] += fewald_z[i + 0][j + 1][k + 0] * tx * dy * tz; + corr[2] += fewald_z[i + 0][j + 1][k + 1] * tx * dy * dz; + corr[2] += fewald_z[i + 1][j + 0][k + 0] * dx * ty * tz; + corr[2] += fewald_z[i + 1][j + 0][k + 1] * dx * ty * dz; + corr[2] += fewald_z[i + 1][j + 1][k + 0] * dx * dy * tz; + corr[2] += fewald_z[i + 1][j + 1][k + 1] * dx * dy * dz; + corr[2] *= s_z; +} +#endif + /** * @brief Checks whether the file containing the exact accelerations for * the current choice of parameters already exists. @@ -116,6 +372,12 @@ void gravity_exact_force_compute_mapper(void *map_data, int nr_gparts, if (gpi->id_or_neg_offset % SWIFT_GRAVITY_FORCE_CHECKS == 0 && gpart_is_active(gpi, e)) { + /* Get some information about the particle */ + const double pix[3] = {gpi->x[0], gpi->x[1], gpi->x[2]}; + const double hi = gpi->epsilon; + const double hi_inv = 1. / hi; + const double hi_inv3 = hi_inv * hi_inv * hi_inv; + /* Be ready for the calculation */ double a_grav[3] = {0., 0., 0.}; @@ -128,9 +390,9 @@ void gravity_exact_force_compute_mapper(void *map_data, int nr_gparts, if (gpi == gpj) continue; /* Compute the pairwise distance. */ - double dx = gpi->x[0] - gpj->x[0]; - double dy = gpi->x[1] - gpj->x[1]; - double dz = gpi->x[2] - gpj->x[2]; + double dx = gpj->x[0] - pix[0]; + double dy = gpj->x[1] - pix[1]; + double dz = gpj->x[2] - pix[2]; /* Now apply periodic BC */ if (periodic) { @@ -140,34 +402,41 @@ void gravity_exact_force_compute_mapper(void *map_data, int nr_gparts, } const double r2 = dx * dx + dy * dy + dz * dz; - const double r = sqrt(r2); - const double ir = 1. / r; + const double r_inv = 1. / sqrt(r2); + const double r = r2 * r_inv; const double mj = gpj->mass; - const double hi = gpi->epsilon; double f; - const double f_lr = 1.; if (r >= hi) { /* Get Newtonian gravity */ - f = mj * ir * ir * ir * f_lr; + f = mj * r_inv * r_inv * r_inv; } else { - const double hi_inv = 1. / hi; - const double hi_inv3 = hi_inv * hi_inv * hi_inv; const double ui = r * hi_inv; double W; kernel_grav_eval_double(ui, &W); /* Get softened gravity */ - f = mj * hi_inv3 * W * f_lr; + f = mj * hi_inv3 * W; } - a_grav[0] -= f * dx; - a_grav[1] -= f * dy; - a_grav[2] -= f * dz; + a_grav[0] += f * dx; + a_grav[1] += f * dy; + a_grav[2] += f * dz; + + /* Apply Ewald correction for periodic BC */ + if (periodic && r > 1e-5 * hi) { + + double corr[3]; + gravity_exact_force_ewald_evaluate(dx, dy, dz, corr); + + a_grav[0] += mj * corr[0]; + a_grav[1] += mj * corr[1]; + a_grav[2] += mj * corr[2]; + } } /* Store the exact answer */ diff --git a/src/gravity.h b/src/gravity.h index 00b930c00fb2558f274feb2991b78e96dc8b990b..85e42370bc456dceb577c42ee609e3f0724e14ea 100644 --- a/src/gravity.h +++ b/src/gravity.h @@ -34,6 +34,8 @@ #include "./gravity/Default/gravity.h" #include "./gravity/Default/gravity_iact.h" +void gravity_exact_force_ewald_init(double boxSize); +void gravity_exact_force_ewald_free(); void gravity_exact_force_compute(struct space *s, const struct engine *e); void gravity_exact_force_check(struct space *s, const struct engine *e, float rel_tol); diff --git a/src/gravity_cache.h b/src/gravity_cache.h new file mode 100644 index 0000000000000000000000000000000000000000..14b672233aa9958ec39af32a87baead98c0bae04 --- /dev/null +++ b/src/gravity_cache.h @@ -0,0 +1,247 @@ +/******************************************************************************* + * This file is part of SWIFT. + * Copyright (c) 2016 Matthieu Schaller (matthieu.schaller@durham.ac.uk) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + ******************************************************************************/ +#ifndef SWIFT_GRAVITY_CACHE_H +#define SWIFT_GRAVITY_CACHE_H + +/* Config parameters. */ +#include "../config.h" + +/* Local headers */ +#include "align.h" +#include "error.h" +#include "gravity.h" +#include "vector.h" + +/** + * @brief A SoA object for the #gpart of a cell. + * + * This is used to help vectorize the leaf-leaf gravity interactions. + */ +struct gravity_cache { + + /*! #gpart x position. */ + float *restrict x SWIFT_CACHE_ALIGN; + + /*! #gpart y position. */ + float *restrict y SWIFT_CACHE_ALIGN; + + /*! #gpart z position. */ + float *restrict z SWIFT_CACHE_ALIGN; + + /*! #gpart softening length. */ + float *restrict epsilon SWIFT_CACHE_ALIGN; + + /*! #gpart mass. */ + float *restrict m SWIFT_CACHE_ALIGN; + + /*! #gpart x acceleration. */ + float *restrict a_x SWIFT_CACHE_ALIGN; + + /*! #gpart y acceleration. */ + float *restrict a_y SWIFT_CACHE_ALIGN; + + /*! #gpart z acceleration. */ + float *restrict a_z SWIFT_CACHE_ALIGN; + + /*! Cache size */ + int count; +}; + +/** + * @brief Frees the memory allocated in a #gravity_cache + * + * @param c The #gravity_cache to free. + */ +static INLINE void gravity_cache_clean(struct gravity_cache *c) { + + if (c->count > 0) { + free(c->x); + free(c->y); + free(c->z); + free(c->epsilon); + free(c->m); + free(c->a_x); + free(c->a_y); + free(c->a_z); + } + c->count = 0; +} + +/** + * @brief Allocates memory for the #gpart caches used in the leaf-leaf + * interactions. + * + * The cache is padded for the vector size and aligned properly + * + * @param c The #gravity_cache to allocate. + * @param count The number of #gpart to allocated for (space_splitsize is a good + * choice). + */ +static INLINE void gravity_cache_init(struct gravity_cache *c, int count) { + + /* Size of the gravity cache */ + const int padded_count = count - (count % VEC_SIZE) + VEC_SIZE; + const size_t sizeBytes = padded_count * sizeof(float); + + /* Delete old stuff if any */ + gravity_cache_clean(c); + + int error = 0; + error += posix_memalign((void **)&c->x, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->y, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->z, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += + posix_memalign((void **)&c->epsilon, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->m, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->a_x, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->a_y, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->a_z, SWIFT_CACHE_ALIGNMENT, sizeBytes); + + if (error != 0) + error("Couldn't allocate gravity cache, size: %d", padded_count); + + c->count = padded_count; +} + +/** + * @brief Fills a #gravity_cache structure with some #gpart and shift them. + * + * @param c The #gravity_cache to fill. + * @param gparts The #gpart array to read from. + * @param gcount The number of particles to read. + * @param gcount_padded The number of particle to read padded to the next + * multiple of the vector length. + * @param shift A shift to apply to all the particles. + */ +__attribute__((always_inline)) INLINE void gravity_cache_populate( + struct gravity_cache *c, const struct gpart *restrict gparts, int gcount, + int gcount_padded, const double shift[3]) { + + /* Make the compiler understand we are in happy vectorization land */ + float *restrict x = c->x; + float *restrict y = c->y; + float *restrict z = c->z; + float *restrict m = c->m; + float *restrict epsilon = c->epsilon; + swift_align_information(x, SWIFT_CACHE_ALIGNMENT); + swift_align_information(y, SWIFT_CACHE_ALIGNMENT); + swift_align_information(z, SWIFT_CACHE_ALIGNMENT); + swift_align_information(epsilon, SWIFT_CACHE_ALIGNMENT); + swift_align_information(m, SWIFT_CACHE_ALIGNMENT); + swift_assume_size(gcount_padded, VEC_SIZE); + + /* Fill the input caches */ + for (int i = 0; i < gcount; ++i) { + x[i] = (float)(gparts[i].x[0] - shift[0]); + y[i] = (float)(gparts[i].x[1] - shift[1]); + z[i] = (float)(gparts[i].x[2] - shift[2]); + epsilon[i] = gparts[i].epsilon; + m[i] = gparts[i].mass; + } + +#ifdef SWIFT_DEBUG_CHECKS + if (gcount_padded < gcount) error("Padded counter smaller than counter"); +#endif + + /* Pad the caches */ + for (int i = gcount; i < gcount_padded; ++i) { + x[i] = 0.f; + y[i] = 0.f; + z[i] = 0.f; + epsilon[i] = 0.f; + m[i] = 0.f; + } +} + +/** + * @brief Fills a #gravity_cache structure with some #gpart. + * + * @param c The #gravity_cache to fill. + * @param gparts The #gpart array to read from. + * @param gcount The number of particles to read. + * @param gcount_padded The number of particle to read padded to the next + * multiple of the vector length. + */ +__attribute__((always_inline)) INLINE void gravity_cache_populate_no_shift( + struct gravity_cache *c, const struct gpart *restrict gparts, int gcount, + int gcount_padded) { + + /* Make the compiler understand we are in happy vectorization land */ + float *restrict x = c->x; + float *restrict y = c->y; + float *restrict z = c->z; + float *restrict m = c->m; + float *restrict epsilon = c->epsilon; + swift_align_information(x, SWIFT_CACHE_ALIGNMENT); + swift_align_information(y, SWIFT_CACHE_ALIGNMENT); + swift_align_information(z, SWIFT_CACHE_ALIGNMENT); + swift_align_information(epsilon, SWIFT_CACHE_ALIGNMENT); + swift_align_information(m, SWIFT_CACHE_ALIGNMENT); + swift_assume_size(gcount_padded, VEC_SIZE); + + /* Fill the input caches */ + for (int i = 0; i < gcount; ++i) { + x[i] = (float)(gparts[i].x[0]); + y[i] = (float)(gparts[i].x[1]); + z[i] = (float)(gparts[i].x[2]); + epsilon[i] = gparts[i].epsilon; + m[i] = gparts[i].mass; + } + +#ifdef SWIFT_DEBUG_CHECKS + if (gcount_padded < gcount) error("Padded counter smaller than counter"); +#endif + + /* Pad the caches */ + for (int i = gcount; i < gcount_padded; ++i) { + x[i] = 0.f; + y[i] = 0.f; + z[i] = 0.f; + epsilon[i] = 0.f; + m[i] = 0.f; + } +} + +/** + * @brief Write the output cache values back to the #gpart. + * + * @param c The #gravity_cache to read from. + * @param gparts The #gpart array to write to. + * @param gcount The number of particles to write. + */ +__attribute__((always_inline)) INLINE void gravity_cache_write_back( + const struct gravity_cache *c, struct gpart *restrict gparts, int gcount) { + + /* Make the compiler understand we are in happy vectorization land */ + float *restrict a_x = c->a_x; + float *restrict a_y = c->a_y; + float *restrict a_z = c->a_z; + swift_align_information(a_x, SWIFT_CACHE_ALIGNMENT); + swift_align_information(a_y, SWIFT_CACHE_ALIGNMENT); + swift_align_information(a_z, SWIFT_CACHE_ALIGNMENT); + + /* Write stuff back to the particles */ + for (int i = 0; i < gcount; ++i) { + gparts[i].a_grav[0] += a_x[i]; + gparts[i].a_grav[1] += a_y[i]; + gparts[i].a_grav[2] += a_z[i]; + } +} + +#endif /* SWIFT_GRAVITY_CACHE_H */ diff --git a/src/hydro/Gadget2/hydro_iact.h b/src/hydro/Gadget2/hydro_iact.h index 25eb1e4b76d088d7dcd3d4ba57fe5901188be000..9e06fec92f9667d21ce7dc196ba26f9870cb24f4 100644 --- a/src/hydro/Gadget2/hydro_iact.h +++ b/src/hydro/Gadget2/hydro_iact.h @@ -388,7 +388,7 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz, vector *wcountSum, vector *wcount_dhSum, vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum, - vector mask, int knlMask) { + mask_t mask) { vector r, ri, ui, wi, wi_dx; vector mj; @@ -432,49 +432,24 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz, curlvry.v = vec_mul(curlvry.v, ri.v); curlvrz.v = vec_mul(curlvrz.v, ri.v); -/* Mask updates to intermediate vector sums for particle pi. */ -#ifdef HAVE_AVX512_F - rhoSum->v = - _mm512_mask_add_ps(rhoSum->v, knlMask, vec_mul(mj.v, wi.v), rhoSum->v); + vector wcount_dh_update; + wcount_dh_update.v = + vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v)); + /* Mask updates to intermediate vector sums for particle pi. */ + rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj.v, wi.v), mask); rho_dhSum->v = - _mm512_mask_sub_ps(rho_dhSum->v, knlMask, rho_dhSum->v, - vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, - vec_mul(ui.v, wi_dx.v)))); - - wcountSum->v = _mm512_mask_add_ps(wcountSum->v, knlMask, wi.v, wcountSum->v); - - wcount_dhSum->v = _mm512_mask_sub_ps( - rho_dhSum->v, knlMask, rho_dhSum->v, - vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v))); - - div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask, div_vSum->v, - vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v))); - - curlvxSum->v = _mm512_mask_add_ps(curlvxSum->v, knlMask, - vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), - curlvxSum->v); - - curlvySum->v = _mm512_mask_add_ps(curlvySum->v, knlMask, - vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), - curlvySum->v); - - curlvzSum->v = _mm512_mask_add_ps(curlvzSum->v, knlMask, - vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), - curlvzSum->v); -#else - rhoSum->v += vec_and(vec_mul(mj.v, wi.v), mask.v); - rho_dhSum->v -= vec_and(vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, - vec_mul(ui.v, wi_dx.v))), - mask.v); - wcountSum->v += vec_and(wi.v, mask.v); - wcount_dhSum->v -= vec_and( - vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v)), mask.v); - div_vSum->v -= vec_and(vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask.v); - curlvxSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask.v); - curlvySum->v += vec_and(vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask.v); - curlvzSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask.v); -#endif + vec_mask_sub(rho_dhSum->v, vec_mul(mj.v, wcount_dh_update.v), mask); + wcountSum->v = vec_mask_add(wcountSum->v, wi.v, mask); + wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, wcount_dh_update.v, mask); + div_vSum->v = + vec_mask_sub(div_vSum->v, vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask); + curlvxSum->v = vec_mask_add(curlvxSum->v, + vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask); + curlvySum->v = vec_mask_add(curlvySum->v, + vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask); + curlvzSum->v = vec_mask_add(curlvzSum->v, + vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask); } /** @@ -482,12 +457,14 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz, * (non-symmetric vectorized version). */ __attribute__((always_inline)) INLINE static void -runner_iact_nonsym_2_vec_density( - float *R2, float *Dx, float *Dy, float *Dz, vector hi_inv, vector vix, - vector viy, vector viz, float *Vjx, float *Vjy, float *Vjz, float *Mj, - vector *rhoSum, vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum, - vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum, - vector mask, vector mask2, int knlMask, int knlMask2) { +runner_iact_nonsym_2_vec_density(float *R2, float *Dx, float *Dy, float *Dz, + vector hi_inv, vector vix, vector viy, + vector viz, float *Vjx, float *Vjy, float *Vjz, + float *Mj, vector *rhoSum, vector *rho_dhSum, + vector *wcountSum, vector *wcount_dhSum, + vector *div_vSum, vector *curlvxSum, + vector *curlvySum, vector *curlvzSum, + mask_t mask, mask_t mask2, short mask_cond) { vector r, ri, r2, ui, wi, wi_dx; vector mj; @@ -567,88 +544,66 @@ runner_iact_nonsym_2_vec_density( curlvrz.v = vec_mul(curlvrz.v, ri.v); curlvrz2.v = vec_mul(curlvrz2.v, ri2.v); -/* Mask updates to intermediate vector sums for particle pi. */ -#ifdef HAVE_AVX512_F - rhoSum->v = - _mm512_mask_add_ps(rhoSum->v, knlMask, vec_mul(mj.v, wi.v), rhoSum->v); - rhoSum->v = - _mm512_mask_add_ps(rhoSum->v, knlMask2, vec_mul(mj2.v, wi2.v), rhoSum->v); - - rho_dhSum->v = - _mm512_mask_sub_ps(rho_dhSum->v, knlMask, rho_dhSum->v, - vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, - vec_mul(ui.v, wi_dx.v)))); - rho_dhSum->v = - _mm512_mask_sub_ps(rho_dhSum->v, knlMask2, rho_dhSum->v, - vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi2.v, - vec_mul(ui2.v, wi_dx2.v)))); - - wcountSum->v = _mm512_mask_add_ps(wcountSum->v, knlMask, wi.v, wcountSum->v); - wcountSum->v = - _mm512_mask_add_ps(wcountSum->v, knlMask2, wi2.v, wcountSum->v); - - wcount_dhSum->v = _mm512_mask_sub_ps( - rho_dhSum->v, knlMask, rho_dhSum->v, - vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v))); - - wcount_dhSum->v = _mm512_mask_sub_ps( - rho_dhSum->v, knlMask2, rho_dhSum->v, - vec_fma(vec_set1(hydro_dimension), wi2.v, vec_mul(ui2.v, wi_dx2.v))); - - div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask, div_vSum->v, - vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v))); - div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask2, div_vSum->v, - vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v))); - - curlvxSum->v = _mm512_mask_add_ps(curlvxSum->v, knlMask, - vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), - curlvxSum->v); - curlvxSum->v = _mm512_mask_add_ps( - curlvxSum->v, knlMask2, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)), - curlvxSum->v); - - curlvySum->v = _mm512_mask_add_ps(curlvySum->v, knlMask, - vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), - curlvySum->v); - curlvySum->v = _mm512_mask_add_ps( - curlvySum->v, knlMask2, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)), - curlvySum->v); - - curlvzSum->v = _mm512_mask_add_ps(curlvzSum->v, knlMask, - vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), - curlvzSum->v); - curlvzSum->v = _mm512_mask_add_ps( - curlvzSum->v, knlMask2, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)), - curlvzSum->v); -#else - rhoSum->v += vec_and(vec_mul(mj.v, wi.v), mask.v); - rhoSum->v += vec_and(vec_mul(mj2.v, wi2.v), mask2.v); - rho_dhSum->v -= vec_and(vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, - vec_mul(ui.v, wi_dx.v))), - mask.v); - rho_dhSum->v -= - vec_and(vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v, - vec_mul(ui2.v, wi_dx2.v))), - mask2.v); - wcountSum->v += vec_and(wi.v, mask.v); - wcountSum->v += vec_and(wi2.v, mask2.v); - wcount_dhSum->v -= vec_and( - vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v)), mask.v); - wcount_dhSum->v -= vec_and( - vec_fma(vec_set1(hydro_dimension), wi2.v, vec_mul(ui2.v, wi_dx2.v)), - mask2.v); - div_vSum->v -= vec_and(vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask.v); - div_vSum->v -= vec_and(vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)), mask2.v); - curlvxSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask.v); - curlvxSum->v += - vec_and(vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)), mask2.v); - curlvySum->v += vec_and(vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask.v); - curlvySum->v += - vec_and(vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)), mask2.v); - curlvzSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask.v); - curlvzSum->v += - vec_and(vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)), mask2.v); -#endif + vector wcount_dh_update, wcount_dh_update2; + wcount_dh_update.v = + vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v)); + wcount_dh_update2.v = + vec_fma(vec_set1(hydro_dimension), wi2.v, vec_mul(ui2.v, wi_dx2.v)); + + /* Mask updates to intermediate vector sums for particle pi. */ + /* Mask only when needed. */ + if (mask_cond) { + rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj.v, wi.v), mask); + rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj2.v, wi2.v), mask2); + rho_dhSum->v = + vec_mask_sub(rho_dhSum->v, vec_mul(mj.v, wcount_dh_update.v), mask); + rho_dhSum->v = + vec_mask_sub(rho_dhSum->v, vec_mul(mj2.v, wcount_dh_update2.v), mask2); + wcountSum->v = vec_mask_add(wcountSum->v, wi.v, mask); + wcountSum->v = vec_mask_add(wcountSum->v, wi2.v, mask2); + wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, wcount_dh_update.v, mask); + wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, wcount_dh_update2.v, mask2); + div_vSum->v = vec_mask_sub(div_vSum->v, + vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask); + div_vSum->v = vec_mask_sub( + div_vSum->v, vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)), mask2); + curlvxSum->v = vec_mask_add( + curlvxSum->v, vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask); + curlvxSum->v = vec_mask_add( + curlvxSum->v, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)), mask2); + curlvySum->v = vec_mask_add( + curlvySum->v, vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask); + curlvySum->v = vec_mask_add( + curlvySum->v, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)), mask2); + curlvzSum->v = vec_mask_add( + curlvzSum->v, vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask); + curlvzSum->v = vec_mask_add( + curlvzSum->v, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)), mask2); + } else { + rhoSum->v = vec_add(rhoSum->v, vec_mul(mj.v, wi.v)); + rhoSum->v = vec_add(rhoSum->v, vec_mul(mj2.v, wi2.v)); + rho_dhSum->v = vec_sub(rho_dhSum->v, vec_mul(mj.v, wcount_dh_update.v)); + rho_dhSum->v = vec_sub(rho_dhSum->v, vec_mul(mj2.v, wcount_dh_update2.v)); + wcountSum->v = vec_add(wcountSum->v, wi.v); + wcountSum->v = vec_add(wcountSum->v, wi2.v); + wcount_dhSum->v = vec_sub(wcount_dhSum->v, wcount_dh_update.v); + wcount_dhSum->v = vec_sub(wcount_dhSum->v, wcount_dh_update2.v); + div_vSum->v = vec_sub(div_vSum->v, vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v))); + div_vSum->v = + vec_sub(div_vSum->v, vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v))); + curlvxSum->v = + vec_add(curlvxSum->v, vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v))); + curlvxSum->v = + vec_add(curlvxSum->v, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v))); + curlvySum->v = + vec_add(curlvySum->v, vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v))); + curlvySum->v = + vec_add(curlvySum->v, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v))); + curlvzSum->v = + vec_add(curlvzSum->v, vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v))); + curlvzSum->v = + vec_add(curlvzSum->v, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v))); + } } #endif @@ -755,7 +710,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_force( float *R2, float *Dx, float *Hi, float *Hj, struct part **pi, struct part **pj) { -#ifdef WITH_VECTORIZATION +#ifdef WITH_OLD_VECTORIZATION vector r, r2, ri; vector xi, xj; @@ -1037,7 +992,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force( float *R2, float *Dx, float *Hi, float *Hj, struct part **pi, struct part **pj) { -#ifdef WITH_VECTORIZATION +#ifdef WITH_OLD_VECTORIZATION vector r, r2, ri; vector xi, xj; diff --git a/src/hydro/PressureEntropy/hydro.h b/src/hydro/PressureEntropy/hydro.h index 6d8d06abb126a574fd82c71a809235eb833494c6..080b796b21d7f3b48191cd375574ae1de6d11d1a 100644 --- a/src/hydro/PressureEntropy/hydro.h +++ b/src/hydro/PressureEntropy/hydro.h @@ -212,14 +212,15 @@ __attribute__((always_inline)) INLINE static void hydro_end_density( p->density.pressure_dh -= hydro_dimension * p->mass * p->entropy_one_over_gamma * kernel_root; p->density.wcount += kernel_root; + p->density.wcount_dh -= hydro_dimension * kernel_root; /* Finish the calculation by inserting the missing h-factors */ p->rho *= h_inv_dim; p->rho_bar *= h_inv_dim; p->density.rho_dh *= h_inv_dim_plus_one; p->density.pressure_dh *= h_inv_dim_plus_one; - p->density.wcount *= kernel_norm; - p->density.wcount_dh *= h_inv * kernel_gamma * kernel_norm; + p->density.wcount *= h_inv_dim; + p->density.wcount_dh *= h_inv_dim_plus_one; const float rho_inv = 1.f / p->rho; const float entropy_minus_one_over_gamma = 1.f / p->entropy_one_over_gamma; diff --git a/src/hydro/PressureEntropy/hydro_iact.h b/src/hydro/PressureEntropy/hydro_iact.h index ce1c38ca69954252dc804af9181b9060a14afcb9..37a9f2b01af16fe598b414a9f67123849bee1442 100644 --- a/src/hydro/PressureEntropy/hydro_iact.h +++ b/src/hydro/PressureEntropy/hydro_iact.h @@ -59,7 +59,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_density( /* Compute contribution to the number of neighbours */ pi->density.wcount += wi; - pi->density.wcount_dh -= ui * wi_dx; + pi->density.wcount_dh -= (hydro_dimension * wi + ui * wi_dx); /* Compute contribution to the weighted density */ pi->rho_bar += mj * pj->entropy_one_over_gamma * wi; @@ -77,7 +77,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_density( /* Compute contribution to the number of neighbours */ pj->density.wcount += wj; - pj->density.wcount_dh -= uj * wj_dx; + pj->density.wcount_dh -= (hydro_dimension * wj + uj * wj_dx); /* Compute contribution to the weighted density */ pj->rho_bar += mi * pi->entropy_one_over_gamma * wj; @@ -147,7 +147,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density( /* Compute contribution to the number of neighbours */ pi->density.wcount += wi; - pi->density.wcount_dh -= ui * wi_dx; + pi->density.wcount_dh -= (hydro_dimension * wi + ui * wi_dx); /* Compute contribution to the weighted density */ pi->rho_bar += mj * pj->entropy_one_over_gamma * wi; diff --git a/src/kernel_hydro.h b/src/kernel_hydro.h index f634a59d7ee769951e6560d46a92053c144cc766..5355c15f8f1a0d0c3d811c7039da04caf0522cc9 100644 --- a/src/kernel_hydro.h +++ b/src/kernel_hydro.h @@ -341,20 +341,7 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx( /* ------------------------------------------------------------------------- */ -#ifdef WITH_VECTORIZATION - -static const vector kernel_gamma_inv_vec = FILL_VEC((float)kernel_gamma_inv); - -static const vector kernel_ivals_vec = FILL_VEC((float)kernel_ivals); - -static const vector kernel_constant_vec = FILL_VEC((float)kernel_constant); - -static const vector kernel_gamma_inv_dim_vec = - FILL_VEC((float)kernel_gamma_inv_dim); - -static const vector kernel_gamma_inv_dim_plus_one_vec = - FILL_VEC((float)kernel_gamma_inv_dim_plus_one); - +#ifdef WITH_OLD_VECTORIZATION /** * @brief Computes the kernel function and its derivative (Vectorised version). * @@ -369,11 +356,12 @@ __attribute__((always_inline)) INLINE static void kernel_deval_vec( /* Go to the range [0,1[ from [0,H[ */ vector x; - x.v = u->v * kernel_gamma_inv_vec.v; + x.v = vec_mul(u->v, kernel_gamma_inv_vec.v); /* Load x and get the interval id. */ vector ind; - ind.m = vec_ftoi(vec_fmin(x.v * kernel_ivals_vec.v, kernel_ivals_vec.v)); + ind.m = + vec_ftoi(vec_fmin(vec_mul(x.v, kernel_ivals_vec.v), kernel_ivals_vec.v)); /* load the coefficients. */ vector c[kernel_degree + 1]; @@ -382,20 +370,36 @@ __attribute__((always_inline)) INLINE static void kernel_deval_vec( c[j].f[k] = kernel_coeffs[ind.i[k] * (kernel_degree + 1) + j]; /* Init the iteration for Horner's scheme. */ - w->v = (c[0].v * x.v) + c[1].v; + w->v = vec_fma(c[0].v, x.v, c[1].v); dw_dx->v = c[0].v; /* And we're off! */ for (int k = 2; k <= kernel_degree; k++) { - dw_dx->v = (dw_dx->v * x.v) + w->v; - w->v = (x.v * w->v) + c[k].v; + dw_dx->v = vec_fma(dw_dx->v, x.v, w->v); + w->v = vec_fma(x.v, w->v, c[k].v); } /* Return everything */ - w->v = w->v * kernel_constant_vec.v * kernel_gamma_inv_dim_vec.v; - dw_dx->v = - dw_dx->v * kernel_constant_vec.v * kernel_gamma_inv_dim_plus_one_vec.v; + w->v = + vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v)); + dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v, + kernel_gamma_inv_dim_plus_one_vec.v)); } +#endif + +#ifdef WITH_VECTORIZATION + +static const vector kernel_gamma_inv_vec = FILL_VEC((float)kernel_gamma_inv); + +static const vector kernel_ivals_vec = FILL_VEC((float)kernel_ivals); + +static const vector kernel_constant_vec = FILL_VEC((float)kernel_constant); + +static const vector kernel_gamma_inv_dim_vec = + FILL_VEC((float)kernel_gamma_inv_dim); + +static const vector kernel_gamma_inv_dim_plus_one_vec = + FILL_VEC((float)kernel_gamma_inv_dim_plus_one); /* Define constant vectors for the Wendland C2 and Cubic Spline kernel * coefficients. */ @@ -468,15 +472,15 @@ __attribute__((always_inline)) INLINE static void kernel_deval_1_vec( w->v = vec_fma(x.v, w->v, wendland_const_c5.v); #elif defined(CUBIC_SPLINE_KERNEL) vector w2, dw_dx2; - vector mask_reg1, mask_reg2; + mask_t mask_reg; - /* Form a mask for each part of the kernel. */ - mask_reg1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */ - mask_reg2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */ - ; + /* Form a mask for one part of the kernel. */ + /* Only need the mask for one region as the vec_blend defaults to the vector + * when the mask is 0.*/ + vec_create_mask(mask_reg, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */ /* Work out w for both regions of the kernel and combine the results together - * using masks. */ + * using a mask. */ /* Init the iteration for Horner's scheme. */ w->v = vec_fma(cubic_1_const_c0.v, x.v, cubic_1_const_c1.v); @@ -495,20 +499,17 @@ __attribute__((always_inline)) INLINE static void kernel_deval_1_vec( w->v = vec_fma(x.v, w->v, cubic_1_const_c3.v); w2.v = vec_fma(x.v, w2.v, cubic_2_const_c3.v); - /* Mask out unneeded values. */ - w->v = vec_and(w->v, mask_reg1.v); - w2.v = vec_and(w2.v, mask_reg2.v); - dw_dx->v = vec_and(dw_dx->v, mask_reg1.v); - dw_dx2.v = vec_and(dw_dx2.v, mask_reg2.v); - - /* Added both w and w2 together to form complete result. */ - w->v = vec_add(w->v, w2.v); - dw_dx->v = vec_add(dw_dx->v, dw_dx2.v); + /* Blend both kernel regions into one vector (mask out unneeded values). */ + /* Only need the mask for one region as the vec_blend defaults to the vector + * when the mask is 0.*/ + w->v = vec_blend(mask_reg, w->v, w2.v); + dw_dx->v = vec_blend(mask_reg, dw_dx->v, dw_dx2.v); + #else -#error +#error "Vectorisation not supported for this kernel!!!" #endif - /* Return everything */ + /* Return everyting */ w->v = vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v)); dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v, @@ -580,15 +581,13 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec( #elif defined(CUBIC_SPLINE_KERNEL) vector w_2, dw_dx_2; vector w2_2, dw_dx2_2; - vector mask_reg1, mask_reg2, mask_reg1_v2, mask_reg2_v2; + mask_t mask_reg, mask_reg_v2; - /* Form a mask for each part of the kernel. */ - mask_reg1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */ - mask_reg1_v2.v = vec_cmp_lt(x2.v, cond.v); /* 0 < x < 0.5 */ - mask_reg2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */ - ; - mask_reg2_v2.v = vec_cmp_gte(x2.v, cond.v); /* 0.5 < x < 1 */ - ; + /* Form a mask for one part of the kernel for each vector. */ + /* Only need the mask for one region as the vec_blend defaults to the vector + * when the mask is 0.*/ + vec_create_mask(mask_reg, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */ + vec_create_mask(mask_reg_v2, vec_cmp_gte(x2.v, cond.v)); /* 0.5 < x < 1 */ /* Work out w for both regions of the kernel and combine the results together * using masks. */ @@ -622,29 +621,23 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec( w_2.v = vec_fma(x.v, w_2.v, cubic_2_const_c3.v); w2_2.v = vec_fma(x2.v, w2_2.v, cubic_2_const_c3.v); - /* Mask out unneeded values. */ - w->v = vec_and(w->v, mask_reg1.v); - w2->v = vec_and(w2->v, mask_reg1_v2.v); - w_2.v = vec_and(w_2.v, mask_reg2.v); - w2_2.v = vec_and(w2_2.v, mask_reg2_v2.v); - dw_dx->v = vec_and(dw_dx->v, mask_reg1.v); - dw_dx2->v = vec_and(dw_dx2->v, mask_reg1_v2.v); - dw_dx_2.v = vec_and(dw_dx_2.v, mask_reg2.v); - dw_dx2_2.v = vec_and(dw_dx2_2.v, mask_reg2_v2.v); - - /* Added both w and w2 together to form complete result. */ - w->v = vec_add(w->v, w_2.v); - w2->v = vec_add(w2->v, w2_2.v); - dw_dx->v = vec_add(dw_dx->v, dw_dx_2.v); - dw_dx2->v = vec_add(dw_dx2->v, dw_dx2_2.v); + /* Blend both kernel regions into one vector (mask out unneeded values). */ + /* Only need the mask for one region as the vec_blend defaults to the vector + * when the mask is 0.*/ + w->v = vec_blend(mask_reg, w->v, w_2.v); + w2->v = vec_blend(mask_reg_v2, w2->v, w2_2.v); + dw_dx->v = vec_blend(mask_reg, dw_dx->v, dw_dx_2.v); + dw_dx2->v = vec_blend(mask_reg_v2, dw_dx2->v, dw_dx2_2.v); /* Return everything */ - w->v = w->v * kernel_constant_vec.v * kernel_gamma_inv_dim_vec.v; - w2->v = w2->v * kernel_constant_vec.v * kernel_gamma_inv_dim_vec.v; - dw_dx->v = - dw_dx->v * kernel_constant_vec.v * kernel_gamma_inv_dim_plus_one_vec.v; - dw_dx2->v = - dw_dx2->v * kernel_constant_vec.v * kernel_gamma_inv_dim_plus_one_vec.v; + w->v = + vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v)); + w2->v = vec_mul(w2->v, + vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v)); + dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v, + kernel_gamma_inv_dim_plus_one_vec.v)); + dw_dx2->v = vec_mul(dw_dx2->v, vec_mul(kernel_constant_vec.v, + kernel_gamma_inv_dim_plus_one_vec.v)); #endif } @@ -676,12 +669,12 @@ __attribute__((always_inline)) INLINE static void kernel_eval_W_vec(vector *u, w->v = vec_fma(x.v, w->v, wendland_const_c5.v); #elif defined(CUBIC_SPLINE_KERNEL) vector w2; - vector mask1, mask2; + mask_t mask_reg; /* Form a mask for each part of the kernel. */ - mask1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */ - mask2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */ - ; + /* Only need the mask for one region as the vec_blend defaults to the vector + * when the mask is 0.*/ + vec_create_mask(mask_reg, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */ /* Work out w for both regions of the kernel and combine the results together * using masks. */ @@ -698,13 +691,12 @@ __attribute__((always_inline)) INLINE static void kernel_eval_W_vec(vector *u, w2.v = vec_fma(x.v, w2.v, cubic_2_const_c3.v); /* Mask out unneeded values. */ - w->v = vec_and(w->v, mask1.v); - w2.v = vec_and(w2.v, mask2.v); + /* Only need the mask for one region as the vec_blend defaults to the vector + * when the mask is 0.*/ + w->v = vec_blend(mask_reg, w->v, w2.v); - /* Added both w and w2 together to form complete result. */ - w->v = vec_add(w->v, w2.v); #else -#error +#error "Vectorisation not supported for this kernel!!!" #endif /* Return everything */ @@ -741,12 +733,12 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_vec( #elif defined(CUBIC_SPLINE_KERNEL) vector dw_dx2; - vector mask1, mask2; + mask_t mask_reg; /* Form a mask for each part of the kernel. */ - mask1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */ - mask2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */ - ; + /* Only need the mask for one region as the vec_blend defaults to the vector + * when the mask is 0.*/ + vec_create_mask(mask_reg, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */ /* Work out w for both regions of the kernel and combine the results together * using masks. */ @@ -760,15 +752,106 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_vec( dw_dx2.v = vec_fma(dw_dx2.v, x.v, cubic_2_dwdx_const_c2.v); /* Mask out unneeded values. */ - dw_dx->v = vec_and(dw_dx->v, mask1.v); - dw_dx2.v = vec_and(dw_dx2.v, mask2.v); + /* Only need the mask for one region as the vec_blend defaults to the vector + * when the mask is 0.*/ + dw_dx->v = vec_blend(mask_reg, dw_dx->v, dw_dx2.v); + +#else +#error "Vectorisation not supported for this kernel!!!" +#endif + + /* Mask out result for particles that lie outside of the kernel function. */ + mask_t mask; + vec_create_mask(mask, vec_cmp_lt(x.v, vec_set1(1.f))); /* x < 1 */ + + dw_dx->v = vec_and_mask(dw_dx->v, mask); + + /* Return everything */ + dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v, + kernel_gamma_inv_dim_plus_one_vec.v)); +} + +/** + * @brief Computes the kernel function derivative for two particles + * using interleaved vectors. + * + * Return 0 if $u > \\gamma = H/h$ + * + * @param u The ratio of the distance to the smoothing length $u = x/h$. + * @param dw_dx (return) The norm of the gradient of $|\\nabla W(x,h)|$. + * @param u_2 The ratio of the distance to the smoothing length $u = x/h$ for + * second particle. + * @param dw_dx_2 (return) The norm of the gradient of $|\\nabla W(x,h)|$ for + * second particle. + */ +__attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_2_vec( + vector *u, vector *dw_dx, vector *u_2, vector *dw_dx_2) { + + /* Go to the range [0,1[ from [0,H[ */ + vector x, x_2; + x.v = vec_mul(u->v, kernel_gamma_inv_vec.v); + x_2.v = vec_mul(u_2->v, kernel_gamma_inv_vec.v); + +#ifdef WENDLAND_C2_KERNEL + /* Init the iteration for Horner's scheme. */ + dw_dx->v = vec_fma(wendland_dwdx_const_c0.v, x.v, wendland_dwdx_const_c1.v); + dw_dx_2->v = + vec_fma(wendland_dwdx_const_c0.v, x_2.v, wendland_dwdx_const_c1.v); + + /* Calculate the polynomial interleaving vector operations */ + dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c2.v); + dw_dx_2->v = vec_fma(dw_dx_2->v, x_2.v, wendland_dwdx_const_c2.v); + + dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c3.v); + dw_dx_2->v = vec_fma(dw_dx_2->v, x_2.v, wendland_dwdx_const_c3.v); + + dw_dx->v = vec_mul(dw_dx->v, x.v); + dw_dx_2->v = vec_mul(dw_dx_2->v, x_2.v); + +#elif defined(CUBIC_SPLINE_KERNEL) + vector dw_dx2, dw_dx2_2; + mask_t mask_reg; + mask_t mask_reg_v2; + + /* Form a mask for one part of the kernel. */ + /* Only need the mask for one region as the vec_blend defaults to the vector + * when the mask is 0.*/ + vec_create_mask(mask_reg, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */ + vec_create_mask(mask_reg_v2, vec_cmp_gte(x_2.v, cond.v)); /* 0.5 < x < 1 */ + + /* Work out w for both regions of the kernel and combine the results together + * using masks. */ + + /* Init the iteration for Horner's scheme. */ + dw_dx->v = vec_fma(cubic_1_dwdx_const_c0.v, x.v, cubic_1_dwdx_const_c1.v); + dw_dx_2->v = vec_fma(cubic_1_dwdx_const_c0.v, x_2.v, cubic_1_dwdx_const_c1.v); + dw_dx2.v = vec_fma(cubic_2_dwdx_const_c0.v, x.v, cubic_2_dwdx_const_c1.v); + dw_dx2_2.v = vec_fma(cubic_2_dwdx_const_c0.v, x_2.v, cubic_2_dwdx_const_c1.v); + + /* Calculate the polynomial interleaving vector operations. */ + dw_dx->v = vec_mul(dw_dx->v, x.v); /* cubic_1_dwdx_const_c2 is zero. */ + dw_dx_2->v = vec_mul(dw_dx_2->v, x_2.v); /* cubic_1_dwdx_const_c2 is zero. */ + dw_dx2.v = vec_fma(dw_dx2.v, x.v, cubic_2_dwdx_const_c2.v); + dw_dx2_2.v = vec_fma(dw_dx2_2.v, x_2.v, cubic_2_dwdx_const_c2.v); + + /* Mask out unneeded values. */ + /* Only need the mask for one region as the vec_blend defaults to the vector + * when the mask is 0.*/ + dw_dx->v = vec_blend(mask_reg, dw_dx->v, dw_dx2.v); + dw_dx_2->v = vec_blend(mask_reg_v2, dw_dx_2->v, dw_dx2_2.v); - /* Added both dwdx and dwdx2 together to form complete result. */ - dw_dx->v = vec_add(dw_dx->v, dw_dx2.v); #else -#error +#error "Vectorisation not supported for this kernel!!!" #endif + /* Mask out result for particles that lie outside of the kernel function. */ + mask_t mask, mask_2; + vec_create_mask(mask, vec_cmp_lt(x.v, vec_set1(1.f))); /* x < 1 */ + vec_create_mask(mask_2, vec_cmp_lt(x_2.v, vec_set1(1.f))); /* x < 1 */ + + dw_dx->v = vec_and_mask(dw_dx->v, mask); + dw_dx_2->v = vec_and_mask(dw_dx_2->v, mask_2); + /* Return everything */ dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_plus_one_vec.v)); diff --git a/src/multipole.h b/src/multipole.h index 3932b265de2dc7416a96d0a02695fe5973b6fb4e..004757924cccb6bc2f450c19f1ccd600f50e1990 100644 --- a/src/multipole.h +++ b/src/multipole.h @@ -2647,31 +2647,17 @@ INLINE static void gravity_L2P(const struct grav_tensor *lb, * @param ma The #multipole of the first #cell. * @param mb The #multipole of the second #cell. * @param theta_crit_inv The inverse of the critical opening angle. - * @param periodic Are we using periodic boundary conditions ? - * @param dim The dimensions of the box. + * @param r2 Square of the distance (periodically wrapped) between the + * multipoles. */ __attribute__((always_inline)) INLINE static int gravity_multipole_accept_rebuild(const struct gravity_tensors *const ma, const struct gravity_tensors *const mb, - double theta_crit_inv, int periodic, - const double dim[3]) { + double theta_crit_inv, double r2) { const double r_crit_a = ma->r_max_rebuild * theta_crit_inv; const double r_crit_b = mb->r_max_rebuild * theta_crit_inv; - double dx = ma->CoM_rebuild[0] - mb->CoM_rebuild[0]; - double dy = ma->CoM_rebuild[1] - mb->CoM_rebuild[1]; - double dz = ma->CoM_rebuild[2] - mb->CoM_rebuild[2]; - - /* Apply BC */ - if (periodic) { - dx = nearest(dx, dim[0]); - dy = nearest(dy, dim[1]); - dz = nearest(dz, dim[2]); - } - - const double r2 = dx * dx + dy * dy + dz * dz; - // MATTHIEU: Make this mass-dependent ? /* Multipole acceptance criterion (Dehnen 2002, eq.10) */ @@ -2688,30 +2674,16 @@ gravity_multipole_accept_rebuild(const struct gravity_tensors *const ma, * @param ma The #multipole of the first #cell. * @param mb The #multipole of the second #cell. * @param theta_crit_inv The inverse of the critical opening angle. - * @param periodic Are we using periodic boundary conditions ? - * @param dim The dimensions of the box. + * @param r2 Square of the distance (periodically wrapped) between the + * multipoles. */ __attribute__((always_inline)) INLINE static int gravity_multipole_accept( const struct gravity_tensors *const ma, - const struct gravity_tensors *const mb, double theta_crit_inv, int periodic, - const double dim[3]) { + const struct gravity_tensors *const mb, double theta_crit_inv, double r2) { const double r_crit_a = ma->r_max * theta_crit_inv; const double r_crit_b = mb->r_max * theta_crit_inv; - double dx = ma->CoM[0] - mb->CoM[0]; - double dy = ma->CoM[1] - mb->CoM[1]; - double dz = ma->CoM[2] - mb->CoM[2]; - - /* Apply BC */ - if (periodic) { - dx = nearest(dx, dim[0]); - dy = nearest(dy, dim[1]); - dz = nearest(dz, dim[2]); - } - - const double r2 = dx * dx + dy * dy + dz * dz; - // MATTHIEU: Make this mass-dependent ? /* Multipole acceptance criterion (Dehnen 2002, eq.10) */ diff --git a/src/parser.c b/src/parser.c index 02970c0526072013c14ee27c4486c8915ec67b25..0b608b29263342240af68fd99d2fdd3241e2a1e6 100644 --- a/src/parser.c +++ b/src/parser.c @@ -133,6 +133,8 @@ void parser_set_param(struct swift_params *params, const char *namevalue) { int updated = 0; for (int i = 0; i < params->paramCount; i++) { if (strcmp(name, params->data[i].name) == 0) { + message("Value of '%s' changed from '%s' to '%s'", params->data[i].name, + params->data[i].value, value); strcpy(params->data[i].value, value); updated = 1; } diff --git a/src/runner.h b/src/runner.h index facadf1608fb7e06af952eedbf1151fa68530bef..e33a3e380e6097a67258d116d617483caca35086 100644 --- a/src/runner.h +++ b/src/runner.h @@ -28,6 +28,7 @@ /* Includes. */ #include "cache.h" +#include "gravity_cache.h" struct cell; struct engine; @@ -49,7 +50,14 @@ struct runner { /*! The engine owing this runner. */ struct engine *e; + /*! The particle gravity_cache of cell ci. */ + struct gravity_cache ci_gravity_cache; + + /*! The particle gravity_cache of cell cj. */ + struct gravity_cache cj_gravity_cache; + #ifdef WITH_VECTORIZATION + /*! The particle cache of cell ci. */ struct cache ci_cache; diff --git a/src/runner_doiact.h b/src/runner_doiact.h index 47ebba61d9a77486448ba2cdb25c9c0a448b9bac..0dd6b2cdfd430e471fc13087afb0b2fcde947b97 100644 --- a/src/runner_doiact.h +++ b/src/runner_doiact.h @@ -1037,8 +1037,8 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj, const int sid, struct part *restrict pj = &parts_j[sort_j[pjd].i]; if (!part_is_active(pj, e)) continue; const float hj = pj->h; - const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift; - if (dj > di_max) continue; + const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max + rshift; + if (dj - rshift > di_max) continue; double pjx[3]; for (int k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k]; @@ -1449,8 +1449,8 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) { /* Get a hold of the jth part in cj. */ struct part *restrict pj = &parts_j[sort_j[pjd].i]; const float hj = pj->h; - const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift; - if (dj > di_max) continue; + const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max + rshift; + if (dj - rshift > di_max) continue; double pjx[3]; for (int k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k]; diff --git a/src/runner_doiact_grav.h b/src/runner_doiact_grav.h index a5b803ae62ad199dce7168aa9b9fb270d1e7042c..01ea6a073211a08430e77721f4c2e60ef7adfd04 100644 --- a/src/runner_doiact_grav.h +++ b/src/runner_doiact_grav.h @@ -162,6 +162,235 @@ void runner_dopair_grav_mm(const struct runner *r, struct cell *restrict ci, void runner_dopair_grav_pp_full(struct runner *r, struct cell *ci, struct cell *cj, double shift[3]) { + /* Some constants */ + const struct engine *const e = r->e; + struct gravity_cache *const ci_cache = &r->ci_gravity_cache; + struct gravity_cache *const cj_cache = &r->cj_gravity_cache; + + /* Cell properties */ + const int gcount_i = ci->gcount; + const int gcount_j = cj->gcount; + struct gpart *restrict gparts_i = ci->gparts; + struct gpart *restrict gparts_j = cj->gparts; + const int ci_active = cell_is_active(ci, e); + const int cj_active = cell_is_active(cj, e); + const double loc_i[3] = {ci->loc[0], ci->loc[1], ci->loc[2]}; + const double loc_j[3] = {cj->loc[0], cj->loc[1], cj->loc[2]}; + const double loc_mean[3] = {0.5 * (loc_i[0] + loc_j[0]), + 0.5 * (loc_i[1] + loc_j[1]), + 0.5 * (loc_i[2] + loc_j[2])}; + + /* Anything to do here ?*/ + if (!ci_active && !cj_active) return; + + /* Check that we fit in cache */ + if (gcount_i > ci_cache->count || gcount_j > cj_cache->count) + error("Not enough space in the caches! gcount_i=%d gcount_j=%d", gcount_i, + gcount_j); + + /* Computed the padded counts */ + const int gcount_padded_i = gcount_i - (gcount_i % VEC_SIZE) + VEC_SIZE; + const int gcount_padded_j = gcount_j - (gcount_j % VEC_SIZE) + VEC_SIZE; + + /* Fill the caches */ + gravity_cache_populate(ci_cache, gparts_i, gcount_i, gcount_padded_i, + loc_mean); + gravity_cache_populate(cj_cache, gparts_j, gcount_j, gcount_padded_j, + loc_mean); + + /* Ok... Here we go ! */ + + if (ci_active) { + + /* Loop over all particles in ci... */ + for (int pid = 0; pid < gcount_i; pid++) { + + /* Skip inactive particles */ + if (!gpart_is_active(&gparts_i[pid], e)) continue; + + const float x_i = ci_cache->x[pid]; + const float y_i = ci_cache->y[pid]; + const float z_i = ci_cache->z[pid]; + + /* Some powers of the softening length */ + const float h_i = ci_cache->epsilon[pid]; + const float h2_i = h_i * h_i; + const float h_inv_i = 1.f / h_i; + const float h_inv3_i = h_inv_i * h_inv_i * h_inv_i; + + /* Local accumulators for the acceleration */ + float a_x = 0.f, a_y = 0.f, a_z = 0.f; + + /* Make the compiler understand we are in happy vectorization land */ + swift_align_information(cj_cache->x, SWIFT_CACHE_ALIGNMENT); + swift_align_information(cj_cache->y, SWIFT_CACHE_ALIGNMENT); + swift_align_information(cj_cache->z, SWIFT_CACHE_ALIGNMENT); + swift_align_information(cj_cache->m, SWIFT_CACHE_ALIGNMENT); + swift_assume_size(gcount_padded_j, VEC_SIZE); + + /* Loop over every particle in the other cell. */ + for (int pjd = 0; pjd < gcount_padded_j; pjd++) { + + /* Get info about j */ + const float x_j = cj_cache->x[pjd]; + const float y_j = cj_cache->y[pjd]; + const float z_j = cj_cache->z[pjd]; + const float mass_j = cj_cache->m[pjd]; + + /* Compute the pairwise (square) distance. */ + const float dx = x_i - x_j; + const float dy = y_i - y_j; + const float dz = z_i - z_j; + const float r2 = dx * dx + dy * dy + dz * dz; + +#ifdef SWIFT_DEBUG_CHECKS + if (r2 == 0.f) error("Interacting particles with 0 distance"); + + /* Check that particles have been drifted to the current time */ + if (gparts_i[pid].ti_drift != e->ti_current) + error("gpi not drifted to current time"); + if (pjd < gcount_j && gparts_j[pjd].ti_drift != e->ti_current) + error("gpj not drifted to current time"); +#endif + + /* Get the inverse distance */ + const float r_inv = 1.f / sqrtf(r2); + + float f_ij, W_ij; + + if (r2 >= h2_i) { + + /* Get Newtonian gravity */ + f_ij = mass_j * r_inv * r_inv * r_inv; + + } else { + + const float r = r2 * r_inv; + const float ui = r * h_inv_i; + + kernel_grav_eval(ui, &W_ij); + + /* Get softened gravity */ + f_ij = mass_j * h_inv3_i * W_ij; + } + + /* Store it back */ + a_x -= f_ij * dx; + a_y -= f_ij * dy; + a_z -= f_ij * dz; + +#ifdef SWIFT_DEBUG_CHECKS + /* Update the interaction counter if it's not a padded gpart */ + if (pjd < gcount_j) gparts_i[pid].num_interacted++; +#endif + } + + /* Store everything back in cache */ + ci_cache->a_x[pid] = a_x; + ci_cache->a_y[pid] = a_y; + ci_cache->a_z[pid] = a_z; + } + } + + /* Now do the opposite loop */ + if (cj_active) { + + /* Loop over all particles in ci... */ + for (int pjd = 0; pjd < gcount_j; pjd++) { + + /* Skip inactive particles */ + if (!gpart_is_active(&gparts_j[pjd], e)) continue; + + const float x_j = cj_cache->x[pjd]; + const float y_j = cj_cache->y[pjd]; + const float z_j = cj_cache->z[pjd]; + + /* Some powers of the softening length */ + const float h_j = cj_cache->epsilon[pjd]; + const float h2_j = h_j * h_j; + const float h_inv_j = 1.f / h_j; + const float h_inv3_j = h_inv_j * h_inv_j * h_inv_j; + + /* Local accumulators for the acceleration */ + float a_x = 0.f, a_y = 0.f, a_z = 0.f; + + /* Make the compiler understand we are in happy vectorization land */ + swift_align_information(ci_cache->x, SWIFT_CACHE_ALIGNMENT); + swift_align_information(ci_cache->y, SWIFT_CACHE_ALIGNMENT); + swift_align_information(ci_cache->z, SWIFT_CACHE_ALIGNMENT); + swift_align_information(ci_cache->m, SWIFT_CACHE_ALIGNMENT); + swift_assume_size(gcount_padded_i, VEC_SIZE); + + /* Loop over every particle in the other cell. */ + for (int pid = 0; pid < gcount_padded_i; pid++) { + + /* Get info about j */ + const float x_i = ci_cache->x[pid]; + const float y_i = ci_cache->y[pid]; + const float z_i = ci_cache->z[pid]; + const float mass_i = ci_cache->m[pid]; + + /* Compute the pairwise (square) distance. */ + const float dx = x_j - x_i; + const float dy = y_j - y_i; + const float dz = z_j - z_i; + const float r2 = dx * dx + dy * dy + dz * dz; + +#ifdef SWIFT_DEBUG_CHECKS + if (r2 == 0.f) error("Interacting particles with 0 distance"); + + /* Check that particles have been drifted to the current time */ + if (gparts_j[pjd].ti_drift != e->ti_current) + error("gpj not drifted to current time"); + if (pid < gcount_i && gparts_i[pid].ti_drift != e->ti_current) + error("gpi not drifted to current time"); +#endif + + /* Get the inverse distance */ + const float r_inv = 1.f / sqrtf(r2); + + float f_ji, W_ji; + + if (r2 >= h2_j) { + + /* Get Newtonian gravity */ + f_ji = mass_i * r_inv * r_inv * r_inv; + + } else { + + const float r = r2 * r_inv; + const float uj = r * h_inv_j; + + kernel_grav_eval(uj, &W_ji); + + /* Get softened gravity */ + f_ji = mass_i * h_inv3_j * W_ji; + } + + /* Store it back */ + a_x -= f_ji * dx; + a_y -= f_ji * dy; + a_z -= f_ji * dz; + +#ifdef SWIFT_DEBUG_CHECKS + /* Update the interaction counter if it's not a padded gpart */ + if (pid < gcount_i) gparts_j[pjd].num_interacted++; +#endif + } + + /* Store everything back in cache */ + cj_cache->a_x[pjd] = a_x; + cj_cache->a_y[pjd] = a_y; + cj_cache->a_z[pjd] = a_z; + } + } + + /* Write back to the particles */ + if (ci_active) gravity_cache_write_back(ci_cache, gparts_i, gcount_i); + if (cj_active) gravity_cache_write_back(cj_cache, gparts_j, gcount_j); + +#ifdef MATTHIEU_OLD_STUFF + /* Some constants */ const struct engine *const e = r->e; @@ -258,6 +487,7 @@ void runner_dopair_grav_pp_full(struct runner *r, struct cell *ci, } } } +#endif } /** @@ -281,6 +511,251 @@ void runner_dopair_grav_pp_truncated(struct runner *r, struct cell *ci, const double rlr = cell_width * a_smooth; const float rlr_inv = 1. / rlr; + /* Caches to play with */ + struct gravity_cache *const ci_cache = &r->ci_gravity_cache; + struct gravity_cache *const cj_cache = &r->cj_gravity_cache; + + /* Cell properties */ + const int gcount_i = ci->gcount; + const int gcount_j = cj->gcount; + struct gpart *restrict gparts_i = ci->gparts; + struct gpart *restrict gparts_j = cj->gparts; + const int ci_active = cell_is_active(ci, e); + const int cj_active = cell_is_active(cj, e); + const double loc_i[3] = {ci->loc[0], ci->loc[1], ci->loc[2]}; + const double loc_j[3] = {cj->loc[0], cj->loc[1], cj->loc[2]}; + const double loc_mean[3] = {0.5 * (loc_i[0] + loc_j[0]), + 0.5 * (loc_i[1] + loc_j[1]), + 0.5 * (loc_i[2] + loc_j[2])}; + + /* Anything to do here ?*/ + if (!ci_active && !cj_active) return; + + /* Check that we fit in cache */ + if (gcount_i > ci_cache->count || gcount_j > cj_cache->count) + error("Not enough space in the caches! gcount_i=%d gcount_j=%d", gcount_i, + gcount_j); + + /* Computed the padded counts */ + const int gcount_padded_i = gcount_i - (gcount_i % VEC_SIZE) + VEC_SIZE; + const int gcount_padded_j = gcount_j - (gcount_j % VEC_SIZE) + VEC_SIZE; + + /* Fill the caches */ + gravity_cache_populate(ci_cache, gparts_i, gcount_i, gcount_padded_i, + loc_mean); + gravity_cache_populate(cj_cache, gparts_j, gcount_j, gcount_padded_j, + loc_mean); + + /* Ok... Here we go ! */ + + if (ci_active) { + + /* Loop over all particles in ci... */ + for (int pid = 0; pid < gcount_i; pid++) { + + /* Skip inactive particles */ + if (!gpart_is_active(&gparts_i[pid], e)) continue; + + const float x_i = ci_cache->x[pid]; + const float y_i = ci_cache->y[pid]; + const float z_i = ci_cache->z[pid]; + + /* Some powers of the softening length */ + const float h_i = ci_cache->epsilon[pid]; + const float h2_i = h_i * h_i; + const float h_inv_i = 1.f / h_i; + const float h_inv3_i = h_inv_i * h_inv_i * h_inv_i; + + /* Local accumulators for the acceleration */ + float a_x = 0.f, a_y = 0.f, a_z = 0.f; + + /* Make the compiler understand we are in happy vectorization land */ + swift_align_information(cj_cache->x, SWIFT_CACHE_ALIGNMENT); + swift_align_information(cj_cache->y, SWIFT_CACHE_ALIGNMENT); + swift_align_information(cj_cache->z, SWIFT_CACHE_ALIGNMENT); + swift_align_information(cj_cache->m, SWIFT_CACHE_ALIGNMENT); + swift_assume_size(gcount_padded_j, VEC_SIZE); + + /* Loop over every particle in the other cell. */ + for (int pjd = 0; pjd < gcount_padded_j; pjd++) { + + /* Get info about j */ + const float x_j = cj_cache->x[pjd]; + const float y_j = cj_cache->y[pjd]; + const float z_j = cj_cache->z[pjd]; + const float mass_j = cj_cache->m[pjd]; + + /* Compute the pairwise (square) distance. */ + const float dx = x_i - x_j; + const float dy = y_i - y_j; + const float dz = z_i - z_j; + const float r2 = dx * dx + dy * dy + dz * dz; + +#ifdef SWIFT_DEBUG_CHECKS + if (r2 == 0.f) error("Interacting particles with 0 distance"); + + /* Check that particles have been drifted to the current time */ + if (gparts_i[pid].ti_drift != e->ti_current) + error("gpi not drifted to current time"); + if (pjd < gcount_j && gparts_j[pjd].ti_drift != e->ti_current) + error("gpj not drifted to current time"); +#endif + + /* Get the inverse distance */ + const float r_inv = 1.f / sqrtf(r2); + const float r = r2 * r_inv; + + float f_ij, W_ij, corr_lr; + + if (r2 >= h2_i) { + + /* Get Newtonian gravity */ + f_ij = mass_j * r_inv * r_inv * r_inv; + + } else { + + const float ui = r * h_inv_i; + + kernel_grav_eval(ui, &W_ij); + + /* Get softened gravity */ + f_ij = mass_j * h_inv3_i * W_ij; + } + + /* Get long-range correction */ + const float u_lr = r * rlr_inv; + kernel_long_grav_eval(u_lr, &corr_lr); + f_ij *= corr_lr; + + /* Store it back */ + a_x -= f_ij * dx; + a_y -= f_ij * dy; + a_z -= f_ij * dz; + +#ifdef SWIFT_DEBUG_CHECKS + /* Update the interaction counter if it's not a padded gpart */ + if (pjd < gcount_j) gparts_i[pid].num_interacted++; +#endif + } + + /* Store everything back in cache */ + ci_cache->a_x[pid] = a_x; + ci_cache->a_y[pid] = a_y; + ci_cache->a_z[pid] = a_z; + } + } + + /* Now do the opposite loop */ + if (cj_active) { + + /* Loop over all particles in ci... */ + for (int pjd = 0; pjd < gcount_j; pjd++) { + + /* Skip inactive particles */ + if (!gpart_is_active(&gparts_j[pjd], e)) continue; + + const float x_j = cj_cache->x[pjd]; + const float y_j = cj_cache->y[pjd]; + const float z_j = cj_cache->z[pjd]; + + /* Some powers of the softening length */ + const float h_j = cj_cache->epsilon[pjd]; + const float h2_j = h_j * h_j; + const float h_inv_j = 1.f / h_j; + const float h_inv3_j = h_inv_j * h_inv_j * h_inv_j; + + /* Local accumulators for the acceleration */ + float a_x = 0.f, a_y = 0.f, a_z = 0.f; + + /* Make the compiler understand we are in happy vectorization land */ + swift_align_information(ci_cache->x, SWIFT_CACHE_ALIGNMENT); + swift_align_information(ci_cache->y, SWIFT_CACHE_ALIGNMENT); + swift_align_information(ci_cache->z, SWIFT_CACHE_ALIGNMENT); + swift_align_information(ci_cache->m, SWIFT_CACHE_ALIGNMENT); + swift_assume_size(gcount_padded_i, VEC_SIZE); + + /* Loop over every particle in the other cell. */ + for (int pid = 0; pid < gcount_padded_i; pid++) { + + /* Get info about j */ + const float x_i = ci_cache->x[pid]; + const float y_i = ci_cache->y[pid]; + const float z_i = ci_cache->z[pid]; + const float mass_i = ci_cache->m[pid]; + + /* Compute the pairwise (square) distance. */ + const float dx = x_j - x_i; + const float dy = y_j - y_i; + const float dz = z_j - z_i; + const float r2 = dx * dx + dy * dy + dz * dz; + +#ifdef SWIFT_DEBUG_CHECKS + if (r2 == 0.f) error("Interacting particles with 0 distance"); + + /* Check that particles have been drifted to the current time */ + if (gparts_j[pjd].ti_drift != e->ti_current) + error("gpj not drifted to current time"); + if (pid < gcount_i && gparts_i[pid].ti_drift != e->ti_current) + error("gpi not drifted to current time"); +#endif + + /* Get the inverse distance */ + const float r_inv = 1.f / sqrtf(r2); + const float r = r2 * r_inv; + + float f_ji, W_ji, corr_lr; + + if (r2 >= h2_j) { + + /* Get Newtonian gravity */ + f_ji = mass_i * r_inv * r_inv * r_inv; + + } else { + + const float uj = r * h_inv_j; + + kernel_grav_eval(uj, &W_ji); + + /* Get softened gravity */ + f_ji = mass_i * h_inv3_j * W_ji; + } + + /* Get long-range correction */ + const float u_lr = r * rlr_inv; + kernel_long_grav_eval(u_lr, &corr_lr); + f_ji *= corr_lr; + + /* Store it back */ + a_x -= f_ji * dx; + a_y -= f_ji * dy; + a_z -= f_ji * dz; + +#ifdef SWIFT_DEBUG_CHECKS + /* Update the interaction counter if it's not a padded gpart */ + if (pid < gcount_i) gparts_j[pjd].num_interacted++; +#endif + } + + /* Store everything back in cache */ + cj_cache->a_x[pjd] = a_x; + cj_cache->a_y[pjd] = a_y; + cj_cache->a_z[pjd] = a_z; + } + } + + /* Write back to the particles */ + if (ci_active) gravity_cache_write_back(ci_cache, gparts_i, gcount_i); + if (cj_active) gravity_cache_write_back(cj_cache, gparts_j, gcount_j); + +#ifdef MATTHIEU_OLD_STUFF + /* Some constants */ + const struct engine *const e = r->e; + const struct space *s = e->s; + const double cell_width = s->width[0]; + const double a_smooth = e->gravity_properties->a_smooth; + const double rlr = cell_width * a_smooth; + const float rlr_inv = 1. / rlr; + /* Cell properties */ const int gcount_i = ci->gcount; const int gcount_j = cj->gcount; @@ -374,6 +849,8 @@ void runner_dopair_grav_pp_truncated(struct runner *r, struct cell *ci, } } } + +#endif } /** @@ -442,6 +919,129 @@ void runner_dopair_grav_pp(struct runner *r, struct cell *ci, struct cell *cj) { */ void runner_doself_grav_pp_full(struct runner *r, struct cell *c) { + /* Some constants */ + const struct engine *const e = r->e; + struct gravity_cache *const ci_cache = &r->ci_gravity_cache; + + /* Cell properties */ + const int gcount = c->gcount; + struct gpart *restrict gparts = c->gparts; + const int c_active = cell_is_active(c, e); + const double loc[3] = {c->loc[0] + 0.5 * c->width[0], + c->loc[1] + 0.5 * c->width[1], + c->loc[2] + 0.5 * c->width[2]}; + + /* Anything to do here ?*/ + if (!c_active) return; + + /* Check that we fit in cache */ + if (gcount > ci_cache->count) + error("Not enough space in the cache! gcount=%d", gcount); + + /* Computed the padded counts */ + const int gcount_padded = gcount - (gcount % VEC_SIZE) + VEC_SIZE; + + gravity_cache_populate(ci_cache, gparts, gcount, gcount_padded, loc); + + /* Ok... Here we go ! */ + + /* Loop over all particles in ci... */ + for (int pid = 0; pid < gcount; pid++) { + + /* Skip inactive particles */ + if (!gpart_is_active(&gparts[pid], e)) continue; + + const float x_i = ci_cache->x[pid]; + const float y_i = ci_cache->y[pid]; + const float z_i = ci_cache->z[pid]; + + /* Some powers of the softening length */ + const float h_i = ci_cache->epsilon[pid]; + const float h2_i = h_i * h_i; + const float h_inv_i = 1.f / h_i; + const float h_inv3_i = h_inv_i * h_inv_i * h_inv_i; + + /* Local accumulators for the acceleration */ + float a_x = 0.f, a_y = 0.f, a_z = 0.f; + + /* Make the compiler understand we are in happy vectorization land */ + swift_align_information(ci_cache->x, SWIFT_CACHE_ALIGNMENT); + swift_align_information(ci_cache->y, SWIFT_CACHE_ALIGNMENT); + swift_align_information(ci_cache->z, SWIFT_CACHE_ALIGNMENT); + swift_align_information(ci_cache->m, SWIFT_CACHE_ALIGNMENT); + swift_assume_size(gcount_padded, VEC_SIZE); + + /* Loop over every other particle in the cell. */ + for (int pjd = 0; pjd < gcount_padded; pjd++) { + + /* No self interaction */ + if (pid == pjd) continue; + + /* Get info about j */ + const float x_j = ci_cache->x[pjd]; + const float y_j = ci_cache->y[pjd]; + const float z_j = ci_cache->z[pjd]; + const float mass_j = ci_cache->m[pjd]; + + /* Compute the pairwise (square) distance. */ + const float dx = x_i - x_j; + const float dy = y_i - y_j; + const float dz = z_i - z_j; + const float r2 = dx * dx + dy * dy + dz * dz; + +#ifdef SWIFT_DEBUG_CHECKS + if (r2 == 0.f) error("Interacting particles with 0 distance"); + + /* Check that particles have been drifted to the current time */ + if (gparts[pid].ti_drift != e->ti_current) + error("gpi not drifted to current time"); + if (pjd < gcount && gparts[pjd].ti_drift != e->ti_current) + error("gpj not drifted to current time"); +#endif + + /* Get the inverse distance */ + const float r_inv = 1.f / sqrtf(r2); + + float f_ij, W_ij; + + if (r2 >= h2_i) { + + /* Get Newtonian gravity */ + f_ij = mass_j * r_inv * r_inv * r_inv; + + } else { + + const float r = r2 * r_inv; + const float ui = r * h_inv_i; + + kernel_grav_eval(ui, &W_ij); + + /* Get softened gravity */ + f_ij = mass_j * h_inv3_i * W_ij; + } + + /* Store it back */ + a_x -= f_ij * dx; + a_y -= f_ij * dy; + a_z -= f_ij * dz; + +#ifdef SWIFT_DEBUG_CHECKS + /* Update the interaction counter if it's not a padded gpart */ + if (pjd < gcount) gparts[pid].num_interacted++; +#endif + } + + /* Store everything back in cache */ + ci_cache->a_x[pid] = a_x; + ci_cache->a_y[pid] = a_y; + ci_cache->a_z[pid] = a_z; + } + + /* Write back to the particles */ + gravity_cache_write_back(ci_cache, gparts, gcount); + +#ifdef MATTHIEU_OLD_STUFF + /* Some constants */ const struct engine *const e = r->e; @@ -511,6 +1111,8 @@ void runner_doself_grav_pp_full(struct runner *r, struct cell *c) { } } } + +#endif } /** @@ -532,6 +1134,140 @@ void runner_doself_grav_pp_truncated(struct runner *r, struct cell *c) { const double rlr = cell_width * a_smooth; const float rlr_inv = 1. / rlr; + /* Caches to play with */ + struct gravity_cache *const ci_cache = &r->ci_gravity_cache; + + /* Cell properties */ + const int gcount = c->gcount; + struct gpart *restrict gparts = c->gparts; + const int c_active = cell_is_active(c, e); + const double loc[3] = {c->loc[0] + 0.5 * c->width[0], + c->loc[1] + 0.5 * c->width[1], + c->loc[2] + 0.5 * c->width[2]}; + + /* Anything to do here ?*/ + if (!c_active) return; + + /* Check that we fit in cache */ + if (gcount > ci_cache->count) + error("Not enough space in the caches! gcount=%d", gcount); + + /* Computed the padded counts */ + const int gcount_padded = gcount - (gcount % VEC_SIZE) + VEC_SIZE; + + gravity_cache_populate(ci_cache, gparts, gcount, gcount_padded, loc); + + /* Ok... Here we go ! */ + + /* Loop over all particles in ci... */ + for (int pid = 0; pid < gcount; pid++) { + + /* Skip inactive particles */ + if (!gpart_is_active(&gparts[pid], e)) continue; + + const float x_i = ci_cache->x[pid]; + const float y_i = ci_cache->y[pid]; + const float z_i = ci_cache->z[pid]; + + /* Some powers of the softening length */ + const float h_i = ci_cache->epsilon[pid]; + const float h2_i = h_i * h_i; + const float h_inv_i = 1.f / h_i; + const float h_inv3_i = h_inv_i * h_inv_i * h_inv_i; + + /* Local accumulators for the acceleration */ + float a_x = 0.f, a_y = 0.f, a_z = 0.f; + + /* Make the compiler understand we are in happy vectorization land */ + swift_align_information(ci_cache->x, SWIFT_CACHE_ALIGNMENT); + swift_align_information(ci_cache->y, SWIFT_CACHE_ALIGNMENT); + swift_align_information(ci_cache->z, SWIFT_CACHE_ALIGNMENT); + swift_align_information(ci_cache->m, SWIFT_CACHE_ALIGNMENT); + swift_assume_size(gcount_padded, VEC_SIZE); + + /* Loop over every other particle in the cell. */ + for (int pjd = 0; pjd < gcount_padded; pjd++) { + + /* No self interaction */ + if (pid == pjd) continue; + + /* Get info about j */ + const float x_j = ci_cache->x[pjd]; + const float y_j = ci_cache->y[pjd]; + const float z_j = ci_cache->z[pjd]; + const float mass_j = ci_cache->m[pjd]; + + /* Compute the pairwise (square) distance. */ + const float dx = x_i - x_j; + const float dy = y_i - y_j; + const float dz = z_i - z_j; + const float r2 = dx * dx + dy * dy + dz * dz; + +#ifdef SWIFT_DEBUG_CHECKS + if (r2 == 0.f) error("Interacting particles with 0 distance"); + + /* Check that particles have been drifted to the current time */ + if (gparts[pid].ti_drift != e->ti_current) + error("gpi not drifted to current time"); + if (pjd < gcount && gparts[pjd].ti_drift != e->ti_current) + error("gpj not drifted to current time"); +#endif + + /* Get the inverse distance */ + const float r_inv = 1.f / sqrtf(r2); + const float r = r2 * r_inv; + + float f_ij, W_ij, corr_lr; + + if (r2 >= h2_i) { + + /* Get Newtonian gravity */ + f_ij = mass_j * r_inv * r_inv * r_inv; + + } else { + + const float ui = r * h_inv_i; + + kernel_grav_eval(ui, &W_ij); + + /* Get softened gravity */ + f_ij = mass_j * h_inv3_i * W_ij; + } + + /* Get long-range correction */ + const float u_lr = r * rlr_inv; + kernel_long_grav_eval(u_lr, &corr_lr); + f_ij *= corr_lr; + + /* Store it back */ + a_x -= f_ij * dx; + a_y -= f_ij * dy; + a_z -= f_ij * dz; + +#ifdef SWIFT_DEBUG_CHECKS + /* Update the interaction counter if it's not a padded gpart */ + if (pjd < gcount) gparts[pid].num_interacted++; +#endif + } + + /* Store everything back in cache */ + ci_cache->a_x[pid] = a_x; + ci_cache->a_y[pid] = a_y; + ci_cache->a_z[pid] = a_z; + } + + /* Write back to the particles */ + gravity_cache_write_back(ci_cache, gparts, gcount); + +#ifdef MATTHIEU_OLD_STUFF + /* Some constants */ + const struct engine *const e = r->e; + const struct space *s = e->s; + const double cell_width = s->width[0]; + const double a_smooth = e->gravity_properties->a_smooth; + const double rlr = cell_width * a_smooth; + const float rlr_inv = 1. / rlr; + /* Cell properties */ const int gcount = c->gcount; struct gpart *restrict gparts = c->gparts; @@ -598,6 +1334,7 @@ void runner_doself_grav_pp_truncated(struct runner *r, struct cell *c) { } } } +#endif } /** @@ -666,9 +1403,12 @@ void runner_dopair_grav(struct runner *r, struct cell *ci, struct cell *cj, const struct engine *e = r->e; const struct space *s = e->s; const int periodic = s->periodic; + const double cell_width = s->width[0]; const double dim[3] = {s->dim[0], s->dim[1], s->dim[2]}; const struct gravity_props *props = e->gravity_properties; const double theta_crit_inv = props->theta_crit_inv; + const double max_distance = props->a_smooth * props->r_cut_max * cell_width; + const double max_distance2 = max_distance * max_distance; #ifdef SWIFT_DEBUG_CHECKS @@ -688,38 +1428,46 @@ void runner_dopair_grav(struct runner *r, struct cell *ci, struct cell *cj, error("cj->multipole not drifted."); #endif -#if ICHECK > 0 - for (int pid = 0; pid < ci->gcount; pid++) { - - /* Get a hold of the ith part in ci. */ - struct gpart *restrict gp = &ci->gparts[pid]; + TIMER_TIC; - if (gp->id_or_neg_offset == ICHECK) - message("id=%lld loc=[ %f %f %f ] size= %f count= %d", - gp->id_or_neg_offset, cj->loc[0], cj->loc[1], cj->loc[2], - cj->width[0], cj->gcount); - } + /* Anything to do here? */ + if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return; - for (int pid = 0; pid < cj->gcount; pid++) { + /* Recover the multipole information */ + struct gravity_tensors *const multi_i = ci->multipole; + struct gravity_tensors *const multi_j = cj->multipole; - /* Get a hold of the ith part in ci. */ - struct gpart *restrict gp = &cj->gparts[pid]; + /* Get the distance between the CoMs */ + double dx = multi_i->CoM[0] - multi_j->CoM[0]; + double dy = multi_i->CoM[1] - multi_j->CoM[1]; + double dz = multi_i->CoM[2] - multi_j->CoM[2]; - if (gp->id_or_neg_offset == ICHECK) - message("id=%lld loc=[ %f %f %f ] size= %f count= %d", - gp->id_or_neg_offset, ci->loc[0], ci->loc[1], ci->loc[2], - ci->width[0], ci->gcount); + /* Apply BC */ + if (periodic) { + dx = nearest(dx, dim[0]); + dy = nearest(dy, dim[1]); + dz = nearest(dz, dim[2]); } -#endif + const double r2 = dx * dx + dy * dy + dz * dz; - /* Anything to do here? */ - if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return; + /* Are we beyond the distance where the truncated forces are 0? */ + if (periodic && r2 > max_distance2) { - TIMER_TIC; +#ifdef SWIFT_DEBUG_CHECKS + /* Need to account for the interactions we missed */ + if (cell_is_active(ci, e)) + multi_i->pot.num_interacted += multi_j->m_pole.num_gpart; + if (cell_is_active(cj, e)) + multi_j->pot.num_interacted += multi_i->m_pole.num_gpart; +#endif + return; + } + + /* OK, we actually need to compute this pair. Let's find the cheapest + * option... */ /* Can we use M-M interactions ? */ - if (gravity_multipole_accept(ci->multipole, cj->multipole, theta_crit_inv, - periodic, dim)) { + if (gravity_multipole_accept(multi_i, multi_j, theta_crit_inv, r2)) { /* MATTHIEU: make a symmetric M-M interaction function ! */ runner_dopair_grav_mm(r, ci, cj); @@ -732,8 +1480,8 @@ void runner_dopair_grav(struct runner *r, struct cell *ci, struct cell *cj, /* Alright, we'll have to split and recurse. */ else { - const double ri_max = ci->multipole->r_max; - const double rj_max = cj->multipole->r_max; + const double ri_max = multi_i->r_max; + const double rj_max = multi_j->r_max; /* Split the larger of the two cells and start over again */ if (ri_max > rj_max) { @@ -887,8 +1635,6 @@ void runner_do_grav_long_range(struct runner *r, struct cell *ci, int timer) { const double theta_crit_inv = props->theta_crit_inv; const double max_distance = props->a_smooth * props->r_cut_max * cell_width; const double max_distance2 = max_distance * max_distance; - struct gravity_tensors *const mi = ci->multipole; - const double CoM[3] = {mi->CoM[0], mi->CoM[1], mi->CoM[2]}; TIMER_TIC; @@ -903,51 +1649,80 @@ void runner_do_grav_long_range(struct runner *r, struct cell *ci, int timer) { if (ci->ti_old_multipole != e->ti_current) error("Interacting un-drifted multipole"); + /* Recover the local multipole */ + struct gravity_tensors *const multi_i = ci->multipole; + const double CoM_i[3] = {multi_i->CoM[0], multi_i->CoM[1], multi_i->CoM[2]}; + const double CoM_rebuild_i[3] = {multi_i->CoM_rebuild[0], + multi_i->CoM_rebuild[1], + multi_i->CoM_rebuild[2]}; + /* Loop over all the top-level cells and go for a M-M interaction if * well-separated */ for (int i = 0; i < nr_cells; ++i) { /* Handle on the top-level cell and it's gravity business*/ struct cell *cj = &cells[i]; - const struct gravity_tensors *const mj = cj->multipole; + const struct gravity_tensors *const multi_j = cj->multipole; /* Avoid stupid cases */ if (ci == cj || cj->gcount == 0) continue; - /* Is this interaction part of the periodic mesh calculation ?*/ - if (periodic) { + /* Get the distance between the CoMs */ + double dx = CoM_i[0] - multi_j->CoM[0]; + double dy = CoM_i[1] - multi_j->CoM[1]; + double dz = CoM_i[2] - multi_j->CoM[2]; - const double dx = nearest(CoM[0] - mj->CoM[0], dim[0]); - const double dy = nearest(CoM[1] - mj->CoM[1], dim[1]); - const double dz = nearest(CoM[2] - mj->CoM[2], dim[2]); - const double r2 = dx * dx + dy * dy + dz * dz; + /* Apply BC */ + if (periodic) { + dx = nearest(dx, dim[0]); + dy = nearest(dy, dim[1]); + dz = nearest(dz, dim[2]); + } + const double r2 = dx * dx + dy * dy + dz * dz; - /* Are we beyond the distance where the truncated forces are 0 ?*/ - if (r2 > max_distance2) { + /* Are we beyond the distance where the truncated forces are 0 ?*/ + if (periodic && r2 > max_distance2) { #ifdef SWIFT_DEBUG_CHECKS - /* Need to account for the interactions we missed */ - mi->pot.num_interacted += mj->m_pole.num_gpart; + /* Need to account for the interactions we missed */ + multi_i->pot.num_interacted += multi_j->m_pole.num_gpart; #endif - continue; - } + continue; } /* Check the multipole acceptance criterion */ - if (gravity_multipole_accept(mi, mj, theta_crit_inv, periodic, dim)) { + if (gravity_multipole_accept(multi_i, multi_j, theta_crit_inv, r2)) { /* Go for a (non-symmetric) M-M calculation */ runner_dopair_grav_mm(r, ci, cj); - } - /* Is the criterion violated now but was OK at the last rebuild ? */ - else if (gravity_multipole_accept_rebuild(mi, mj, theta_crit_inv, periodic, - dim)) { - /* Alright, we have to take charge of that pair in a different way. */ - // MATTHIEU: We should actually open the tree-node here and recurse. - runner_dopair_grav_mm(r, ci, cj); + } else { + + /* Let's check whether we need to still operate on this pair */ + + /* Get the distance between the CoMs at the last rebuild*/ + double dx = CoM_rebuild_i[0] - multi_j->CoM_rebuild[0]; + double dy = CoM_rebuild_i[1] - multi_j->CoM_rebuild[1]; + double dz = CoM_rebuild_i[2] - multi_j->CoM_rebuild[2]; + + /* Apply BC */ + if (periodic) { + dx = nearest(dx, dim[0]); + dy = nearest(dy, dim[1]); + dz = nearest(dz, dim[2]); + } + const double r2_rebuild = dx * dx + dy * dy + dz * dz; + + /* Is the criterion violated now but was OK at the last rebuild ? */ + if (gravity_multipole_accept_rebuild(multi_i, multi_j, theta_crit_inv, + r2_rebuild)) { + + /* Alright, we have to take charge of that pair in a different way. */ + // MATTHIEU: We should actually open the tree-node here and recurse. + runner_dopair_grav_mm(r, ci, cj); + } } - } + } /* Loop over top-level cells */ if (timer) TIMER_TOC(timer_dograv_long_range); } diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c index 5b22a67bbf757e09f144ba3accc2ef5adfb706f0..1557c1ec84863c3518f8671cdf0e9ca44732e94e 100644 --- a/src/runner_doiact_vec.c +++ b/src/runner_doiact_vec.c @@ -63,10 +63,7 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions( vector v_hi_inv, vector v_vix, vector v_viy, vector v_viz, int *icount_align) { -#ifdef HAVE_AVX512_F - KNL_MASK_16 knl_mask, knl_mask2; -#endif - vector int_mask, int_mask2; + mask_t int_mask, int_mask2; /* Work out the number of remainder interactions and pad secondary cache. */ *icount_align = icount; @@ -75,16 +72,10 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions( int pad = (NUM_VEC_PROC * VEC_SIZE) - rem; *icount_align += pad; -/* Initialise masks to true. */ -#ifdef HAVE_AVX512_F - knl_mask = 0xFFFF; - knl_mask2 = 0xFFFF; - int_mask.m = vec_setint1(0xFFFFFFFF); - int_mask2.m = vec_setint1(0xFFFFFFFF); -#else - int_mask.m = vec_setint1(0xFFFFFFFF); - int_mask2.m = vec_setint1(0xFFFFFFFF); -#endif + /* Initialise masks to true. */ + vec_init_mask_true(int_mask); + vec_init_mask_true(int_mask2); + /* Pad secondary cache so that there are no contributions in the interaction * function. */ for (int i = icount; i < *icount_align; i++) { @@ -100,19 +91,10 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions( /* Zero parts of mask that represent the padded values.*/ if (pad < VEC_SIZE) { -#ifdef HAVE_AVX512_F - knl_mask2 = knl_mask2 >> pad; -#else - for (int i = VEC_SIZE - pad; i < VEC_SIZE; i++) int_mask2.i[i] = 0; -#endif + vec_pad_mask(int_mask2, pad); } else { -#ifdef HAVE_AVX512_F - knl_mask = knl_mask >> (VEC_SIZE - rem); - knl_mask2 = 0; -#else - for (int i = rem; i < VEC_SIZE; i++) int_mask.i[i] = 0; - int_mask2.v = vec_setzero(); -#endif + vec_pad_mask(int_mask, VEC_SIZE - rem); + vec_zero_mask(int_mask2); } /* Perform remainder interaction and remove remainder from aligned @@ -125,12 +107,7 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions( &int_cache->vyq[*icount_align], &int_cache->vzq[*icount_align], &int_cache->mq[*icount_align], rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, curlvySum, curlvzSum, int_mask, - int_mask2, -#ifdef HAVE_AVX512_F - knl_mask, knl_mask2); -#else - 0, 0); -#endif + int_mask2, 1); } } @@ -144,10 +121,6 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions( * @param v_dx #vector of the x separation between two particles. * @param v_dy #vector of the y separation between two particles. * @param v_dz #vector of the z separation between two particles. - * @param v_mj #vector of the mass of particle pj. - * @param v_vjx #vector of x velocity of pj. - * @param v_vjy #vector of y velocity of pj. - * @param v_vjz #vector of z velocity of pj. * @param cell_cache #cache of all particles in the cell. * @param int_cache (return) secondary #cache of interactions between two * particles. @@ -174,44 +147,33 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions( */ __attribute__((always_inline)) INLINE static void storeInteractions( const int mask, const int pjd, vector *v_r2, vector *v_dx, vector *v_dy, - vector *v_dz, vector *v_mj, vector *v_vjx, vector *v_vjy, vector *v_vjz, - const struct cache *const cell_cache, struct c2_cache *const int_cache, - int *icount, vector *rhoSum, vector *rho_dhSum, vector *wcountSum, - vector *wcount_dhSum, vector *div_vSum, vector *curlvxSum, - vector *curlvySum, vector *curlvzSum, vector v_hi_inv, vector v_vix, - vector v_viy, vector v_viz) { + vector *v_dz, const struct cache *const cell_cache, + struct c2_cache *const int_cache, int *icount, vector *rhoSum, + vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum, + vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum, + vector v_hi_inv, vector v_vix, vector v_viy, vector v_viz) { /* Left-pack values needed into the secondary cache using the interaction mask. */ #if defined(HAVE_AVX2) || defined(HAVE_AVX512_F) - int pack = 0; - -#ifdef HAVE_AVX512_F - pack += __builtin_popcount(mask); - VEC_LEFT_PACK(v_r2->v, mask, &int_cache->r2q[*icount]); - VEC_LEFT_PACK(v_dx->v, mask, &int_cache->dxq[*icount]); - VEC_LEFT_PACK(v_dy->v, mask, &int_cache->dyq[*icount]); - VEC_LEFT_PACK(v_dz->v, mask, &int_cache->dzq[*icount]); - VEC_LEFT_PACK(v_mj->v, mask, &int_cache->mq[*icount]); - VEC_LEFT_PACK(v_vjx->v, mask, &int_cache->vxq[*icount]); - VEC_LEFT_PACK(v_vjy->v, mask, &int_cache->vyq[*icount]); - VEC_LEFT_PACK(v_vjz->v, mask, &int_cache->vzq[*icount]); -#else - vector v_mask; - VEC_FORM_PACKED_MASK(mask, v_mask.m, pack); - - VEC_LEFT_PACK(v_r2->v, v_mask.m, &int_cache->r2q[*icount]); - VEC_LEFT_PACK(v_dx->v, v_mask.m, &int_cache->dxq[*icount]); - VEC_LEFT_PACK(v_dy->v, v_mask.m, &int_cache->dyq[*icount]); - VEC_LEFT_PACK(v_dz->v, v_mask.m, &int_cache->dzq[*icount]); - VEC_LEFT_PACK(v_mj->v, v_mask.m, &int_cache->mq[*icount]); - VEC_LEFT_PACK(v_vjx->v, v_mask.m, &int_cache->vxq[*icount]); - VEC_LEFT_PACK(v_vjy->v, v_mask.m, &int_cache->vyq[*icount]); - VEC_LEFT_PACK(v_vjz->v, v_mask.m, &int_cache->vzq[*icount]); - -#endif /* HAVE_AVX512_F */ - - (*icount) += pack; + mask_t packed_mask; + VEC_FORM_PACKED_MASK(mask, packed_mask); + + VEC_LEFT_PACK(v_r2->v, packed_mask, &int_cache->r2q[*icount]); + VEC_LEFT_PACK(v_dx->v, packed_mask, &int_cache->dxq[*icount]); + VEC_LEFT_PACK(v_dy->v, packed_mask, &int_cache->dyq[*icount]); + VEC_LEFT_PACK(v_dz->v, packed_mask, &int_cache->dzq[*icount]); + VEC_LEFT_PACK(vec_load(&cell_cache->m[pjd]), packed_mask, + &int_cache->mq[*icount]); + VEC_LEFT_PACK(vec_load(&cell_cache->vx[pjd]), packed_mask, + &int_cache->vxq[*icount]); + VEC_LEFT_PACK(vec_load(&cell_cache->vy[pjd]), packed_mask, + &int_cache->vyq[*icount]); + VEC_LEFT_PACK(vec_load(&cell_cache->vz[pjd]), packed_mask, + &int_cache->vzq[*icount]); + + /* Increment interaction count by number of bits set in mask. */ + (*icount) += __builtin_popcount(mask); #else /* Quicker to do it serially in AVX rather than use intrinsics. */ for (int bit_index = 0; bit_index < VEC_SIZE; bit_index++) { @@ -242,9 +204,9 @@ __attribute__((always_inline)) INLINE static void storeInteractions( wcount_dhSum, div_vSum, curlvxSum, curlvySum, curlvzSum, v_hi_inv, v_vix, v_viy, v_viz, &icount_align); - vector int_mask, int_mask2; - int_mask.m = vec_setint1(0xFFFFFFFF); - int_mask2.m = vec_setint1(0xFFFFFFFF); + mask_t int_mask, int_mask2; + vec_init_mask_true(int_mask); + vec_init_mask_true(int_mask2); /* Perform interactions. */ for (int pjd = 0; pjd < icount_align; pjd += (NUM_VEC_PROC * VEC_SIZE)) { @@ -253,7 +215,7 @@ __attribute__((always_inline)) INLINE static void storeInteractions( &int_cache->dzq[pjd], v_hi_inv, v_vix, v_viy, v_viz, &int_cache->vxq[pjd], &int_cache->vyq[pjd], &int_cache->vzq[pjd], &int_cache->mq[pjd], rhoSum, rho_dhSum, wcountSum, wcount_dhSum, - div_vSum, curlvxSum, curlvySum, curlvzSum, int_mask, int_mask2, 0, 0); + div_vSum, curlvxSum, curlvySum, curlvzSum, int_mask, int_mask2, 0); } /* Reset interaction count. */ @@ -262,7 +224,8 @@ __attribute__((always_inline)) INLINE static void storeInteractions( } /** - * @brief Populates the arrays max_di and max_dj with the maximum distances of + * @brief Populates the arrays max_index_i and max_index_j with the maximum + * indices of * particles into their neighbouring cells. Also finds the first pi that * interacts with any particle in cj and the last pj that interacts with any * particle in ci. @@ -277,77 +240,121 @@ __attribute__((always_inline)) INLINE static void storeInteractions( * @param hj_max Maximal smoothing length in cell cj * @param di_max Maximal position on the axis that can interact in cell ci * @param dj_min Minimal position on the axis that can interact in cell ci - * @param max_di array to hold the maximum distances of pi particles into cell + * @param max_index_i array to hold the maximum distances of pi particles into + * cell * cj - * @param max_dj array to hold the maximum distances of pj particles into cell + * @param max_index_j array to hold the maximum distances of pj particles into + * cell * cj * @param init_pi first pi to interact with a pj particle * @param init_pj last pj to interact with a pi particle * @param e The #engine. */ -__attribute__((always_inline)) INLINE static void populate_max_d_no_cache( +__attribute__((always_inline)) INLINE static void populate_max_index_no_cache( const struct cell *ci, const struct cell *cj, const struct entry *restrict sort_i, const struct entry *restrict sort_j, const float dx_max, const float rshift, const double hi_max, const double hj_max, const double di_max, const double dj_min, - float *max_di, float *max_dj, int *init_pi, int *init_pj, + int *max_index_i, int *max_index_j, int *init_pi, int *init_pj, const struct engine *e) { const struct part *restrict parts_i = ci->parts; const struct part *restrict parts_j = cj->parts; int first_pi = 0, last_pj = cj->count - 1; + int temp; + + /* Find the leftmost active particle in cell i that interacts with any + * particle in cell j. */ + first_pi = ci->count; + int active_id = first_pi - 1; + while (first_pi > 0 && sort_i[first_pi - 1].d + dx_max + hi_max > dj_min) { + first_pi--; + /* Store the index of the particle if it is active. */ + if (part_is_active(&parts_i[sort_i[first_pi].i], e)) active_id = first_pi; + } - /* Find the first active particle in ci to interact with any particle in cj. - */ - /* Populate max_di with distances. */ - int active_id = ci->count - 1; - for (int k = ci->count - 1; k >= 0; k--) { - const struct part *pi = &parts_i[sort_i[k].i]; - const float d = sort_i[k].d + dx_max; - - // max_di[k] = d + h * kernel_gamma - rshift; - max_di[k] = d + hi_max; - - /* If the particle is out of range set the index to - * the last active particle within range. */ - if (d + hi_max < dj_min) { - first_pi = active_id; - break; - } else { - if (part_is_active(pi, e)) active_id = k; + /* Set the first active pi in range of any particle in cell j. */ + first_pi = active_id; + + /* Find the maximum index into cell j for each particle in range in cell i. */ + if (first_pi < ci->count) { + + /* Start from the first particle in cell j. */ + temp = 0; + + const struct part *pi = &parts_i[sort_i[first_pi].i]; + + /* Loop through particles in cell j until they are not in range of pi. */ + while (temp <= cj->count && + (sort_i[first_pi].d + (pi->h * kernel_gamma + dx_max - rshift) > + sort_j[temp].d)) + temp++; + + max_index_i[first_pi] = temp; + + /* Populate max_index_i for remaining particles that are within range. */ + for (int i = first_pi + 1; i < ci->count; i++) { + temp = max_index_i[i - 1]; + pi = &parts_i[sort_i[i].i]; + + while (temp <= cj->count && + (sort_i[i].d + (pi->h * kernel_gamma + dx_max - rshift) > + sort_j[temp].d)) + temp++; + + max_index_i[i] = temp; } + } else { + /* Make sure that max index is set to first particle in cj.*/ + max_index_i[ci->count - 1] = 0; } - /* Find the maximum distance of pi particles into cj.*/ - for (int k = first_pi + 1; k < ci->count; k++) { - max_di[k] = fmaxf(max_di[k - 1], max_di[k]); + /* Find the rightmost active particle in cell j that interacts with any + * particle in cell i. */ + last_pj = -1; + active_id = last_pj; + while (last_pj < cj->count && + sort_j[last_pj + 1].d - hj_max - dx_max < di_max) { + last_pj++; + /* Store the index of the particle if it is active. */ + if (part_is_active(&parts_j[sort_j[last_pj].i], e)) active_id = last_pj; } - /* Find the last particle in cj to interact with any particle in ci. */ - /* Populate max_dj with distances. */ - active_id = 0; - for (int k = 0; k < cj->count; k++) { - const struct part *pj = &parts_j[sort_j[k].i]; - const float d = sort_j[k].d - dx_max; - - /*TODO: don't think rshift should be taken off here, waiting on Pedro. */ - // max_dj[k] = d - h * kernel_gamma - rshift; - max_dj[k] = d - hj_max; - - /* If the particle is out of range set the index to - * the last active particle within range. */ - if (d - hj_max > di_max) { - last_pj = active_id; - break; - } else { - if (part_is_active(pj, e)) active_id = k; - } - } + /* Set the last active pj in range of any particle in cell i. */ + last_pj = active_id; + + /* Find the maximum index into cell i for each particle in range in cell j. */ + if (last_pj > 0) { + + /* Start from the last particle in cell i. */ + temp = ci->count - 1; + + const struct part *pj = &parts_j[sort_j[last_pj].i]; + + /* Loop through particles in cell i until they are not in range of pj. */ + while (temp > 0 && + sort_j[last_pj].d - dx_max - (pj->h * kernel_gamma) < + sort_i[temp].d - rshift) + temp--; - /* Find the maximum distance of pj particles into ci.*/ - for (int k = 1; k <= last_pj; k++) { - max_dj[k] = fmaxf(max_dj[k - 1], max_dj[k]); + max_index_j[last_pj] = temp; + + /* Populate max_index_j for remaining particles that are within range. */ + for (int i = last_pj - 1; i >= 0; i--) { + temp = max_index_j[i + 1]; + pj = &parts_j[sort_j[i].i]; + + while (temp > 0 && + sort_j[i].d - dx_max - (pj->h * kernel_gamma) < + sort_i[temp].d - rshift) + temp--; + + max_index_j[i] = temp; + } + } else { + /* Make sure that max index is set to last particle in ci.*/ + max_index_j[0] = ci->count - 1; } *init_pi = first_pi; @@ -367,7 +374,6 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( #ifdef WITH_VECTORIZATION const struct engine *e = r->e; - int doi_mask; struct part *restrict pi; int count_align; int num_vec_proc = NUM_VEC_PROC; @@ -459,9 +465,7 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( } vector pjx, pjy, pjz; - vector pjvx, pjvy, pjvz, mj; vector pjx2, pjy2, pjz2; - vector pjvx2, pjvy2, pjvz2, mj2; /* Find all of particle pi's interacions and store needed values in the * secondary cache.*/ @@ -471,18 +475,10 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( pjx.v = vec_load(&cell_cache->x[pjd]); pjy.v = vec_load(&cell_cache->y[pjd]); pjz.v = vec_load(&cell_cache->z[pjd]); - pjvx.v = vec_load(&cell_cache->vx[pjd]); - pjvy.v = vec_load(&cell_cache->vy[pjd]); - pjvz.v = vec_load(&cell_cache->vz[pjd]); - mj.v = vec_load(&cell_cache->m[pjd]); pjx2.v = vec_load(&cell_cache->x[pjd + VEC_SIZE]); pjy2.v = vec_load(&cell_cache->y[pjd + VEC_SIZE]); pjz2.v = vec_load(&cell_cache->z[pjd + VEC_SIZE]); - pjvx2.v = vec_load(&cell_cache->vx[pjd + VEC_SIZE]); - pjvy2.v = vec_load(&cell_cache->vy[pjd + VEC_SIZE]); - pjvz2.v = vec_load(&cell_cache->vz[pjd + VEC_SIZE]); - mj2.v = vec_load(&cell_cache->m[pjd + VEC_SIZE]); /* Compute the pairwise distance. */ vector v_dx_tmp, v_dy_tmp, v_dz_tmp; @@ -502,53 +498,47 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( v_r2.v = vec_fma(v_dz_tmp.v, v_dz_tmp.v, v_r2.v); v_r2_2.v = vec_fma(v_dz_tmp2.v, v_dz_tmp2.v, v_r2_2.v); -/* Form a mask from r2 < hig2 and r2 > 0.*/ -#ifdef HAVE_AVX512_F - // KNL_MASK_16 doi_mask, doi_mask_check, doi_mask2, doi_mask2_check; - KNL_MASK_16 doi_mask_check, doi_mask2, doi_mask2_check; - - doi_mask_check = vec_cmp_gt(v_r2.v, vec_setzero()); - doi_mask = vec_cmp_lt(v_r2.v, v_hig2.v); - - doi_mask2_check = vec_cmp_gt(v_r2_2.v, vec_setzero()); - doi_mask2 = vec_cmp_lt(v_r2_2.v, v_hig2.v); - - doi_mask = doi_mask & doi_mask_check; - doi_mask2 = doi_mask2 & doi_mask2_check; - -#else - vector v_doi_mask, v_doi_mask_check, v_doi_mask2, v_doi_mask2_check; - int doi_mask2; + /* Form a mask from r2 < hig2 and r2 > 0.*/ + mask_t v_doi_mask, v_doi_mask_self_check, v_doi_mask2, + v_doi_mask2_self_check; + int doi_mask, doi_mask_self_check, doi_mask2, doi_mask2_self_check; /* Form r2 > 0 mask and r2 < hig2 mask. */ - v_doi_mask_check.v = vec_cmp_gt(v_r2.v, vec_setzero()); - v_doi_mask.v = vec_cmp_lt(v_r2.v, v_hig2.v); + vec_create_mask(v_doi_mask_self_check, vec_cmp_gt(v_r2.v, vec_setzero())); + vec_create_mask(v_doi_mask, vec_cmp_lt(v_r2.v, v_hig2.v)); /* Form r2 > 0 mask and r2 < hig2 mask. */ - v_doi_mask2_check.v = vec_cmp_gt(v_r2_2.v, vec_setzero()); - v_doi_mask2.v = vec_cmp_lt(v_r2_2.v, v_hig2.v); + vec_create_mask(v_doi_mask2_self_check, + vec_cmp_gt(v_r2_2.v, vec_setzero())); + vec_create_mask(v_doi_mask2, vec_cmp_lt(v_r2_2.v, v_hig2.v)); - /* Combine two masks and form integer mask. */ - doi_mask = vec_cmp_result(vec_and(v_doi_mask.v, v_doi_mask_check.v)); - doi_mask2 = vec_cmp_result(vec_and(v_doi_mask2.v, v_doi_mask2_check.v)); -#endif /* HAVE_AVX512_F */ + /* Form integer masks. */ + doi_mask_self_check = vec_form_int_mask(v_doi_mask_self_check); + doi_mask = vec_form_int_mask(v_doi_mask); + + doi_mask2_self_check = vec_form_int_mask(v_doi_mask2_self_check); + doi_mask2 = vec_form_int_mask(v_doi_mask2); + + /* Combine the two masks. */ + doi_mask = doi_mask & doi_mask_self_check; + doi_mask2 = doi_mask2 & doi_mask2_self_check; /* If there are any interactions left pack interaction values into c2 * cache. */ if (doi_mask) { storeInteractions(doi_mask, pjd, &v_r2, &v_dx_tmp, &v_dy_tmp, &v_dz_tmp, - &mj, &pjvx, &pjvy, &pjvz, cell_cache, &int_cache, + cell_cache, &int_cache, &icount, &rhoSum, &rho_dhSum, + &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, + &curlvySum, &curlvzSum, v_hi_inv, v_vix, v_viy, + v_viz); + } + if (doi_mask2) { + storeInteractions(doi_mask2, pjd + VEC_SIZE, &v_r2_2, &v_dx_tmp2, + &v_dy_tmp2, &v_dz_tmp2, cell_cache, &int_cache, &icount, &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz); } - if (doi_mask2) { - storeInteractions( - doi_mask2, pjd + VEC_SIZE, &v_r2_2, &v_dx_tmp2, &v_dy_tmp2, - &v_dz_tmp2, &mj2, &pjvx2, &pjvy2, &pjvz2, cell_cache, &int_cache, - &icount, &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, - &curlvxSum, &curlvySum, &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz); - } } /* Perform padded vector remainder interactions if any are present. */ @@ -559,16 +549,9 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( /* Initialise masks to true in case remainder interactions have been * performed. */ - vector int_mask, int_mask2; -#ifdef HAVE_AVX512_F - KNL_MASK_16 knl_mask = 0xFFFF; - KNL_MASK_16 knl_mask2 = 0xFFFF; - int_mask.m = vec_setint1(0xFFFFFFFF); - int_mask2.m = vec_setint1(0xFFFFFFFF); -#else - int_mask.m = vec_setint1(0xFFFFFFFF); - int_mask2.m = vec_setint1(0xFFFFFFFF); -#endif + mask_t int_mask, int_mask2; + vec_init_mask_true(int_mask); + vec_init_mask_true(int_mask2); /* Perform interaction with 2 vectors. */ for (int pjd = 0; pjd < icount_align; pjd += (num_vec_proc * VEC_SIZE)) { @@ -578,11 +561,7 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( &int_cache.vxq[pjd], &int_cache.vyq[pjd], &int_cache.vzq[pjd], &int_cache.mq[pjd], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, int_mask, int_mask2, -#ifdef HAVE_AVX512_F - knl_mask, knl_mask2); -#else - 0, 0); -#endif + 0); } /* Perform horizontal adds on vector sums and store result in particle pi. @@ -714,38 +693,19 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, } int first_pi, last_pj; - float *max_di __attribute__((aligned(sizeof(float) * VEC_SIZE))); - float *max_dj __attribute__((aligned(sizeof(float) * VEC_SIZE))); + int *max_index_i __attribute__((aligned(sizeof(int) * VEC_SIZE))); + int *max_index_j __attribute__((aligned(sizeof(int) * VEC_SIZE))); - max_di = r->ci_cache.max_d; - max_dj = r->cj_cache.max_d; + max_index_i = r->ci_cache.max_index; + max_index_j = r->cj_cache.max_index; - /* Find particles maximum distance into cj, max_di[] and ci, max_dj[]. */ + /* Find particles maximum index into cj, max_index_i[] and ci, max_index_j[]. + */ /* Also find the first pi that interacts with any particle in cj and the last * pj that interacts with any particle in ci. */ - populate_max_d_no_cache(ci, cj, sort_i, sort_j, dx_max, rshift, hi_max, - hj_max, di_max, dj_min, max_di, max_dj, &first_pi, - &last_pj, e); - - /* Find the maximum index into cj that is required by a particle in ci. */ - /* Find the maximum index into ci that is required by a particle in cj. */ - float di, dj; - int max_ind_j = count_j - 1; - int max_ind_i = 0; - - dj = sort_j[max_ind_j].d; - while (max_ind_j > 0 && max_di[count_i - 1] < dj) { - max_ind_j--; - - dj = sort_j[max_ind_j].d; - } - - di = sort_i[max_ind_i].d; - while (max_ind_i < count_i - 1 && max_dj[0] > di) { - max_ind_i++; - - di = sort_i[max_ind_i].d; - } + populate_max_index_no_cache(ci, cj, sort_i, sort_j, dx_max, rshift, hi_max, + hj_max, di_max, dj_min, max_index_i, max_index_j, + &first_pi, &last_pj, e); /* Limits of the outer loops. */ int first_pi_loop = first_pi; @@ -753,8 +713,8 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, /* Take the max/min of both values calculated to work out how many particles * to read into the cache. */ - last_pj = max(last_pj, max_ind_j); - first_pi = min(first_pi, max_ind_i); + last_pj = max(last_pj, max_index_i[count_i - 1]); + first_pi = min(first_pi, max_index_j[0]); /* Read the needed particles into the two caches. */ int first_pi_align = first_pi; @@ -769,7 +729,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, if (cell_is_active(ci, e)) { /* Loop over the parts in ci until nothing is within range in cj. */ - for (int pid = count_i - 1; pid >= first_pi_loop && max_ind_j >= 0; pid--) { + for (int pid = count_i - 1; pid >= first_pi_loop; pid--) { /* Get a hold of the ith part in ci. */ struct part *restrict pi = &parts_i[sort_i[pid].i]; @@ -785,13 +745,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, if (di_test < dj_min) continue; /* Determine the exit iteration of the interaction loop. */ - dj = sort_j[max_ind_j].d; - while (max_ind_j > 0 && max_di[pid] < dj) { - max_ind_j--; - - dj = sort_j[max_ind_j].d; - } - int exit_iteration = max_ind_j + 1; + int exit_iteration = max_index_i[pid]; const float hig2 = hi * hi * kernel_gamma2; @@ -866,14 +820,14 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v); v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v); - vector v_doi_mask; + mask_t v_doi_mask; int doi_mask; /* Form r2 < hig2 mask. */ - v_doi_mask.v = vec_cmp_lt(v_r2.v, v_hig2.v); + vec_create_mask(v_doi_mask, vec_cmp_lt(v_r2.v, v_hig2.v)); /* Form integer mask. */ - doi_mask = vec_cmp_result(v_doi_mask.v); + doi_mask = vec_form_int_mask(v_doi_mask); /* If there are any interactions perform them. */ if (doi_mask) @@ -882,12 +836,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, &cj_cache->vx[cj_cache_idx], &cj_cache->vy[cj_cache_idx], &cj_cache->vz[cj_cache_idx], &cj_cache->m[cj_cache_idx], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, - &curlvySum, &curlvzSum, v_doi_mask, -#ifdef HAVE_AVX512_F - knl_mask); -#else - 0); -#endif + &curlvySum, &curlvzSum, v_doi_mask); } /* loop over the parts in cj. */ @@ -908,7 +857,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, if (cell_is_active(cj, e)) { /* Loop over the parts in cj until nothing is within range in ci. */ - for (int pjd = 0; pjd <= last_pj_loop && max_ind_i < count_i; pjd++) { + for (int pjd = 0; pjd <= last_pj_loop; pjd++) { /* Get a hold of the jth part in cj. */ struct part *restrict pj = &parts_j[sort_j[pjd].i]; @@ -925,13 +874,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, if (dj_test > di_max) continue; /* Determine the exit iteration of the interaction loop. */ - di = sort_i[max_ind_i].d; - while (max_ind_i < count_i - 1 && max_dj[pjd] > di) { - max_ind_i++; - - di = sort_i[max_ind_i].d; - } - int exit_iteration = max_ind_i; + int exit_iteration = max_index_j[pjd]; const float hjg2 = hj * hj * kernel_gamma2; @@ -1005,14 +948,14 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v); v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v); - vector v_doj_mask; + mask_t v_doj_mask; int doj_mask; /* Form r2 < hig2 mask. */ - v_doj_mask.v = vec_cmp_lt(v_r2.v, v_hjg2.v); + vec_create_mask(v_doj_mask, vec_cmp_lt(v_r2.v, v_hjg2.v)); /* Form integer mask. */ - doj_mask = vec_cmp_result(v_doj_mask.v); + doj_mask = vec_form_int_mask(v_doj_mask); /* If there are any interactions perform them. */ if (doj_mask) @@ -1021,12 +964,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, &ci_cache->vx[ci_cache_idx], &ci_cache->vy[ci_cache_idx], &ci_cache->vz[ci_cache_idx], &ci_cache->m[ci_cache_idx], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, - &curlvySum, &curlvzSum, v_doj_mask, -#ifdef HAVE_AVX512_F - knl_mask); -#else - 0); -#endif + &curlvySum, &curlvzSum, v_doj_mask); } /* loop over the parts in ci. */ diff --git a/src/scheduler.c b/src/scheduler.c index ed3c163949875a5621674fa2fbb6fa3157e65bc1..b3505bb874d08a90504dfec2b104f19dcf9dff9a 100644 --- a/src/scheduler.c +++ b/src/scheduler.c @@ -970,9 +970,7 @@ void scheduler_reset(struct scheduler *s, int size) { if (size > s->size) { /* Free existing task lists if necessary. */ - if (s->tasks != NULL) free(s->tasks); - if (s->tasks_ind != NULL) free(s->tasks_ind); - if (s->tid_active != NULL) free(s->tid_active); + scheduler_free_tasks(s); /* Allocate the new lists. */ if (posix_memalign((void *)&s->tasks, task_align, @@ -1648,11 +1646,29 @@ void scheduler_print_tasks(const struct scheduler *s, const char *fileName) { */ void scheduler_clean(struct scheduler *s) { - free(s->tasks); - free(s->tasks_ind); + scheduler_free_tasks(s); free(s->unlocks); free(s->unlock_ind); - free(s->tid_active); for (int i = 0; i < s->nr_queues; ++i) queue_clean(&s->queues[i]); free(s->queues); } + +/** + * @brief Free the task arrays allocated by this #scheduler. + */ +void scheduler_free_tasks(struct scheduler *s) { + + if (s->tasks != NULL) { + free(s->tasks); + s->tasks = NULL; + } + if (s->tasks_ind != NULL) { + free(s->tasks_ind); + s->tasks_ind = NULL; + } + if (s->tid_active != NULL) { + free(s->tid_active); + s->tid_active = NULL; + } + s->size = 0; +} diff --git a/src/scheduler.h b/src/scheduler.h index f38e5fb4d849842217756b7b93713a5e1375c9c5..ac654580b2af2ffb506dc3fd9f0b988b89effbd0 100644 --- a/src/scheduler.h +++ b/src/scheduler.h @@ -52,7 +52,7 @@ /* Flags . */ #define scheduler_flag_none 0 -#define scheduler_flag_steal 1 +#define scheduler_flag_steal (1 << 1) /* Data of a scheduler. */ struct scheduler { @@ -143,5 +143,6 @@ void scheduler_set_unlocks(struct scheduler *s); void scheduler_dump_queue(struct scheduler *s); void scheduler_print_tasks(const struct scheduler *s, const char *fileName); void scheduler_clean(struct scheduler *s); +void scheduler_free_tasks(struct scheduler *s); #endif /* SWIFT_SCHEDULER_H */ diff --git a/src/space.c b/src/space.c index fc7732f45350b37600198b057583baf5b761c28c..52a34248cd9bf38e03e476e4937fa601b0ee9222 100644 --- a/src/space.c +++ b/src/space.c @@ -253,6 +253,15 @@ void space_rebuild_recycle_mapper(void *map_data, int num_elements, } } +/** + * @brief Free up any allocated cells. + */ +void space_free_cells(struct space *s) { + threadpool_map(&s->e->threadpool, space_rebuild_recycle_mapper, s->cells_top, + s->nr_cells, sizeof(struct cell), 0, s); + s->maxdepth = 0; +} + /** * @brief Re-build the top-level cell grid. * @@ -379,13 +388,15 @@ void space_regrid(struct space *s, int verbose) { /* Free the old cells, if they were allocated. */ if (s->cells_top != NULL) { - threadpool_map(&s->e->threadpool, space_rebuild_recycle_mapper, - s->cells_top, s->nr_cells, sizeof(struct cell), 0, s); + space_free_cells(s); free(s->cells_top); free(s->multipoles_top); - s->maxdepth = 0; } + /* Also free the task arrays, these will be regenerated and we can use the + * memory while copying the particle arrays. */ + if (s->e != NULL) scheduler_free_tasks(&s->e->sched); + /* Set the new cell dimensions only if smaller. */ for (int k = 0; k < 3; k++) { s->cdim[k] = cdim[k]; @@ -492,9 +503,7 @@ void space_regrid(struct space *s, int verbose) { else { /* Otherwise, just clean up the cells. */ /* Free the old cells, if they were allocated. */ - threadpool_map(&s->e->threadpool, space_rebuild_recycle_mapper, - s->cells_top, s->nr_cells, sizeof(struct cell), 0, s); - s->maxdepth = 0; + space_free_cells(s); } if (verbose) diff --git a/src/space.h b/src/space.h index 7fe8f27ecaf671738e03881d547d6cb29a61503a..dbbba714c2b3c9841905b2ba54e4f2d854b820a6 100644 --- a/src/space.h +++ b/src/space.h @@ -226,5 +226,6 @@ void space_check_timesteps(struct space *s); void space_replicate(struct space *s, int replicate, int verbose); void space_reset_task_counters(struct space *s); void space_clean(struct space *s); +void space_free_cells(struct space *s); #endif /* SWIFT_SPACE_H */ diff --git a/src/tools.c b/src/tools.c index 68b23e1a815b510abafbd8f7c6285d3633ba0a63..7d69ebc6c476312081d8a8c34c76c6592da5cab0 100644 --- a/src/tools.c +++ b/src/tools.c @@ -32,6 +32,7 @@ #include "tools.h" /* Local includes. */ +#include "active.h" #include "cell.h" #include "error.h" #include "gravity.h" @@ -183,6 +184,7 @@ void pairs_all_density(struct runner *r, struct cell *ci, struct cell *cj) { float r2, hi, hj, hig2, hjg2, dx[3]; struct part *pi, *pj; const double dim[3] = {r->e->s->dim[0], r->e->s->dim[1], r->e->s->dim[2]}; + const struct engine *e = r->e; /* Implements a double-for loop and checks every interaction */ for (int i = 0; i < ci->count; ++i) { @@ -191,6 +193,9 @@ void pairs_all_density(struct runner *r, struct cell *ci, struct cell *cj) { hi = pi->h; hig2 = hi * hi * kernel_gamma2; + /* Skip inactive particles. */ + if (!part_is_active(pi, e)) continue; + for (int j = 0; j < cj->count; ++j) { pj = &cj->parts[j]; @@ -219,6 +224,9 @@ void pairs_all_density(struct runner *r, struct cell *ci, struct cell *cj) { hj = pj->h; hjg2 = hj * hj * kernel_gamma2; + /* Skip inactive particles. */ + if (!part_is_active(pj, e)) continue; + for (int i = 0; i < ci->count; ++i) { pi = &ci->parts[i]; @@ -311,6 +319,7 @@ void pairs_all_force(struct runner *r, struct cell *ci, struct cell *cj) { void self_all_density(struct runner *r, struct cell *ci) { float r2, hi, hj, hig2, hjg2, dxi[3]; //, dxj[3]; struct part *pi, *pj; + const struct engine *e = r->e; /* Implements a double-for loop and checks every interaction */ for (int i = 0; i < ci->count; ++i) { @@ -335,14 +344,14 @@ void self_all_density(struct runner *r, struct cell *ci) { } /* Hit or miss? */ - if (r2 < hig2) { + if (r2 < hig2 && part_is_active(pi, e)) { /* Interact */ runner_iact_nonsym_density(r2, dxi, hi, hj, pi, pj); } /* Hit or miss? */ - if (r2 < hjg2) { + if (r2 < hjg2 && part_is_active(pj, e)) { dxi[0] = -dxi[0]; dxi[1] = -dxi[1]; diff --git a/src/vector.h b/src/vector.h index 7d82b9f5c9e2bc6e1fea9426ab3870eeec180408..6a7c6837989025785c1f9134004f2ebcc226a205 100644 --- a/src/vector.h +++ b/src/vector.h @@ -23,8 +23,12 @@ /* Have I already read this file? */ #ifndef VEC_MACRO +/* Config parameters. */ #include "../config.h" +/* Local headers */ +#include "inline.h" + #ifdef WITH_VECTORIZATION /* Need to check whether compiler supports this (IBM does not) @@ -60,9 +64,13 @@ #define vec_dbl_set(a, b, c, d, e, f, g, h) \ _mm512_set_pd(h, g, f, e, d, c, b, a) #define vec_add(a, b) _mm512_add_ps(a, b) +#define vec_mask_add(a, b, mask) _mm512_mask_add_ps(a, mask, b, a) #define vec_sub(a, b) _mm512_sub_ps(a, b) +#define vec_mask_sub(a, b, mask) _mm512_mask_sub_ps(a, mask, a, b) #define vec_mul(a, b) _mm512_mul_ps(a, b) +#define vec_div(a, b) _mm512_div_ps(a, b) #define vec_fma(a, b, c) _mm512_fmadd_ps(a, b, c) +#define vec_fnma(a, b, c) _mm512_fnmadd_ps(a, b, c) #define vec_sqrt(a) _mm512_sqrt_ps(a) #define vec_rcp(a) _mm512_rcp14_ps(a) #define vec_rsqrt(a) _mm512_rsqrt14_ps(a) @@ -75,7 +83,16 @@ #define vec_cmp_lt(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ) #define vec_cmp_lte(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ) #define vec_cmp_gte(a, b) _mm512_cmp_ps_mask(a, b, _CMP_GE_OQ) +#define vec_cmp_result(a) ({ a; }) +#define vec_form_int_mask(a) ({ a; }) #define vec_and(a, b) _mm512_and_ps(a, b) +#define vec_mask_and(a, b) _mm512_kand(a, b) +#define vec_and_mask(a, mask) _mm512_maskz_mov_ps(mask, a) +#define vec_init_mask_true(mask) ({ mask = 0xFFFF; }) +#define vec_zero_mask(mask) ({ mask = 0; }) +#define vec_create_mask(mask, cond) ({ mask = cond; }) +#define vec_pad_mask(mask, pad) ({ mask = mask >> (pad); }) +#define vec_blend(mask, a, b) _mm512_mask_blend_ps(mask, a, b) #define vec_todbl_lo(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 0)) #define vec_todbl_hi(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 1)) #define vec_dbl_tofloat(a, b) _mm512_insertf128(_mm512_castps128_ps512(a), b, 1) @@ -116,10 +133,13 @@ for (int i = 0; i < VEC_SIZE; i++) b += a.f[i]; \ } #endif -/* Calculates the number of set bits in the mask and adds the result to an int. - */ -#define VEC_FORM_PACKED_MASK(mask, v_mask, pack) \ - pack += __builtin_popcount(mask); + +/* Do nothing in the case of AVX-512 as there are already + * instructions for left-packing.*/ +#define VEC_FORM_PACKED_MASK(mask, packed_mask) packed_mask = mask + +/* Finds the horizontal maximum of vector b and returns a float. */ +#define VEC_HMAX(a, b) b = _mm512_reduce_max_ps(a.v) /* Performs a left-pack on a vector based upon a mask and returns the result. */ #define VEC_LEFT_PACK(a, mask, result) \ @@ -142,8 +162,11 @@ #define vec_set(a, b, c, d, e, f, g, h) _mm256_set_ps(h, g, f, e, d, c, b, a) #define vec_dbl_set(a, b, c, d) _mm256_set_pd(d, c, b, a) #define vec_add(a, b) _mm256_add_ps(a, b) +#define vec_mask_add(a, b, mask) vec_add(a, vec_and(b, mask.v)) #define vec_sub(a, b) _mm256_sub_ps(a, b) +#define vec_mask_sub(a, b, mask) vec_sub(a, vec_and(b, mask.v)) #define vec_mul(a, b) _mm256_mul_ps(a, b) +#define vec_div(a, b) _mm256_div_ps(a, b) #define vec_sqrt(a) _mm256_sqrt_ps(a) #define vec_rcp(a) _mm256_rcp_ps(a) #define vec_rsqrt(a) _mm256_rsqrt_ps(a) @@ -157,7 +180,16 @@ #define vec_cmp_lte(a, b) _mm256_cmp_ps(a, b, _CMP_LE_OQ) #define vec_cmp_gte(a, b) _mm256_cmp_ps(a, b, _CMP_GE_OQ) #define vec_cmp_result(a) _mm256_movemask_ps(a) +#define vec_form_int_mask(a) _mm256_movemask_ps(a.v) #define vec_and(a, b) _mm256_and_ps(a, b) +#define vec_mask_and(a, b) _mm256_and_ps(a.v, b.v) +#define vec_and_mask(a, mask) _mm256_and_ps(a, mask.v) +#define vec_init_mask_true(mask) mask.m = vec_setint1(0xFFFFFFFF) +#define vec_create_mask(mask, cond) mask.v = cond +#define vec_zero_mask(mask) mask.v = vec_setzero() +#define vec_pad_mask(mask, pad) \ + for (int i = VEC_SIZE - (pad); i < VEC_SIZE; i++) mask.i[i] = 0 +#define vec_blend(mask, a, b) _mm256_blendv_ps(a, b, mask.v) #define vec_todbl_lo(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 0)) #define vec_todbl_hi(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 1)) #define vec_dbl_tofloat(a, b) _mm256_insertf128(_mm256_castps128_ps256(a), b, 1) @@ -183,6 +215,13 @@ a.v = _mm256_hadd_ps(a.v, a.v); \ b += a.f[0] + a.f[4]; +/* Performs a horizontal maximum on the vector and takes the maximum of the + * result with a float, b. */ +#define VEC_HMAX(a, b) \ + { \ + for (int k = 0; k < VEC_SIZE; k++) b = max(b, a.f[k]); \ + } + /* Returns the lower 128-bits of the 256-bit vector. */ #define VEC_GET_LOW(a) _mm256_castps256_ps128(a) @@ -192,6 +231,7 @@ /* Check if we have AVX2 intrinsics alongside AVX */ #ifdef HAVE_AVX2 #define vec_fma(a, b, c) _mm256_fmadd_ps(a, b, c) +#define vec_fnma(a, b, c) _mm256_fnmadd_ps(a, b, c) /* Used in VEC_FORM_PACKED_MASK */ #define identity_indices 0x0706050403020100 @@ -201,19 +241,18 @@ /* Takes an integer mask and forms a left-packed integer vector * containing indices of the set bits in the integer mask. * Also returns the total number of bits set in the mask. */ -#define VEC_FORM_PACKED_MASK(mask, v_mask, pack) \ +#define VEC_FORM_PACKED_MASK(mask, packed_mask) \ { \ unsigned long expanded_mask = _pdep_u64(mask, 0x0101010101010101); \ expanded_mask *= 0xFF; \ unsigned long wanted_indices = _pext_u64(identity_indices, expanded_mask); \ __m128i bytevec = _mm_cvtsi64_si128(wanted_indices); \ - v_mask = _mm256_cvtepu8_epi32(bytevec); \ - pack += __builtin_popcount(mask); \ + packed_mask.m = _mm256_cvtepu8_epi32(bytevec); \ } /* Performs a left-pack on a vector based upon a mask and returns the result. */ #define VEC_LEFT_PACK(a, mask, result) \ - vec_unaligned_store(_mm256_permutevar8x32_ps(a, mask), result) + vec_unaligned_store(_mm256_permutevar8x32_ps(a, mask.m), result) #endif /* HAVE_AVX2 */ /* Create an FMA using vec_add and vec_mul if AVX2 is not present. */ @@ -221,6 +260,11 @@ #define vec_fma(a, b, c) vec_add(vec_mul(a, b), c) #endif +/* Create a negated FMA using vec_sub and vec_mul if AVX2 is not present. */ +#ifndef vec_fnma +#define vec_fnma(a, b, c) vec_sub(c, vec_mul(a, b)) +#endif + /* Form a packed mask without intrinsics if AVX2 is not present. */ #ifndef VEC_FORM_PACKED_MASK @@ -284,6 +328,7 @@ #define vec_add(a, b) _mm_add_ps(a, b) #define vec_sub(a, b) _mm_sub_ps(a, b) #define vec_mul(a, b) _mm_mul_ps(a, b) +#define vec_div(a, b) _mm_div_ps(a, b) #define vec_sqrt(a) _mm_sqrt_ps(a) #define vec_rcp(a) _mm_rcp_ps(a) #define vec_rsqrt(a) _mm_rsqrt_ps(a) @@ -336,6 +381,13 @@ typedef union { int i[VEC_SIZE]; } vector; +/* Define the mask type depending on the instruction set used. */ +#ifdef HAVE_AVX512_F +typedef __mmask16 mask_t; +#else +typedef vector mask_t; +#endif + /** * @brief Calculates the inverse ($1/x$) of a vector using intrinsics and a * Newton iteration to obtain the correct level of accuracy. diff --git a/tests/Makefile.am b/tests/Makefile.am index 2eec538ea50440cf5589882af52d3a0c5ca04b92..9cd6e9ab9e09935d39bf416dfbb65b83a874b382 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -21,16 +21,16 @@ AM_LDFLAGS = ../src/.libs/libswiftsim.a $(HDF5_LDFLAGS) $(HDF5_LIBS) $(FFTW_LIBS # List of programs and scripts to run in the test suite TESTS = testGreetings testMaths testReading.sh testSingle testKernel testSymmetry \ - testPair.sh testPairPerturbed.sh test27cells.sh test27cellsPerturbed.sh \ + testActivePair.sh test27cells.sh test27cellsPerturbed.sh \ testParser.sh testSPHStep test125cells.sh test125cellsPerturbed.sh testFFT \ testAdiabaticIndex testRiemannExact testRiemannTRRS testRiemannHLLC \ - testMatrixInversion testThreadpool testDump testLogger \ + testMatrixInversion testThreadpool testDump testLogger testInteractions.sh \ testVoronoi1D testVoronoi2D testVoronoi3D \ testPeriodicBC.sh testPeriodicBCPerturbed.sh # List of test programs to compile check_PROGRAMS = testGreetings testReading testSingle testTimeIntegration \ - testSPHStep testPair test27cells test125cells testParser \ + testSPHStep testActivePair test27cells test125cells testParser \ testKernel testFFT testInteractions testMaths \ testSymmetry testThreadpool benchmarkInteractions \ testAdiabaticIndex testRiemannExact testRiemannTRRS \ @@ -55,7 +55,7 @@ testSPHStep_SOURCES = testSPHStep.c testSingle_SOURCES = testSingle.c -testPair_SOURCES = testPair.c +testActivePair_SOURCES = testActivePair.c test27cells_SOURCES = test27cells.c @@ -96,10 +96,10 @@ testDump_SOURCES = testDump.c testLogger_SOURCES = testLogger.c # Files necessary for distribution -EXTRA_DIST = testReading.sh makeInput.py testPair.sh testPairPerturbed.sh \ +EXTRA_DIST = testReading.sh makeInput.py testActivePair.sh \ test27cells.sh test27cellsPerturbed.sh testParser.sh testPeriodicBC.sh \ testPeriodicBCPerturbed.sh test125cells.sh test125cellsPerturbed.sh testParserInput.yaml \ difffloat.py tolerance_125_normal.dat tolerance_125_perturbed.dat \ - tolerance_27_normal.dat tolerance_27_perturbed.dat tolerance_27_perturbed_h.dat \ - tolerance_pair_normal.dat tolerance_pair_perturbed.dat \ + tolerance_27_normal.dat tolerance_27_perturbed.dat tolerance_27_perturbed_h.dat tolerance_27_perturbed_h2.dat \ + tolerance_testInteractions.dat tolerance_pair_active.dat \ fft_params.yml tolerance_periodic_BC_normal.dat tolerance_periodic_BC_perturbed.dat diff --git a/tests/benchmarkInteractions.c b/tests/benchmarkInteractions.c index a11862ebaa345623ca65cd7025b4f4e79769d0ac..5dc7a4ae85e62313a84d0c0be88b65a1c0bb59df 100644 --- a/tests/benchmarkInteractions.c +++ b/tests/benchmarkInteractions.c @@ -31,6 +31,7 @@ #define IACT runner_iact_nonsym_density #define IACT_VEC runner_iact_nonsym_2_vec_density #define IACT_NAME "test_nonsym_density" +#define NUM_VEC_PROC_INT 2 #endif #ifdef SYM_DENSITY @@ -53,8 +54,9 @@ #ifndef IACT #define IACT runner_iact_nonsym_density -#define IACT_VEC runner_iact_nonsym_2_vec_density +#define IACT_VEC runner_iact_nonsym_1_vec_density #define IACT_NAME "test_nonsym_density" +#define NUM_VEC_PROC_INT 1 #endif /** @@ -368,7 +370,7 @@ void test_interactions(struct part test_part, struct part *parts, size_t count, /* Perform vector interaction. */ #ifdef WITH_VECTORIZATION - vector hi_vec, hi_inv_vec, vix_vec, viy_vec, viz_vec, mask, mask2; + vector hi_vec, hi_inv_vec, vix_vec, viy_vec, viz_vec; vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, curlvySum, curlvzSum; @@ -387,28 +389,35 @@ void test_interactions(struct part test_part, struct part *parts, size_t count, viz_vec.v = vec_load(&vizq[0]); hi_inv_vec = vec_reciprocal(hi_vec); - mask.m = vec_setint1(0xFFFFFFFF); - mask2.m = vec_setint1(0xFFFFFFFF); -#ifdef HAVE_AVX512_F - KNL_MASK_16 knl_mask, knl_mask2; - knl_mask = 0xFFFF; - knl_mask2 = 0xFFFF; + mask_t mask; + vec_init_mask_true(mask); +#if (NUM_VEC_PROC_INT == 2) + mask_t mask2; + vec_init_mask_true(mask2); #endif - const ticks vec_tic = getticks(); - for (size_t i = 0; i < count; i += 2 * VEC_SIZE) { + for (size_t i = 0; i < count; i += NUM_VEC_PROC_INT * VEC_SIZE) { +/* Interleave two vectors for interaction. */ +#if (NUM_VEC_PROC_INT == 2) IACT_VEC(&(r2q[i]), &(dxq[i]), &(dyq[i]), &(dzq[i]), (hi_inv_vec), (vix_vec), (viy_vec), (viz_vec), &(vjxq[i]), &(vjyq[i]), &(vjzq[i]), &(mjq[i]), &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, - mask, mask2, -#ifdef HAVE_AVX512_F - knl_mask, knl_mask2); -#else - 0, 0); + mask, mask2, 0); +#else /* Only use one vector for interaction. */ + vector r2, dx, dy, dz; + r2.v = vec_load(&(r2q[i])); + dx.v = vec_load(&(dxq[i])); + dy.v = vec_load(&(dyq[i])); + dz.v = vec_load(&(dzq[i])); + + IACT_VEC(&r2, &dx, &dy, &dz, (hi_inv_vec), (vix_vec), (viy_vec), + (viz_vec), &(vjxq[i]), &(vjyq[i]), &(vjzq[i]), &(mjq[i]), + &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, + &curlvxSum, &curlvySum, &curlvzSum, mask); #endif } diff --git a/tests/test125cells.c b/tests/test125cells.c index 1bddd5dd2d4933e3b2d6c802d1325f3b88ba6ba5..2b35f7b5d509a7f966d07d6da82616869f29ab66 100644 --- a/tests/test125cells.c +++ b/tests/test125cells.c @@ -440,6 +440,7 @@ void runner_doself2_force(struct runner *r, struct cell *ci); /* And go... */ int main(int argc, char *argv[]) { + engine_pin(); size_t runs = 0, particles = 0; double h = 1.23485, size = 1., rho = 2.5; double perturbation = 0.; diff --git a/tests/test27cells.c b/tests/test27cells.c index c4df8356eb6494cc34b91c6ab5038b3866b14056..7ba1eec9ad279f09f63021e332dac1cfd5cc1505 100644 --- a/tests/test27cells.c +++ b/tests/test27cells.c @@ -122,7 +122,7 @@ struct cell *make_cell(size_t n, double *offset, double size, double h, break; } if (h_pert) - part->h = size * h * random_uniform(1.f, 1.1f) / (float)n; + part->h = size * h * random_uniform(1.f, h_pert) / (float)n; else part->h = size * h / (float)n; h_max = fmaxf(h_max, part->h); diff --git a/tests/test27cells.sh.in b/tests/test27cells.sh.in index a313a594c10c80598852a92e7920ac846e15ba2d..059a7a208aa8e570ad5035fac16ffd201bf3dddd 100755 --- a/tests/test27cells.sh.in +++ b/tests/test27cells.sh.in @@ -56,4 +56,32 @@ do done +# Test for particles with random smoothing lengths +for v in {0..3} +do + echo "" + + rm -f brute_force_27_standard.dat swift_dopair_27_standard.dat + + echo "Running ./test27cells -n 6 -r 1 -d 0 -f standard -v $v -p 1.3" + ./test27cells -n 6 -r 1 -d 0 -f standard -v $v -p 1.3 + + if [ -e brute_force_27_standard.dat ] + then + if python @srcdir@/difffloat.py brute_force_27_standard.dat swift_dopair_27_standard.dat @srcdir@/tolerance_27_perturbed_h2.dat 6 + then + echo "Accuracy test passed" + else + echo "Accuracy test failed" + exit 1 + fi + else + echo "Error Missing test output file" + exit 1 + fi + + echo "------------" + +done + exit $? diff --git a/tests/test27cellsPerturbed.sh.in b/tests/test27cellsPerturbed.sh.in index 4adbe0b9ca6d8e1cfe7df861c2bf88bf35832983..f875504e541588377ca6e40fe55681ebec3466f6 100755 --- a/tests/test27cellsPerturbed.sh.in +++ b/tests/test27cellsPerturbed.sh.in @@ -56,4 +56,31 @@ do done +# Test for particles with random smoothing lengths +for v in {0..3} +do + echo "" + + rm -f brute_force_27_perturbed.dat swift_dopair_27_perturbed.dat + + echo "Running ./test27cells -n 6 -r 1 -d 0.1 -f perturbed -v $v -p 1.3" + ./test27cells -n 6 -r 1 -d 0.1 -f perturbed -v $v -p 1.3 + + if [ -e brute_force_27_perturbed.dat ] + then + if python @srcdir@/difffloat.py brute_force_27_perturbed.dat swift_dopair_27_perturbed.dat @srcdir@/tolerance_27_perturbed_h2.dat 6 + then + echo "Accuracy test passed" + else + echo "Accuracy test failed" + exit 1 + fi + else + echo "Error Missing test output file" + exit 1 + fi + + echo "------------" + +done exit $? diff --git a/tests/testActivePair.c b/tests/testActivePair.c new file mode 100644 index 0000000000000000000000000000000000000000..1e0111b4f0e480d0f66463b4c2264cdd89bd28c8 --- /dev/null +++ b/tests/testActivePair.c @@ -0,0 +1,510 @@ +/******************************************************************************* + * This file is part of SWIFT. + * Copyright (C) 2015 Matthieu Schaller (matthieu.schaller@durham.ac.uk). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + ******************************************************************************/ +#include "../config.h" + +/* Some standard headers. */ +#include <fenv.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include <unistd.h> + +/* Local headers. */ +#include "swift.h" + +/** + * @brief Constructs a cell and all of its particle in a valid state prior to + * a DOPAIR or DOSELF calcuation. + * + * @param n The cube root of the number of particles. + * @param offset The position of the cell offset from (0,0,0). + * @param size The cell size. + * @param h The smoothing length of the particles in units of the inter-particle + * separation. + * @param density The density of the fluid. + * @param partId The running counter of IDs. + * @param pert The perturbation to apply to the particles in the cell in units + * of the inter-particle separation. + * @param h_pert The perturbation to apply to the smoothing length. + * @param fraction_active The fraction of particles that should be active in the + * cell. + */ +struct cell *make_cell(size_t n, double *offset, double size, double h, + double density, long long *partId, double pert, + double h_pert, double fraction_active) { + const size_t count = n * n * n; + const double volume = size * size * size; + float h_max = 0.f; + struct cell *cell = malloc(sizeof(struct cell)); + bzero(cell, sizeof(struct cell)); + + if (posix_memalign((void **)&cell->parts, part_align, + count * sizeof(struct part)) != 0) { + error("couldn't allocate particles, no. of particles: %d", (int)count); + } + bzero(cell->parts, count * sizeof(struct part)); + + /* Construct the parts */ + struct part *part = cell->parts; + for (size_t x = 0; x < n; ++x) { + for (size_t y = 0; y < n; ++y) { + for (size_t z = 0; z < n; ++z) { + part->x[0] = + offset[0] + + size * (x + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n; + part->x[1] = + offset[1] + + size * (y + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n; + part->x[2] = + offset[2] + + size * (z + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n; + part->v[0] = random_uniform(-0.05, 0.05); + part->v[1] = random_uniform(-0.05, 0.05); + part->v[2] = random_uniform(-0.05, 0.05); + + if (h_pert) + part->h = size * h * random_uniform(1.f, h_pert) / (float)n; + else + part->h = size * h / (float)n; + h_max = fmaxf(h_max, part->h); + part->id = ++(*partId); + +#if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH) + part->conserved.mass = density * volume / count; + +#ifdef SHADOWFAX_SPH + double anchor[3] = {0., 0., 0.}; + double side[3] = {1., 1., 1.}; + voronoi_cell_init(&part->cell, part->x, anchor, side); +#endif + +#else + part->mass = density * volume / count; +#endif + +#if defined(HOPKINS_PE_SPH) + part->entropy = 1.f; + part->entropy_one_over_gamma = 1.f; +#endif + if (random_uniform(0, 1.f) < fraction_active) + part->time_bin = 1; + else + part->time_bin = num_time_bins + 1; + +#ifdef SWIFT_DEBUG_CHECKS + part->ti_drift = 8; + part->ti_kick = 8; +#endif + + ++part; + } + } + } + + /* Cell properties */ + cell->split = 0; + cell->h_max = h_max; + cell->count = count; + cell->dx_max_part = 0.; + cell->dx_max_sort = 0.; + cell->width[0] = size; + cell->width[1] = size; + cell->width[2] = size; + cell->loc[0] = offset[0]; + cell->loc[1] = offset[1]; + cell->loc[2] = offset[2]; + + cell->ti_old_part = 8; + cell->ti_end_min = 8; + cell->ti_end_max = 8; + + shuffle_particles(cell->parts, cell->count); + + cell->sorted = 0; + for (int k = 0; k < 13; k++) cell->sort[k] = NULL; + + return cell; +} + +void clean_up(struct cell *ci) { + free(ci->parts); + for (int k = 0; k < 13; k++) + if (ci->sort[k] != NULL) free(ci->sort[k]); + free(ci); +} + +/** + * @brief Initializes all particles field to be ready for a density calculation + */ +void zero_particle_fields(struct cell *c) { + for (int pid = 0; pid < c->count; pid++) { + hydro_init_part(&c->parts[pid], NULL); + } +} + +/** + * @brief Ends the loop by adding the appropriate coefficients + */ +void end_calculation(struct cell *c) { + for (int pid = 0; pid < c->count; pid++) { + hydro_end_density(&c->parts[pid]); + + /* Recover the common "Neighbour number" definition */ + c->parts[pid].density.wcount *= pow_dimension(c->parts[pid].h); + c->parts[pid].density.wcount *= kernel_norm; + } +} + +/** + * @brief Dump all the particles to a file + */ +void dump_particle_fields(char *fileName, struct cell *ci, struct cell *cj) { + FILE *file = fopen(fileName, "a"); + + /* Write header */ + fprintf(file, "# %4s %13s\n", "ID", "wcount"); + + fprintf(file, "# ci --------------------------------------------\n"); + + for (int pid = 0; pid < ci->count; pid++) { + fprintf(file, "%6llu %13e\n", ci->parts[pid].id, + ci->parts[pid].density.wcount); + } + + fprintf(file, "# cj --------------------------------------------\n"); + + for (int pjd = 0; pjd < cj->count; pjd++) { + fprintf(file, "%6llu %13e\n", cj->parts[pjd].id, + cj->parts[pjd].density.wcount); + } + + fclose(file); +} + +/* Just a forward declaration... */ +void runner_dopair1_density(struct runner *r, struct cell *ci, struct cell *cj); +void runner_doself1_density_vec(struct runner *r, struct cell *ci); +void runner_dopair1_branch_density(struct runner *r, struct cell *ci, + struct cell *cj); + +/** + * @brief Computes the pair interactions of two cells using SWIFT and a brute + * force implementation. + */ +void test_pair_interactions(struct runner *runner, struct cell **ci, + struct cell **cj, char *swiftOutputFileName, + char *bruteForceOutputFileName) { + + runner_do_sort(runner, *ci, 0x1FFF, 0, 0); + runner_do_sort(runner, *cj, 0x1FFF, 0, 0); + + /* Zero the fields */ + zero_particle_fields(*ci); + zero_particle_fields(*cj); + + /* Run the test */ + runner_dopair1_branch_density(runner, *ci, *cj); + + /* Let's get physical ! */ + end_calculation(*ci); + end_calculation(*cj); + + /* Dump if necessary */ + dump_particle_fields(swiftOutputFileName, *ci, *cj); + + /* Now perform a brute-force version for accuracy tests */ + + /* Zero the fields */ + zero_particle_fields(*ci); + zero_particle_fields(*cj); + + /* Run the brute-force test */ + pairs_all_density(runner, *ci, *cj); + + /* Let's get physical ! */ + end_calculation(*ci); + end_calculation(*cj); + + dump_particle_fields(bruteForceOutputFileName, *ci, *cj); +} + +/** + * @brief Computes the pair interactions of two cells in various configurations. + */ +void test_all_pair_interactions(struct runner *runner, double *offset2, + size_t particles, double size, double h, + double rho, long long *partId, + double perturbation, double h_pert, + char *swiftOutputFileName, + char *bruteForceOutputFileName) { + + double offset1[3] = {0, 0, 0}; + struct cell *ci, *cj; + + /* All active particles. */ + ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert, + 1.); + cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert, + 1.); + + test_pair_interactions(runner, &ci, &cj, swiftOutputFileName, + bruteForceOutputFileName); + + clean_up(ci); + clean_up(cj); + + /* Half particles are active. */ + ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert, + 0.5); + cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert, + 0.5); + + test_pair_interactions(runner, &ci, &cj, swiftOutputFileName, + bruteForceOutputFileName); + + clean_up(ci); + clean_up(cj); + + /* All particles inactive. */ + ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert, + 0.); + cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert, + 0.); + + test_pair_interactions(runner, &ci, &cj, swiftOutputFileName, + bruteForceOutputFileName); + + clean_up(ci); + clean_up(cj); + + /* 10% of particles active. */ + ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert, + 0.1); + cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert, + 0.1); + + test_pair_interactions(runner, &ci, &cj, swiftOutputFileName, + bruteForceOutputFileName); + + clean_up(ci); + clean_up(cj); + + /* One active cell one inactive cell. */ + ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert, + 1.0); + cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert, + 0.); + + test_pair_interactions(runner, &ci, &cj, swiftOutputFileName, + bruteForceOutputFileName); + + clean_up(ci); + clean_up(cj); + + /* One active cell one inactive cell. */ + ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert, + 0.); + cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert, + 1.0); + + test_pair_interactions(runner, &ci, &cj, swiftOutputFileName, + bruteForceOutputFileName); + + clean_up(ci); + clean_up(cj); + + /* Smaller cells, all active. */ + ci = make_cell(2, offset1, size, h, rho, partId, perturbation, h_pert, 1.0); + cj = make_cell(2, offset2, size, h, rho, partId, perturbation, h_pert, 1.0); + + test_pair_interactions(runner, &ci, &cj, swiftOutputFileName, + bruteForceOutputFileName); + + clean_up(ci); + clean_up(cj); + + /* Different numbers of particles in each cell. */ + ci = make_cell(10, offset1, size, h, rho, partId, perturbation, h_pert, 0.5); + cj = make_cell(3, offset2, size, h, rho, partId, perturbation, h_pert, 0.75); + + test_pair_interactions(runner, &ci, &cj, swiftOutputFileName, + bruteForceOutputFileName); + + clean_up(ci); + clean_up(cj); + + /* One cell inactive and the other only half active. */ + ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert, + 0.5); + cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert, + 0.); + + test_pair_interactions(runner, &ci, &cj, swiftOutputFileName, + bruteForceOutputFileName); + + clean_up(ci); + clean_up(cj); + + /* One cell inactive and the other only half active. */ + ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert, + 0.); + cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert, + 0.5); + + test_pair_interactions(runner, &ci, &cj, swiftOutputFileName, + bruteForceOutputFileName); + + /* Clean things to make the sanitizer happy ... */ + clean_up(ci); + clean_up(cj); +} + +int main(int argc, char *argv[]) { + size_t particles = 0, runs = 0, type = 0; + double h = 1.23485, size = 1., rho = 1.; + double perturbation = 0.1, h_pert = 1.1; + struct space space; + struct engine engine; + struct runner *runner; + char c; + static long long partId = 0; + char outputFileNameExtension[200] = ""; + char swiftOutputFileName[200] = ""; + char bruteForceOutputFileName[200] = ""; + + /* Initialize CPU frequency, this also starts time. */ + unsigned long long cpufreq = 0; + clocks_set_cpufreq(cpufreq); + + /* Choke on FP-exceptions */ + feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW); + + /* Generate a RNG seed from time. */ + unsigned int seed = time(NULL); + + while ((c = getopt(argc, argv, "h:p:n:r:t:d:s:f:")) != -1) { + switch (c) { + case 'h': + sscanf(optarg, "%lf", &h); + break; + case 'p': + sscanf(optarg, "%lf", &h_pert); + break; + case 'n': + sscanf(optarg, "%zu", &particles); + break; + case 'r': + sscanf(optarg, "%zu", &runs); + break; + case 't': + sscanf(optarg, "%zu", &type); + break; + case 'd': + sscanf(optarg, "%lf", &perturbation); + break; + case 's': + sscanf(optarg, "%u", &seed); + break; + case 'f': + strcpy(outputFileNameExtension, optarg); + break; + case '?': + error("Unknown option."); + break; + } + } + + if (h < 0 || particles == 0 || runs == 0 || type > 2) { + printf( + "\nUsage: %s -n PARTICLES_PER_AXIS -r NUMBER_OF_RUNS [OPTIONS...]\n" + "\nGenerates a cell pair, filled with particles on a Cartesian grid." + "\nThese are then interacted using runner_dopair1_density." + "\n\nOptions:" + "\n-t TYPE=0 - cells share face (0), edge (1) or corner (2)" + "\n-h DISTANCE=1.2348 - smoothing length" + "\n-p - Random fractional change in h, h=h*random(1,p)" + "\n-d pert - perturbation to apply to the particles [0,1[" + "\n-s seed - seed for RNG" + "\n-f fileName - part of the file name used to save the dumps\n", + argv[0]); + exit(1); + } + + /* Seed RNG. */ + message("Seed used for RNG: %d", seed); + srand(seed); + + space.periodic = 0; + space.dim[0] = 3.; + space.dim[1] = 3.; + space.dim[2] = 3.; + + engine.s = &space; + engine.time = 0.1f; + engine.ti_current = 8; + engine.max_active_bin = num_time_bins; + + if (posix_memalign((void **)&runner, SWIFT_STRUCT_ALIGNMENT, + sizeof(struct runner)) != 0) { + error("couldn't allocate runner"); + } + + runner->e = &engine; + + /* Create output file names. */ + sprintf(swiftOutputFileName, "swift_dopair_%s.dat", outputFileNameExtension); + sprintf(bruteForceOutputFileName, "brute_force_%s.dat", + outputFileNameExtension); + + /* Delete files if they already exist. */ + remove(swiftOutputFileName); + remove(bruteForceOutputFileName); + +#ifdef WITH_VECTORIZATION + runner->ci_cache.count = 0; + cache_init(&runner->ci_cache, 512); + runner->cj_cache.count = 0; + cache_init(&runner->cj_cache, 512); +#endif + + double offset[3] = {1., 0., 0.}; + + /* Test a pair of cells face-on. */ + test_all_pair_interactions(runner, offset, particles, size, h, rho, &partId, + perturbation, h_pert, swiftOutputFileName, + bruteForceOutputFileName); + + /* Test a pair of cells edge-on. */ + offset[0] = 1.; + offset[1] = 1.; + offset[2] = 0.; + test_all_pair_interactions(runner, offset, particles, size, h, rho, &partId, + perturbation, h_pert, swiftOutputFileName, + bruteForceOutputFileName); + + /* Test a pair of cells corner-on. */ + offset[0] = 1.; + offset[1] = 1.; + offset[2] = 1.; + test_all_pair_interactions(runner, offset, particles, size, h, rho, &partId, + perturbation, h_pert, swiftOutputFileName, + bruteForceOutputFileName); + return 0; +} diff --git a/tests/testActivePair.sh.in b/tests/testActivePair.sh.in new file mode 100755 index 0000000000000000000000000000000000000000..ff8d027a469bd9bc78286b843cf2dffd3ef27ad3 --- /dev/null +++ b/tests/testActivePair.sh.in @@ -0,0 +1,11 @@ +#!/bin/bash + +echo "" + +rm -f brute_force_pair_active.dat swift_dopair_active.dat + +./testActivePair -n 6 -r 1 -d 0 -f active + +python @srcdir@/difffloat.py brute_force_active.dat swift_dopair_active.dat @srcdir@/tolerance_pair_active.dat + +exit $? diff --git a/tests/testInteractions.c b/tests/testInteractions.c index 4ce7fe40554d24551750629fa47c0bee7acdb6da..54d1f38733a1f1647331166f1a37b40ed3511419 100644 --- a/tests/testInteractions.c +++ b/tests/testInteractions.c @@ -17,12 +17,6 @@ * ******************************************************************************/ -#include "../config.h" - -#ifndef WITH_VECTORIZATION -int main() { return 0; } -#else - #include <fenv.h> #include <stdio.h> #include <stdlib.h> @@ -30,15 +24,17 @@ int main() { return 0; } #include <unistd.h> #include "swift.h" +#ifdef WITH_VECTORIZATION + #define array_align sizeof(float) * VEC_SIZE #define ACC_THRESHOLD 1e-5 -/* Typdef function pointers for serial and vectorised versions of the - * interaction functions. */ -typedef void (*serial_interaction)(float, float *, float, float, struct part *, - struct part *); -typedef void (*vec_interaction)(float *, float *, float *, float *, - struct part **, struct part **); +#ifndef IACT +#define IACT runner_iact_nonsym_density +#define IACT_VEC runner_iact_nonsym_1_vec_density +#define IACT_NAME "test_nonsym_density" +#define NUM_VEC_PROC_INT 1 +#endif /** * @brief Constructs an array of particles in a valid state prior to @@ -74,7 +70,10 @@ struct part *make_particles(size_t count, double *offset, double spacing, p->h = h; p->id = ++(*partId); + +#if !defined(GIZMO_SPH) && !defined(SHADOWFAX_SPH) p->mass = 1.0f; +#endif /* Place rest of particles around the test particle * with random position within a unit sphere. */ @@ -93,7 +92,9 @@ struct part *make_particles(size_t count, double *offset, double spacing, p->h = h; p->id = ++(*partId); +#if !defined(GIZMO_SPH) && !defined(SHADOWFAX_SPH) p->mass = 1.0f; +#endif } return particles; } @@ -103,6 +104,7 @@ struct part *make_particles(size_t count, double *offset, double spacing, */ void prepare_force(struct part *parts, size_t count) { +#if !defined(GIZMO_SPH) && !defined(SHADOWFAX_SPH) && !defined(MINIMAL_SPH) struct part *p; for (size_t i = 0; i < count; ++i) { p = &parts[i]; @@ -113,6 +115,7 @@ void prepare_force(struct part *parts, size_t count) { p->force.v_sig = 0.0f; p->force.h_dt = 0.0f; } +#endif } /** @@ -122,25 +125,26 @@ void dump_indv_particle_fields(char *fileName, struct part *p) { FILE *file = fopen(fileName, "a"); + /* Write header */ fprintf(file, - "%6llu %10f %10f %10f %10f %10f %10f %10e %10e %10e %13e %13e %13e " - "%13e %13e %13e %13e " - "%13e %13e %13e %10f\n", + "%6llu %10f %10f %10f %10f %10f %10f %13e %13e %13e %13e %13e " + "%13e %13e %13e\n", p->id, p->x[0], p->x[1], p->x[2], p->v[0], p->v[1], p->v[2], - p->a_hydro[0], p->a_hydro[1], p->a_hydro[2], p->rho, - p->density.rho_dh, p->density.wcount, p->density.wcount_dh, - p->force.h_dt, p->force.v_sig, -#if defined(GADGET2_SPH) - p->density.div_v, p->density.rot_v[0], p->density.rot_v[1], - p->density.rot_v[2], p->entropy_dt -#elif defined(DEFAULT_SPH) - p->density.div_v, p->density.rot_v[0], p->density.rot_v[1], - p->density.rot_v[2], 0. + hydro_get_density(p), +#if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH) + 0.f, #else + p->density.rho_dh, +#endif + p->density.wcount, p->density.wcount_dh, +#if defined(GADGET2_SPH) || defined(DEFAULT_SPH) || defined(HOPKINS_PE_SPH) p->density.div_v, p->density.rot_v[0], p->density.rot_v[1], p->density.rot_v[2] +#else + 0., 0., 0., 0. #endif ); + fclose(file); } @@ -152,13 +156,10 @@ void write_header(char *fileName) { FILE *file = fopen(fileName, "w"); /* Write header */ fprintf(file, - "# %4s %10s %10s %10s %10s %10s %10s %10s %10s %10s %13s %13s %13s " - "%13s %13s %13s %13s" - "%13s %13s %13s %13s\n", - "ID", "pos_x", "pos_y", "pos_z", "v_x", "v_y", "v_z", "a_x", "a_y", - "a_z", "rho", "rho_dh", "wcount", "wcount_dh", "dh/dt", "v_sig", - "div_v", "curl_vx", "curl_vy", "curl_vz", "dS/dt"); - fprintf(file, "\n# PARTICLES BEFORE INTERACTION:\n"); + "# %4s %10s %10s %10s %10s %10s %10s %13s %13s %13s %13s %13s " + "%13s %13s %13s\n", + "ID", "pos_x", "pos_y", "pos_z", "v_x", "v_y", "v_z", "rho", "rho_dh", + "wcount", "wcount_dh", "div_v", "curl_vx", "curl_vy", "curl_vz"); fclose(file); } @@ -187,8 +188,8 @@ int check_results(struct part serial_test_part, struct part *serial_parts, } /* - * @brief Calls the serial and vectorised version of an interaction - * function given by the function pointers. + * @brief Calls the serial and vectorised version of the non-symmetrical density + * interaction. * * @param test_part Particle that will be updated * @param parts Particle array to be interacted @@ -196,16 +197,15 @@ int check_results(struct part serial_test_part, struct part *serial_parts, * @param serial_inter_func Serial interaction function to be called * @param vec_inter_func Vectorised interaction function to be called * @param runs No. of times to call interactions + * @param num_vec_proc No. of vectors to use to process interaction * */ void test_interactions(struct part test_part, struct part *parts, size_t count, - serial_interaction serial_inter_func, - vec_interaction vec_inter_func, char *filePrefix, - size_t runs) { + char *filePrefix, int runs, int num_vec_proc) { - ticks serial_time = 0, vec_time = 0; + ticks serial_time = 0; + ticks vec_time = 0; - FILE *file; char serial_filename[200] = ""; char vec_filename[200] = ""; @@ -217,64 +217,68 @@ void test_interactions(struct part test_part, struct part *parts, size_t count, write_header(serial_filename); write_header(vec_filename); - /* Test particle at the center of a unit sphere. */ struct part pi_serial, pi_vec; - - /* Remaining particles in the sphere that will interact with test particle. */ struct part pj_serial[count], pj_vec[count]; - /* Stores the separation, smoothing length and pointers to particles - * needed for the vectorised interaction. */ + float r2[count] __attribute__((aligned(array_align))); + float dx[3 * count] __attribute__((aligned(array_align))); + + struct part *piq[count], *pjq[count]; + for (size_t k = 0; k < count; k++) { + piq[k] = NULL; + pjq[k] = NULL; + } + float r2q[count] __attribute__((aligned(array_align))); float hiq[count] __attribute__((aligned(array_align))); - float hjq[count] __attribute__((aligned(array_align))); - float dxq[3 * count] __attribute__((aligned(array_align))); - struct part *piq[count], *pjq[count]; + float dxq[count] __attribute__((aligned(array_align))); + + float dyq[count] __attribute__((aligned(array_align))); + float dzq[count] __attribute__((aligned(array_align))); + float mjq[count] __attribute__((aligned(array_align))); + float vixq[count] __attribute__((aligned(array_align))); + float viyq[count] __attribute__((aligned(array_align))); + float vizq[count] __attribute__((aligned(array_align))); + float vjxq[count] __attribute__((aligned(array_align))); + float vjyq[count] __attribute__((aligned(array_align))); + float vjzq[count] __attribute__((aligned(array_align))); /* Call serial interaction a set number of times. */ - for (size_t k = 0; k < runs; k++) { + for (int k = 0; k < runs; k++) { /* Reset particle to initial setup */ pi_serial = test_part; for (size_t i = 0; i < count; i++) pj_serial[i] = parts[i]; - /* Only dump data on first run. */ - if (k == 0) { - /* Dump state of particles before serial interaction. */ - dump_indv_particle_fields(serial_filename, &pi_serial); - for (size_t i = 0; i < count; i++) - dump_indv_particle_fields(serial_filename, &pj_serial[i]); - } - /* Perform serial interaction */ for (size_t i = 0; i < count; i++) { /* Compute the pairwise distance. */ - float r2 = 0.0f; - float dx[3]; - for (size_t k = 0; k < 3; k++) { - dx[k] = pi_serial.x[k] - pj_serial[i].x[k]; - r2 += dx[k] * dx[k]; + r2[i] = 0.0f; + for (int k = 0; k < 3; k++) { + int ind = (3 * i) + k; + dx[ind] = pi_serial.x[k] - pj_serial[i].x[k]; + r2[i] += dx[ind] * dx[ind]; } + } - const ticks tic = getticks(); - - serial_inter_func(r2, dx, pi_serial.h, pj_serial[i].h, &pi_serial, - &pj_serial[i]); - - serial_time += getticks() - tic; + const ticks tic = getticks(); +/* Perform serial interaction */ +#ifdef __ICC +#pragma novector +#endif + for (size_t i = 0; i < count; i++) { + IACT(r2[i], &(dx[3 * i]), pi_serial.h, pj_serial[i].h, &pi_serial, + &pj_serial[i]); } + serial_time += getticks() - tic; } - file = fopen(serial_filename, "a"); - fprintf(file, "\n# PARTICLES AFTER INTERACTION:\n"); - fclose(file); - /* Dump result of serial interaction. */ dump_indv_particle_fields(serial_filename, &pi_serial); for (size_t i = 0; i < count; i++) dump_indv_particle_fields(serial_filename, &pj_serial[i]); /* Call vector interaction a set number of times. */ - for (size_t k = 0; k < runs; k++) { + for (int k = 0; k < runs; k++) { /* Reset particle to initial setup */ pi_vec = test_part; for (size_t i = 0; i < count; i++) pj_vec[i] = parts[i]; @@ -284,45 +288,92 @@ void test_interactions(struct part test_part, struct part *parts, size_t count, /* Compute the pairwise distance. */ float r2 = 0.0f; float dx[3]; - for (size_t k = 0; k < 3; k++) { + for (int k = 0; k < 3; k++) { dx[k] = pi_vec.x[k] - pj_vec[i].x[k]; r2 += dx[k] * dx[k]; } r2q[i] = r2; - dxq[3 * i + 0] = dx[0]; - dxq[3 * i + 1] = dx[1]; - dxq[3 * i + 2] = dx[2]; + dxq[i] = dx[0]; hiq[i] = pi_vec.h; - hjq[i] = pj_vec[i].h; piq[i] = &pi_vec; pjq[i] = &pj_vec[i]; - } - /* Only dump data on first run. */ - if (k == 0) { - /* Dump state of particles before vector interaction. */ - dump_indv_particle_fields(vec_filename, piq[0]); - for (size_t i = 0; i < count; i++) - dump_indv_particle_fields(vec_filename, pjq[i]); + dyq[i] = dx[1]; + dzq[i] = dx[2]; + mjq[i] = pj_vec[i].mass; + vixq[i] = pi_vec.v[0]; + viyq[i] = pi_vec.v[1]; + vizq[i] = pi_vec.v[2]; + vjxq[i] = pj_vec[i].v[0]; + vjyq[i] = pj_vec[i].v[1]; + vjzq[i] = pj_vec[i].v[2]; } + /* Perform vector interaction. */ + vector hi_vec, hi_inv_vec, vix_vec, viy_vec, viz_vec; + vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, + curlvySum, curlvzSum; + mask_t mask, mask2; + + rhoSum.v = vec_set1(0.f); + rho_dhSum.v = vec_set1(0.f); + wcountSum.v = vec_set1(0.f); + wcount_dhSum.v = vec_set1(0.f); + div_vSum.v = vec_set1(0.f); + curlvxSum.v = vec_set1(0.f); + curlvySum.v = vec_set1(0.f); + curlvzSum.v = vec_set1(0.f); + + hi_vec.v = vec_load(&hiq[0]); + vix_vec.v = vec_load(&vixq[0]); + viy_vec.v = vec_load(&viyq[0]); + viz_vec.v = vec_load(&vizq[0]); + + hi_inv_vec = vec_reciprocal(hi_vec); + vec_init_mask_true(mask); + vec_init_mask_true(mask2); + const ticks vec_tic = getticks(); - /* Perform vector interaction. */ - for (size_t i = 0; i < count; i += VEC_SIZE) { - vec_inter_func(&(r2q[i]), &(dxq[3 * i]), &(hiq[i]), &(hjq[i]), &(piq[i]), - &(pjq[i])); + for (size_t i = 0; i < count; i += num_vec_proc * VEC_SIZE) { + + /* Interleave two vectors for interaction. */ + if (num_vec_proc == 2) { + runner_iact_nonsym_2_vec_density( + &(r2q[i]), &(dxq[i]), &(dyq[i]), &(dzq[i]), (hi_inv_vec), (vix_vec), + (viy_vec), (viz_vec), &(vjxq[i]), &(vjyq[i]), &(vjzq[i]), &(mjq[i]), + &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, + &curlvxSum, &curlvySum, &curlvzSum, mask, mask2, 0); + } else { /* Only use one vector for interaction. */ + + vector r2, dx, dy, dz; + r2.v = vec_load(&(r2q[i])); + dx.v = vec_load(&(dxq[i])); + dy.v = vec_load(&(dyq[i])); + dz.v = vec_load(&(dzq[i])); + + runner_iact_nonsym_1_vec_density( + &r2, &dx, &dy, &dz, (hi_inv_vec), (vix_vec), (viy_vec), (viz_vec), + &(vjxq[i]), &(vjyq[i]), &(vjzq[i]), &(mjq[i]), &rhoSum, &rho_dhSum, + &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, + &curlvzSum, mask); + } } + VEC_HADD(rhoSum, piq[0]->rho); + VEC_HADD(rho_dhSum, piq[0]->density.rho_dh); + VEC_HADD(wcountSum, piq[0]->density.wcount); + VEC_HADD(wcount_dhSum, piq[0]->density.wcount_dh); + VEC_HADD(div_vSum, piq[0]->density.div_v); + VEC_HADD(curlvxSum, piq[0]->density.rot_v[0]); + VEC_HADD(curlvySum, piq[0]->density.rot_v[1]); + VEC_HADD(curlvzSum, piq[0]->density.rot_v[2]); + vec_time += getticks() - vec_tic; } - file = fopen(vec_filename, "a"); - fprintf(file, "\n# PARTICLES AFTER INTERACTION:\n"); - fclose(file); - - /* Dump result of vector interaction. */ + /* Dump result of serial interaction. */ dump_indv_particle_fields(vec_filename, piq[0]); for (size_t i = 0; i < count; i++) dump_indv_particle_fields(vec_filename, pjq[i]); @@ -334,6 +385,7 @@ void test_interactions(struct part test_part, struct part *parts, size_t count, message("The serial interactions took : %15lli ticks.", serial_time / runs); message("The vectorised interactions took : %15lli ticks.", vec_time / runs); + message("Speed up: %15fx.", (double)(serial_time) / vec_time); } /* And go... */ @@ -386,62 +438,22 @@ int main(int argc, char *argv[]) { /* Build the infrastructure */ static long long partId = 0; - struct part density_test_particle, force_test_particle; - struct part *density_particles = - make_particles(count, offset, spacing, h, &partId); - struct part *force_particles = - make_particles(count, offset, spacing, h, &partId); - prepare_force(force_particles, count); - - /* Define which interactions to call */ - serial_interaction serial_inter_func = &runner_iact_nonsym_density; - vec_interaction vec_inter_func = &runner_iact_nonsym_vec_density; - - density_test_particle = density_particles[0]; + struct part test_particle; + struct part *particles = make_particles(count, offset, spacing, h, &partId); + + test_particle = particles[0]; /* Call the non-sym density test. */ - message("Testing non-symmetrical density interaction..."); - test_interactions(density_test_particle, &density_particles[1], count - 1, - serial_inter_func, vec_inter_func, "test_nonsym_density", - runs); - - density_particles = make_particles(count, offset, spacing, h, &partId); - - /* Re-assign function pointers. */ - serial_inter_func = &runner_iact_density; - vec_inter_func = &runner_iact_vec_density; - - density_test_particle = density_particles[0]; - /* Call the symmetrical density test. */ - message("Testing symmetrical density interaction..."); - test_interactions(density_test_particle, &density_particles[1], count - 1, - serial_inter_func, vec_inter_func, "test_sym_density", - runs); - - /* Re-assign function pointers. */ - serial_inter_func = &runner_iact_nonsym_force; - vec_inter_func = &runner_iact_nonsym_vec_force; - - force_test_particle = force_particles[0]; - /* Call the test non-sym force test. */ - message("Testing non-symmetrical force interaction..."); - test_interactions(force_test_particle, &force_particles[1], count - 1, - serial_inter_func, vec_inter_func, "test_nonsym_force", - runs); - - force_particles = make_particles(count, offset, spacing, h, &partId); - prepare_force(force_particles, count); - - /* Re-assign function pointers. */ - serial_inter_func = &runner_iact_force; - vec_inter_func = &runner_iact_vec_force; - - force_test_particle = force_particles[0]; - /* Call the test symmetrical force test. */ - message("Testing symmetrical force interaction..."); - test_interactions(force_test_particle, &force_particles[1], count - 1, - serial_inter_func, vec_inter_func, "test_sym_force", runs); + message("Testing %s interaction...", IACT_NAME); + test_interactions(test_particle, &particles[1], count - 1, IACT_NAME, runs, + 1); + test_interactions(test_particle, &particles[1], count - 1, IACT_NAME, runs, + 2); return 0; } -#endif /* WITH_VECTORIZATION */ +#else + +int main() { return 1; } + +#endif diff --git a/tests/testInteractions.sh.in b/tests/testInteractions.sh.in new file mode 100644 index 0000000000000000000000000000000000000000..4b002c56e37eff417c673ddac2e44b3edf17683a --- /dev/null +++ b/tests/testInteractions.sh.in @@ -0,0 +1,29 @@ +#!/bin/bash + +echo "" + +rm -f test_nonsym_density_serial.dat test_nonsym_density_vec.dat + +echo "Running ./testInteractions" + +./testInteractions + +if [ $? != 0 ]; then + echo "testInteractions is redundant when vectorisation is disabled" +else + if [ -e test_nonsym_density_serial.dat ] + then + if python @srcdir@/difffloat.py test_nonsym_density_serial.dat test_nonsym_density_vec.dat @srcdir@/tolerance_testInteractions.dat + then + echo "Accuracy test passed" + else + echo "Accuracy test failed" + exit 1 + fi + else + echo "Error Missing test output file" + exit 1 + fi +fi + +echo "------------" diff --git a/tests/testKernel.c b/tests/testKernel.c index a2744119a527cc842cdd4711056eee7a7d7b4270..0658639070526f28ce1bceefc54d3f2d7a3ae765 100644 --- a/tests/testKernel.c +++ b/tests/testKernel.c @@ -68,7 +68,7 @@ int main() { vx.f[j] = (i + j) * 2.25f / numPoints; } - vx_h.v = vx.v / vec_set1(h); + vx_h.v = vec_mul(vx.v, vec_set1(1.f / h)); kernel_deval_1_vec(&vx_h, &W_vec, &dW_vec); @@ -106,8 +106,8 @@ int main() { vx_2.f[j] = (i + j) * 2.25f / numPoints; } - vx_h.v = vx.v / vec_set1(h); - vx_h_2.v = vx_2.v / vec_set1(h); + vx_h.v = vec_mul(vx.v, vec_set1(1.f / h)); + vx_h_2.v = vec_mul(vx_2.v, vec_set1(1.f / h)); kernel_deval_2_vec(&vx_h, &W_vec, &dW_vec, &vx_h_2, &W_vec_2, &dW_vec_2); diff --git a/tests/testPair.c b/tests/testPair.c deleted file mode 100644 index e0e813f31531fb1c8d2e24e417a43ac4d07ecf7e..0000000000000000000000000000000000000000 --- a/tests/testPair.c +++ /dev/null @@ -1,349 +0,0 @@ -/******************************************************************************* - * This file is part of SWIFT. - * Copyright (C) 2015 Matthieu Schaller (matthieu.schaller@durham.ac.uk). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - ******************************************************************************/ -#include "../config.h" - -/* Some standard headers. */ -#include <fenv.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <unistd.h> - -/* Local headers. */ -#include "swift.h" - -#if defined(WITH_VECTORIZATION) -#define DOSELF1 runner_doself1_density_vec -#define DOPAIR1 runner_dopair1_branch_density -#define DOSELF1_NAME "runner_doself1_density_vec" -#define DOPAIR1_NAME "runner_dopair1_density_vec" -#endif - -#ifndef DOSELF1 -#define DOSELF1 runner_doself1_density -#define DOSELF1_NAME "runner_doself1_density" -#endif - -#ifndef DOPAIR1 -#define DOPAIR1 runner_dopair1_branch_density -#define DOPAIR1_NAME "runner_dopair1_density" -#endif - -/* n is both particles per axis and box size: - * particles are generated on a mesh with unit spacing - */ -struct cell *make_cell(size_t n, double *offset, double size, double h, - double density, unsigned long long *partId, - double pert) { - const size_t count = n * n * n; - const double volume = size * size * size; - struct cell *cell = malloc(sizeof(struct cell)); - bzero(cell, sizeof(struct cell)); - - if (posix_memalign((void **)&cell->parts, part_align, - count * sizeof(struct part)) != 0) { - error("couldn't allocate particles, no. of particles: %d", (int)count); - } - bzero(cell->parts, count * sizeof(struct part)); - - /* Construct the parts */ - struct part *part = cell->parts; - for (size_t x = 0; x < n; ++x) { - for (size_t y = 0; y < n; ++y) { - for (size_t z = 0; z < n; ++z) { - part->x[0] = - offset[0] + - size * (x + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n; - part->x[1] = - offset[1] + - size * (y + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n; - part->x[2] = - offset[2] + - size * (z + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n; - // part->v[0] = part->x[0] - 1.5; - // part->v[1] = part->x[1] - 1.5; - // part->v[2] = part->x[2] - 1.5; - part->v[0] = random_uniform(-0.05, 0.05); - part->v[1] = random_uniform(-0.05, 0.05); - part->v[2] = random_uniform(-0.05, 0.05); - part->h = size * h / (float)n; - part->id = ++(*partId); -#if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH) - part->conserved.mass = density * volume / count; -#else - part->mass = density * volume / count; -#endif - part->time_bin = 1; - -#ifdef SWIFT_DEBUG_CHECKS - part->ti_drift = 8; - part->ti_kick = 8; -#endif - - ++part; - } - } - } - - /* Cell properties */ - cell->split = 0; - cell->h_max = h; - cell->count = count; - cell->dx_max_part = 0.; - cell->dx_max_sort = 0.; - cell->width[0] = n; - cell->width[1] = n; - cell->width[2] = n; - cell->loc[0] = offset[0]; - cell->loc[1] = offset[1]; - cell->loc[2] = offset[2]; - - cell->ti_old_part = 8; - cell->ti_end_min = 8; - cell->ti_end_max = 8; - - shuffle_particles(cell->parts, cell->count); - - cell->sorted = 0; - for (int k = 0; k < 13; k++) cell->sort[k] = NULL; - - return cell; -} - -void clean_up(struct cell *ci) { - free(ci->parts); - for (int k = 0; k < 13; k++) - if (ci->sort[k] != NULL) free(ci->sort[k]); - free(ci); -} - -/** - * @brief Initializes all particles field to be ready for a density calculation - */ -void zero_particle_fields(struct cell *c) { - for (int pid = 0; pid < c->count; pid++) { - hydro_init_part(&c->parts[pid], NULL); - } -} - -/** - * @brief Dump all the particles to a file - */ -void dump_particle_fields(char *fileName, struct cell *ci, struct cell *cj) { - FILE *file = fopen(fileName, "w"); - - /* Write header */ - fprintf(file, - "# %4s %10s %10s %10s %10s %10s %10s %13s %13s %13s %13s %13s " - "%13s %13s %13s\n", - "ID", "pos_x", "pos_y", "pos_z", "v_x", "v_y", "v_z", "rho", "rho_dh", - "wcount", "wcount_dh", "div_v", "curl_vx", "curl_vy", "curl_vz"); - - fprintf(file, "# ci --------------------------------------------\n"); - - for (int pid = 0; pid < ci->count; pid++) { - fprintf(file, - "%6llu %10f %10f %10f %10f %10f %10f %13e %13e %13e %13e %13e " - "%13e %13e %13e\n", - ci->parts[pid].id, ci->parts[pid].x[0], ci->parts[pid].x[1], - ci->parts[pid].x[2], ci->parts[pid].v[0], ci->parts[pid].v[1], - ci->parts[pid].v[2], hydro_get_density(&ci->parts[pid]), -#if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH) - 0.f, -#else - ci->parts[pid].density.rho_dh, -#endif - ci->parts[pid].density.wcount, ci->parts[pid].density.wcount_dh, -#if defined(GADGET2_SPH) || defined(DEFAULT_SPH) || defined(HOPKINS_PE_SPH) - ci->parts[pid].density.div_v, ci->parts[pid].density.rot_v[0], - ci->parts[pid].density.rot_v[1], ci->parts[pid].density.rot_v[2] -#else - 0., 0., 0., 0. -#endif - ); - } - - fprintf(file, "# cj --------------------------------------------\n"); - - for (int pjd = 0; pjd < cj->count; pjd++) { - fprintf(file, - "%6llu %10f %10f %10f %10f %10f %10f %13e %13e %13e %13e %13e " - "%13e %13e %13e\n", - cj->parts[pjd].id, cj->parts[pjd].x[0], cj->parts[pjd].x[1], - cj->parts[pjd].x[2], cj->parts[pjd].v[0], cj->parts[pjd].v[1], - cj->parts[pjd].v[2], hydro_get_density(&cj->parts[pjd]), -#if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH) - 0.f, -#else - cj->parts[pjd].density.rho_dh, -#endif - cj->parts[pjd].density.wcount, cj->parts[pjd].density.wcount_dh, -#if defined(GADGET2_SPH) || defined(DEFAULT_SPH) || defined(HOPKINS_PE_SPH) - cj->parts[pjd].density.div_v, cj->parts[pjd].density.rot_v[0], - cj->parts[pjd].density.rot_v[1], cj->parts[pjd].density.rot_v[2] -#else - 0., 0., 0., 0. -#endif - ); - } - - fclose(file); -} - -/* Just a forward declaration... */ -void runner_dopair1_density(struct runner *r, struct cell *ci, struct cell *cj); -void runner_doself1_density_vec(struct runner *r, struct cell *ci); -void runner_dopair1_branch_density(struct runner *r, struct cell *ci, - struct cell *cj); - -int main(int argc, char *argv[]) { - size_t particles = 0, runs = 0, volume, type = 0; - double offset[3] = {0, 0, 0}, h = 1.1255, size = 1., rho = 1.; - double perturbation = 0.; - struct cell *ci, *cj; - struct space space; - struct engine engine; - struct runner runner; - char c; - static unsigned long long partId = 0; - char outputFileNameExtension[200] = ""; - char outputFileName[200] = ""; - ticks tic, toc, time; - - /* Initialize CPU frequency, this also starts time. */ - unsigned long long cpufreq = 0; - clocks_set_cpufreq(cpufreq); - - /* Choke on FP-exceptions */ - feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW); - - srand(0); - - while ((c = getopt(argc, argv, "h:p:r:t:d:f:")) != -1) { - switch (c) { - case 'h': - sscanf(optarg, "%lf", &h); - break; - case 'p': - sscanf(optarg, "%zu", &particles); - break; - case 'r': - sscanf(optarg, "%zu", &runs); - break; - case 't': - sscanf(optarg, "%zu", &type); - break; - case 'd': - sscanf(optarg, "%lf", &perturbation); - break; - case 'f': - strcpy(outputFileNameExtension, optarg); - break; - case '?': - error("Unknown option."); - break; - } - } - - if (h < 0 || particles == 0 || runs == 0 || type > 2) { - printf( - "\nUsage: %s -p PARTICLES_PER_AXIS -r NUMBER_OF_RUNS [OPTIONS...]\n" - "\nGenerates a cell pair, filled with particles on a Cartesian grid." - "\nThese are then interacted using runner_dopair1_density." - "\n\nOptions:" - "\n-t TYPE=0 - cells share face (0), edge (1) or corner (2)" - "\n-h DISTANCE=1.1255 - smoothing length" - "\n-d pert - perturbation to apply to the particles [0,1[" - "\n-f fileName - part of the file name used to save the dumps\n", - argv[0]); - exit(1); - } - - space.periodic = 0; - - engine.s = &space; - engine.time = 0.1f; - engine.ti_current = 8; - engine.max_active_bin = num_time_bins; - runner.e = &engine; - - volume = particles * particles * particles; - message("particles: %zu B\npositions: 0 B", 2 * volume * sizeof(struct part)); - - ci = make_cell(particles, offset, size, h, rho, &partId, perturbation); - for (size_t i = 0; i < type + 1; ++i) offset[i] = 1.; - cj = make_cell(particles, offset, size, h, rho, &partId, perturbation); - - runner_do_sort(&runner, ci, 0x1FFF, 0, 0); - runner_do_sort(&runner, cj, 0x1FFF, 0, 0); - - time = 0; - for (size_t i = 0; i < runs; ++i) { - /* Zero the fields */ - zero_particle_fields(ci); - zero_particle_fields(cj); - - tic = getticks(); - -#if defined(DEFAULT_SPH) || !defined(WITH_VECTORIZATION) - /* Run the test */ - DOPAIR1(&runner, ci, cj); -#endif - - toc = getticks(); - time += toc - tic; - - /* Dump if necessary */ - if (i % 50 == 0) { - sprintf(outputFileName, "swift_dopair_%s.dat", outputFileNameExtension); - dump_particle_fields(outputFileName, ci, cj); - } - } - - /* Output timing */ - message("SWIFT calculation took %lli ticks.", time / runs); - - /* Now perform a brute-force version for accuracy tests */ - - /* Zero the fields */ - zero_particle_fields(ci); - zero_particle_fields(cj); - - tic = getticks(); - -#if defined(DEFAULT_SPH) || !defined(WITH_VECTORIZATION) - /* Run the brute-force test */ - pairs_all_density(&runner, ci, cj); -#endif - - toc = getticks(); - - /* Dump */ - sprintf(outputFileName, "brute_force_%s.dat", outputFileNameExtension); - dump_particle_fields(outputFileName, ci, cj); - - /* Output timing */ - message("Brute force calculation took %lli ticks.", toc - tic); - - /* Clean things to make the sanitizer happy ... */ - clean_up(ci); - clean_up(cj); - - return 0; -} diff --git a/tests/testPair.sh.in b/tests/testPair.sh.in deleted file mode 100755 index bd7051b060c4acab6cf5a164af1914715856849b..0000000000000000000000000000000000000000 --- a/tests/testPair.sh.in +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -echo "" - -rm -f brute_force_standard.dat swift_dopair_standard.dat - -./testPair -p 6 -r 1 -d 0 -f standard - -python @srcdir@/difffloat.py brute_force_standard.dat swift_dopair_standard.dat @srcdir@/tolerance_pair_normal.dat - -exit $? diff --git a/tests/testPairPerturbed.sh.in b/tests/testPairPerturbed.sh.in deleted file mode 100755 index 9f214e25a098448a906f9da307ea569e327cfdea..0000000000000000000000000000000000000000 --- a/tests/testPairPerturbed.sh.in +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -echo "" - -rm -f brute_force_perturbed.dat swift_dopair_perturbed.dat - -./testPair -p 6 -r 1 -d 0.1 -f perturbed - -python @srcdir@/difffloat.py brute_force_perturbed.dat swift_dopair_perturbed.dat @srcdir@/tolerance_pair_perturbed.dat - -exit $? diff --git a/tests/testPeriodicBC.c b/tests/testPeriodicBC.c index 495442efac853517891c1efdec54f1f5e8e14e75..6fa2dc607b996b9e8508338a9806633c5a4d1a89 100644 --- a/tests/testPeriodicBC.c +++ b/tests/testPeriodicBC.c @@ -517,7 +517,8 @@ int main(int argc, char *argv[]) { const int half_dim = (dim - 1) / 2; - /* Test the periodic boundary conditions for each of the 8 corners. */ + /* Test the periodic boundary conditions for each of the 8 corners. Interact + * each corner with all of its 26 neighbours.*/ test_boundary_conditions(cells, runner, 0, 0, 0, dim, swiftOutputFileName, bruteForceOutputFileName); test_boundary_conditions(cells, runner, dim - 1, 0, 0, dim, diff --git a/tests/tolerance_27_perturbed_h.dat b/tests/tolerance_27_perturbed_h.dat index 495c865783ae4fc6cab76f11e945efe9ac21870f..5142c2a2090e15381a19b2bc71e253a482973b11 100644 --- a/tests/tolerance_27_perturbed_h.dat +++ b/tests/tolerance_27_perturbed_h.dat @@ -1,4 +1,4 @@ # ID pos_x pos_y pos_z v_x v_y v_z rho rho_dh wcount wcount_dh div_v curl_vx curl_vy curl_vz - 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 2e-6 1e-4 5e-4 1.2e-2 1e-5 3e-6 3e-6 7e-6 - 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1.4e-2 1e-5 2e-3 2.5e-4 3e-3 3e-3 3e-3 - 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e0 1e-6 4e-6 4e-6 4e-6 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 2.4e-6 1e-4 5e-4 1.2e-2 1e-5 3e-6 3e-6 8e-6 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1.2e-6 1.4e-2 1e-5 2e-3 2.5e-4 3e-3 3e-3 3e-3 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e0 1e-6 4e-6 4e-6 4e-6 diff --git a/tests/tolerance_27_perturbed_h2.dat b/tests/tolerance_27_perturbed_h2.dat new file mode 100644 index 0000000000000000000000000000000000000000..23f6a5006124f6233aebd111005760a5dcc5b6a3 --- /dev/null +++ b/tests/tolerance_27_perturbed_h2.dat @@ -0,0 +1,4 @@ +# ID pos_x pos_y pos_z v_x v_y v_z rho rho_dh wcount wcount_dh div_v curl_vx curl_vy curl_vz + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 3e-6 1e-4 5e-4 1.5e-2 1.4e-5 3e-6 3e-6 9e-6 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1.5e-6 1.57e-2 1e-5 4.74e-3 3.89e-4 3e-3 3e-3 3e-3 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e0 1e-6 4e-6 4e-6 4e-6 diff --git a/tests/tolerance_pair_active.dat b/tests/tolerance_pair_active.dat new file mode 100644 index 0000000000000000000000000000000000000000..b07697a686eb7801326ceaf77cf93fb3a1491c2e --- /dev/null +++ b/tests/tolerance_pair_active.dat @@ -0,0 +1,4 @@ +# ID wcount + 0 1e-2 + 0 1e-2 + 0 1e-2 diff --git a/tests/tolerance_pair_perturbed.dat b/tests/tolerance_pair_perturbed.dat deleted file mode 100644 index 59a80a7b1fa0db373b7cb43b9793146330b17d53..0000000000000000000000000000000000000000 --- a/tests/tolerance_pair_perturbed.dat +++ /dev/null @@ -1,4 +0,0 @@ -# ID pos_x pos_y pos_z v_x v_y v_z rho rho_dh wcount wcount_dh div_v curl_vx curl_vy curl_vz - 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-5 1e-5 2e-5 3e-2 1e-5 1e-5 1e-5 1e-5 - 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-3 4e-4 8e-3 2e-2 1e-4 1.6e-4 1.6e-4 1.6e-4 - 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 diff --git a/tests/tolerance_pair_normal.dat b/tests/tolerance_testInteractions.dat similarity index 75% rename from tests/tolerance_pair_normal.dat rename to tests/tolerance_testInteractions.dat index 3fca591dce8156a08c970e4a6579b56b2de12553..ebb376bf26bfdc0fb2107ab720bbf9eca5a35bce 100644 --- a/tests/tolerance_pair_normal.dat +++ b/tests/tolerance_testInteractions.dat @@ -1,4 +1,4 @@ # ID pos_x pos_y pos_z v_x v_y v_z rho rho_dh wcount wcount_dh div_v curl_vx curl_vy curl_vz - 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-5 1e-5 2e-5 3e-2 1e-5 1e-5 1e-5 1e-5 - 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-5 1.2e-5 1e-5 1e-2 1e-4 1e-4 1e-4 1e-4 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-5 4e-5 4e-4 1e-2 1e-5 6e-6 6e-6 6e-6 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1.2e-4 1e-4 2e-4 2e-4 1e-4 1e-4 1e-4 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6