diff --git a/.gitignore b/.gitignore index 0c4d8bddadb17afcc29aa3dc1d4dba6cb6fd2f5b..e3e17bb82a01d9af0ace6ed72d196cf2dba242f6 100644 --- a/.gitignore +++ b/.gitignore @@ -34,7 +34,7 @@ examples/*/*/*.txt examples/*/*/used_parameters.yml examples/*/gravity_checks_*.dat -tests/testPair +tests/testActivePair tests/brute_force_periodic_BC_standard.dat tests/swift_periodic_BC_standard.dat tests/brute_force_periodic_BC_pertrubed.dat @@ -54,6 +54,11 @@ tests/brute_force_125_standard.dat tests/swift_dopair_125_standard.dat tests/brute_force_125_perturbed.dat tests/swift_dopair_125_perturbed.dat +tests/brute_force_active.dat +tests/brute_force_periodic_BC_perturbed.dat +tests/swift_dopair_active.dat +tests/test_nonsym_density_serial.dat +tests/test_nonsym_density_vec.dat tests/testGreetings tests/testReading tests/input.hdf5 @@ -75,8 +80,6 @@ tests/test27cells.sh tests/test27cellsPerturbed.sh tests/test125cells.sh tests/test125cellsPerturbed.sh -tests/testPair.sh -tests/testPairPerturbed.sh tests/testParser.sh tests/testReading.sh tests/testAdiabaticIndex diff --git a/configure.ac b/configure.ac index 95519863f6e303cde51bbef7e2046faa314f46ee..74fede99f4fbf578af4e703cedaa42f2c278b037 100644 --- a/configure.ac +++ b/configure.ac @@ -861,14 +861,14 @@ AM_CONDITIONAL([HAVE_DOXYGEN], [test "$ac_cv_path_ac_pt_DX_DOXYGEN" != ""]) # Handle .in files. AC_CONFIG_FILES([Makefile src/Makefile examples/Makefile doc/Makefile doc/Doxyfile tests/Makefile]) AC_CONFIG_FILES([tests/testReading.sh], [chmod +x tests/testReading.sh]) -AC_CONFIG_FILES([tests/testPair.sh], [chmod +x tests/testPair.sh]) -AC_CONFIG_FILES([tests/testPairPerturbed.sh], [chmod +x tests/testPairPerturbed.sh]) +AC_CONFIG_FILES([tests/testActivePair.sh], [chmod +x tests/testActivePair.sh]) AC_CONFIG_FILES([tests/test27cells.sh], [chmod +x tests/test27cells.sh]) AC_CONFIG_FILES([tests/test27cellsPerturbed.sh], [chmod +x tests/test27cellsPerturbed.sh]) AC_CONFIG_FILES([tests/test125cells.sh], [chmod +x tests/test125cells.sh]) AC_CONFIG_FILES([tests/test125cellsPerturbed.sh], [chmod +x tests/test125cellsPerturbed.sh]) AC_CONFIG_FILES([tests/testPeriodicBC.sh], [chmod +x tests/testPeriodicBC.sh]) AC_CONFIG_FILES([tests/testPeriodicBCPerturbed.sh], [chmod +x tests/testPeriodicBCPerturbed.sh]) +AC_CONFIG_FILES([tests/testInteractions.sh], [chmod +x tests/testInteractions.sh]) AC_CONFIG_FILES([tests/testParser.sh], [chmod +x tests/testParser.sh]) # Save the compilation options diff --git a/src/align.h b/src/align.h index 2e541043cc3866d76979277f3c143a45657684ca..54435c4c9baa1ce9dc511e2903b7e2be2d6655de 100644 --- a/src/align.h +++ b/src/align.h @@ -58,6 +58,22 @@ #define swift_align_information(array, alignment) ; #endif +/** + * @brief Macro to create a restrict pointer to an array and tell the compiler + * that the given array has the specified + * alignment. + * + * Note that this turns into a no-op but gives information to the compiler. + * + * @param array The array. + * @param ptr Pointer to array + * @param type Type of array + * @param alignment The alignment in bytes of the array. + */ +#define swift_declare_aligned_ptr(type, array, ptr, alignment) \ + type *restrict array = ptr; \ + swift_align_information(array, alignment); + /** * @brief Macro to tell the compiler that a given number is 0 modulo a given * size. diff --git a/src/cache.h b/src/cache.h index 1b6d89f03fafa1595cb9bdf857c9f2bc1cba316e..170eb8f5cb0939e85647bbb4622d66cc78ef3dbe 100644 --- a/src/cache.h +++ b/src/cache.h @@ -23,6 +23,7 @@ #include "../config.h" /* Local headers */ +#include "align.h" #include "cell.h" #include "error.h" #include "part.h" @@ -30,9 +31,7 @@ #include "vector.h" #define NUM_VEC_PROC 2 -#define CACHE_ALIGN 64 #define C2_CACHE_SIZE (NUM_VEC_PROC * VEC_SIZE * 6) + (NUM_VEC_PROC * VEC_SIZE) -#define C2_CACHE_ALIGN sizeof(float) * VEC_SIZE #ifdef WITH_VECTORIZATION /* Cache struct to hold a local copy of a cells' particle @@ -40,31 +39,31 @@ struct cache { /* Particle x position. */ - float *restrict x __attribute__((aligned(CACHE_ALIGN))); + float *restrict x SWIFT_CACHE_ALIGN; /* Particle y position. */ - float *restrict y __attribute__((aligned(CACHE_ALIGN))); + float *restrict y SWIFT_CACHE_ALIGN; /* Particle z position. */ - float *restrict z __attribute__((aligned(CACHE_ALIGN))); + float *restrict z SWIFT_CACHE_ALIGN; /* Particle smoothing length. */ - float *restrict h __attribute__((aligned(CACHE_ALIGN))); + float *restrict h SWIFT_CACHE_ALIGN; /* Particle mass. */ - float *restrict m __attribute__((aligned(CACHE_ALIGN))); + float *restrict m SWIFT_CACHE_ALIGN; /* Particle x velocity. */ - float *restrict vx __attribute__((aligned(CACHE_ALIGN))); + float *restrict vx SWIFT_CACHE_ALIGN; /* Particle y velocity. */ - float *restrict vy __attribute__((aligned(CACHE_ALIGN))); + float *restrict vy SWIFT_CACHE_ALIGN; /* Particle z velocity. */ - float *restrict vz __attribute__((aligned(CACHE_ALIGN))); + float *restrict vz SWIFT_CACHE_ALIGN; - /* Maximum distance of particles into neighbouring cell. */ - float *restrict max_d __attribute__((aligned(CACHE_ALIGN))); + /* Maximum index into neighbouring cell for particles that are in range. */ + int *restrict max_index SWIFT_CACHE_ALIGN; /* Cache size. */ int count; @@ -75,28 +74,28 @@ struct cache { struct c2_cache { /* Separation between two particles squared. */ - float r2q[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + float r2q[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN; /* x separation between two particles. */ - float dxq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + float dxq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN; /* y separation between two particles. */ - float dyq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + float dyq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN; /* z separation between two particles. */ - float dzq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + float dzq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN; /* Mass of particle pj. */ - float mq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + float mq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN; /* x velocity of particle pj. */ - float vxq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + float vxq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN; /* y velocity of particle pj. */ - float vyq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + float vyq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN; /* z velocity of particle pj. */ - float vzq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + float vzq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN; }; /** @@ -111,9 +110,10 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c, /* Align cache on correct byte boundary and pad cache size to be a multiple of * the vector size * and include 2 vector lengths for remainder operations. */ - unsigned int pad = 2 * VEC_SIZE, rem = count % VEC_SIZE; + size_t pad = 2 * VEC_SIZE, rem = count % VEC_SIZE; if (rem > 0) pad += VEC_SIZE - rem; - unsigned int sizeBytes = (count + pad) * sizeof(float); + size_t sizeBytes = (count + pad) * sizeof(float); + size_t sizeIntBytes = (count + pad) * sizeof(int); int error = 0; /* Free memory if cache has already been allocated. */ @@ -126,18 +126,19 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c, free(c->vy); free(c->vz); free(c->h); - free(c->max_d); + free(c->max_index); } - error += posix_memalign((void **)&c->x, CACHE_ALIGN, sizeBytes); - error += posix_memalign((void **)&c->y, CACHE_ALIGN, sizeBytes); - error += posix_memalign((void **)&c->z, CACHE_ALIGN, sizeBytes); - error += posix_memalign((void **)&c->m, CACHE_ALIGN, sizeBytes); - error += posix_memalign((void **)&c->vx, CACHE_ALIGN, sizeBytes); - error += posix_memalign((void **)&c->vy, CACHE_ALIGN, sizeBytes); - error += posix_memalign((void **)&c->vz, CACHE_ALIGN, sizeBytes); - error += posix_memalign((void **)&c->h, CACHE_ALIGN, sizeBytes); - error += posix_memalign((void **)&c->max_d, CACHE_ALIGN, sizeBytes); + error += posix_memalign((void **)&c->x, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->y, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->z, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->m, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->vx, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->vy, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->vz, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->h, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->max_index, SWIFT_CACHE_ALIGNMENT, + sizeIntBytes); if (error != 0) error("Couldn't allocate cache, no. of particles: %d", (int)count); @@ -151,156 +152,43 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c, * @param ci_cache The cache. */ __attribute__((always_inline)) INLINE void cache_read_particles( - const struct cell *const ci, struct cache *const ci_cache) { + const struct cell *restrict const ci, + struct cache *restrict const ci_cache) { #if defined(GADGET2_SPH) -/* Shift the particles positions to a local frame so single precision can be - * used instead of double precision. */ -#if defined(WITH_VECTORIZATION) && defined(__ICC) -#pragma vector aligned -#endif - for (int i = 0; i < ci->count; i++) { - ci_cache->x[i] = ci->parts[i].x[0] - ci->loc[0]; - ci_cache->y[i] = ci->parts[i].x[1] - ci->loc[1]; - ci_cache->z[i] = ci->parts[i].x[2] - ci->loc[2]; - ci_cache->h[i] = ci->parts[i].h; - - ci_cache->m[i] = ci->parts[i].mass; - ci_cache->vx[i] = ci->parts[i].v[0]; - ci_cache->vy[i] = ci->parts[i].v[1]; - ci_cache->vz[i] = ci->parts[i].v[2]; - } - -#endif -} - -/** - * @brief Populate cache by reading in the particles from two cells in unsorted - * order. - * - * @param ci The i #cell. - * @param cj The j #cell. - * @param ci_cache The cache for cell ci. - * @param cj_cache The cache for cell cj. - * @param shift The amount to shift the particle positions to account for BCs - */ -__attribute__((always_inline)) INLINE void cache_read_two_cells( - const struct cell *const ci, const struct cell *const cj, - struct cache *const ci_cache, struct cache *const cj_cache, - const double *const shift) { - - /* Shift the particles positions to a local frame (ci frame) so single - * precision can be - * used instead of double precision. Also shift the cell ci, particles - * positions due to BCs but leave cell cj. */ - for (int i = 0; i < ci->count; i++) { - ci_cache->x[i] = ci->parts[i].x[0] - ci->loc[0] - shift[0]; - ci_cache->y[i] = ci->parts[i].x[1] - ci->loc[1] - shift[1]; - ci_cache->z[i] = ci->parts[i].x[2] - ci->loc[2] - shift[2]; - ci_cache->h[i] = ci->parts[i].h; - - ci_cache->m[i] = ci->parts[i].mass; - ci_cache->vx[i] = ci->parts[i].v[0]; - ci_cache->vy[i] = ci->parts[i].v[1]; - ci_cache->vz[i] = ci->parts[i].v[2]; - } - - for (int i = 0; i < cj->count; i++) { - cj_cache->x[i] = cj->parts[i].x[0] - ci->loc[0]; - cj_cache->y[i] = cj->parts[i].x[1] - ci->loc[1]; - cj_cache->z[i] = cj->parts[i].x[2] - ci->loc[2]; - cj_cache->h[i] = cj->parts[i].h; - - cj_cache->m[i] = cj->parts[i].mass; - cj_cache->vx[i] = cj->parts[i].v[0]; - cj_cache->vy[i] = cj->parts[i].v[1]; - cj_cache->vz[i] = cj->parts[i].v[2]; - } -} - -__attribute__((always_inline)) INLINE void cache_read_cell_sorted( - const struct cell *const ci, struct cache *const ci_cache, - const struct entry *restrict sort_i, double *const loc, - double *const shift) { - - int idx; -/* Shift the particles positions to a local frame (ci frame) so single precision - * can be - * used instead of double precision. Also shift the cell ci, particles positions - * due to BCs but leave cell cj. */ -#if defined(WITH_VECTORIZATION) && defined(__ICC) -#pragma simd -#endif - for (int i = 0; i < ci->count; i++) { - idx = sort_i[i].i; - - ci_cache->x[i] = ci->parts[idx].x[0] - loc[0] - shift[0]; - ci_cache->y[i] = ci->parts[idx].x[1] - loc[1] - shift[1]; - ci_cache->z[i] = ci->parts[idx].x[2] - loc[2] - shift[2]; - ci_cache->h[i] = ci->parts[idx].h; - - ci_cache->m[i] = ci->parts[idx].mass; - ci_cache->vx[i] = ci->parts[idx].v[0]; - ci_cache->vy[i] = ci->parts[idx].v[1]; - ci_cache->vz[i] = ci->parts[idx].v[2]; - } -} - -/** - * @brief Populate cache by reading in the particles from two cells in sorted - * order. - * - * @param ci The i #cell. - * @param cj The j #cell. - * @param ci_cache The #cache for cell ci. - * @param cj_cache The #cache for cell cj. - * @param sort_i The array of sorted particle indices for cell ci. - * @param sort_j The array of sorted particle indices for cell ci. - * @param shift The amount to shift the particle positions to account for BCs - */ -__attribute__((always_inline)) INLINE void cache_read_two_cells_sorted( - const struct cell *const ci, const struct cell *const cj, - struct cache *const ci_cache, struct cache *const cj_cache, - const struct entry *restrict sort_i, const struct entry *restrict sort_j, - const double *const shift) { - - int idx; -/* Shift the particles positions to a local frame (ci frame) so single precision - * can be - * used instead of double precision. Also shift the cell ci, particles positions - * due to BCs but leave cell cj. */ -#if defined(WITH_VECTORIZATION) && defined(__ICC) -#pragma simd -#endif + /* Let the compiler know that the data is aligned and create pointers to the + * arrays inside the cache. */ + swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, y, ci_cache->y, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, z, ci_cache->z, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, h, ci_cache->h, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, m, ci_cache->m, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vx, ci_cache->vx, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT); + + const struct part *restrict parts = ci->parts; + double loc[3]; + loc[0] = ci->loc[0]; + loc[1] = ci->loc[1]; + loc[2] = ci->loc[2]; + + /* Shift the particles positions to a local frame so single precision can be + * used instead of double precision. */ for (int i = 0; i < ci->count; i++) { - idx = sort_i[i].i; - ci_cache->x[i] = ci->parts[idx].x[0] - ci->loc[0] - shift[0]; - ci_cache->y[i] = ci->parts[idx].x[1] - ci->loc[1] - shift[1]; - ci_cache->z[i] = ci->parts[idx].x[2] - ci->loc[2] - shift[2]; - ci_cache->h[i] = ci->parts[idx].h; - - ci_cache->m[i] = ci->parts[idx].mass; - ci_cache->vx[i] = ci->parts[idx].v[0]; - ci_cache->vy[i] = ci->parts[idx].v[1]; - ci_cache->vz[i] = ci->parts[idx].v[2]; + x[i] = (float)(parts[i].x[0] - loc[0]); + y[i] = (float)(parts[i].x[1] - loc[1]); + z[i] = (float)(parts[i].x[2] - loc[2]); + h[i] = parts[i].h; + + m[i] = parts[i].mass; + vx[i] = parts[i].v[0]; + vy[i] = parts[i].v[1]; + vz[i] = parts[i].v[2]; } -#if defined(WITH_VECTORIZATION) && defined(__ICC) -#pragma simd #endif - for (int i = 0; i < cj->count; i++) { - idx = sort_j[i].i; - cj_cache->x[i] = cj->parts[idx].x[0] - ci->loc[0]; - cj_cache->y[i] = cj->parts[idx].x[1] - ci->loc[1]; - cj_cache->z[i] = cj->parts[idx].x[2] - ci->loc[2]; - cj_cache->h[i] = cj->parts[idx].h; - - cj_cache->m[i] = cj->parts[idx].mass; - cj_cache->vx[i] = cj->parts[idx].v[0]; - cj_cache->vy[i] = cj->parts[idx].v[1]; - cj_cache->vz[i] = cj->parts[idx].v[2]; - } } /** @@ -321,13 +209,13 @@ __attribute__((always_inline)) INLINE void cache_read_two_cells_sorted( * interaction. */ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( - const struct cell *const ci, const struct cell *const cj, - struct cache *const ci_cache, struct cache *const cj_cache, - const struct entry *restrict sort_i, const struct entry *restrict sort_j, - const double *const shift, int *first_pi, int *last_pj, - const int num_vec_proc) { + const struct cell *restrict const ci, const struct cell *restrict const cj, + struct cache *restrict const ci_cache, + struct cache *restrict const cj_cache, const struct entry *restrict sort_i, + const struct entry *restrict sort_j, const double *restrict const shift, + int *first_pi, int *last_pj, const int num_vec_proc) { - int idx, ci_cache_idx; + int idx; /* Pad number of particles read to the vector size. */ int rem = (ci->count - *first_pi) % (num_vec_proc * VEC_SIZE); if (rem != 0) { @@ -345,74 +233,97 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( int first_pi_align = *first_pi; int last_pj_align = *last_pj; - -/* Shift the particles positions to a local frame (ci frame) so single precision - * can be - * used instead of double precision. Also shift the cell ci, particles positions - * due to BCs but leave cell cj. */ -#if defined(WITH_VECTORIZATION) && defined(__ICC) -#pragma vector aligned -#endif - for (int i = first_pi_align; i < ci->count; i++) { - /* Make sure ci_cache is filled from the first element. */ - ci_cache_idx = i - first_pi_align; - idx = sort_i[i].i; - ci_cache->x[ci_cache_idx] = ci->parts[idx].x[0] - ci->loc[0] - shift[0]; - ci_cache->y[ci_cache_idx] = ci->parts[idx].x[1] - ci->loc[1] - shift[1]; - ci_cache->z[ci_cache_idx] = ci->parts[idx].x[2] - ci->loc[2] - shift[2]; - ci_cache->h[ci_cache_idx] = ci->parts[idx].h; - - ci_cache->m[ci_cache_idx] = ci->parts[idx].mass; - ci_cache->vx[ci_cache_idx] = ci->parts[idx].v[0]; - ci_cache->vy[ci_cache_idx] = ci->parts[idx].v[1]; - ci_cache->vz[ci_cache_idx] = ci->parts[idx].v[2]; + const struct part *restrict parts_i = ci->parts; + const struct part *restrict parts_j = cj->parts; + double loc[3]; + loc[0] = ci->loc[0]; + loc[1] = ci->loc[1]; + loc[2] = ci->loc[2]; + + /* Let the compiler know that the data is aligned and create pointers to the + * arrays inside the cache. */ + swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, y, ci_cache->y, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, z, ci_cache->z, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, h, ci_cache->h, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, m, ci_cache->m, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vx, ci_cache->vx, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT); + + int ci_cache_count = ci->count - first_pi_align; + /* Shift the particles positions to a local frame (ci frame) so single + * precision + * can be + * used instead of double precision. Also shift the cell ci, particles + * positions + * due to BCs but leave cell cj. */ + for (int i = 0; i < ci_cache_count; i++) { + idx = sort_i[i + first_pi_align].i; + x[i] = (float)(parts_i[idx].x[0] - loc[0] - shift[0]); + y[i] = (float)(parts_i[idx].x[1] - loc[1] - shift[1]); + z[i] = (float)(parts_i[idx].x[2] - loc[2] - shift[2]); + h[i] = parts_i[idx].h; + + m[i] = parts_i[idx].mass; + vx[i] = parts_i[idx].v[0]; + vy[i] = parts_i[idx].v[1]; + vz[i] = parts_i[idx].v[2]; } /* Pad cache with fake particles that exist outside the cell so will not * interact.*/ - float fake_pix = 2.0f * ci->parts[sort_i[ci->count - 1].i].x[0]; + float fake_pix = 2.0f * parts_i[sort_i[ci->count - 1].i].x[0]; for (int i = ci->count - first_pi_align; i < ci->count - first_pi_align + VEC_SIZE; i++) { - ci_cache->x[i] = fake_pix; - ci_cache->y[i] = 1.f; - ci_cache->z[i] = 1.f; - ci_cache->h[i] = 1.f; - - ci_cache->m[i] = 1.f; - ci_cache->vx[i] = 1.f; - ci_cache->vy[i] = 1.f; - ci_cache->vz[i] = 1.f; + x[i] = fake_pix; + y[i] = 1.f; + z[i] = 1.f; + h[i] = 1.f; + + m[i] = 1.f; + vx[i] = 1.f; + vy[i] = 1.f; + vz[i] = 1.f; } -#if defined(WITH_VECTORIZATION) && defined(__ICC) -#pragma vector aligned -#endif + /* Let the compiler know that the data is aligned and create pointers to the + * arrays inside the cache. */ + swift_declare_aligned_ptr(float, xj, cj_cache->x, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, yj, cj_cache->y, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, zj, cj_cache->z, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, hj, cj_cache->h, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, mj, cj_cache->m, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vxj, cj_cache->vx, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vyj, cj_cache->vy, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vzj, cj_cache->vz, SWIFT_CACHE_ALIGNMENT); + for (int i = 0; i <= last_pj_align; i++) { idx = sort_j[i].i; - cj_cache->x[i] = cj->parts[idx].x[0] - ci->loc[0]; - cj_cache->y[i] = cj->parts[idx].x[1] - ci->loc[1]; - cj_cache->z[i] = cj->parts[idx].x[2] - ci->loc[2]; - cj_cache->h[i] = cj->parts[idx].h; - - cj_cache->m[i] = cj->parts[idx].mass; - cj_cache->vx[i] = cj->parts[idx].v[0]; - cj_cache->vy[i] = cj->parts[idx].v[1]; - cj_cache->vz[i] = cj->parts[idx].v[2]; + xj[i] = (float)(parts_j[idx].x[0] - loc[0]); + yj[i] = (float)(parts_j[idx].x[1] - loc[1]); + zj[i] = (float)(parts_j[idx].x[2] - loc[2]); + hj[i] = parts_j[idx].h; + + mj[i] = parts_j[idx].mass; + vxj[i] = parts_j[idx].v[0]; + vyj[i] = parts_j[idx].v[1]; + vzj[i] = parts_j[idx].v[2]; } /* Pad cache with fake particles that exist outside the cell so will not * interact.*/ float fake_pjx = 2.0f * cj->parts[sort_j[cj->count - 1].i].x[0]; for (int i = last_pj_align + 1; i < last_pj_align + 1 + VEC_SIZE; i++) { - cj_cache->x[i] = fake_pjx; - cj_cache->y[i] = 1.f; - cj_cache->z[i] = 1.f; - cj_cache->h[i] = 1.f; - - cj_cache->m[i] = 1.f; - cj_cache->vx[i] = 1.f; - cj_cache->vy[i] = 1.f; - cj_cache->vz[i] = 1.f; + xj[i] = fake_pjx; + yj[i] = 1.f; + zj[i] = 1.f; + hj[i] = 1.f; + + mj[i] = 1.f; + vxj[i] = 1.f; + vyj[i] = 1.f; + vzj[i] = 1.f; } } @@ -430,7 +341,7 @@ static INLINE void cache_clean(struct cache *c) { free(c->vy); free(c->vz); free(c->h); - free(c->max_d); + free(c->max_index); } } diff --git a/src/dimension.h b/src/dimension.h index 4ee9377b4f9608b2549f3dfee2a683b61a71c76a..0b2093d718a61c6ce850db1970412af3e2e462b9 100644 --- a/src/dimension.h +++ b/src/dimension.h @@ -275,11 +275,11 @@ __attribute__((always_inline)) INLINE static vector pow_dimension_vec( #if defined(HYDRO_DIMENSION_3D) - return (vector)(x.v * x.v * x.v); + return (vector)(vec_mul(vec_mul(x.v, x.v), x.v)); #elif defined(HYDRO_DIMENSION_2D) - return (vector)(x.v * x.v); + return (vector)(vec_mul(x.v, x.v)); #elif defined(HYDRO_DIMENSION_1D) @@ -304,16 +304,16 @@ __attribute__((always_inline)) INLINE static vector pow_dimension_plus_one_vec( #if defined(HYDRO_DIMENSION_3D) - const vector x2 = (vector)(x.v * x.v); - return (vector)(x2.v * x2.v); + const vector x2 = (vector)(vec_mul(x.v, x.v)); + return (vector)(vec_mul(x2.v, x2.v)); #elif defined(HYDRO_DIMENSION_2D) - return (vector)(x.v * x.v * x.v); + return (vector)(vec_mul(x.v, vec_mul(x.v, x.v))); #elif defined(HYDRO_DIMENSION_1D) - return (vector)(x.v * x.v); + return (vector)(vec_mul(x.v, x.v)); #else diff --git a/src/engine.c b/src/engine.c index 99c4baee1e6856fa08789b7b81103970ded3d57e..d20dab45643d0d61f9246d24aaa670a35a14e1fd 100644 --- a/src/engine.c +++ b/src/engine.c @@ -227,8 +227,8 @@ void engine_make_hierarchical_tasks(struct engine *e, struct cell *c) { /* implicit = */ 1, c, NULL); engine_add_ghosts(e, c, c->ghost_in, c->ghost_out); -/* Generate the extra ghost task. */ #ifdef EXTRA_HYDRO_LOOP + /* Generate the extra ghost task. */ c->extra_ghost = scheduler_addtask(s, task_type_extra_ghost, task_subtype_none, 0, 0, c, NULL); #endif @@ -4545,8 +4545,8 @@ void engine_init(struct engine *e, struct space *s, /* Init the scheduler with enough tasks for the initial sorting tasks. */ const int nr_tasks = 2 * s->tot_cells + 2 * e->nr_threads; - scheduler_init(&e->sched, e->s, nr_tasks, nr_queues, scheduler_flag_steal, - e->nodeID, &e->threadpool); + scheduler_init(&e->sched, e->s, nr_tasks, nr_queues, + (policy & scheduler_flag_steal), e->nodeID, &e->threadpool); /* Allocate and init the threads. */ if ((e->runners = (struct runner *)malloc(sizeof(struct runner) * diff --git a/src/hydro/Gadget2/hydro_iact.h b/src/hydro/Gadget2/hydro_iact.h index 25eb1e4b76d088d7dcd3d4ba57fe5901188be000..9e06fec92f9667d21ce7dc196ba26f9870cb24f4 100644 --- a/src/hydro/Gadget2/hydro_iact.h +++ b/src/hydro/Gadget2/hydro_iact.h @@ -388,7 +388,7 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz, vector *wcountSum, vector *wcount_dhSum, vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum, - vector mask, int knlMask) { + mask_t mask) { vector r, ri, ui, wi, wi_dx; vector mj; @@ -432,49 +432,24 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz, curlvry.v = vec_mul(curlvry.v, ri.v); curlvrz.v = vec_mul(curlvrz.v, ri.v); -/* Mask updates to intermediate vector sums for particle pi. */ -#ifdef HAVE_AVX512_F - rhoSum->v = - _mm512_mask_add_ps(rhoSum->v, knlMask, vec_mul(mj.v, wi.v), rhoSum->v); + vector wcount_dh_update; + wcount_dh_update.v = + vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v)); + /* Mask updates to intermediate vector sums for particle pi. */ + rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj.v, wi.v), mask); rho_dhSum->v = - _mm512_mask_sub_ps(rho_dhSum->v, knlMask, rho_dhSum->v, - vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, - vec_mul(ui.v, wi_dx.v)))); - - wcountSum->v = _mm512_mask_add_ps(wcountSum->v, knlMask, wi.v, wcountSum->v); - - wcount_dhSum->v = _mm512_mask_sub_ps( - rho_dhSum->v, knlMask, rho_dhSum->v, - vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v))); - - div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask, div_vSum->v, - vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v))); - - curlvxSum->v = _mm512_mask_add_ps(curlvxSum->v, knlMask, - vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), - curlvxSum->v); - - curlvySum->v = _mm512_mask_add_ps(curlvySum->v, knlMask, - vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), - curlvySum->v); - - curlvzSum->v = _mm512_mask_add_ps(curlvzSum->v, knlMask, - vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), - curlvzSum->v); -#else - rhoSum->v += vec_and(vec_mul(mj.v, wi.v), mask.v); - rho_dhSum->v -= vec_and(vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, - vec_mul(ui.v, wi_dx.v))), - mask.v); - wcountSum->v += vec_and(wi.v, mask.v); - wcount_dhSum->v -= vec_and( - vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v)), mask.v); - div_vSum->v -= vec_and(vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask.v); - curlvxSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask.v); - curlvySum->v += vec_and(vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask.v); - curlvzSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask.v); -#endif + vec_mask_sub(rho_dhSum->v, vec_mul(mj.v, wcount_dh_update.v), mask); + wcountSum->v = vec_mask_add(wcountSum->v, wi.v, mask); + wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, wcount_dh_update.v, mask); + div_vSum->v = + vec_mask_sub(div_vSum->v, vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask); + curlvxSum->v = vec_mask_add(curlvxSum->v, + vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask); + curlvySum->v = vec_mask_add(curlvySum->v, + vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask); + curlvzSum->v = vec_mask_add(curlvzSum->v, + vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask); } /** @@ -482,12 +457,14 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz, * (non-symmetric vectorized version). */ __attribute__((always_inline)) INLINE static void -runner_iact_nonsym_2_vec_density( - float *R2, float *Dx, float *Dy, float *Dz, vector hi_inv, vector vix, - vector viy, vector viz, float *Vjx, float *Vjy, float *Vjz, float *Mj, - vector *rhoSum, vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum, - vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum, - vector mask, vector mask2, int knlMask, int knlMask2) { +runner_iact_nonsym_2_vec_density(float *R2, float *Dx, float *Dy, float *Dz, + vector hi_inv, vector vix, vector viy, + vector viz, float *Vjx, float *Vjy, float *Vjz, + float *Mj, vector *rhoSum, vector *rho_dhSum, + vector *wcountSum, vector *wcount_dhSum, + vector *div_vSum, vector *curlvxSum, + vector *curlvySum, vector *curlvzSum, + mask_t mask, mask_t mask2, short mask_cond) { vector r, ri, r2, ui, wi, wi_dx; vector mj; @@ -567,88 +544,66 @@ runner_iact_nonsym_2_vec_density( curlvrz.v = vec_mul(curlvrz.v, ri.v); curlvrz2.v = vec_mul(curlvrz2.v, ri2.v); -/* Mask updates to intermediate vector sums for particle pi. */ -#ifdef HAVE_AVX512_F - rhoSum->v = - _mm512_mask_add_ps(rhoSum->v, knlMask, vec_mul(mj.v, wi.v), rhoSum->v); - rhoSum->v = - _mm512_mask_add_ps(rhoSum->v, knlMask2, vec_mul(mj2.v, wi2.v), rhoSum->v); - - rho_dhSum->v = - _mm512_mask_sub_ps(rho_dhSum->v, knlMask, rho_dhSum->v, - vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, - vec_mul(ui.v, wi_dx.v)))); - rho_dhSum->v = - _mm512_mask_sub_ps(rho_dhSum->v, knlMask2, rho_dhSum->v, - vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi2.v, - vec_mul(ui2.v, wi_dx2.v)))); - - wcountSum->v = _mm512_mask_add_ps(wcountSum->v, knlMask, wi.v, wcountSum->v); - wcountSum->v = - _mm512_mask_add_ps(wcountSum->v, knlMask2, wi2.v, wcountSum->v); - - wcount_dhSum->v = _mm512_mask_sub_ps( - rho_dhSum->v, knlMask, rho_dhSum->v, - vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v))); - - wcount_dhSum->v = _mm512_mask_sub_ps( - rho_dhSum->v, knlMask2, rho_dhSum->v, - vec_fma(vec_set1(hydro_dimension), wi2.v, vec_mul(ui2.v, wi_dx2.v))); - - div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask, div_vSum->v, - vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v))); - div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask2, div_vSum->v, - vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v))); - - curlvxSum->v = _mm512_mask_add_ps(curlvxSum->v, knlMask, - vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), - curlvxSum->v); - curlvxSum->v = _mm512_mask_add_ps( - curlvxSum->v, knlMask2, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)), - curlvxSum->v); - - curlvySum->v = _mm512_mask_add_ps(curlvySum->v, knlMask, - vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), - curlvySum->v); - curlvySum->v = _mm512_mask_add_ps( - curlvySum->v, knlMask2, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)), - curlvySum->v); - - curlvzSum->v = _mm512_mask_add_ps(curlvzSum->v, knlMask, - vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), - curlvzSum->v); - curlvzSum->v = _mm512_mask_add_ps( - curlvzSum->v, knlMask2, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)), - curlvzSum->v); -#else - rhoSum->v += vec_and(vec_mul(mj.v, wi.v), mask.v); - rhoSum->v += vec_and(vec_mul(mj2.v, wi2.v), mask2.v); - rho_dhSum->v -= vec_and(vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, - vec_mul(ui.v, wi_dx.v))), - mask.v); - rho_dhSum->v -= - vec_and(vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v, - vec_mul(ui2.v, wi_dx2.v))), - mask2.v); - wcountSum->v += vec_and(wi.v, mask.v); - wcountSum->v += vec_and(wi2.v, mask2.v); - wcount_dhSum->v -= vec_and( - vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v)), mask.v); - wcount_dhSum->v -= vec_and( - vec_fma(vec_set1(hydro_dimension), wi2.v, vec_mul(ui2.v, wi_dx2.v)), - mask2.v); - div_vSum->v -= vec_and(vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask.v); - div_vSum->v -= vec_and(vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)), mask2.v); - curlvxSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask.v); - curlvxSum->v += - vec_and(vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)), mask2.v); - curlvySum->v += vec_and(vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask.v); - curlvySum->v += - vec_and(vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)), mask2.v); - curlvzSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask.v); - curlvzSum->v += - vec_and(vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)), mask2.v); -#endif + vector wcount_dh_update, wcount_dh_update2; + wcount_dh_update.v = + vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v)); + wcount_dh_update2.v = + vec_fma(vec_set1(hydro_dimension), wi2.v, vec_mul(ui2.v, wi_dx2.v)); + + /* Mask updates to intermediate vector sums for particle pi. */ + /* Mask only when needed. */ + if (mask_cond) { + rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj.v, wi.v), mask); + rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj2.v, wi2.v), mask2); + rho_dhSum->v = + vec_mask_sub(rho_dhSum->v, vec_mul(mj.v, wcount_dh_update.v), mask); + rho_dhSum->v = + vec_mask_sub(rho_dhSum->v, vec_mul(mj2.v, wcount_dh_update2.v), mask2); + wcountSum->v = vec_mask_add(wcountSum->v, wi.v, mask); + wcountSum->v = vec_mask_add(wcountSum->v, wi2.v, mask2); + wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, wcount_dh_update.v, mask); + wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, wcount_dh_update2.v, mask2); + div_vSum->v = vec_mask_sub(div_vSum->v, + vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask); + div_vSum->v = vec_mask_sub( + div_vSum->v, vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)), mask2); + curlvxSum->v = vec_mask_add( + curlvxSum->v, vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask); + curlvxSum->v = vec_mask_add( + curlvxSum->v, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)), mask2); + curlvySum->v = vec_mask_add( + curlvySum->v, vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask); + curlvySum->v = vec_mask_add( + curlvySum->v, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)), mask2); + curlvzSum->v = vec_mask_add( + curlvzSum->v, vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask); + curlvzSum->v = vec_mask_add( + curlvzSum->v, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)), mask2); + } else { + rhoSum->v = vec_add(rhoSum->v, vec_mul(mj.v, wi.v)); + rhoSum->v = vec_add(rhoSum->v, vec_mul(mj2.v, wi2.v)); + rho_dhSum->v = vec_sub(rho_dhSum->v, vec_mul(mj.v, wcount_dh_update.v)); + rho_dhSum->v = vec_sub(rho_dhSum->v, vec_mul(mj2.v, wcount_dh_update2.v)); + wcountSum->v = vec_add(wcountSum->v, wi.v); + wcountSum->v = vec_add(wcountSum->v, wi2.v); + wcount_dhSum->v = vec_sub(wcount_dhSum->v, wcount_dh_update.v); + wcount_dhSum->v = vec_sub(wcount_dhSum->v, wcount_dh_update2.v); + div_vSum->v = vec_sub(div_vSum->v, vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v))); + div_vSum->v = + vec_sub(div_vSum->v, vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v))); + curlvxSum->v = + vec_add(curlvxSum->v, vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v))); + curlvxSum->v = + vec_add(curlvxSum->v, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v))); + curlvySum->v = + vec_add(curlvySum->v, vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v))); + curlvySum->v = + vec_add(curlvySum->v, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v))); + curlvzSum->v = + vec_add(curlvzSum->v, vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v))); + curlvzSum->v = + vec_add(curlvzSum->v, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v))); + } } #endif @@ -755,7 +710,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_force( float *R2, float *Dx, float *Hi, float *Hj, struct part **pi, struct part **pj) { -#ifdef WITH_VECTORIZATION +#ifdef WITH_OLD_VECTORIZATION vector r, r2, ri; vector xi, xj; @@ -1037,7 +992,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force( float *R2, float *Dx, float *Hi, float *Hj, struct part **pi, struct part **pj) { -#ifdef WITH_VECTORIZATION +#ifdef WITH_OLD_VECTORIZATION vector r, r2, ri; vector xi, xj; diff --git a/src/kernel_hydro.h b/src/kernel_hydro.h index f634a59d7ee769951e6560d46a92053c144cc766..5355c15f8f1a0d0c3d811c7039da04caf0522cc9 100644 --- a/src/kernel_hydro.h +++ b/src/kernel_hydro.h @@ -341,20 +341,7 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx( /* ------------------------------------------------------------------------- */ -#ifdef WITH_VECTORIZATION - -static const vector kernel_gamma_inv_vec = FILL_VEC((float)kernel_gamma_inv); - -static const vector kernel_ivals_vec = FILL_VEC((float)kernel_ivals); - -static const vector kernel_constant_vec = FILL_VEC((float)kernel_constant); - -static const vector kernel_gamma_inv_dim_vec = - FILL_VEC((float)kernel_gamma_inv_dim); - -static const vector kernel_gamma_inv_dim_plus_one_vec = - FILL_VEC((float)kernel_gamma_inv_dim_plus_one); - +#ifdef WITH_OLD_VECTORIZATION /** * @brief Computes the kernel function and its derivative (Vectorised version). * @@ -369,11 +356,12 @@ __attribute__((always_inline)) INLINE static void kernel_deval_vec( /* Go to the range [0,1[ from [0,H[ */ vector x; - x.v = u->v * kernel_gamma_inv_vec.v; + x.v = vec_mul(u->v, kernel_gamma_inv_vec.v); /* Load x and get the interval id. */ vector ind; - ind.m = vec_ftoi(vec_fmin(x.v * kernel_ivals_vec.v, kernel_ivals_vec.v)); + ind.m = + vec_ftoi(vec_fmin(vec_mul(x.v, kernel_ivals_vec.v), kernel_ivals_vec.v)); /* load the coefficients. */ vector c[kernel_degree + 1]; @@ -382,20 +370,36 @@ __attribute__((always_inline)) INLINE static void kernel_deval_vec( c[j].f[k] = kernel_coeffs[ind.i[k] * (kernel_degree + 1) + j]; /* Init the iteration for Horner's scheme. */ - w->v = (c[0].v * x.v) + c[1].v; + w->v = vec_fma(c[0].v, x.v, c[1].v); dw_dx->v = c[0].v; /* And we're off! */ for (int k = 2; k <= kernel_degree; k++) { - dw_dx->v = (dw_dx->v * x.v) + w->v; - w->v = (x.v * w->v) + c[k].v; + dw_dx->v = vec_fma(dw_dx->v, x.v, w->v); + w->v = vec_fma(x.v, w->v, c[k].v); } /* Return everything */ - w->v = w->v * kernel_constant_vec.v * kernel_gamma_inv_dim_vec.v; - dw_dx->v = - dw_dx->v * kernel_constant_vec.v * kernel_gamma_inv_dim_plus_one_vec.v; + w->v = + vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v)); + dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v, + kernel_gamma_inv_dim_plus_one_vec.v)); } +#endif + +#ifdef WITH_VECTORIZATION + +static const vector kernel_gamma_inv_vec = FILL_VEC((float)kernel_gamma_inv); + +static const vector kernel_ivals_vec = FILL_VEC((float)kernel_ivals); + +static const vector kernel_constant_vec = FILL_VEC((float)kernel_constant); + +static const vector kernel_gamma_inv_dim_vec = + FILL_VEC((float)kernel_gamma_inv_dim); + +static const vector kernel_gamma_inv_dim_plus_one_vec = + FILL_VEC((float)kernel_gamma_inv_dim_plus_one); /* Define constant vectors for the Wendland C2 and Cubic Spline kernel * coefficients. */ @@ -468,15 +472,15 @@ __attribute__((always_inline)) INLINE static void kernel_deval_1_vec( w->v = vec_fma(x.v, w->v, wendland_const_c5.v); #elif defined(CUBIC_SPLINE_KERNEL) vector w2, dw_dx2; - vector mask_reg1, mask_reg2; + mask_t mask_reg; - /* Form a mask for each part of the kernel. */ - mask_reg1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */ - mask_reg2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */ - ; + /* Form a mask for one part of the kernel. */ + /* Only need the mask for one region as the vec_blend defaults to the vector + * when the mask is 0.*/ + vec_create_mask(mask_reg, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */ /* Work out w for both regions of the kernel and combine the results together - * using masks. */ + * using a mask. */ /* Init the iteration for Horner's scheme. */ w->v = vec_fma(cubic_1_const_c0.v, x.v, cubic_1_const_c1.v); @@ -495,20 +499,17 @@ __attribute__((always_inline)) INLINE static void kernel_deval_1_vec( w->v = vec_fma(x.v, w->v, cubic_1_const_c3.v); w2.v = vec_fma(x.v, w2.v, cubic_2_const_c3.v); - /* Mask out unneeded values. */ - w->v = vec_and(w->v, mask_reg1.v); - w2.v = vec_and(w2.v, mask_reg2.v); - dw_dx->v = vec_and(dw_dx->v, mask_reg1.v); - dw_dx2.v = vec_and(dw_dx2.v, mask_reg2.v); - - /* Added both w and w2 together to form complete result. */ - w->v = vec_add(w->v, w2.v); - dw_dx->v = vec_add(dw_dx->v, dw_dx2.v); + /* Blend both kernel regions into one vector (mask out unneeded values). */ + /* Only need the mask for one region as the vec_blend defaults to the vector + * when the mask is 0.*/ + w->v = vec_blend(mask_reg, w->v, w2.v); + dw_dx->v = vec_blend(mask_reg, dw_dx->v, dw_dx2.v); + #else -#error +#error "Vectorisation not supported for this kernel!!!" #endif - /* Return everything */ + /* Return everyting */ w->v = vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v)); dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v, @@ -580,15 +581,13 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec( #elif defined(CUBIC_SPLINE_KERNEL) vector w_2, dw_dx_2; vector w2_2, dw_dx2_2; - vector mask_reg1, mask_reg2, mask_reg1_v2, mask_reg2_v2; + mask_t mask_reg, mask_reg_v2; - /* Form a mask for each part of the kernel. */ - mask_reg1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */ - mask_reg1_v2.v = vec_cmp_lt(x2.v, cond.v); /* 0 < x < 0.5 */ - mask_reg2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */ - ; - mask_reg2_v2.v = vec_cmp_gte(x2.v, cond.v); /* 0.5 < x < 1 */ - ; + /* Form a mask for one part of the kernel for each vector. */ + /* Only need the mask for one region as the vec_blend defaults to the vector + * when the mask is 0.*/ + vec_create_mask(mask_reg, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */ + vec_create_mask(mask_reg_v2, vec_cmp_gte(x2.v, cond.v)); /* 0.5 < x < 1 */ /* Work out w for both regions of the kernel and combine the results together * using masks. */ @@ -622,29 +621,23 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec( w_2.v = vec_fma(x.v, w_2.v, cubic_2_const_c3.v); w2_2.v = vec_fma(x2.v, w2_2.v, cubic_2_const_c3.v); - /* Mask out unneeded values. */ - w->v = vec_and(w->v, mask_reg1.v); - w2->v = vec_and(w2->v, mask_reg1_v2.v); - w_2.v = vec_and(w_2.v, mask_reg2.v); - w2_2.v = vec_and(w2_2.v, mask_reg2_v2.v); - dw_dx->v = vec_and(dw_dx->v, mask_reg1.v); - dw_dx2->v = vec_and(dw_dx2->v, mask_reg1_v2.v); - dw_dx_2.v = vec_and(dw_dx_2.v, mask_reg2.v); - dw_dx2_2.v = vec_and(dw_dx2_2.v, mask_reg2_v2.v); - - /* Added both w and w2 together to form complete result. */ - w->v = vec_add(w->v, w_2.v); - w2->v = vec_add(w2->v, w2_2.v); - dw_dx->v = vec_add(dw_dx->v, dw_dx_2.v); - dw_dx2->v = vec_add(dw_dx2->v, dw_dx2_2.v); + /* Blend both kernel regions into one vector (mask out unneeded values). */ + /* Only need the mask for one region as the vec_blend defaults to the vector + * when the mask is 0.*/ + w->v = vec_blend(mask_reg, w->v, w_2.v); + w2->v = vec_blend(mask_reg_v2, w2->v, w2_2.v); + dw_dx->v = vec_blend(mask_reg, dw_dx->v, dw_dx_2.v); + dw_dx2->v = vec_blend(mask_reg_v2, dw_dx2->v, dw_dx2_2.v); /* Return everything */ - w->v = w->v * kernel_constant_vec.v * kernel_gamma_inv_dim_vec.v; - w2->v = w2->v * kernel_constant_vec.v * kernel_gamma_inv_dim_vec.v; - dw_dx->v = - dw_dx->v * kernel_constant_vec.v * kernel_gamma_inv_dim_plus_one_vec.v; - dw_dx2->v = - dw_dx2->v * kernel_constant_vec.v * kernel_gamma_inv_dim_plus_one_vec.v; + w->v = + vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v)); + w2->v = vec_mul(w2->v, + vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v)); + dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v, + kernel_gamma_inv_dim_plus_one_vec.v)); + dw_dx2->v = vec_mul(dw_dx2->v, vec_mul(kernel_constant_vec.v, + kernel_gamma_inv_dim_plus_one_vec.v)); #endif } @@ -676,12 +669,12 @@ __attribute__((always_inline)) INLINE static void kernel_eval_W_vec(vector *u, w->v = vec_fma(x.v, w->v, wendland_const_c5.v); #elif defined(CUBIC_SPLINE_KERNEL) vector w2; - vector mask1, mask2; + mask_t mask_reg; /* Form a mask for each part of the kernel. */ - mask1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */ - mask2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */ - ; + /* Only need the mask for one region as the vec_blend defaults to the vector + * when the mask is 0.*/ + vec_create_mask(mask_reg, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */ /* Work out w for both regions of the kernel and combine the results together * using masks. */ @@ -698,13 +691,12 @@ __attribute__((always_inline)) INLINE static void kernel_eval_W_vec(vector *u, w2.v = vec_fma(x.v, w2.v, cubic_2_const_c3.v); /* Mask out unneeded values. */ - w->v = vec_and(w->v, mask1.v); - w2.v = vec_and(w2.v, mask2.v); + /* Only need the mask for one region as the vec_blend defaults to the vector + * when the mask is 0.*/ + w->v = vec_blend(mask_reg, w->v, w2.v); - /* Added both w and w2 together to form complete result. */ - w->v = vec_add(w->v, w2.v); #else -#error +#error "Vectorisation not supported for this kernel!!!" #endif /* Return everything */ @@ -741,12 +733,12 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_vec( #elif defined(CUBIC_SPLINE_KERNEL) vector dw_dx2; - vector mask1, mask2; + mask_t mask_reg; /* Form a mask for each part of the kernel. */ - mask1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */ - mask2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */ - ; + /* Only need the mask for one region as the vec_blend defaults to the vector + * when the mask is 0.*/ + vec_create_mask(mask_reg, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */ /* Work out w for both regions of the kernel and combine the results together * using masks. */ @@ -760,15 +752,106 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_vec( dw_dx2.v = vec_fma(dw_dx2.v, x.v, cubic_2_dwdx_const_c2.v); /* Mask out unneeded values. */ - dw_dx->v = vec_and(dw_dx->v, mask1.v); - dw_dx2.v = vec_and(dw_dx2.v, mask2.v); + /* Only need the mask for one region as the vec_blend defaults to the vector + * when the mask is 0.*/ + dw_dx->v = vec_blend(mask_reg, dw_dx->v, dw_dx2.v); + +#else +#error "Vectorisation not supported for this kernel!!!" +#endif + + /* Mask out result for particles that lie outside of the kernel function. */ + mask_t mask; + vec_create_mask(mask, vec_cmp_lt(x.v, vec_set1(1.f))); /* x < 1 */ + + dw_dx->v = vec_and_mask(dw_dx->v, mask); + + /* Return everything */ + dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v, + kernel_gamma_inv_dim_plus_one_vec.v)); +} + +/** + * @brief Computes the kernel function derivative for two particles + * using interleaved vectors. + * + * Return 0 if $u > \\gamma = H/h$ + * + * @param u The ratio of the distance to the smoothing length $u = x/h$. + * @param dw_dx (return) The norm of the gradient of $|\\nabla W(x,h)|$. + * @param u_2 The ratio of the distance to the smoothing length $u = x/h$ for + * second particle. + * @param dw_dx_2 (return) The norm of the gradient of $|\\nabla W(x,h)|$ for + * second particle. + */ +__attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_2_vec( + vector *u, vector *dw_dx, vector *u_2, vector *dw_dx_2) { + + /* Go to the range [0,1[ from [0,H[ */ + vector x, x_2; + x.v = vec_mul(u->v, kernel_gamma_inv_vec.v); + x_2.v = vec_mul(u_2->v, kernel_gamma_inv_vec.v); + +#ifdef WENDLAND_C2_KERNEL + /* Init the iteration for Horner's scheme. */ + dw_dx->v = vec_fma(wendland_dwdx_const_c0.v, x.v, wendland_dwdx_const_c1.v); + dw_dx_2->v = + vec_fma(wendland_dwdx_const_c0.v, x_2.v, wendland_dwdx_const_c1.v); + + /* Calculate the polynomial interleaving vector operations */ + dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c2.v); + dw_dx_2->v = vec_fma(dw_dx_2->v, x_2.v, wendland_dwdx_const_c2.v); + + dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c3.v); + dw_dx_2->v = vec_fma(dw_dx_2->v, x_2.v, wendland_dwdx_const_c3.v); + + dw_dx->v = vec_mul(dw_dx->v, x.v); + dw_dx_2->v = vec_mul(dw_dx_2->v, x_2.v); + +#elif defined(CUBIC_SPLINE_KERNEL) + vector dw_dx2, dw_dx2_2; + mask_t mask_reg; + mask_t mask_reg_v2; + + /* Form a mask for one part of the kernel. */ + /* Only need the mask for one region as the vec_blend defaults to the vector + * when the mask is 0.*/ + vec_create_mask(mask_reg, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */ + vec_create_mask(mask_reg_v2, vec_cmp_gte(x_2.v, cond.v)); /* 0.5 < x < 1 */ + + /* Work out w for both regions of the kernel and combine the results together + * using masks. */ + + /* Init the iteration for Horner's scheme. */ + dw_dx->v = vec_fma(cubic_1_dwdx_const_c0.v, x.v, cubic_1_dwdx_const_c1.v); + dw_dx_2->v = vec_fma(cubic_1_dwdx_const_c0.v, x_2.v, cubic_1_dwdx_const_c1.v); + dw_dx2.v = vec_fma(cubic_2_dwdx_const_c0.v, x.v, cubic_2_dwdx_const_c1.v); + dw_dx2_2.v = vec_fma(cubic_2_dwdx_const_c0.v, x_2.v, cubic_2_dwdx_const_c1.v); + + /* Calculate the polynomial interleaving vector operations. */ + dw_dx->v = vec_mul(dw_dx->v, x.v); /* cubic_1_dwdx_const_c2 is zero. */ + dw_dx_2->v = vec_mul(dw_dx_2->v, x_2.v); /* cubic_1_dwdx_const_c2 is zero. */ + dw_dx2.v = vec_fma(dw_dx2.v, x.v, cubic_2_dwdx_const_c2.v); + dw_dx2_2.v = vec_fma(dw_dx2_2.v, x_2.v, cubic_2_dwdx_const_c2.v); + + /* Mask out unneeded values. */ + /* Only need the mask for one region as the vec_blend defaults to the vector + * when the mask is 0.*/ + dw_dx->v = vec_blend(mask_reg, dw_dx->v, dw_dx2.v); + dw_dx_2->v = vec_blend(mask_reg_v2, dw_dx_2->v, dw_dx2_2.v); - /* Added both dwdx and dwdx2 together to form complete result. */ - dw_dx->v = vec_add(dw_dx->v, dw_dx2.v); #else -#error +#error "Vectorisation not supported for this kernel!!!" #endif + /* Mask out result for particles that lie outside of the kernel function. */ + mask_t mask, mask_2; + vec_create_mask(mask, vec_cmp_lt(x.v, vec_set1(1.f))); /* x < 1 */ + vec_create_mask(mask_2, vec_cmp_lt(x_2.v, vec_set1(1.f))); /* x < 1 */ + + dw_dx->v = vec_and_mask(dw_dx->v, mask); + dw_dx_2->v = vec_and_mask(dw_dx_2->v, mask_2); + /* Return everything */ dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_plus_one_vec.v)); diff --git a/src/parser.c b/src/parser.c index 641e2fdac6c286e0c1e1a1c07f4badcb591112c2..0b608b29263342240af68fd99d2fdd3241e2a1e6 100644 --- a/src/parser.c +++ b/src/parser.c @@ -133,8 +133,8 @@ void parser_set_param(struct swift_params *params, const char *namevalue) { int updated = 0; for (int i = 0; i < params->paramCount; i++) { if (strcmp(name, params->data[i].name) == 0) { - message("Using the value '%s' instead of '%s' for the parameter '%s'", - value, params->data[i].value, params->data[i].name); + message("Value of '%s' changed from '%s' to '%s'", params->data[i].name, + params->data[i].value, value); strcpy(params->data[i].value, value); updated = 1; } diff --git a/src/runner_doiact.h b/src/runner_doiact.h index 47ebba61d9a77486448ba2cdb25c9c0a448b9bac..906e9ad35572b643a5c2126f3dc25f67532e30a4 100644 --- a/src/runner_doiact.h +++ b/src/runner_doiact.h @@ -1037,9 +1037,9 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj, const int sid, struct part *restrict pj = &parts_j[sort_j[pjd].i]; if (!part_is_active(pj, e)) continue; const float hj = pj->h; - const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift; - if (dj > di_max) continue; - + const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max + rshift; + if (dj - rshift > di_max) continue; + double pjx[3]; for (int k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k]; const float hjg2 = hj * hj * kernel_gamma2; @@ -1449,9 +1449,9 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) { /* Get a hold of the jth part in cj. */ struct part *restrict pj = &parts_j[sort_j[pjd].i]; const float hj = pj->h; - const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift; - if (dj > di_max) continue; - + const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max + rshift; + if (dj - rshift > di_max) continue; + double pjx[3]; for (int k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k]; const float hjg2 = hj * hj * kernel_gamma2; diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c index 5b22a67bbf757e09f144ba3accc2ef5adfb706f0..1557c1ec84863c3518f8671cdf0e9ca44732e94e 100644 --- a/src/runner_doiact_vec.c +++ b/src/runner_doiact_vec.c @@ -63,10 +63,7 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions( vector v_hi_inv, vector v_vix, vector v_viy, vector v_viz, int *icount_align) { -#ifdef HAVE_AVX512_F - KNL_MASK_16 knl_mask, knl_mask2; -#endif - vector int_mask, int_mask2; + mask_t int_mask, int_mask2; /* Work out the number of remainder interactions and pad secondary cache. */ *icount_align = icount; @@ -75,16 +72,10 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions( int pad = (NUM_VEC_PROC * VEC_SIZE) - rem; *icount_align += pad; -/* Initialise masks to true. */ -#ifdef HAVE_AVX512_F - knl_mask = 0xFFFF; - knl_mask2 = 0xFFFF; - int_mask.m = vec_setint1(0xFFFFFFFF); - int_mask2.m = vec_setint1(0xFFFFFFFF); -#else - int_mask.m = vec_setint1(0xFFFFFFFF); - int_mask2.m = vec_setint1(0xFFFFFFFF); -#endif + /* Initialise masks to true. */ + vec_init_mask_true(int_mask); + vec_init_mask_true(int_mask2); + /* Pad secondary cache so that there are no contributions in the interaction * function. */ for (int i = icount; i < *icount_align; i++) { @@ -100,19 +91,10 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions( /* Zero parts of mask that represent the padded values.*/ if (pad < VEC_SIZE) { -#ifdef HAVE_AVX512_F - knl_mask2 = knl_mask2 >> pad; -#else - for (int i = VEC_SIZE - pad; i < VEC_SIZE; i++) int_mask2.i[i] = 0; -#endif + vec_pad_mask(int_mask2, pad); } else { -#ifdef HAVE_AVX512_F - knl_mask = knl_mask >> (VEC_SIZE - rem); - knl_mask2 = 0; -#else - for (int i = rem; i < VEC_SIZE; i++) int_mask.i[i] = 0; - int_mask2.v = vec_setzero(); -#endif + vec_pad_mask(int_mask, VEC_SIZE - rem); + vec_zero_mask(int_mask2); } /* Perform remainder interaction and remove remainder from aligned @@ -125,12 +107,7 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions( &int_cache->vyq[*icount_align], &int_cache->vzq[*icount_align], &int_cache->mq[*icount_align], rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, curlvySum, curlvzSum, int_mask, - int_mask2, -#ifdef HAVE_AVX512_F - knl_mask, knl_mask2); -#else - 0, 0); -#endif + int_mask2, 1); } } @@ -144,10 +121,6 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions( * @param v_dx #vector of the x separation between two particles. * @param v_dy #vector of the y separation between two particles. * @param v_dz #vector of the z separation between two particles. - * @param v_mj #vector of the mass of particle pj. - * @param v_vjx #vector of x velocity of pj. - * @param v_vjy #vector of y velocity of pj. - * @param v_vjz #vector of z velocity of pj. * @param cell_cache #cache of all particles in the cell. * @param int_cache (return) secondary #cache of interactions between two * particles. @@ -174,44 +147,33 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions( */ __attribute__((always_inline)) INLINE static void storeInteractions( const int mask, const int pjd, vector *v_r2, vector *v_dx, vector *v_dy, - vector *v_dz, vector *v_mj, vector *v_vjx, vector *v_vjy, vector *v_vjz, - const struct cache *const cell_cache, struct c2_cache *const int_cache, - int *icount, vector *rhoSum, vector *rho_dhSum, vector *wcountSum, - vector *wcount_dhSum, vector *div_vSum, vector *curlvxSum, - vector *curlvySum, vector *curlvzSum, vector v_hi_inv, vector v_vix, - vector v_viy, vector v_viz) { + vector *v_dz, const struct cache *const cell_cache, + struct c2_cache *const int_cache, int *icount, vector *rhoSum, + vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum, + vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum, + vector v_hi_inv, vector v_vix, vector v_viy, vector v_viz) { /* Left-pack values needed into the secondary cache using the interaction mask. */ #if defined(HAVE_AVX2) || defined(HAVE_AVX512_F) - int pack = 0; - -#ifdef HAVE_AVX512_F - pack += __builtin_popcount(mask); - VEC_LEFT_PACK(v_r2->v, mask, &int_cache->r2q[*icount]); - VEC_LEFT_PACK(v_dx->v, mask, &int_cache->dxq[*icount]); - VEC_LEFT_PACK(v_dy->v, mask, &int_cache->dyq[*icount]); - VEC_LEFT_PACK(v_dz->v, mask, &int_cache->dzq[*icount]); - VEC_LEFT_PACK(v_mj->v, mask, &int_cache->mq[*icount]); - VEC_LEFT_PACK(v_vjx->v, mask, &int_cache->vxq[*icount]); - VEC_LEFT_PACK(v_vjy->v, mask, &int_cache->vyq[*icount]); - VEC_LEFT_PACK(v_vjz->v, mask, &int_cache->vzq[*icount]); -#else - vector v_mask; - VEC_FORM_PACKED_MASK(mask, v_mask.m, pack); - - VEC_LEFT_PACK(v_r2->v, v_mask.m, &int_cache->r2q[*icount]); - VEC_LEFT_PACK(v_dx->v, v_mask.m, &int_cache->dxq[*icount]); - VEC_LEFT_PACK(v_dy->v, v_mask.m, &int_cache->dyq[*icount]); - VEC_LEFT_PACK(v_dz->v, v_mask.m, &int_cache->dzq[*icount]); - VEC_LEFT_PACK(v_mj->v, v_mask.m, &int_cache->mq[*icount]); - VEC_LEFT_PACK(v_vjx->v, v_mask.m, &int_cache->vxq[*icount]); - VEC_LEFT_PACK(v_vjy->v, v_mask.m, &int_cache->vyq[*icount]); - VEC_LEFT_PACK(v_vjz->v, v_mask.m, &int_cache->vzq[*icount]); - -#endif /* HAVE_AVX512_F */ - - (*icount) += pack; + mask_t packed_mask; + VEC_FORM_PACKED_MASK(mask, packed_mask); + + VEC_LEFT_PACK(v_r2->v, packed_mask, &int_cache->r2q[*icount]); + VEC_LEFT_PACK(v_dx->v, packed_mask, &int_cache->dxq[*icount]); + VEC_LEFT_PACK(v_dy->v, packed_mask, &int_cache->dyq[*icount]); + VEC_LEFT_PACK(v_dz->v, packed_mask, &int_cache->dzq[*icount]); + VEC_LEFT_PACK(vec_load(&cell_cache->m[pjd]), packed_mask, + &int_cache->mq[*icount]); + VEC_LEFT_PACK(vec_load(&cell_cache->vx[pjd]), packed_mask, + &int_cache->vxq[*icount]); + VEC_LEFT_PACK(vec_load(&cell_cache->vy[pjd]), packed_mask, + &int_cache->vyq[*icount]); + VEC_LEFT_PACK(vec_load(&cell_cache->vz[pjd]), packed_mask, + &int_cache->vzq[*icount]); + + /* Increment interaction count by number of bits set in mask. */ + (*icount) += __builtin_popcount(mask); #else /* Quicker to do it serially in AVX rather than use intrinsics. */ for (int bit_index = 0; bit_index < VEC_SIZE; bit_index++) { @@ -242,9 +204,9 @@ __attribute__((always_inline)) INLINE static void storeInteractions( wcount_dhSum, div_vSum, curlvxSum, curlvySum, curlvzSum, v_hi_inv, v_vix, v_viy, v_viz, &icount_align); - vector int_mask, int_mask2; - int_mask.m = vec_setint1(0xFFFFFFFF); - int_mask2.m = vec_setint1(0xFFFFFFFF); + mask_t int_mask, int_mask2; + vec_init_mask_true(int_mask); + vec_init_mask_true(int_mask2); /* Perform interactions. */ for (int pjd = 0; pjd < icount_align; pjd += (NUM_VEC_PROC * VEC_SIZE)) { @@ -253,7 +215,7 @@ __attribute__((always_inline)) INLINE static void storeInteractions( &int_cache->dzq[pjd], v_hi_inv, v_vix, v_viy, v_viz, &int_cache->vxq[pjd], &int_cache->vyq[pjd], &int_cache->vzq[pjd], &int_cache->mq[pjd], rhoSum, rho_dhSum, wcountSum, wcount_dhSum, - div_vSum, curlvxSum, curlvySum, curlvzSum, int_mask, int_mask2, 0, 0); + div_vSum, curlvxSum, curlvySum, curlvzSum, int_mask, int_mask2, 0); } /* Reset interaction count. */ @@ -262,7 +224,8 @@ __attribute__((always_inline)) INLINE static void storeInteractions( } /** - * @brief Populates the arrays max_di and max_dj with the maximum distances of + * @brief Populates the arrays max_index_i and max_index_j with the maximum + * indices of * particles into their neighbouring cells. Also finds the first pi that * interacts with any particle in cj and the last pj that interacts with any * particle in ci. @@ -277,77 +240,121 @@ __attribute__((always_inline)) INLINE static void storeInteractions( * @param hj_max Maximal smoothing length in cell cj * @param di_max Maximal position on the axis that can interact in cell ci * @param dj_min Minimal position on the axis that can interact in cell ci - * @param max_di array to hold the maximum distances of pi particles into cell + * @param max_index_i array to hold the maximum distances of pi particles into + * cell * cj - * @param max_dj array to hold the maximum distances of pj particles into cell + * @param max_index_j array to hold the maximum distances of pj particles into + * cell * cj * @param init_pi first pi to interact with a pj particle * @param init_pj last pj to interact with a pi particle * @param e The #engine. */ -__attribute__((always_inline)) INLINE static void populate_max_d_no_cache( +__attribute__((always_inline)) INLINE static void populate_max_index_no_cache( const struct cell *ci, const struct cell *cj, const struct entry *restrict sort_i, const struct entry *restrict sort_j, const float dx_max, const float rshift, const double hi_max, const double hj_max, const double di_max, const double dj_min, - float *max_di, float *max_dj, int *init_pi, int *init_pj, + int *max_index_i, int *max_index_j, int *init_pi, int *init_pj, const struct engine *e) { const struct part *restrict parts_i = ci->parts; const struct part *restrict parts_j = cj->parts; int first_pi = 0, last_pj = cj->count - 1; + int temp; + + /* Find the leftmost active particle in cell i that interacts with any + * particle in cell j. */ + first_pi = ci->count; + int active_id = first_pi - 1; + while (first_pi > 0 && sort_i[first_pi - 1].d + dx_max + hi_max > dj_min) { + first_pi--; + /* Store the index of the particle if it is active. */ + if (part_is_active(&parts_i[sort_i[first_pi].i], e)) active_id = first_pi; + } - /* Find the first active particle in ci to interact with any particle in cj. - */ - /* Populate max_di with distances. */ - int active_id = ci->count - 1; - for (int k = ci->count - 1; k >= 0; k--) { - const struct part *pi = &parts_i[sort_i[k].i]; - const float d = sort_i[k].d + dx_max; - - // max_di[k] = d + h * kernel_gamma - rshift; - max_di[k] = d + hi_max; - - /* If the particle is out of range set the index to - * the last active particle within range. */ - if (d + hi_max < dj_min) { - first_pi = active_id; - break; - } else { - if (part_is_active(pi, e)) active_id = k; + /* Set the first active pi in range of any particle in cell j. */ + first_pi = active_id; + + /* Find the maximum index into cell j for each particle in range in cell i. */ + if (first_pi < ci->count) { + + /* Start from the first particle in cell j. */ + temp = 0; + + const struct part *pi = &parts_i[sort_i[first_pi].i]; + + /* Loop through particles in cell j until they are not in range of pi. */ + while (temp <= cj->count && + (sort_i[first_pi].d + (pi->h * kernel_gamma + dx_max - rshift) > + sort_j[temp].d)) + temp++; + + max_index_i[first_pi] = temp; + + /* Populate max_index_i for remaining particles that are within range. */ + for (int i = first_pi + 1; i < ci->count; i++) { + temp = max_index_i[i - 1]; + pi = &parts_i[sort_i[i].i]; + + while (temp <= cj->count && + (sort_i[i].d + (pi->h * kernel_gamma + dx_max - rshift) > + sort_j[temp].d)) + temp++; + + max_index_i[i] = temp; } + } else { + /* Make sure that max index is set to first particle in cj.*/ + max_index_i[ci->count - 1] = 0; } - /* Find the maximum distance of pi particles into cj.*/ - for (int k = first_pi + 1; k < ci->count; k++) { - max_di[k] = fmaxf(max_di[k - 1], max_di[k]); + /* Find the rightmost active particle in cell j that interacts with any + * particle in cell i. */ + last_pj = -1; + active_id = last_pj; + while (last_pj < cj->count && + sort_j[last_pj + 1].d - hj_max - dx_max < di_max) { + last_pj++; + /* Store the index of the particle if it is active. */ + if (part_is_active(&parts_j[sort_j[last_pj].i], e)) active_id = last_pj; } - /* Find the last particle in cj to interact with any particle in ci. */ - /* Populate max_dj with distances. */ - active_id = 0; - for (int k = 0; k < cj->count; k++) { - const struct part *pj = &parts_j[sort_j[k].i]; - const float d = sort_j[k].d - dx_max; - - /*TODO: don't think rshift should be taken off here, waiting on Pedro. */ - // max_dj[k] = d - h * kernel_gamma - rshift; - max_dj[k] = d - hj_max; - - /* If the particle is out of range set the index to - * the last active particle within range. */ - if (d - hj_max > di_max) { - last_pj = active_id; - break; - } else { - if (part_is_active(pj, e)) active_id = k; - } - } + /* Set the last active pj in range of any particle in cell i. */ + last_pj = active_id; + + /* Find the maximum index into cell i for each particle in range in cell j. */ + if (last_pj > 0) { + + /* Start from the last particle in cell i. */ + temp = ci->count - 1; + + const struct part *pj = &parts_j[sort_j[last_pj].i]; + + /* Loop through particles in cell i until they are not in range of pj. */ + while (temp > 0 && + sort_j[last_pj].d - dx_max - (pj->h * kernel_gamma) < + sort_i[temp].d - rshift) + temp--; - /* Find the maximum distance of pj particles into ci.*/ - for (int k = 1; k <= last_pj; k++) { - max_dj[k] = fmaxf(max_dj[k - 1], max_dj[k]); + max_index_j[last_pj] = temp; + + /* Populate max_index_j for remaining particles that are within range. */ + for (int i = last_pj - 1; i >= 0; i--) { + temp = max_index_j[i + 1]; + pj = &parts_j[sort_j[i].i]; + + while (temp > 0 && + sort_j[i].d - dx_max - (pj->h * kernel_gamma) < + sort_i[temp].d - rshift) + temp--; + + max_index_j[i] = temp; + } + } else { + /* Make sure that max index is set to last particle in ci.*/ + max_index_j[0] = ci->count - 1; } *init_pi = first_pi; @@ -367,7 +374,6 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( #ifdef WITH_VECTORIZATION const struct engine *e = r->e; - int doi_mask; struct part *restrict pi; int count_align; int num_vec_proc = NUM_VEC_PROC; @@ -459,9 +465,7 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( } vector pjx, pjy, pjz; - vector pjvx, pjvy, pjvz, mj; vector pjx2, pjy2, pjz2; - vector pjvx2, pjvy2, pjvz2, mj2; /* Find all of particle pi's interacions and store needed values in the * secondary cache.*/ @@ -471,18 +475,10 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( pjx.v = vec_load(&cell_cache->x[pjd]); pjy.v = vec_load(&cell_cache->y[pjd]); pjz.v = vec_load(&cell_cache->z[pjd]); - pjvx.v = vec_load(&cell_cache->vx[pjd]); - pjvy.v = vec_load(&cell_cache->vy[pjd]); - pjvz.v = vec_load(&cell_cache->vz[pjd]); - mj.v = vec_load(&cell_cache->m[pjd]); pjx2.v = vec_load(&cell_cache->x[pjd + VEC_SIZE]); pjy2.v = vec_load(&cell_cache->y[pjd + VEC_SIZE]); pjz2.v = vec_load(&cell_cache->z[pjd + VEC_SIZE]); - pjvx2.v = vec_load(&cell_cache->vx[pjd + VEC_SIZE]); - pjvy2.v = vec_load(&cell_cache->vy[pjd + VEC_SIZE]); - pjvz2.v = vec_load(&cell_cache->vz[pjd + VEC_SIZE]); - mj2.v = vec_load(&cell_cache->m[pjd + VEC_SIZE]); /* Compute the pairwise distance. */ vector v_dx_tmp, v_dy_tmp, v_dz_tmp; @@ -502,53 +498,47 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( v_r2.v = vec_fma(v_dz_tmp.v, v_dz_tmp.v, v_r2.v); v_r2_2.v = vec_fma(v_dz_tmp2.v, v_dz_tmp2.v, v_r2_2.v); -/* Form a mask from r2 < hig2 and r2 > 0.*/ -#ifdef HAVE_AVX512_F - // KNL_MASK_16 doi_mask, doi_mask_check, doi_mask2, doi_mask2_check; - KNL_MASK_16 doi_mask_check, doi_mask2, doi_mask2_check; - - doi_mask_check = vec_cmp_gt(v_r2.v, vec_setzero()); - doi_mask = vec_cmp_lt(v_r2.v, v_hig2.v); - - doi_mask2_check = vec_cmp_gt(v_r2_2.v, vec_setzero()); - doi_mask2 = vec_cmp_lt(v_r2_2.v, v_hig2.v); - - doi_mask = doi_mask & doi_mask_check; - doi_mask2 = doi_mask2 & doi_mask2_check; - -#else - vector v_doi_mask, v_doi_mask_check, v_doi_mask2, v_doi_mask2_check; - int doi_mask2; + /* Form a mask from r2 < hig2 and r2 > 0.*/ + mask_t v_doi_mask, v_doi_mask_self_check, v_doi_mask2, + v_doi_mask2_self_check; + int doi_mask, doi_mask_self_check, doi_mask2, doi_mask2_self_check; /* Form r2 > 0 mask and r2 < hig2 mask. */ - v_doi_mask_check.v = vec_cmp_gt(v_r2.v, vec_setzero()); - v_doi_mask.v = vec_cmp_lt(v_r2.v, v_hig2.v); + vec_create_mask(v_doi_mask_self_check, vec_cmp_gt(v_r2.v, vec_setzero())); + vec_create_mask(v_doi_mask, vec_cmp_lt(v_r2.v, v_hig2.v)); /* Form r2 > 0 mask and r2 < hig2 mask. */ - v_doi_mask2_check.v = vec_cmp_gt(v_r2_2.v, vec_setzero()); - v_doi_mask2.v = vec_cmp_lt(v_r2_2.v, v_hig2.v); + vec_create_mask(v_doi_mask2_self_check, + vec_cmp_gt(v_r2_2.v, vec_setzero())); + vec_create_mask(v_doi_mask2, vec_cmp_lt(v_r2_2.v, v_hig2.v)); - /* Combine two masks and form integer mask. */ - doi_mask = vec_cmp_result(vec_and(v_doi_mask.v, v_doi_mask_check.v)); - doi_mask2 = vec_cmp_result(vec_and(v_doi_mask2.v, v_doi_mask2_check.v)); -#endif /* HAVE_AVX512_F */ + /* Form integer masks. */ + doi_mask_self_check = vec_form_int_mask(v_doi_mask_self_check); + doi_mask = vec_form_int_mask(v_doi_mask); + + doi_mask2_self_check = vec_form_int_mask(v_doi_mask2_self_check); + doi_mask2 = vec_form_int_mask(v_doi_mask2); + + /* Combine the two masks. */ + doi_mask = doi_mask & doi_mask_self_check; + doi_mask2 = doi_mask2 & doi_mask2_self_check; /* If there are any interactions left pack interaction values into c2 * cache. */ if (doi_mask) { storeInteractions(doi_mask, pjd, &v_r2, &v_dx_tmp, &v_dy_tmp, &v_dz_tmp, - &mj, &pjvx, &pjvy, &pjvz, cell_cache, &int_cache, + cell_cache, &int_cache, &icount, &rhoSum, &rho_dhSum, + &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, + &curlvySum, &curlvzSum, v_hi_inv, v_vix, v_viy, + v_viz); + } + if (doi_mask2) { + storeInteractions(doi_mask2, pjd + VEC_SIZE, &v_r2_2, &v_dx_tmp2, + &v_dy_tmp2, &v_dz_tmp2, cell_cache, &int_cache, &icount, &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz); } - if (doi_mask2) { - storeInteractions( - doi_mask2, pjd + VEC_SIZE, &v_r2_2, &v_dx_tmp2, &v_dy_tmp2, - &v_dz_tmp2, &mj2, &pjvx2, &pjvy2, &pjvz2, cell_cache, &int_cache, - &icount, &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, - &curlvxSum, &curlvySum, &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz); - } } /* Perform padded vector remainder interactions if any are present. */ @@ -559,16 +549,9 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( /* Initialise masks to true in case remainder interactions have been * performed. */ - vector int_mask, int_mask2; -#ifdef HAVE_AVX512_F - KNL_MASK_16 knl_mask = 0xFFFF; - KNL_MASK_16 knl_mask2 = 0xFFFF; - int_mask.m = vec_setint1(0xFFFFFFFF); - int_mask2.m = vec_setint1(0xFFFFFFFF); -#else - int_mask.m = vec_setint1(0xFFFFFFFF); - int_mask2.m = vec_setint1(0xFFFFFFFF); -#endif + mask_t int_mask, int_mask2; + vec_init_mask_true(int_mask); + vec_init_mask_true(int_mask2); /* Perform interaction with 2 vectors. */ for (int pjd = 0; pjd < icount_align; pjd += (num_vec_proc * VEC_SIZE)) { @@ -578,11 +561,7 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( &int_cache.vxq[pjd], &int_cache.vyq[pjd], &int_cache.vzq[pjd], &int_cache.mq[pjd], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, int_mask, int_mask2, -#ifdef HAVE_AVX512_F - knl_mask, knl_mask2); -#else - 0, 0); -#endif + 0); } /* Perform horizontal adds on vector sums and store result in particle pi. @@ -714,38 +693,19 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, } int first_pi, last_pj; - float *max_di __attribute__((aligned(sizeof(float) * VEC_SIZE))); - float *max_dj __attribute__((aligned(sizeof(float) * VEC_SIZE))); + int *max_index_i __attribute__((aligned(sizeof(int) * VEC_SIZE))); + int *max_index_j __attribute__((aligned(sizeof(int) * VEC_SIZE))); - max_di = r->ci_cache.max_d; - max_dj = r->cj_cache.max_d; + max_index_i = r->ci_cache.max_index; + max_index_j = r->cj_cache.max_index; - /* Find particles maximum distance into cj, max_di[] and ci, max_dj[]. */ + /* Find particles maximum index into cj, max_index_i[] and ci, max_index_j[]. + */ /* Also find the first pi that interacts with any particle in cj and the last * pj that interacts with any particle in ci. */ - populate_max_d_no_cache(ci, cj, sort_i, sort_j, dx_max, rshift, hi_max, - hj_max, di_max, dj_min, max_di, max_dj, &first_pi, - &last_pj, e); - - /* Find the maximum index into cj that is required by a particle in ci. */ - /* Find the maximum index into ci that is required by a particle in cj. */ - float di, dj; - int max_ind_j = count_j - 1; - int max_ind_i = 0; - - dj = sort_j[max_ind_j].d; - while (max_ind_j > 0 && max_di[count_i - 1] < dj) { - max_ind_j--; - - dj = sort_j[max_ind_j].d; - } - - di = sort_i[max_ind_i].d; - while (max_ind_i < count_i - 1 && max_dj[0] > di) { - max_ind_i++; - - di = sort_i[max_ind_i].d; - } + populate_max_index_no_cache(ci, cj, sort_i, sort_j, dx_max, rshift, hi_max, + hj_max, di_max, dj_min, max_index_i, max_index_j, + &first_pi, &last_pj, e); /* Limits of the outer loops. */ int first_pi_loop = first_pi; @@ -753,8 +713,8 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, /* Take the max/min of both values calculated to work out how many particles * to read into the cache. */ - last_pj = max(last_pj, max_ind_j); - first_pi = min(first_pi, max_ind_i); + last_pj = max(last_pj, max_index_i[count_i - 1]); + first_pi = min(first_pi, max_index_j[0]); /* Read the needed particles into the two caches. */ int first_pi_align = first_pi; @@ -769,7 +729,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, if (cell_is_active(ci, e)) { /* Loop over the parts in ci until nothing is within range in cj. */ - for (int pid = count_i - 1; pid >= first_pi_loop && max_ind_j >= 0; pid--) { + for (int pid = count_i - 1; pid >= first_pi_loop; pid--) { /* Get a hold of the ith part in ci. */ struct part *restrict pi = &parts_i[sort_i[pid].i]; @@ -785,13 +745,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, if (di_test < dj_min) continue; /* Determine the exit iteration of the interaction loop. */ - dj = sort_j[max_ind_j].d; - while (max_ind_j > 0 && max_di[pid] < dj) { - max_ind_j--; - - dj = sort_j[max_ind_j].d; - } - int exit_iteration = max_ind_j + 1; + int exit_iteration = max_index_i[pid]; const float hig2 = hi * hi * kernel_gamma2; @@ -866,14 +820,14 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v); v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v); - vector v_doi_mask; + mask_t v_doi_mask; int doi_mask; /* Form r2 < hig2 mask. */ - v_doi_mask.v = vec_cmp_lt(v_r2.v, v_hig2.v); + vec_create_mask(v_doi_mask, vec_cmp_lt(v_r2.v, v_hig2.v)); /* Form integer mask. */ - doi_mask = vec_cmp_result(v_doi_mask.v); + doi_mask = vec_form_int_mask(v_doi_mask); /* If there are any interactions perform them. */ if (doi_mask) @@ -882,12 +836,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, &cj_cache->vx[cj_cache_idx], &cj_cache->vy[cj_cache_idx], &cj_cache->vz[cj_cache_idx], &cj_cache->m[cj_cache_idx], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, - &curlvySum, &curlvzSum, v_doi_mask, -#ifdef HAVE_AVX512_F - knl_mask); -#else - 0); -#endif + &curlvySum, &curlvzSum, v_doi_mask); } /* loop over the parts in cj. */ @@ -908,7 +857,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, if (cell_is_active(cj, e)) { /* Loop over the parts in cj until nothing is within range in ci. */ - for (int pjd = 0; pjd <= last_pj_loop && max_ind_i < count_i; pjd++) { + for (int pjd = 0; pjd <= last_pj_loop; pjd++) { /* Get a hold of the jth part in cj. */ struct part *restrict pj = &parts_j[sort_j[pjd].i]; @@ -925,13 +874,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, if (dj_test > di_max) continue; /* Determine the exit iteration of the interaction loop. */ - di = sort_i[max_ind_i].d; - while (max_ind_i < count_i - 1 && max_dj[pjd] > di) { - max_ind_i++; - - di = sort_i[max_ind_i].d; - } - int exit_iteration = max_ind_i; + int exit_iteration = max_index_j[pjd]; const float hjg2 = hj * hj * kernel_gamma2; @@ -1005,14 +948,14 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v); v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v); - vector v_doj_mask; + mask_t v_doj_mask; int doj_mask; /* Form r2 < hig2 mask. */ - v_doj_mask.v = vec_cmp_lt(v_r2.v, v_hjg2.v); + vec_create_mask(v_doj_mask, vec_cmp_lt(v_r2.v, v_hjg2.v)); /* Form integer mask. */ - doj_mask = vec_cmp_result(v_doj_mask.v); + doj_mask = vec_form_int_mask(v_doj_mask); /* If there are any interactions perform them. */ if (doj_mask) @@ -1021,12 +964,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, &ci_cache->vx[ci_cache_idx], &ci_cache->vy[ci_cache_idx], &ci_cache->vz[ci_cache_idx], &ci_cache->m[ci_cache_idx], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, - &curlvySum, &curlvzSum, v_doj_mask, -#ifdef HAVE_AVX512_F - knl_mask); -#else - 0); -#endif + &curlvySum, &curlvzSum, v_doj_mask); } /* loop over the parts in ci. */ diff --git a/src/scheduler.h b/src/scheduler.h index cdb3a28f1a3482e934480b0c0ccd18f18b66d437..ac654580b2af2ffb506dc3fd9f0b988b89effbd0 100644 --- a/src/scheduler.h +++ b/src/scheduler.h @@ -52,7 +52,7 @@ /* Flags . */ #define scheduler_flag_none 0 -#define scheduler_flag_steal 1 +#define scheduler_flag_steal (1 << 1) /* Data of a scheduler. */ struct scheduler { diff --git a/src/tools.c b/src/tools.c index 3f866710c387a858c23ed052be1ad463222601d5..3ee55db3d5f5348699372d2620b6d15af38b23d0 100644 --- a/src/tools.c +++ b/src/tools.c @@ -32,6 +32,7 @@ #include "tools.h" /* Local includes. */ +#include "active.h" #include "cell.h" #include "error.h" #include "gravity.h" @@ -183,6 +184,7 @@ void pairs_all_density(struct runner *r, struct cell *ci, struct cell *cj) { float r2, hi, hj, hig2, hjg2, dx[3]; struct part *pi, *pj; const double dim[3] = {r->e->s->dim[0], r->e->s->dim[1], r->e->s->dim[2]}; + const struct engine *e = r->e; /* Implements a double-for loop and checks every interaction */ for (int i = 0; i < ci->count; ++i) { @@ -191,6 +193,9 @@ void pairs_all_density(struct runner *r, struct cell *ci, struct cell *cj) { hi = pi->h; hig2 = hi * hi * kernel_gamma2; + /* Skip inactive particles. */ + if (!part_is_active(pi, e)) continue; + for (int j = 0; j < cj->count; ++j) { pj = &cj->parts[j]; @@ -219,6 +224,9 @@ void pairs_all_density(struct runner *r, struct cell *ci, struct cell *cj) { hj = pj->h; hjg2 = hj * hj * kernel_gamma2; + /* Skip inactive particles. */ + if (!part_is_active(pj, e)) continue; + for (int i = 0; i < ci->count; ++i) { pi = &ci->parts[i]; @@ -311,6 +319,7 @@ void pairs_all_force(struct runner *r, struct cell *ci, struct cell *cj) { void self_all_density(struct runner *r, struct cell *ci) { float r2, hi, hj, hig2, hjg2, dxi[3]; //, dxj[3]; struct part *pi, *pj; + const struct engine *e = r->e; /* Implements a double-for loop and checks every interaction */ for (int i = 0; i < ci->count; ++i) { @@ -335,14 +344,14 @@ void self_all_density(struct runner *r, struct cell *ci) { } /* Hit or miss? */ - if (r2 < hig2) { + if (r2 < hig2 && part_is_active(pi, e)) { /* Interact */ runner_iact_nonsym_density(r2, dxi, hi, hj, pi, pj); } /* Hit or miss? */ - if (r2 < hjg2) { + if (r2 < hjg2 && part_is_active(pj, e)) { dxi[0] = -dxi[0]; dxi[1] = -dxi[1]; diff --git a/src/vector.h b/src/vector.h index 6d5389039c028e4307d4e559dd6567cce6c25392..6a7c6837989025785c1f9134004f2ebcc226a205 100644 --- a/src/vector.h +++ b/src/vector.h @@ -64,9 +64,13 @@ #define vec_dbl_set(a, b, c, d, e, f, g, h) \ _mm512_set_pd(h, g, f, e, d, c, b, a) #define vec_add(a, b) _mm512_add_ps(a, b) +#define vec_mask_add(a, b, mask) _mm512_mask_add_ps(a, mask, b, a) #define vec_sub(a, b) _mm512_sub_ps(a, b) +#define vec_mask_sub(a, b, mask) _mm512_mask_sub_ps(a, mask, a, b) #define vec_mul(a, b) _mm512_mul_ps(a, b) +#define vec_div(a, b) _mm512_div_ps(a, b) #define vec_fma(a, b, c) _mm512_fmadd_ps(a, b, c) +#define vec_fnma(a, b, c) _mm512_fnmadd_ps(a, b, c) #define vec_sqrt(a) _mm512_sqrt_ps(a) #define vec_rcp(a) _mm512_rcp14_ps(a) #define vec_rsqrt(a) _mm512_rsqrt14_ps(a) @@ -79,7 +83,16 @@ #define vec_cmp_lt(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ) #define vec_cmp_lte(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ) #define vec_cmp_gte(a, b) _mm512_cmp_ps_mask(a, b, _CMP_GE_OQ) +#define vec_cmp_result(a) ({ a; }) +#define vec_form_int_mask(a) ({ a; }) #define vec_and(a, b) _mm512_and_ps(a, b) +#define vec_mask_and(a, b) _mm512_kand(a, b) +#define vec_and_mask(a, mask) _mm512_maskz_mov_ps(mask, a) +#define vec_init_mask_true(mask) ({ mask = 0xFFFF; }) +#define vec_zero_mask(mask) ({ mask = 0; }) +#define vec_create_mask(mask, cond) ({ mask = cond; }) +#define vec_pad_mask(mask, pad) ({ mask = mask >> (pad); }) +#define vec_blend(mask, a, b) _mm512_mask_blend_ps(mask, a, b) #define vec_todbl_lo(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 0)) #define vec_todbl_hi(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 1)) #define vec_dbl_tofloat(a, b) _mm512_insertf128(_mm512_castps128_ps512(a), b, 1) @@ -120,10 +133,13 @@ for (int i = 0; i < VEC_SIZE; i++) b += a.f[i]; \ } #endif -/* Calculates the number of set bits in the mask and adds the result to an int. - */ -#define VEC_FORM_PACKED_MASK(mask, v_mask, pack) \ - pack += __builtin_popcount(mask); + +/* Do nothing in the case of AVX-512 as there are already + * instructions for left-packing.*/ +#define VEC_FORM_PACKED_MASK(mask, packed_mask) packed_mask = mask + +/* Finds the horizontal maximum of vector b and returns a float. */ +#define VEC_HMAX(a, b) b = _mm512_reduce_max_ps(a.v) /* Performs a left-pack on a vector based upon a mask and returns the result. */ #define VEC_LEFT_PACK(a, mask, result) \ @@ -146,8 +162,11 @@ #define vec_set(a, b, c, d, e, f, g, h) _mm256_set_ps(h, g, f, e, d, c, b, a) #define vec_dbl_set(a, b, c, d) _mm256_set_pd(d, c, b, a) #define vec_add(a, b) _mm256_add_ps(a, b) +#define vec_mask_add(a, b, mask) vec_add(a, vec_and(b, mask.v)) #define vec_sub(a, b) _mm256_sub_ps(a, b) +#define vec_mask_sub(a, b, mask) vec_sub(a, vec_and(b, mask.v)) #define vec_mul(a, b) _mm256_mul_ps(a, b) +#define vec_div(a, b) _mm256_div_ps(a, b) #define vec_sqrt(a) _mm256_sqrt_ps(a) #define vec_rcp(a) _mm256_rcp_ps(a) #define vec_rsqrt(a) _mm256_rsqrt_ps(a) @@ -161,7 +180,16 @@ #define vec_cmp_lte(a, b) _mm256_cmp_ps(a, b, _CMP_LE_OQ) #define vec_cmp_gte(a, b) _mm256_cmp_ps(a, b, _CMP_GE_OQ) #define vec_cmp_result(a) _mm256_movemask_ps(a) +#define vec_form_int_mask(a) _mm256_movemask_ps(a.v) #define vec_and(a, b) _mm256_and_ps(a, b) +#define vec_mask_and(a, b) _mm256_and_ps(a.v, b.v) +#define vec_and_mask(a, mask) _mm256_and_ps(a, mask.v) +#define vec_init_mask_true(mask) mask.m = vec_setint1(0xFFFFFFFF) +#define vec_create_mask(mask, cond) mask.v = cond +#define vec_zero_mask(mask) mask.v = vec_setzero() +#define vec_pad_mask(mask, pad) \ + for (int i = VEC_SIZE - (pad); i < VEC_SIZE; i++) mask.i[i] = 0 +#define vec_blend(mask, a, b) _mm256_blendv_ps(a, b, mask.v) #define vec_todbl_lo(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 0)) #define vec_todbl_hi(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 1)) #define vec_dbl_tofloat(a, b) _mm256_insertf128(_mm256_castps128_ps256(a), b, 1) @@ -187,6 +215,13 @@ a.v = _mm256_hadd_ps(a.v, a.v); \ b += a.f[0] + a.f[4]; +/* Performs a horizontal maximum on the vector and takes the maximum of the + * result with a float, b. */ +#define VEC_HMAX(a, b) \ + { \ + for (int k = 0; k < VEC_SIZE; k++) b = max(b, a.f[k]); \ + } + /* Returns the lower 128-bits of the 256-bit vector. */ #define VEC_GET_LOW(a) _mm256_castps256_ps128(a) @@ -196,6 +231,7 @@ /* Check if we have AVX2 intrinsics alongside AVX */ #ifdef HAVE_AVX2 #define vec_fma(a, b, c) _mm256_fmadd_ps(a, b, c) +#define vec_fnma(a, b, c) _mm256_fnmadd_ps(a, b, c) /* Used in VEC_FORM_PACKED_MASK */ #define identity_indices 0x0706050403020100 @@ -205,19 +241,18 @@ /* Takes an integer mask and forms a left-packed integer vector * containing indices of the set bits in the integer mask. * Also returns the total number of bits set in the mask. */ -#define VEC_FORM_PACKED_MASK(mask, v_mask, pack) \ +#define VEC_FORM_PACKED_MASK(mask, packed_mask) \ { \ unsigned long expanded_mask = _pdep_u64(mask, 0x0101010101010101); \ expanded_mask *= 0xFF; \ unsigned long wanted_indices = _pext_u64(identity_indices, expanded_mask); \ __m128i bytevec = _mm_cvtsi64_si128(wanted_indices); \ - v_mask = _mm256_cvtepu8_epi32(bytevec); \ - pack += __builtin_popcount(mask); \ + packed_mask.m = _mm256_cvtepu8_epi32(bytevec); \ } /* Performs a left-pack on a vector based upon a mask and returns the result. */ #define VEC_LEFT_PACK(a, mask, result) \ - vec_unaligned_store(_mm256_permutevar8x32_ps(a, mask), result) + vec_unaligned_store(_mm256_permutevar8x32_ps(a, mask.m), result) #endif /* HAVE_AVX2 */ /* Create an FMA using vec_add and vec_mul if AVX2 is not present. */ @@ -225,6 +260,11 @@ #define vec_fma(a, b, c) vec_add(vec_mul(a, b), c) #endif +/* Create a negated FMA using vec_sub and vec_mul if AVX2 is not present. */ +#ifndef vec_fnma +#define vec_fnma(a, b, c) vec_sub(c, vec_mul(a, b)) +#endif + /* Form a packed mask without intrinsics if AVX2 is not present. */ #ifndef VEC_FORM_PACKED_MASK @@ -288,6 +328,7 @@ #define vec_add(a, b) _mm_add_ps(a, b) #define vec_sub(a, b) _mm_sub_ps(a, b) #define vec_mul(a, b) _mm_mul_ps(a, b) +#define vec_div(a, b) _mm_div_ps(a, b) #define vec_sqrt(a) _mm_sqrt_ps(a) #define vec_rcp(a) _mm_rcp_ps(a) #define vec_rsqrt(a) _mm_rsqrt_ps(a) @@ -340,6 +381,13 @@ typedef union { int i[VEC_SIZE]; } vector; +/* Define the mask type depending on the instruction set used. */ +#ifdef HAVE_AVX512_F +typedef __mmask16 mask_t; +#else +typedef vector mask_t; +#endif + /** * @brief Calculates the inverse ($1/x$) of a vector using intrinsics and a * Newton iteration to obtain the correct level of accuracy. diff --git a/tests/Makefile.am b/tests/Makefile.am index c4ed2512fc49276ad09dd1c4ef7a0f28207a1f77..bf4c205b5ab5000e246693c4d45772d16f67a5e5 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -21,16 +21,16 @@ AM_LDFLAGS = ../src/.libs/libswiftsim.a $(HDF5_LDFLAGS) $(HDF5_LIBS) $(FFTW_LIBS # List of programs and scripts to run in the test suite TESTS = testGreetings testMaths testReading.sh testSingle testKernel testSymmetry \ - testPair.sh testPairPerturbed.sh test27cells.sh test27cellsPerturbed.sh \ + testActivePair.sh test27cells.sh test27cellsPerturbed.sh \ testParser.sh testSPHStep test125cells.sh test125cellsPerturbed.sh testFFT \ testAdiabaticIndex testRiemannExact testRiemannTRRS testRiemannHLLC \ - testMatrixInversion testThreadpool testDump testLogger \ + testMatrixInversion testThreadpool testDump testLogger testInteractions.sh \ testVoronoi1D testVoronoi2D testVoronoi3D testGravityDerivatives \ testPeriodicBC.sh testPeriodicBCPerturbed.sh # List of test programs to compile check_PROGRAMS = testGreetings testReading testSingle testTimeIntegration \ - testSPHStep testPair test27cells test125cells testParser \ + testSPHStep testActivePair test27cells test125cells testParser \ testKernel testFFT testInteractions testMaths \ testSymmetry testThreadpool benchmarkInteractions \ testAdiabaticIndex testRiemannExact testRiemannTRRS \ @@ -56,7 +56,7 @@ testSPHStep_SOURCES = testSPHStep.c testSingle_SOURCES = testSingle.c -testPair_SOURCES = testPair.c +testActivePair_SOURCES = testActivePair.c test27cells_SOURCES = test27cells.c @@ -99,10 +99,10 @@ testLogger_SOURCES = testLogger.c testGravityDerivatives_SOURCES = testGravityDerivatives.c # Files necessary for distribution -EXTRA_DIST = testReading.sh makeInput.py testPair.sh testPairPerturbed.sh \ +EXTRA_DIST = testReading.sh makeInput.py testActivePair.sh \ test27cells.sh test27cellsPerturbed.sh testParser.sh testPeriodicBC.sh \ testPeriodicBCPerturbed.sh test125cells.sh test125cellsPerturbed.sh testParserInput.yaml \ difffloat.py tolerance_125_normal.dat tolerance_125_perturbed.dat \ - tolerance_27_normal.dat tolerance_27_perturbed.dat tolerance_27_perturbed_h.dat \ - tolerance_pair_normal.dat tolerance_pair_perturbed.dat \ + tolerance_27_normal.dat tolerance_27_perturbed.dat tolerance_27_perturbed_h.dat tolerance_27_perturbed_h2.dat \ + tolerance_testInteractions.dat tolerance_pair_active.dat \ fft_params.yml tolerance_periodic_BC_normal.dat tolerance_periodic_BC_perturbed.dat diff --git a/tests/benchmarkInteractions.c b/tests/benchmarkInteractions.c index a11862ebaa345623ca65cd7025b4f4e79769d0ac..5dc7a4ae85e62313a84d0c0be88b65a1c0bb59df 100644 --- a/tests/benchmarkInteractions.c +++ b/tests/benchmarkInteractions.c @@ -31,6 +31,7 @@ #define IACT runner_iact_nonsym_density #define IACT_VEC runner_iact_nonsym_2_vec_density #define IACT_NAME "test_nonsym_density" +#define NUM_VEC_PROC_INT 2 #endif #ifdef SYM_DENSITY @@ -53,8 +54,9 @@ #ifndef IACT #define IACT runner_iact_nonsym_density -#define IACT_VEC runner_iact_nonsym_2_vec_density +#define IACT_VEC runner_iact_nonsym_1_vec_density #define IACT_NAME "test_nonsym_density" +#define NUM_VEC_PROC_INT 1 #endif /** @@ -368,7 +370,7 @@ void test_interactions(struct part test_part, struct part *parts, size_t count, /* Perform vector interaction. */ #ifdef WITH_VECTORIZATION - vector hi_vec, hi_inv_vec, vix_vec, viy_vec, viz_vec, mask, mask2; + vector hi_vec, hi_inv_vec, vix_vec, viy_vec, viz_vec; vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, curlvySum, curlvzSum; @@ -387,28 +389,35 @@ void test_interactions(struct part test_part, struct part *parts, size_t count, viz_vec.v = vec_load(&vizq[0]); hi_inv_vec = vec_reciprocal(hi_vec); - mask.m = vec_setint1(0xFFFFFFFF); - mask2.m = vec_setint1(0xFFFFFFFF); -#ifdef HAVE_AVX512_F - KNL_MASK_16 knl_mask, knl_mask2; - knl_mask = 0xFFFF; - knl_mask2 = 0xFFFF; + mask_t mask; + vec_init_mask_true(mask); +#if (NUM_VEC_PROC_INT == 2) + mask_t mask2; + vec_init_mask_true(mask2); #endif - const ticks vec_tic = getticks(); - for (size_t i = 0; i < count; i += 2 * VEC_SIZE) { + for (size_t i = 0; i < count; i += NUM_VEC_PROC_INT * VEC_SIZE) { +/* Interleave two vectors for interaction. */ +#if (NUM_VEC_PROC_INT == 2) IACT_VEC(&(r2q[i]), &(dxq[i]), &(dyq[i]), &(dzq[i]), (hi_inv_vec), (vix_vec), (viy_vec), (viz_vec), &(vjxq[i]), &(vjyq[i]), &(vjzq[i]), &(mjq[i]), &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, - mask, mask2, -#ifdef HAVE_AVX512_F - knl_mask, knl_mask2); -#else - 0, 0); + mask, mask2, 0); +#else /* Only use one vector for interaction. */ + vector r2, dx, dy, dz; + r2.v = vec_load(&(r2q[i])); + dx.v = vec_load(&(dxq[i])); + dy.v = vec_load(&(dyq[i])); + dz.v = vec_load(&(dzq[i])); + + IACT_VEC(&r2, &dx, &dy, &dz, (hi_inv_vec), (vix_vec), (viy_vec), + (viz_vec), &(vjxq[i]), &(vjyq[i]), &(vjzq[i]), &(mjq[i]), + &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, + &curlvxSum, &curlvySum, &curlvzSum, mask); #endif } diff --git a/tests/test125cells.c b/tests/test125cells.c index 1bddd5dd2d4933e3b2d6c802d1325f3b88ba6ba5..2b35f7b5d509a7f966d07d6da82616869f29ab66 100644 --- a/tests/test125cells.c +++ b/tests/test125cells.c @@ -440,6 +440,7 @@ void runner_doself2_force(struct runner *r, struct cell *ci); /* And go... */ int main(int argc, char *argv[]) { + engine_pin(); size_t runs = 0, particles = 0; double h = 1.23485, size = 1., rho = 2.5; double perturbation = 0.; diff --git a/tests/test27cells.c b/tests/test27cells.c index c4df8356eb6494cc34b91c6ab5038b3866b14056..7ba1eec9ad279f09f63021e332dac1cfd5cc1505 100644 --- a/tests/test27cells.c +++ b/tests/test27cells.c @@ -122,7 +122,7 @@ struct cell *make_cell(size_t n, double *offset, double size, double h, break; } if (h_pert) - part->h = size * h * random_uniform(1.f, 1.1f) / (float)n; + part->h = size * h * random_uniform(1.f, h_pert) / (float)n; else part->h = size * h / (float)n; h_max = fmaxf(h_max, part->h); diff --git a/tests/test27cells.sh.in b/tests/test27cells.sh.in index a313a594c10c80598852a92e7920ac846e15ba2d..059a7a208aa8e570ad5035fac16ffd201bf3dddd 100755 --- a/tests/test27cells.sh.in +++ b/tests/test27cells.sh.in @@ -56,4 +56,32 @@ do done +# Test for particles with random smoothing lengths +for v in {0..3} +do + echo "" + + rm -f brute_force_27_standard.dat swift_dopair_27_standard.dat + + echo "Running ./test27cells -n 6 -r 1 -d 0 -f standard -v $v -p 1.3" + ./test27cells -n 6 -r 1 -d 0 -f standard -v $v -p 1.3 + + if [ -e brute_force_27_standard.dat ] + then + if python @srcdir@/difffloat.py brute_force_27_standard.dat swift_dopair_27_standard.dat @srcdir@/tolerance_27_perturbed_h2.dat 6 + then + echo "Accuracy test passed" + else + echo "Accuracy test failed" + exit 1 + fi + else + echo "Error Missing test output file" + exit 1 + fi + + echo "------------" + +done + exit $? diff --git a/tests/test27cellsPerturbed.sh.in b/tests/test27cellsPerturbed.sh.in index 4adbe0b9ca6d8e1cfe7df861c2bf88bf35832983..f875504e541588377ca6e40fe55681ebec3466f6 100755 --- a/tests/test27cellsPerturbed.sh.in +++ b/tests/test27cellsPerturbed.sh.in @@ -56,4 +56,31 @@ do done +# Test for particles with random smoothing lengths +for v in {0..3} +do + echo "" + + rm -f brute_force_27_perturbed.dat swift_dopair_27_perturbed.dat + + echo "Running ./test27cells -n 6 -r 1 -d 0.1 -f perturbed -v $v -p 1.3" + ./test27cells -n 6 -r 1 -d 0.1 -f perturbed -v $v -p 1.3 + + if [ -e brute_force_27_perturbed.dat ] + then + if python @srcdir@/difffloat.py brute_force_27_perturbed.dat swift_dopair_27_perturbed.dat @srcdir@/tolerance_27_perturbed_h2.dat 6 + then + echo "Accuracy test passed" + else + echo "Accuracy test failed" + exit 1 + fi + else + echo "Error Missing test output file" + exit 1 + fi + + echo "------------" + +done exit $? diff --git a/tests/testActivePair.c b/tests/testActivePair.c new file mode 100644 index 0000000000000000000000000000000000000000..1e0111b4f0e480d0f66463b4c2264cdd89bd28c8 --- /dev/null +++ b/tests/testActivePair.c @@ -0,0 +1,510 @@ +/******************************************************************************* + * This file is part of SWIFT. + * Copyright (C) 2015 Matthieu Schaller (matthieu.schaller@durham.ac.uk). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + ******************************************************************************/ +#include "../config.h" + +/* Some standard headers. */ +#include <fenv.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include <unistd.h> + +/* Local headers. */ +#include "swift.h" + +/** + * @brief Constructs a cell and all of its particle in a valid state prior to + * a DOPAIR or DOSELF calcuation. + * + * @param n The cube root of the number of particles. + * @param offset The position of the cell offset from (0,0,0). + * @param size The cell size. + * @param h The smoothing length of the particles in units of the inter-particle + * separation. + * @param density The density of the fluid. + * @param partId The running counter of IDs. + * @param pert The perturbation to apply to the particles in the cell in units + * of the inter-particle separation. + * @param h_pert The perturbation to apply to the smoothing length. + * @param fraction_active The fraction of particles that should be active in the + * cell. + */ +struct cell *make_cell(size_t n, double *offset, double size, double h, + double density, long long *partId, double pert, + double h_pert, double fraction_active) { + const size_t count = n * n * n; + const double volume = size * size * size; + float h_max = 0.f; + struct cell *cell = malloc(sizeof(struct cell)); + bzero(cell, sizeof(struct cell)); + + if (posix_memalign((void **)&cell->parts, part_align, + count * sizeof(struct part)) != 0) { + error("couldn't allocate particles, no. of particles: %d", (int)count); + } + bzero(cell->parts, count * sizeof(struct part)); + + /* Construct the parts */ + struct part *part = cell->parts; + for (size_t x = 0; x < n; ++x) { + for (size_t y = 0; y < n; ++y) { + for (size_t z = 0; z < n; ++z) { + part->x[0] = + offset[0] + + size * (x + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n; + part->x[1] = + offset[1] + + size * (y + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n; + part->x[2] = + offset[2] + + size * (z + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n; + part->v[0] = random_uniform(-0.05, 0.05); + part->v[1] = random_uniform(-0.05, 0.05); + part->v[2] = random_uniform(-0.05, 0.05); + + if (h_pert) + part->h = size * h * random_uniform(1.f, h_pert) / (float)n; + else + part->h = size * h / (float)n; + h_max = fmaxf(h_max, part->h); + part->id = ++(*partId); + +#if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH) + part->conserved.mass = density * volume / count; + +#ifdef SHADOWFAX_SPH + double anchor[3] = {0., 0., 0.}; + double side[3] = {1., 1., 1.}; + voronoi_cell_init(&part->cell, part->x, anchor, side); +#endif + +#else + part->mass = density * volume / count; +#endif + +#if defined(HOPKINS_PE_SPH) + part->entropy = 1.f; + part->entropy_one_over_gamma = 1.f; +#endif + if (random_uniform(0, 1.f) < fraction_active) + part->time_bin = 1; + else + part->time_bin = num_time_bins + 1; + +#ifdef SWIFT_DEBUG_CHECKS + part->ti_drift = 8; + part->ti_kick = 8; +#endif + + ++part; + } + } + } + + /* Cell properties */ + cell->split = 0; + cell->h_max = h_max; + cell->count = count; + cell->dx_max_part = 0.; + cell->dx_max_sort = 0.; + cell->width[0] = size; + cell->width[1] = size; + cell->width[2] = size; + cell->loc[0] = offset[0]; + cell->loc[1] = offset[1]; + cell->loc[2] = offset[2]; + + cell->ti_old_part = 8; + cell->ti_end_min = 8; + cell->ti_end_max = 8; + + shuffle_particles(cell->parts, cell->count); + + cell->sorted = 0; + for (int k = 0; k < 13; k++) cell->sort[k] = NULL; + + return cell; +} + +void clean_up(struct cell *ci) { + free(ci->parts); + for (int k = 0; k < 13; k++) + if (ci->sort[k] != NULL) free(ci->sort[k]); + free(ci); +} + +/** + * @brief Initializes all particles field to be ready for a density calculation + */ +void zero_particle_fields(struct cell *c) { + for (int pid = 0; pid < c->count; pid++) { + hydro_init_part(&c->parts[pid], NULL); + } +} + +/** + * @brief Ends the loop by adding the appropriate coefficients + */ +void end_calculation(struct cell *c) { + for (int pid = 0; pid < c->count; pid++) { + hydro_end_density(&c->parts[pid]); + + /* Recover the common "Neighbour number" definition */ + c->parts[pid].density.wcount *= pow_dimension(c->parts[pid].h); + c->parts[pid].density.wcount *= kernel_norm; + } +} + +/** + * @brief Dump all the particles to a file + */ +void dump_particle_fields(char *fileName, struct cell *ci, struct cell *cj) { + FILE *file = fopen(fileName, "a"); + + /* Write header */ + fprintf(file, "# %4s %13s\n", "ID", "wcount"); + + fprintf(file, "# ci --------------------------------------------\n"); + + for (int pid = 0; pid < ci->count; pid++) { + fprintf(file, "%6llu %13e\n", ci->parts[pid].id, + ci->parts[pid].density.wcount); + } + + fprintf(file, "# cj --------------------------------------------\n"); + + for (int pjd = 0; pjd < cj->count; pjd++) { + fprintf(file, "%6llu %13e\n", cj->parts[pjd].id, + cj->parts[pjd].density.wcount); + } + + fclose(file); +} + +/* Just a forward declaration... */ +void runner_dopair1_density(struct runner *r, struct cell *ci, struct cell *cj); +void runner_doself1_density_vec(struct runner *r, struct cell *ci); +void runner_dopair1_branch_density(struct runner *r, struct cell *ci, + struct cell *cj); + +/** + * @brief Computes the pair interactions of two cells using SWIFT and a brute + * force implementation. + */ +void test_pair_interactions(struct runner *runner, struct cell **ci, + struct cell **cj, char *swiftOutputFileName, + char *bruteForceOutputFileName) { + + runner_do_sort(runner, *ci, 0x1FFF, 0, 0); + runner_do_sort(runner, *cj, 0x1FFF, 0, 0); + + /* Zero the fields */ + zero_particle_fields(*ci); + zero_particle_fields(*cj); + + /* Run the test */ + runner_dopair1_branch_density(runner, *ci, *cj); + + /* Let's get physical ! */ + end_calculation(*ci); + end_calculation(*cj); + + /* Dump if necessary */ + dump_particle_fields(swiftOutputFileName, *ci, *cj); + + /* Now perform a brute-force version for accuracy tests */ + + /* Zero the fields */ + zero_particle_fields(*ci); + zero_particle_fields(*cj); + + /* Run the brute-force test */ + pairs_all_density(runner, *ci, *cj); + + /* Let's get physical ! */ + end_calculation(*ci); + end_calculation(*cj); + + dump_particle_fields(bruteForceOutputFileName, *ci, *cj); +} + +/** + * @brief Computes the pair interactions of two cells in various configurations. + */ +void test_all_pair_interactions(struct runner *runner, double *offset2, + size_t particles, double size, double h, + double rho, long long *partId, + double perturbation, double h_pert, + char *swiftOutputFileName, + char *bruteForceOutputFileName) { + + double offset1[3] = {0, 0, 0}; + struct cell *ci, *cj; + + /* All active particles. */ + ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert, + 1.); + cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert, + 1.); + + test_pair_interactions(runner, &ci, &cj, swiftOutputFileName, + bruteForceOutputFileName); + + clean_up(ci); + clean_up(cj); + + /* Half particles are active. */ + ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert, + 0.5); + cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert, + 0.5); + + test_pair_interactions(runner, &ci, &cj, swiftOutputFileName, + bruteForceOutputFileName); + + clean_up(ci); + clean_up(cj); + + /* All particles inactive. */ + ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert, + 0.); + cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert, + 0.); + + test_pair_interactions(runner, &ci, &cj, swiftOutputFileName, + bruteForceOutputFileName); + + clean_up(ci); + clean_up(cj); + + /* 10% of particles active. */ + ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert, + 0.1); + cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert, + 0.1); + + test_pair_interactions(runner, &ci, &cj, swiftOutputFileName, + bruteForceOutputFileName); + + clean_up(ci); + clean_up(cj); + + /* One active cell one inactive cell. */ + ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert, + 1.0); + cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert, + 0.); + + test_pair_interactions(runner, &ci, &cj, swiftOutputFileName, + bruteForceOutputFileName); + + clean_up(ci); + clean_up(cj); + + /* One active cell one inactive cell. */ + ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert, + 0.); + cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert, + 1.0); + + test_pair_interactions(runner, &ci, &cj, swiftOutputFileName, + bruteForceOutputFileName); + + clean_up(ci); + clean_up(cj); + + /* Smaller cells, all active. */ + ci = make_cell(2, offset1, size, h, rho, partId, perturbation, h_pert, 1.0); + cj = make_cell(2, offset2, size, h, rho, partId, perturbation, h_pert, 1.0); + + test_pair_interactions(runner, &ci, &cj, swiftOutputFileName, + bruteForceOutputFileName); + + clean_up(ci); + clean_up(cj); + + /* Different numbers of particles in each cell. */ + ci = make_cell(10, offset1, size, h, rho, partId, perturbation, h_pert, 0.5); + cj = make_cell(3, offset2, size, h, rho, partId, perturbation, h_pert, 0.75); + + test_pair_interactions(runner, &ci, &cj, swiftOutputFileName, + bruteForceOutputFileName); + + clean_up(ci); + clean_up(cj); + + /* One cell inactive and the other only half active. */ + ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert, + 0.5); + cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert, + 0.); + + test_pair_interactions(runner, &ci, &cj, swiftOutputFileName, + bruteForceOutputFileName); + + clean_up(ci); + clean_up(cj); + + /* One cell inactive and the other only half active. */ + ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert, + 0.); + cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert, + 0.5); + + test_pair_interactions(runner, &ci, &cj, swiftOutputFileName, + bruteForceOutputFileName); + + /* Clean things to make the sanitizer happy ... */ + clean_up(ci); + clean_up(cj); +} + +int main(int argc, char *argv[]) { + size_t particles = 0, runs = 0, type = 0; + double h = 1.23485, size = 1., rho = 1.; + double perturbation = 0.1, h_pert = 1.1; + struct space space; + struct engine engine; + struct runner *runner; + char c; + static long long partId = 0; + char outputFileNameExtension[200] = ""; + char swiftOutputFileName[200] = ""; + char bruteForceOutputFileName[200] = ""; + + /* Initialize CPU frequency, this also starts time. */ + unsigned long long cpufreq = 0; + clocks_set_cpufreq(cpufreq); + + /* Choke on FP-exceptions */ + feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW); + + /* Generate a RNG seed from time. */ + unsigned int seed = time(NULL); + + while ((c = getopt(argc, argv, "h:p:n:r:t:d:s:f:")) != -1) { + switch (c) { + case 'h': + sscanf(optarg, "%lf", &h); + break; + case 'p': + sscanf(optarg, "%lf", &h_pert); + break; + case 'n': + sscanf(optarg, "%zu", &particles); + break; + case 'r': + sscanf(optarg, "%zu", &runs); + break; + case 't': + sscanf(optarg, "%zu", &type); + break; + case 'd': + sscanf(optarg, "%lf", &perturbation); + break; + case 's': + sscanf(optarg, "%u", &seed); + break; + case 'f': + strcpy(outputFileNameExtension, optarg); + break; + case '?': + error("Unknown option."); + break; + } + } + + if (h < 0 || particles == 0 || runs == 0 || type > 2) { + printf( + "\nUsage: %s -n PARTICLES_PER_AXIS -r NUMBER_OF_RUNS [OPTIONS...]\n" + "\nGenerates a cell pair, filled with particles on a Cartesian grid." + "\nThese are then interacted using runner_dopair1_density." + "\n\nOptions:" + "\n-t TYPE=0 - cells share face (0), edge (1) or corner (2)" + "\n-h DISTANCE=1.2348 - smoothing length" + "\n-p - Random fractional change in h, h=h*random(1,p)" + "\n-d pert - perturbation to apply to the particles [0,1[" + "\n-s seed - seed for RNG" + "\n-f fileName - part of the file name used to save the dumps\n", + argv[0]); + exit(1); + } + + /* Seed RNG. */ + message("Seed used for RNG: %d", seed); + srand(seed); + + space.periodic = 0; + space.dim[0] = 3.; + space.dim[1] = 3.; + space.dim[2] = 3.; + + engine.s = &space; + engine.time = 0.1f; + engine.ti_current = 8; + engine.max_active_bin = num_time_bins; + + if (posix_memalign((void **)&runner, SWIFT_STRUCT_ALIGNMENT, + sizeof(struct runner)) != 0) { + error("couldn't allocate runner"); + } + + runner->e = &engine; + + /* Create output file names. */ + sprintf(swiftOutputFileName, "swift_dopair_%s.dat", outputFileNameExtension); + sprintf(bruteForceOutputFileName, "brute_force_%s.dat", + outputFileNameExtension); + + /* Delete files if they already exist. */ + remove(swiftOutputFileName); + remove(bruteForceOutputFileName); + +#ifdef WITH_VECTORIZATION + runner->ci_cache.count = 0; + cache_init(&runner->ci_cache, 512); + runner->cj_cache.count = 0; + cache_init(&runner->cj_cache, 512); +#endif + + double offset[3] = {1., 0., 0.}; + + /* Test a pair of cells face-on. */ + test_all_pair_interactions(runner, offset, particles, size, h, rho, &partId, + perturbation, h_pert, swiftOutputFileName, + bruteForceOutputFileName); + + /* Test a pair of cells edge-on. */ + offset[0] = 1.; + offset[1] = 1.; + offset[2] = 0.; + test_all_pair_interactions(runner, offset, particles, size, h, rho, &partId, + perturbation, h_pert, swiftOutputFileName, + bruteForceOutputFileName); + + /* Test a pair of cells corner-on. */ + offset[0] = 1.; + offset[1] = 1.; + offset[2] = 1.; + test_all_pair_interactions(runner, offset, particles, size, h, rho, &partId, + perturbation, h_pert, swiftOutputFileName, + bruteForceOutputFileName); + return 0; +} diff --git a/tests/testActivePair.sh.in b/tests/testActivePair.sh.in new file mode 100755 index 0000000000000000000000000000000000000000..ff8d027a469bd9bc78286b843cf2dffd3ef27ad3 --- /dev/null +++ b/tests/testActivePair.sh.in @@ -0,0 +1,11 @@ +#!/bin/bash + +echo "" + +rm -f brute_force_pair_active.dat swift_dopair_active.dat + +./testActivePair -n 6 -r 1 -d 0 -f active + +python @srcdir@/difffloat.py brute_force_active.dat swift_dopair_active.dat @srcdir@/tolerance_pair_active.dat + +exit $? diff --git a/tests/testInteractions.c b/tests/testInteractions.c index 4ce7fe40554d24551750629fa47c0bee7acdb6da..54d1f38733a1f1647331166f1a37b40ed3511419 100644 --- a/tests/testInteractions.c +++ b/tests/testInteractions.c @@ -17,12 +17,6 @@ * ******************************************************************************/ -#include "../config.h" - -#ifndef WITH_VECTORIZATION -int main() { return 0; } -#else - #include <fenv.h> #include <stdio.h> #include <stdlib.h> @@ -30,15 +24,17 @@ int main() { return 0; } #include <unistd.h> #include "swift.h" +#ifdef WITH_VECTORIZATION + #define array_align sizeof(float) * VEC_SIZE #define ACC_THRESHOLD 1e-5 -/* Typdef function pointers for serial and vectorised versions of the - * interaction functions. */ -typedef void (*serial_interaction)(float, float *, float, float, struct part *, - struct part *); -typedef void (*vec_interaction)(float *, float *, float *, float *, - struct part **, struct part **); +#ifndef IACT +#define IACT runner_iact_nonsym_density +#define IACT_VEC runner_iact_nonsym_1_vec_density +#define IACT_NAME "test_nonsym_density" +#define NUM_VEC_PROC_INT 1 +#endif /** * @brief Constructs an array of particles in a valid state prior to @@ -74,7 +70,10 @@ struct part *make_particles(size_t count, double *offset, double spacing, p->h = h; p->id = ++(*partId); + +#if !defined(GIZMO_SPH) && !defined(SHADOWFAX_SPH) p->mass = 1.0f; +#endif /* Place rest of particles around the test particle * with random position within a unit sphere. */ @@ -93,7 +92,9 @@ struct part *make_particles(size_t count, double *offset, double spacing, p->h = h; p->id = ++(*partId); +#if !defined(GIZMO_SPH) && !defined(SHADOWFAX_SPH) p->mass = 1.0f; +#endif } return particles; } @@ -103,6 +104,7 @@ struct part *make_particles(size_t count, double *offset, double spacing, */ void prepare_force(struct part *parts, size_t count) { +#if !defined(GIZMO_SPH) && !defined(SHADOWFAX_SPH) && !defined(MINIMAL_SPH) struct part *p; for (size_t i = 0; i < count; ++i) { p = &parts[i]; @@ -113,6 +115,7 @@ void prepare_force(struct part *parts, size_t count) { p->force.v_sig = 0.0f; p->force.h_dt = 0.0f; } +#endif } /** @@ -122,25 +125,26 @@ void dump_indv_particle_fields(char *fileName, struct part *p) { FILE *file = fopen(fileName, "a"); + /* Write header */ fprintf(file, - "%6llu %10f %10f %10f %10f %10f %10f %10e %10e %10e %13e %13e %13e " - "%13e %13e %13e %13e " - "%13e %13e %13e %10f\n", + "%6llu %10f %10f %10f %10f %10f %10f %13e %13e %13e %13e %13e " + "%13e %13e %13e\n", p->id, p->x[0], p->x[1], p->x[2], p->v[0], p->v[1], p->v[2], - p->a_hydro[0], p->a_hydro[1], p->a_hydro[2], p->rho, - p->density.rho_dh, p->density.wcount, p->density.wcount_dh, - p->force.h_dt, p->force.v_sig, -#if defined(GADGET2_SPH) - p->density.div_v, p->density.rot_v[0], p->density.rot_v[1], - p->density.rot_v[2], p->entropy_dt -#elif defined(DEFAULT_SPH) - p->density.div_v, p->density.rot_v[0], p->density.rot_v[1], - p->density.rot_v[2], 0. + hydro_get_density(p), +#if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH) + 0.f, #else + p->density.rho_dh, +#endif + p->density.wcount, p->density.wcount_dh, +#if defined(GADGET2_SPH) || defined(DEFAULT_SPH) || defined(HOPKINS_PE_SPH) p->density.div_v, p->density.rot_v[0], p->density.rot_v[1], p->density.rot_v[2] +#else + 0., 0., 0., 0. #endif ); + fclose(file); } @@ -152,13 +156,10 @@ void write_header(char *fileName) { FILE *file = fopen(fileName, "w"); /* Write header */ fprintf(file, - "# %4s %10s %10s %10s %10s %10s %10s %10s %10s %10s %13s %13s %13s " - "%13s %13s %13s %13s" - "%13s %13s %13s %13s\n", - "ID", "pos_x", "pos_y", "pos_z", "v_x", "v_y", "v_z", "a_x", "a_y", - "a_z", "rho", "rho_dh", "wcount", "wcount_dh", "dh/dt", "v_sig", - "div_v", "curl_vx", "curl_vy", "curl_vz", "dS/dt"); - fprintf(file, "\n# PARTICLES BEFORE INTERACTION:\n"); + "# %4s %10s %10s %10s %10s %10s %10s %13s %13s %13s %13s %13s " + "%13s %13s %13s\n", + "ID", "pos_x", "pos_y", "pos_z", "v_x", "v_y", "v_z", "rho", "rho_dh", + "wcount", "wcount_dh", "div_v", "curl_vx", "curl_vy", "curl_vz"); fclose(file); } @@ -187,8 +188,8 @@ int check_results(struct part serial_test_part, struct part *serial_parts, } /* - * @brief Calls the serial and vectorised version of an interaction - * function given by the function pointers. + * @brief Calls the serial and vectorised version of the non-symmetrical density + * interaction. * * @param test_part Particle that will be updated * @param parts Particle array to be interacted @@ -196,16 +197,15 @@ int check_results(struct part serial_test_part, struct part *serial_parts, * @param serial_inter_func Serial interaction function to be called * @param vec_inter_func Vectorised interaction function to be called * @param runs No. of times to call interactions + * @param num_vec_proc No. of vectors to use to process interaction * */ void test_interactions(struct part test_part, struct part *parts, size_t count, - serial_interaction serial_inter_func, - vec_interaction vec_inter_func, char *filePrefix, - size_t runs) { + char *filePrefix, int runs, int num_vec_proc) { - ticks serial_time = 0, vec_time = 0; + ticks serial_time = 0; + ticks vec_time = 0; - FILE *file; char serial_filename[200] = ""; char vec_filename[200] = ""; @@ -217,64 +217,68 @@ void test_interactions(struct part test_part, struct part *parts, size_t count, write_header(serial_filename); write_header(vec_filename); - /* Test particle at the center of a unit sphere. */ struct part pi_serial, pi_vec; - - /* Remaining particles in the sphere that will interact with test particle. */ struct part pj_serial[count], pj_vec[count]; - /* Stores the separation, smoothing length and pointers to particles - * needed for the vectorised interaction. */ + float r2[count] __attribute__((aligned(array_align))); + float dx[3 * count] __attribute__((aligned(array_align))); + + struct part *piq[count], *pjq[count]; + for (size_t k = 0; k < count; k++) { + piq[k] = NULL; + pjq[k] = NULL; + } + float r2q[count] __attribute__((aligned(array_align))); float hiq[count] __attribute__((aligned(array_align))); - float hjq[count] __attribute__((aligned(array_align))); - float dxq[3 * count] __attribute__((aligned(array_align))); - struct part *piq[count], *pjq[count]; + float dxq[count] __attribute__((aligned(array_align))); + + float dyq[count] __attribute__((aligned(array_align))); + float dzq[count] __attribute__((aligned(array_align))); + float mjq[count] __attribute__((aligned(array_align))); + float vixq[count] __attribute__((aligned(array_align))); + float viyq[count] __attribute__((aligned(array_align))); + float vizq[count] __attribute__((aligned(array_align))); + float vjxq[count] __attribute__((aligned(array_align))); + float vjyq[count] __attribute__((aligned(array_align))); + float vjzq[count] __attribute__((aligned(array_align))); /* Call serial interaction a set number of times. */ - for (size_t k = 0; k < runs; k++) { + for (int k = 0; k < runs; k++) { /* Reset particle to initial setup */ pi_serial = test_part; for (size_t i = 0; i < count; i++) pj_serial[i] = parts[i]; - /* Only dump data on first run. */ - if (k == 0) { - /* Dump state of particles before serial interaction. */ - dump_indv_particle_fields(serial_filename, &pi_serial); - for (size_t i = 0; i < count; i++) - dump_indv_particle_fields(serial_filename, &pj_serial[i]); - } - /* Perform serial interaction */ for (size_t i = 0; i < count; i++) { /* Compute the pairwise distance. */ - float r2 = 0.0f; - float dx[3]; - for (size_t k = 0; k < 3; k++) { - dx[k] = pi_serial.x[k] - pj_serial[i].x[k]; - r2 += dx[k] * dx[k]; + r2[i] = 0.0f; + for (int k = 0; k < 3; k++) { + int ind = (3 * i) + k; + dx[ind] = pi_serial.x[k] - pj_serial[i].x[k]; + r2[i] += dx[ind] * dx[ind]; } + } - const ticks tic = getticks(); - - serial_inter_func(r2, dx, pi_serial.h, pj_serial[i].h, &pi_serial, - &pj_serial[i]); - - serial_time += getticks() - tic; + const ticks tic = getticks(); +/* Perform serial interaction */ +#ifdef __ICC +#pragma novector +#endif + for (size_t i = 0; i < count; i++) { + IACT(r2[i], &(dx[3 * i]), pi_serial.h, pj_serial[i].h, &pi_serial, + &pj_serial[i]); } + serial_time += getticks() - tic; } - file = fopen(serial_filename, "a"); - fprintf(file, "\n# PARTICLES AFTER INTERACTION:\n"); - fclose(file); - /* Dump result of serial interaction. */ dump_indv_particle_fields(serial_filename, &pi_serial); for (size_t i = 0; i < count; i++) dump_indv_particle_fields(serial_filename, &pj_serial[i]); /* Call vector interaction a set number of times. */ - for (size_t k = 0; k < runs; k++) { + for (int k = 0; k < runs; k++) { /* Reset particle to initial setup */ pi_vec = test_part; for (size_t i = 0; i < count; i++) pj_vec[i] = parts[i]; @@ -284,45 +288,92 @@ void test_interactions(struct part test_part, struct part *parts, size_t count, /* Compute the pairwise distance. */ float r2 = 0.0f; float dx[3]; - for (size_t k = 0; k < 3; k++) { + for (int k = 0; k < 3; k++) { dx[k] = pi_vec.x[k] - pj_vec[i].x[k]; r2 += dx[k] * dx[k]; } r2q[i] = r2; - dxq[3 * i + 0] = dx[0]; - dxq[3 * i + 1] = dx[1]; - dxq[3 * i + 2] = dx[2]; + dxq[i] = dx[0]; hiq[i] = pi_vec.h; - hjq[i] = pj_vec[i].h; piq[i] = &pi_vec; pjq[i] = &pj_vec[i]; - } - /* Only dump data on first run. */ - if (k == 0) { - /* Dump state of particles before vector interaction. */ - dump_indv_particle_fields(vec_filename, piq[0]); - for (size_t i = 0; i < count; i++) - dump_indv_particle_fields(vec_filename, pjq[i]); + dyq[i] = dx[1]; + dzq[i] = dx[2]; + mjq[i] = pj_vec[i].mass; + vixq[i] = pi_vec.v[0]; + viyq[i] = pi_vec.v[1]; + vizq[i] = pi_vec.v[2]; + vjxq[i] = pj_vec[i].v[0]; + vjyq[i] = pj_vec[i].v[1]; + vjzq[i] = pj_vec[i].v[2]; } + /* Perform vector interaction. */ + vector hi_vec, hi_inv_vec, vix_vec, viy_vec, viz_vec; + vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, + curlvySum, curlvzSum; + mask_t mask, mask2; + + rhoSum.v = vec_set1(0.f); + rho_dhSum.v = vec_set1(0.f); + wcountSum.v = vec_set1(0.f); + wcount_dhSum.v = vec_set1(0.f); + div_vSum.v = vec_set1(0.f); + curlvxSum.v = vec_set1(0.f); + curlvySum.v = vec_set1(0.f); + curlvzSum.v = vec_set1(0.f); + + hi_vec.v = vec_load(&hiq[0]); + vix_vec.v = vec_load(&vixq[0]); + viy_vec.v = vec_load(&viyq[0]); + viz_vec.v = vec_load(&vizq[0]); + + hi_inv_vec = vec_reciprocal(hi_vec); + vec_init_mask_true(mask); + vec_init_mask_true(mask2); + const ticks vec_tic = getticks(); - /* Perform vector interaction. */ - for (size_t i = 0; i < count; i += VEC_SIZE) { - vec_inter_func(&(r2q[i]), &(dxq[3 * i]), &(hiq[i]), &(hjq[i]), &(piq[i]), - &(pjq[i])); + for (size_t i = 0; i < count; i += num_vec_proc * VEC_SIZE) { + + /* Interleave two vectors for interaction. */ + if (num_vec_proc == 2) { + runner_iact_nonsym_2_vec_density( + &(r2q[i]), &(dxq[i]), &(dyq[i]), &(dzq[i]), (hi_inv_vec), (vix_vec), + (viy_vec), (viz_vec), &(vjxq[i]), &(vjyq[i]), &(vjzq[i]), &(mjq[i]), + &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, + &curlvxSum, &curlvySum, &curlvzSum, mask, mask2, 0); + } else { /* Only use one vector for interaction. */ + + vector r2, dx, dy, dz; + r2.v = vec_load(&(r2q[i])); + dx.v = vec_load(&(dxq[i])); + dy.v = vec_load(&(dyq[i])); + dz.v = vec_load(&(dzq[i])); + + runner_iact_nonsym_1_vec_density( + &r2, &dx, &dy, &dz, (hi_inv_vec), (vix_vec), (viy_vec), (viz_vec), + &(vjxq[i]), &(vjyq[i]), &(vjzq[i]), &(mjq[i]), &rhoSum, &rho_dhSum, + &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, + &curlvzSum, mask); + } } + VEC_HADD(rhoSum, piq[0]->rho); + VEC_HADD(rho_dhSum, piq[0]->density.rho_dh); + VEC_HADD(wcountSum, piq[0]->density.wcount); + VEC_HADD(wcount_dhSum, piq[0]->density.wcount_dh); + VEC_HADD(div_vSum, piq[0]->density.div_v); + VEC_HADD(curlvxSum, piq[0]->density.rot_v[0]); + VEC_HADD(curlvySum, piq[0]->density.rot_v[1]); + VEC_HADD(curlvzSum, piq[0]->density.rot_v[2]); + vec_time += getticks() - vec_tic; } - file = fopen(vec_filename, "a"); - fprintf(file, "\n# PARTICLES AFTER INTERACTION:\n"); - fclose(file); - - /* Dump result of vector interaction. */ + /* Dump result of serial interaction. */ dump_indv_particle_fields(vec_filename, piq[0]); for (size_t i = 0; i < count; i++) dump_indv_particle_fields(vec_filename, pjq[i]); @@ -334,6 +385,7 @@ void test_interactions(struct part test_part, struct part *parts, size_t count, message("The serial interactions took : %15lli ticks.", serial_time / runs); message("The vectorised interactions took : %15lli ticks.", vec_time / runs); + message("Speed up: %15fx.", (double)(serial_time) / vec_time); } /* And go... */ @@ -386,62 +438,22 @@ int main(int argc, char *argv[]) { /* Build the infrastructure */ static long long partId = 0; - struct part density_test_particle, force_test_particle; - struct part *density_particles = - make_particles(count, offset, spacing, h, &partId); - struct part *force_particles = - make_particles(count, offset, spacing, h, &partId); - prepare_force(force_particles, count); - - /* Define which interactions to call */ - serial_interaction serial_inter_func = &runner_iact_nonsym_density; - vec_interaction vec_inter_func = &runner_iact_nonsym_vec_density; - - density_test_particle = density_particles[0]; + struct part test_particle; + struct part *particles = make_particles(count, offset, spacing, h, &partId); + + test_particle = particles[0]; /* Call the non-sym density test. */ - message("Testing non-symmetrical density interaction..."); - test_interactions(density_test_particle, &density_particles[1], count - 1, - serial_inter_func, vec_inter_func, "test_nonsym_density", - runs); - - density_particles = make_particles(count, offset, spacing, h, &partId); - - /* Re-assign function pointers. */ - serial_inter_func = &runner_iact_density; - vec_inter_func = &runner_iact_vec_density; - - density_test_particle = density_particles[0]; - /* Call the symmetrical density test. */ - message("Testing symmetrical density interaction..."); - test_interactions(density_test_particle, &density_particles[1], count - 1, - serial_inter_func, vec_inter_func, "test_sym_density", - runs); - - /* Re-assign function pointers. */ - serial_inter_func = &runner_iact_nonsym_force; - vec_inter_func = &runner_iact_nonsym_vec_force; - - force_test_particle = force_particles[0]; - /* Call the test non-sym force test. */ - message("Testing non-symmetrical force interaction..."); - test_interactions(force_test_particle, &force_particles[1], count - 1, - serial_inter_func, vec_inter_func, "test_nonsym_force", - runs); - - force_particles = make_particles(count, offset, spacing, h, &partId); - prepare_force(force_particles, count); - - /* Re-assign function pointers. */ - serial_inter_func = &runner_iact_force; - vec_inter_func = &runner_iact_vec_force; - - force_test_particle = force_particles[0]; - /* Call the test symmetrical force test. */ - message("Testing symmetrical force interaction..."); - test_interactions(force_test_particle, &force_particles[1], count - 1, - serial_inter_func, vec_inter_func, "test_sym_force", runs); + message("Testing %s interaction...", IACT_NAME); + test_interactions(test_particle, &particles[1], count - 1, IACT_NAME, runs, + 1); + test_interactions(test_particle, &particles[1], count - 1, IACT_NAME, runs, + 2); return 0; } -#endif /* WITH_VECTORIZATION */ +#else + +int main() { return 1; } + +#endif diff --git a/tests/testInteractions.sh.in b/tests/testInteractions.sh.in new file mode 100644 index 0000000000000000000000000000000000000000..4b002c56e37eff417c673ddac2e44b3edf17683a --- /dev/null +++ b/tests/testInteractions.sh.in @@ -0,0 +1,29 @@ +#!/bin/bash + +echo "" + +rm -f test_nonsym_density_serial.dat test_nonsym_density_vec.dat + +echo "Running ./testInteractions" + +./testInteractions + +if [ $? != 0 ]; then + echo "testInteractions is redundant when vectorisation is disabled" +else + if [ -e test_nonsym_density_serial.dat ] + then + if python @srcdir@/difffloat.py test_nonsym_density_serial.dat test_nonsym_density_vec.dat @srcdir@/tolerance_testInteractions.dat + then + echo "Accuracy test passed" + else + echo "Accuracy test failed" + exit 1 + fi + else + echo "Error Missing test output file" + exit 1 + fi +fi + +echo "------------" diff --git a/tests/testKernel.c b/tests/testKernel.c index a2744119a527cc842cdd4711056eee7a7d7b4270..0658639070526f28ce1bceefc54d3f2d7a3ae765 100644 --- a/tests/testKernel.c +++ b/tests/testKernel.c @@ -68,7 +68,7 @@ int main() { vx.f[j] = (i + j) * 2.25f / numPoints; } - vx_h.v = vx.v / vec_set1(h); + vx_h.v = vec_mul(vx.v, vec_set1(1.f / h)); kernel_deval_1_vec(&vx_h, &W_vec, &dW_vec); @@ -106,8 +106,8 @@ int main() { vx_2.f[j] = (i + j) * 2.25f / numPoints; } - vx_h.v = vx.v / vec_set1(h); - vx_h_2.v = vx_2.v / vec_set1(h); + vx_h.v = vec_mul(vx.v, vec_set1(1.f / h)); + vx_h_2.v = vec_mul(vx_2.v, vec_set1(1.f / h)); kernel_deval_2_vec(&vx_h, &W_vec, &dW_vec, &vx_h_2, &W_vec_2, &dW_vec_2); diff --git a/tests/testPair.c b/tests/testPair.c deleted file mode 100644 index 0e0b36a4c30abc5a8d37060c70788669f5b5710d..0000000000000000000000000000000000000000 --- a/tests/testPair.c +++ /dev/null @@ -1,352 +0,0 @@ -/******************************************************************************* - * This file is part of SWIFT. - * Copyright (C) 2015 Matthieu Schaller (matthieu.schaller@durham.ac.uk). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - ******************************************************************************/ -#include "../config.h" - -/* Some standard headers. */ -#include <fenv.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <unistd.h> - -/* Local headers. */ -#include "swift.h" - -#if defined(WITH_VECTORIZATION) -#define DOSELF1 runner_doself1_density_vec -#define DOPAIR1 runner_dopair1_branch_density -#define DOSELF1_NAME "runner_doself1_density_vec" -#define DOPAIR1_NAME "runner_dopair1_density_vec" -#endif - -#ifndef DOSELF1 -#define DOSELF1 runner_doself1_density -#define DOSELF1_NAME "runner_doself1_density" -#endif - -#ifndef DOPAIR1 -#define DOPAIR1 runner_dopair1_branch_density -#define DOPAIR1_NAME "runner_dopair1_density" -#endif - -/* n is both particles per axis and box size: - * particles are generated on a mesh with unit spacing - */ -struct cell *make_cell(size_t n, double *offset, double size, double h, - double density, unsigned long long *partId, - double pert) { - const size_t count = n * n * n; - const double volume = size * size * size; - struct cell *cell = malloc(sizeof(struct cell)); - bzero(cell, sizeof(struct cell)); - - if (posix_memalign((void **)&cell->parts, part_align, - count * sizeof(struct part)) != 0) { - error("couldn't allocate particles, no. of particles: %d", (int)count); - } - bzero(cell->parts, count * sizeof(struct part)); - - /* Construct the parts */ - struct part *part = cell->parts; - for (size_t x = 0; x < n; ++x) { - for (size_t y = 0; y < n; ++y) { - for (size_t z = 0; z < n; ++z) { - part->x[0] = - offset[0] + - size * (x + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n; - part->x[1] = - offset[1] + - size * (y + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n; - part->x[2] = - offset[2] + - size * (z + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n; - // part->v[0] = part->x[0] - 1.5; - // part->v[1] = part->x[1] - 1.5; - // part->v[2] = part->x[2] - 1.5; - part->v[0] = random_uniform(-0.05, 0.05); - part->v[1] = random_uniform(-0.05, 0.05); - part->v[2] = random_uniform(-0.05, 0.05); - part->h = size * h / (float)n; - part->id = ++(*partId); -#if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH) - part->conserved.mass = density * volume / count; -#else - part->mass = density * volume / count; -#endif - part->time_bin = 1; - -#ifdef SWIFT_DEBUG_CHECKS - part->ti_drift = 8; - part->ti_kick = 8; -#endif - - ++part; - } - } - } - - /* Cell properties */ - cell->split = 0; - cell->h_max = h; - cell->count = count; - cell->dx_max_part = 0.; - cell->dx_max_sort = 0.; - cell->width[0] = n; - cell->width[1] = n; - cell->width[2] = n; - cell->loc[0] = offset[0]; - cell->loc[1] = offset[1]; - cell->loc[2] = offset[2]; - - cell->ti_old_part = 8; - cell->ti_end_min = 8; - cell->ti_end_max = 8; - - shuffle_particles(cell->parts, cell->count); - - cell->sorted = 0; - for (int k = 0; k < 13; k++) cell->sort[k] = NULL; - - return cell; -} - -void clean_up(struct cell *ci) { - free(ci->parts); - for (int k = 0; k < 13; k++) - if (ci->sort[k] != NULL) free(ci->sort[k]); - free(ci); -} - -/** - * @brief Initializes all particles field to be ready for a density calculation - */ -void zero_particle_fields(struct cell *c) { - for (int pid = 0; pid < c->count; pid++) { - hydro_init_part(&c->parts[pid], NULL); - } -} - -/** - * @brief Dump all the particles to a file - */ -void dump_particle_fields(char *fileName, struct cell *ci, struct cell *cj) { - FILE *file = fopen(fileName, "w"); - - /* Write header */ - fprintf(file, - "# %4s %10s %10s %10s %10s %10s %10s %13s %13s %13s %13s %13s " - "%13s %13s %13s\n", - "ID", "pos_x", "pos_y", "pos_z", "v_x", "v_y", "v_z", "rho", "rho_dh", - "wcount", "wcount_dh", "div_v", "curl_vx", "curl_vy", "curl_vz"); - - fprintf(file, "# ci --------------------------------------------\n"); - - for (int pid = 0; pid < ci->count; pid++) { - fprintf(file, - "%6llu %10f %10f %10f %10f %10f %10f %13e %13e %13e %13e %13e " - "%13e %13e %13e\n", - ci->parts[pid].id, ci->parts[pid].x[0], ci->parts[pid].x[1], - ci->parts[pid].x[2], ci->parts[pid].v[0], ci->parts[pid].v[1], - ci->parts[pid].v[2], hydro_get_density(&ci->parts[pid]), -#if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH) - 0.f, -#else - ci->parts[pid].density.rho_dh, -#endif - ci->parts[pid].density.wcount, ci->parts[pid].density.wcount_dh, -#if defined(GADGET2_SPH) || defined(DEFAULT_SPH) || defined(HOPKINS_PE_SPH) - ci->parts[pid].density.div_v, ci->parts[pid].density.rot_v[0], - ci->parts[pid].density.rot_v[1], ci->parts[pid].density.rot_v[2] -#else - 0., 0., 0., 0. -#endif - ); - } - - fprintf(file, "# cj --------------------------------------------\n"); - - for (int pjd = 0; pjd < cj->count; pjd++) { - fprintf(file, - "%6llu %10f %10f %10f %10f %10f %10f %13e %13e %13e %13e %13e " - "%13e %13e %13e\n", - cj->parts[pjd].id, cj->parts[pjd].x[0], cj->parts[pjd].x[1], - cj->parts[pjd].x[2], cj->parts[pjd].v[0], cj->parts[pjd].v[1], - cj->parts[pjd].v[2], hydro_get_density(&cj->parts[pjd]), -#if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH) - 0.f, -#else - cj->parts[pjd].density.rho_dh, -#endif - cj->parts[pjd].density.wcount, cj->parts[pjd].density.wcount_dh, -#if defined(GADGET2_SPH) || defined(DEFAULT_SPH) || defined(HOPKINS_PE_SPH) - cj->parts[pjd].density.div_v, cj->parts[pjd].density.rot_v[0], - cj->parts[pjd].density.rot_v[1], cj->parts[pjd].density.rot_v[2] -#else - 0., 0., 0., 0. -#endif - ); - } - - fclose(file); -} - -/* Just a forward declaration... */ -void runner_dopair1_density(struct runner *r, struct cell *ci, struct cell *cj); -void runner_doself1_density_vec(struct runner *r, struct cell *ci); -void runner_dopair1_branch_density(struct runner *r, struct cell *ci, - struct cell *cj); - -int main(int argc, char *argv[]) { - size_t particles = 0, runs = 0, volume, type = 0; - double offset[3] = {0, 0, 0}, h = 1.1255, size = 1., rho = 1.; - double perturbation = 0.; - struct cell *ci, *cj; - struct space space; - struct engine engine; - struct runner runner; - char c; - static unsigned long long partId = 0; - char outputFileNameExtension[200] = ""; - char outputFileName[200] = ""; - ticks tic, toc, time; - - /* Initialize CPU frequency, this also starts time. */ - unsigned long long cpufreq = 0; - clocks_set_cpufreq(cpufreq); - - /* Choke on FP-exceptions */ - feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW); - - srand(0); - - while ((c = getopt(argc, argv, "h:p:r:t:d:f:")) != -1) { - switch (c) { - case 'h': - sscanf(optarg, "%lf", &h); - break; - case 'p': - sscanf(optarg, "%zu", &particles); - break; - case 'r': - sscanf(optarg, "%zu", &runs); - break; - case 't': - sscanf(optarg, "%zu", &type); - break; - case 'd': - sscanf(optarg, "%lf", &perturbation); - break; - case 'f': - strcpy(outputFileNameExtension, optarg); - break; - case '?': - error("Unknown option."); - break; - } - } - - if (h < 0 || particles == 0 || runs == 0 || type > 2) { - printf( - "\nUsage: %s -p PARTICLES_PER_AXIS -r NUMBER_OF_RUNS [OPTIONS...]\n" - "\nGenerates a cell pair, filled with particles on a Cartesian grid." - "\nThese are then interacted using runner_dopair1_density." - "\n\nOptions:" - "\n-t TYPE=0 - cells share face (0), edge (1) or corner (2)" - "\n-h DISTANCE=1.1255 - smoothing length" - "\n-d pert - perturbation to apply to the particles [0,1[" - "\n-f fileName - part of the file name used to save the dumps\n", - argv[0]); - exit(1); - } - - space.periodic = 0; - space.dim[0] = 3.; - space.dim[1] = 3.; - space.dim[2] = 3.; - - engine.s = &space; - engine.time = 0.1f; - engine.ti_current = 8; - engine.max_active_bin = num_time_bins; - runner.e = &engine; - - volume = particles * particles * particles; - message("particles: %zu B\npositions: 0 B", 2 * volume * sizeof(struct part)); - - ci = make_cell(particles, offset, size, h, rho, &partId, perturbation); - for (size_t i = 0; i < type + 1; ++i) offset[i] = 1.; - cj = make_cell(particles, offset, size, h, rho, &partId, perturbation); - - runner_do_sort(&runner, ci, 0x1FFF, 0, 0); - runner_do_sort(&runner, cj, 0x1FFF, 0, 0); - - time = 0; - for (size_t i = 0; i < runs; ++i) { - /* Zero the fields */ - zero_particle_fields(ci); - zero_particle_fields(cj); - - tic = getticks(); - -#if defined(DEFAULT_SPH) || !defined(WITH_VECTORIZATION) - /* Run the test */ - DOPAIR1(&runner, ci, cj); -#endif - - toc = getticks(); - time += toc - tic; - - /* Dump if necessary */ - if (i % 50 == 0) { - sprintf(outputFileName, "swift_dopair_%s.dat", outputFileNameExtension); - dump_particle_fields(outputFileName, ci, cj); - } - } - - /* Output timing */ - message("SWIFT calculation took %lli ticks.", time / runs); - - /* Now perform a brute-force version for accuracy tests */ - - /* Zero the fields */ - zero_particle_fields(ci); - zero_particle_fields(cj); - - tic = getticks(); - -#if defined(DEFAULT_SPH) || !defined(WITH_VECTORIZATION) - /* Run the brute-force test */ - pairs_all_density(&runner, ci, cj); -#endif - - toc = getticks(); - - /* Dump */ - sprintf(outputFileName, "brute_force_%s.dat", outputFileNameExtension); - dump_particle_fields(outputFileName, ci, cj); - - /* Output timing */ - message("Brute force calculation took %lli ticks.", toc - tic); - - /* Clean things to make the sanitizer happy ... */ - clean_up(ci); - clean_up(cj); - - return 0; -} diff --git a/tests/testPair.sh.in b/tests/testPair.sh.in deleted file mode 100755 index bd7051b060c4acab6cf5a164af1914715856849b..0000000000000000000000000000000000000000 --- a/tests/testPair.sh.in +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -echo "" - -rm -f brute_force_standard.dat swift_dopair_standard.dat - -./testPair -p 6 -r 1 -d 0 -f standard - -python @srcdir@/difffloat.py brute_force_standard.dat swift_dopair_standard.dat @srcdir@/tolerance_pair_normal.dat - -exit $? diff --git a/tests/testPairPerturbed.sh.in b/tests/testPairPerturbed.sh.in deleted file mode 100755 index 9f214e25a098448a906f9da307ea569e327cfdea..0000000000000000000000000000000000000000 --- a/tests/testPairPerturbed.sh.in +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -echo "" - -rm -f brute_force_perturbed.dat swift_dopair_perturbed.dat - -./testPair -p 6 -r 1 -d 0.1 -f perturbed - -python @srcdir@/difffloat.py brute_force_perturbed.dat swift_dopair_perturbed.dat @srcdir@/tolerance_pair_perturbed.dat - -exit $? diff --git a/tests/testPeriodicBC.c b/tests/testPeriodicBC.c index 495442efac853517891c1efdec54f1f5e8e14e75..6fa2dc607b996b9e8508338a9806633c5a4d1a89 100644 --- a/tests/testPeriodicBC.c +++ b/tests/testPeriodicBC.c @@ -517,7 +517,8 @@ int main(int argc, char *argv[]) { const int half_dim = (dim - 1) / 2; - /* Test the periodic boundary conditions for each of the 8 corners. */ + /* Test the periodic boundary conditions for each of the 8 corners. Interact + * each corner with all of its 26 neighbours.*/ test_boundary_conditions(cells, runner, 0, 0, 0, dim, swiftOutputFileName, bruteForceOutputFileName); test_boundary_conditions(cells, runner, dim - 1, 0, 0, dim, diff --git a/tests/tolerance_27_perturbed_h.dat b/tests/tolerance_27_perturbed_h.dat index 495c865783ae4fc6cab76f11e945efe9ac21870f..5142c2a2090e15381a19b2bc71e253a482973b11 100644 --- a/tests/tolerance_27_perturbed_h.dat +++ b/tests/tolerance_27_perturbed_h.dat @@ -1,4 +1,4 @@ # ID pos_x pos_y pos_z v_x v_y v_z rho rho_dh wcount wcount_dh div_v curl_vx curl_vy curl_vz - 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 2e-6 1e-4 5e-4 1.2e-2 1e-5 3e-6 3e-6 7e-6 - 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1.4e-2 1e-5 2e-3 2.5e-4 3e-3 3e-3 3e-3 - 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e0 1e-6 4e-6 4e-6 4e-6 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 2.4e-6 1e-4 5e-4 1.2e-2 1e-5 3e-6 3e-6 8e-6 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1.2e-6 1.4e-2 1e-5 2e-3 2.5e-4 3e-3 3e-3 3e-3 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e0 1e-6 4e-6 4e-6 4e-6 diff --git a/tests/tolerance_27_perturbed_h2.dat b/tests/tolerance_27_perturbed_h2.dat new file mode 100644 index 0000000000000000000000000000000000000000..23f6a5006124f6233aebd111005760a5dcc5b6a3 --- /dev/null +++ b/tests/tolerance_27_perturbed_h2.dat @@ -0,0 +1,4 @@ +# ID pos_x pos_y pos_z v_x v_y v_z rho rho_dh wcount wcount_dh div_v curl_vx curl_vy curl_vz + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 3e-6 1e-4 5e-4 1.5e-2 1.4e-5 3e-6 3e-6 9e-6 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1.5e-6 1.57e-2 1e-5 4.74e-3 3.89e-4 3e-3 3e-3 3e-3 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e0 1e-6 4e-6 4e-6 4e-6 diff --git a/tests/tolerance_pair_active.dat b/tests/tolerance_pair_active.dat new file mode 100644 index 0000000000000000000000000000000000000000..b07697a686eb7801326ceaf77cf93fb3a1491c2e --- /dev/null +++ b/tests/tolerance_pair_active.dat @@ -0,0 +1,4 @@ +# ID wcount + 0 1e-2 + 0 1e-2 + 0 1e-2 diff --git a/tests/tolerance_pair_perturbed.dat b/tests/tolerance_pair_perturbed.dat deleted file mode 100644 index 59a80a7b1fa0db373b7cb43b9793146330b17d53..0000000000000000000000000000000000000000 --- a/tests/tolerance_pair_perturbed.dat +++ /dev/null @@ -1,4 +0,0 @@ -# ID pos_x pos_y pos_z v_x v_y v_z rho rho_dh wcount wcount_dh div_v curl_vx curl_vy curl_vz - 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-5 1e-5 2e-5 3e-2 1e-5 1e-5 1e-5 1e-5 - 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-3 4e-4 8e-3 2e-2 1e-4 1.6e-4 1.6e-4 1.6e-4 - 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 diff --git a/tests/tolerance_pair_normal.dat b/tests/tolerance_testInteractions.dat similarity index 75% rename from tests/tolerance_pair_normal.dat rename to tests/tolerance_testInteractions.dat index 3fca591dce8156a08c970e4a6579b56b2de12553..ebb376bf26bfdc0fb2107ab720bbf9eca5a35bce 100644 --- a/tests/tolerance_pair_normal.dat +++ b/tests/tolerance_testInteractions.dat @@ -1,4 +1,4 @@ # ID pos_x pos_y pos_z v_x v_y v_z rho rho_dh wcount wcount_dh div_v curl_vx curl_vy curl_vz - 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-5 1e-5 2e-5 3e-2 1e-5 1e-5 1e-5 1e-5 - 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-5 1.2e-5 1e-5 1e-2 1e-4 1e-4 1e-4 1e-4 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-5 4e-5 4e-4 1e-2 1e-5 6e-6 6e-6 6e-6 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1.2e-4 1e-4 2e-4 2e-4 1e-4 1e-4 1e-4 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6