diff --git a/src/Makefile.am b/src/Makefile.am index 3aab888c705fffc6e63179928c81c005b2ed9820..f53797f6a4e506a5aa0dc2945fc143a7a4a18d3d 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -44,7 +44,8 @@ include_HEADERS = space.h runner.h queue.h task.h lock.h cell.h part.h const.h \ common_io.h single_io.h multipole.h map.h tools.h partition.h clocks.h parser.h \ physical_constants.h physical_constants_cgs.h potential.h version.h \ hydro_properties.h riemann.h threadpool.h cooling.h cooling_struct.h sourceterms.h \ - sourceterms_struct.h statistics.h memswap.h profiler.h dump.h logger.h + sourceterms_struct.h statistics.h memswap.h cache.h runner_doiact_vec.h profiler.h \ + dump.h logger.h # Common source files AM_SOURCES = space.c runner.c queue.c task.c cell.c engine.c \ @@ -53,11 +54,11 @@ AM_SOURCES = space.c runner.c queue.c task.c cell.c engine.c \ kernel_hydro.c tools.c part.c partition.c clocks.c parser.c \ physical_constants.c potential.c hydro_properties.c \ runner_doiact_fft.c threadpool.c cooling.c sourceterms.c \ - statistics.c profiler.c dump.c logger.c + statistics.c runner_doiact_vec.c profiler.c dump.c logger.c # Include files for distribution, not installation. nobase_noinst_HEADERS = align.h approx_math.h atomic.h cycle.h error.h inline.h kernel_hydro.h kernel_gravity.h \ - kernel_long_gravity.h vector.h runner_doiact.h runner_doiact_grav.h runner_doiact_fft.h \ + kernel_long_gravity.h vector.h cache.h runner_doiact.h runner_doiact_vec.h runner_doiact_grav.h runner_doiact_fft.h \ units.h intrinsics.h minmax.h kick.h timestep.h drift.h adiabatic_index.h io_properties.h \ dimension.h equation_of_state.h active.h \ gravity.h gravity_io.h \ @@ -82,7 +83,7 @@ nobase_noinst_HEADERS = align.h approx_math.h atomic.h cycle.h error.h inline.h cooling/none/cooling.h cooling/none/cooling_struct.h \ cooling/const_du/cooling.h cooling/const_du/cooling_struct.h \ cooling/const_lambda/cooling.h cooling/const_lambda/cooling_struct.h \ - memswap.h dump.h logger.h + memswap.h dump.h logger.h # Sources and flags for regular library diff --git a/src/cache.h b/src/cache.h new file mode 100644 index 0000000000000000000000000000000000000000..a6a7090806f27c038eef562fa654b19a24103560 --- /dev/null +++ b/src/cache.h @@ -0,0 +1,161 @@ +/******************************************************************************* + * This file is part of SWIFT. + * Copyright (c) 2016 James Willis (jame.s.willis@durham.ac.uk) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + ******************************************************************************/ +#ifndef SWIFT_CACHE_H +#define SWIFT_CACHE_H + +/* Config parameters. */ +#include "../config.h" + +/* Local headers */ +#include "cell.h" +#include "error.h" +#include "part.h" +#include "vector.h" + +#define NUM_VEC_PROC 2 +#define C2_CACHE_SIZE (NUM_VEC_PROC * VEC_SIZE * 6) + (NUM_VEC_PROC * VEC_SIZE) +#define C2_CACHE_ALIGN sizeof(float) * VEC_SIZE + +/* Cache struct to hold a local copy of a cells' particle + * properties required for density/force calculations.*/ +struct cache { + + /* Particle x position. */ + float *restrict x __attribute__((aligned(sizeof(float) * VEC_SIZE))); + + /* Particle y position. */ + float *restrict y __attribute__((aligned(sizeof(float) * VEC_SIZE))); + + /* Particle z position. */ + float *restrict z __attribute__((aligned(sizeof(float) * VEC_SIZE))); + + /* Particle smoothing length. */ + float *restrict h __attribute__((aligned(sizeof(float) * VEC_SIZE))); + + /* Particle mass. */ + float *restrict m __attribute__((aligned(sizeof(float) * VEC_SIZE))); + + /* Particle x velocity. */ + float *restrict vx __attribute__((aligned(sizeof(float) * VEC_SIZE))); + + /* Particle y velocity. */ + float *restrict vy __attribute__((aligned(sizeof(float) * VEC_SIZE))); + + /* Particle z velocity. */ + float *restrict vz __attribute__((aligned(sizeof(float) * VEC_SIZE))); + + /* Cache size. */ + int count; +}; + +/* Secondary cache struct to hold a list of interactions between two + * particles.*/ +struct c2_cache { + + /* Separation between two particles squared. */ + float r2q[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + + /* x separation between two particles. */ + float dxq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + + /* y separation between two particles. */ + float dyq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + + /* z separation between two particles. */ + float dzq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + + /* Mass of particle pj. */ + float mq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + + /* x velocity of particle pj. */ + float vxq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + + /* y velocity of particle pj. */ + float vyq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + + /* z velocity of particle pj. */ + float vzq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); +}; + +/** + * @brief Allocate memory and initialise cache. + * + * @param c The cache. + * @param count Number of particles to allocate space for. + */ +__attribute__((always_inline)) INLINE void cache_init(struct cache *c, + size_t count) { + + /* Align cache on correct byte boundary and pad cache size to include 2 vector + * lengths for remainder operations. */ + unsigned long alignment = sizeof(float) * VEC_SIZE; + unsigned int sizeBytes = (count + (2 * VEC_SIZE)) * sizeof(float); + int error = 0; + + /* Free memory if cache has already been allocated. */ + if (c->count > 0) { + free(c->x); + free(c->y); + free(c->z); + free(c->m); + free(c->vx); + free(c->vy); + free(c->vz); + free(c->h); + } + + error += posix_memalign((void **)&c->x, alignment, sizeBytes); + error += posix_memalign((void **)&c->y, alignment, sizeBytes); + error += posix_memalign((void **)&c->z, alignment, sizeBytes); + error += posix_memalign((void **)&c->m, alignment, sizeBytes); + error += posix_memalign((void **)&c->vx, alignment, sizeBytes); + error += posix_memalign((void **)&c->vy, alignment, sizeBytes); + error += posix_memalign((void **)&c->vz, alignment, sizeBytes); + error += posix_memalign((void **)&c->h, alignment, sizeBytes); + + if (error != 0) + error("Couldn't allocate cache, no. of particles: %d", (int)count); + c->count = count; +} + +/** + * @brief Populate cache by reading in the particles in unsorted order. + * + * @param ci The #cell. + * @param ci_cache The cache. + */ +__attribute__((always_inline)) INLINE void cache_read_particles( + const struct cell *const ci, struct cache *const ci_cache) { + + /* Shift the particles positions to a local frame so single precision can be + * used instead of double precision. */ + for (int i = 0; i < ci->count; i++) { + ci_cache->x[i] = ci->parts[i].x[0] - ci->loc[0]; + ci_cache->y[i] = ci->parts[i].x[1] - ci->loc[1]; + ci_cache->z[i] = ci->parts[i].x[2] - ci->loc[2]; + ci_cache->h[i] = ci->parts[i].h; + + ci_cache->m[i] = ci->parts[i].mass; + ci_cache->vx[i] = ci->parts[i].v[0]; + ci_cache->vy[i] = ci->parts[i].v[1]; + ci_cache->vz[i] = ci->parts[i].v[2]; + } +} + +#endif /* SWIFT_CACHE_H */ diff --git a/src/engine.c b/src/engine.c index 6b3f5650f070a075aed2696444538b3bc4e10b4d..6f571f9ece9014e447b8afc38a79da7a614a81ee 100644 --- a/src/engine.c +++ b/src/engine.c @@ -70,6 +70,9 @@ #include "units.h" #include "version.h" +/* Particle cache size. */ +#define CACHE_SIZE 512 + const char *engine_policy_names[16] = {"none", "rand", "steal", @@ -3412,6 +3415,11 @@ void engine_init(struct engine *e, struct space *s, e->runners[k].cpuid = k; e->runners[k].qid = k * nr_queues / e->nr_threads; } + + /* Allocate particle cache. */ + e->runners[k].par_cache.count = 0; + cache_init(&e->runners[k].par_cache, CACHE_SIZE); + if (verbose) { if (with_aff) message("runner %i on cpuid=%i with qid=%i.", e->runners[k].id, diff --git a/src/hydro/Gadget2/hydro_iact.h b/src/hydro/Gadget2/hydro_iact.h index 08fb2b37db566e191bd74d82488b5d68e764573b..3fef18b4f487f1734a5f93c4bad46cf4e6968240 100644 --- a/src/hydro/Gadget2/hydro_iact.h +++ b/src/hydro/Gadget2/hydro_iact.h @@ -155,20 +155,15 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_density( /* Get the radius and inverse radius. */ r2.v = vec_load(R2); - ri.v = vec_rsqrt(r2.v); - /*vec_rsqrt does not have the level of accuracy we need, so an extra term is - * added below.*/ - ri.v = ri.v - vec_set1(0.5f) * ri.v * (r2.v * ri.v * ri.v - vec_set1(1.0f)); + ri = vec_reciprocal_sqrt(r2); r.v = r2.v * ri.v; hi.v = vec_load(Hi); - hi_inv.v = vec_rcp(hi.v); - hi_inv.v = hi_inv.v - hi_inv.v * (hi_inv.v * hi.v - vec_set1(1.0f)); + hi_inv = vec_reciprocal(hi); xi.v = r.v * hi_inv.v; hj.v = vec_load(Hj); - hj_inv.v = vec_rcp(hj.v); - hj_inv.v = hj_inv.v - hj_inv.v * (hj_inv.v * hj.v - vec_set1(1.0f)); + hj_inv = vec_reciprocal(hj); xj.v = r.v * hj_inv.v; /* Compute the kernel function. */ @@ -327,15 +322,11 @@ runner_iact_nonsym_vec_density(float *R2, float *Dx, float *Hi, float *Hj, /* Get the radius and inverse radius. */ r2.v = vec_load(R2); - ri.v = vec_rsqrt(r2.v); - /*vec_rsqrt does not have the level of accuracy we need, so an extra term is - * added below.*/ - ri.v = ri.v - vec_set1(0.5f) * ri.v * (r2.v * ri.v * ri.v - vec_set1(1.0f)); + ri = vec_reciprocal_sqrt(r2); r.v = r2.v * ri.v; hi.v = vec_load(Hi); - hi_inv.v = vec_rcp(hi.v); - hi_inv.v = hi_inv.v - hi_inv.v * (hi_inv.v * hi.v - vec_set1(1.0f)); + hi_inv = vec_reciprocal(hi); xi.v = r.v * hi_inv.v; kernel_deval_vec(&xi, &wi, &wi_dx); @@ -382,6 +373,176 @@ runner_iact_nonsym_vec_density(float *R2, float *Dx, float *Hi, float *Hj, #endif } +#ifdef WITH_VECTORIZATION +/** + * @brief Density interaction computed using 2 interleaved vectors + * (non-symmetric vectorized version). + */ +__attribute__((always_inline)) INLINE static void +runner_iact_nonsym_2_vec_density( + float *R2, float *Dx, float *Dy, float *Dz, vector hi_inv, vector vix, + vector viy, vector viz, float *Vjx, float *Vjy, float *Vjz, float *Mj, + vector *rhoSum, vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum, + vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum, + vector mask, vector mask2, int knlMask, int knlMask2) { + + vector r, ri, r2, xi, wi, wi_dx; + vector mj; + vector dx, dy, dz, dvx, dvy, dvz; + vector vjx, vjy, vjz; + vector dvdr; + vector curlvrx, curlvry, curlvrz; + vector r_2, ri2, r2_2, xi2, wi2, wi_dx2; + vector mj2; + vector dx2, dy2, dz2, dvx2, dvy2, dvz2; + vector vjx2, vjy2, vjz2; + vector dvdr2; + vector curlvrx2, curlvry2, curlvrz2; + + /* Fill the vectors. */ + mj.v = vec_load(Mj); + mj2.v = vec_load(&Mj[VEC_SIZE]); + vjx.v = vec_load(Vjx); + vjx2.v = vec_load(&Vjx[VEC_SIZE]); + vjy.v = vec_load(Vjy); + vjy2.v = vec_load(&Vjy[VEC_SIZE]); + vjz.v = vec_load(Vjz); + vjz2.v = vec_load(&Vjz[VEC_SIZE]); + dx.v = vec_load(Dx); + dx2.v = vec_load(&Dx[VEC_SIZE]); + dy.v = vec_load(Dy); + dy2.v = vec_load(&Dy[VEC_SIZE]); + dz.v = vec_load(Dz); + dz2.v = vec_load(&Dz[VEC_SIZE]); + + /* Get the radius and inverse radius. */ + r2.v = vec_load(R2); + r2_2.v = vec_load(&R2[VEC_SIZE]); + ri = vec_reciprocal_sqrt(r2); + ri2 = vec_reciprocal_sqrt(r2_2); + r.v = vec_mul(r2.v, ri.v); + r_2.v = vec_mul(r2_2.v, ri2.v); + + xi.v = vec_mul(r.v, hi_inv.v); + xi2.v = vec_mul(r_2.v, hi_inv.v); + + /* Calculate the kernel for two particles. */ + kernel_deval_2_vec(&xi, &wi, &wi_dx, &xi2, &wi2, &wi_dx2); + + /* Compute dv. */ + dvx.v = vec_sub(vix.v, vjx.v); + dvx2.v = vec_sub(vix.v, vjx2.v); + dvy.v = vec_sub(viy.v, vjy.v); + dvy2.v = vec_sub(viy.v, vjy2.v); + dvz.v = vec_sub(viz.v, vjz.v); + dvz2.v = vec_sub(viz.v, vjz2.v); + + /* Compute dv dot r */ + dvdr.v = vec_fma(dvx.v, dx.v, vec_fma(dvy.v, dy.v, vec_mul(dvz.v, dz.v))); + dvdr2.v = + vec_fma(dvx2.v, dx2.v, vec_fma(dvy2.v, dy2.v, vec_mul(dvz2.v, dz2.v))); + dvdr.v = vec_mul(dvdr.v, ri.v); + dvdr2.v = vec_mul(dvdr2.v, ri2.v); + + /* Compute dv cross r */ + curlvrx.v = + vec_fma(dvy.v, dz.v, vec_mul(vec_set1(-1.0f), vec_mul(dvz.v, dy.v))); + curlvrx2.v = + vec_fma(dvy2.v, dz2.v, vec_mul(vec_set1(-1.0f), vec_mul(dvz2.v, dy2.v))); + curlvry.v = + vec_fma(dvz.v, dx.v, vec_mul(vec_set1(-1.0f), vec_mul(dvx.v, dz.v))); + curlvry2.v = + vec_fma(dvz2.v, dx2.v, vec_mul(vec_set1(-1.0f), vec_mul(dvx2.v, dz2.v))); + curlvrz.v = + vec_fma(dvx.v, dy.v, vec_mul(vec_set1(-1.0f), vec_mul(dvy.v, dx.v))); + curlvrz2.v = + vec_fma(dvx2.v, dy2.v, vec_mul(vec_set1(-1.0f), vec_mul(dvy2.v, dx2.v))); + curlvrx.v = vec_mul(curlvrx.v, ri.v); + curlvrx2.v = vec_mul(curlvrx2.v, ri2.v); + curlvry.v = vec_mul(curlvry.v, ri.v); + curlvry2.v = vec_mul(curlvry2.v, ri2.v); + curlvrz.v = vec_mul(curlvrz.v, ri.v); + curlvrz2.v = vec_mul(curlvrz2.v, ri2.v); + +/* Mask updates to intermediate vector sums for particle pi. */ +#ifdef HAVE_AVX512_F + rhoSum->v = + _mm512_mask_add_ps(rhoSum->v, knlMask, vec_mul(mj.v, wi.v), rhoSum->v); + rhoSum->v = + _mm512_mask_add_ps(rhoSum->v, knlMask2, vec_mul(mj2.v, wi2.v), rhoSum->v); + + rho_dhSum->v = + _mm512_mask_sub_ps(rho_dhSum->v, knlMask, rho_dhSum->v, + vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, + vec_mul(xi.v, wi_dx.v)))); + rho_dhSum->v = _mm512_mask_sub_ps( + rho_dhSum->v, knlMask2, rho_dhSum->v, + vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v, + vec_mul(xi2.v, wi_dx2.v)))); + + wcountSum->v = _mm512_mask_add_ps(wcountSum->v, knlMask, wi.v, wcountSum->v); + wcountSum->v = + _mm512_mask_add_ps(wcountSum->v, knlMask2, wi2.v, wcountSum->v); + + wcount_dhSum->v = _mm512_mask_sub_ps(wcount_dhSum->v, knlMask, + wcount_dhSum->v, vec_mul(xi.v, wi_dx.v)); + wcount_dhSum->v = _mm512_mask_sub_ps( + wcount_dhSum->v, knlMask2, wcount_dhSum->v, vec_mul(xi2.v, wi_dx2.v)); + + div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask, div_vSum->v, + vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v))); + div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask2, div_vSum->v, + vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v))); + + curlvxSum->v = _mm512_mask_add_ps(curlvxSum->v, knlMask, + vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), + curlvxSum->v); + curlvxSum->v = _mm512_mask_add_ps( + curlvxSum->v, knlMask2, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)), + curlvxSum->v); + + curlvySum->v = _mm512_mask_add_ps(curlvySum->v, knlMask, + vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), + curlvySum->v); + curlvySum->v = _mm512_mask_add_ps( + curlvySum->v, knlMask2, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)), + curlvySum->v); + + curlvzSum->v = _mm512_mask_add_ps(curlvzSum->v, knlMask, + vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), + curlvzSum->v); + curlvzSum->v = _mm512_mask_add_ps( + curlvzSum->v, knlMask2, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)), + curlvzSum->v); +#else + rhoSum->v += vec_and(vec_mul(mj.v, wi.v), mask.v); + rhoSum->v += vec_and(vec_mul(mj2.v, wi2.v), mask2.v); + rho_dhSum->v -= vec_and(vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, + vec_mul(xi.v, wi_dx.v))), + mask.v); + rho_dhSum->v -= + vec_and(vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v, + vec_mul(xi2.v, wi_dx2.v))), + mask2.v); + wcountSum->v += vec_and(wi.v, mask.v); + wcountSum->v += vec_and(wi2.v, mask2.v); + wcount_dhSum->v -= vec_and(vec_mul(xi.v, wi_dx.v), mask.v); + wcount_dhSum->v -= vec_and(vec_mul(xi2.v, wi_dx2.v), mask2.v); + div_vSum->v -= vec_and(vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask.v); + div_vSum->v -= vec_and(vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)), mask2.v); + curlvxSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask.v); + curlvxSum->v += + vec_and(vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)), mask2.v); + curlvySum->v += vec_and(vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask.v); + curlvySum->v += + vec_and(vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)), mask2.v); + curlvzSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask.v); + curlvzSum->v += + vec_and(vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)), mask2.v); +#endif +} +#endif + /** * @brief Force loop */ @@ -492,9 +653,10 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_force( vector hi, hj, hi_inv, hj_inv; vector hid_inv, hjd_inv; vector wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, dvdr; - vector piPOrho, pjPOrho, pirho, pjrho; + vector piPOrho2, pjPOrho2, pirho, pjrho; vector mi, mj; vector f; + vector grad_hi, grad_hj; vector dx[3]; vector vi[3], vj[3]; vector pia[3], pja[3]; @@ -512,14 +674,20 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_force( pi[4]->mass, pi[5]->mass, pi[6]->mass, pi[7]->mass); mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass, pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass); - piPOrho.v = vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2, - pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2, - pi[4]->force.P_over_rho2, pi[5]->force.P_over_rho2, - pi[6]->force.P_over_rho2, pi[7]->force.P_over_rho2); - pjPOrho.v = vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2, - pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2, - pj[4]->force.P_over_rho2, pj[5]->force.P_over_rho2, - pj[6]->force.P_over_rho2, pj[7]->force.P_over_rho2); + piPOrho2.v = vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2, + pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2, + pi[4]->force.P_over_rho2, pi[5]->force.P_over_rho2, + pi[6]->force.P_over_rho2, pi[7]->force.P_over_rho2); + pjPOrho2.v = vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2, + pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2, + pj[4]->force.P_over_rho2, pj[5]->force.P_over_rho2, + pj[6]->force.P_over_rho2, pj[7]->force.P_over_rho2); + grad_hi.v = + vec_set(pi[0]->force.f, pi[1]->force.f, pi[2]->force.f, pi[3]->force.f, + pi[4]->force.f, pi[5]->force.f, pi[6]->force.f, pi[7]->force.f); + grad_hj.v = + vec_set(pj[0]->force.f, pj[1]->force.f, pj[2]->force.f, pj[3]->force.f, + pj[4]->force.f, pj[5]->force.f, pj[6]->force.f, pj[7]->force.f); pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho, pi[4]->rho, pi[5]->rho, pi[6]->rho, pi[7]->rho); pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho, pj[4]->rho, @@ -551,10 +719,14 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_force( #elif VEC_SIZE == 4 mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass); mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass); - piPOrho.v = vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2, - pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2); - pjPOrho.v = vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2, - pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2); + piPOrho2.v = vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2, + pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2); + pjPOrho2.v = vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2, + pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2); + grad_hi.v = + vec_set(pi[0]->force.f, pi[1]->force.f, pi[2]->force.f, pi[3]->force.f); + grad_hj.v = + vec_set(pj[0]->force.f, pj[1]->force.f, pj[2]->force.f, pj[3]->force.f); pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho); pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho); ci.v = vec_set(pi[0]->force.soundspeed, pi[1]->force.soundspeed, @@ -577,14 +749,12 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_force( /* Get the radius and inverse radius. */ r2.v = vec_load(R2); - ri.v = vec_rsqrt(r2.v); - ri.v = ri.v - vec_set1(0.5f) * ri.v * (r2.v * ri.v * ri.v - vec_set1(1.0f)); + ri = vec_reciprocal_sqrt(r2); r.v = r2.v * ri.v; /* Get the kernel for hi. */ hi.v = vec_load(Hi); - hi_inv.v = vec_rcp(hi.v); - hi_inv.v = hi_inv.v - hi_inv.v * (hi.v * hi_inv.v - vec_set1(1.0f)); + hi_inv = vec_reciprocal(hi); hid_inv = pow_dimension_plus_one_vec(hi_inv); /* 1/h^(d+1) */ xi.v = r.v * hi_inv.v; kernel_deval_vec(&xi, &wi, &wi_dx); @@ -592,8 +762,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_force( /* Get the kernel for hj. */ hj.v = vec_load(Hj); - hj_inv.v = vec_rcp(hj.v); - hj_inv.v = hj_inv.v - hj_inv.v * (hj.v * hj_inv.v - vec_set1(1.0f)); + hj_inv = vec_reciprocal(hj); hjd_inv = pow_dimension_plus_one_vec(hj_inv); /* 1/h^(d+1) */ xj.v = r.v * hj_inv.v; kernel_deval_vec(&xj, &wj, &wj_dx); @@ -619,7 +788,9 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_force( /* Now, convolve with the kernel */ visc_term.v = vec_set1(0.5f) * visc.v * (wi_dr.v + wj_dr.v) * ri.v; - sph_term.v = (piPOrho.v * wi_dr.v + pjPOrho.v * wj_dr.v) * ri.v; + sph_term.v = + (grad_hi.v * piPOrho2.v * wi_dr.v + grad_hj.v * pjPOrho2.v * wj_dr.v) * + ri.v; /* Eventually get the acceleration */ acc.v = visc_term.v + sph_term.v; @@ -764,9 +935,10 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force( vector hi, hj, hi_inv, hj_inv; vector hid_inv, hjd_inv; vector wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, dvdr; - vector piPOrho, pjPOrho, pirho, pjrho; + vector piPOrho2, pjPOrho2, pirho, pjrho; vector mj; vector f; + vector grad_hi, grad_hj; vector dx[3]; vector vi[3], vj[3]; vector pia[3]; @@ -782,14 +954,20 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force( #if VEC_SIZE == 8 mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass, pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass); - piPOrho.v = vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2, - pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2, - pi[4]->force.P_over_rho2, pi[5]->force.P_over_rho2, - pi[6]->force.P_over_rho2, pi[7]->force.P_over_rho2); - pjPOrho.v = vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2, - pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2, - pj[4]->force.P_over_rho2, pj[5]->force.P_over_rho2, - pj[6]->force.P_over_rho2, pj[7]->force.P_over_rho2); + piPOrho2.v = vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2, + pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2, + pi[4]->force.P_over_rho2, pi[5]->force.P_over_rho2, + pi[6]->force.P_over_rho2, pi[7]->force.P_over_rho2); + pjPOrho2.v = vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2, + pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2, + pj[4]->force.P_over_rho2, pj[5]->force.P_over_rho2, + pj[6]->force.P_over_rho2, pj[7]->force.P_over_rho2); + grad_hi.v = + vec_set(pi[0]->force.f, pi[1]->force.f, pi[2]->force.f, pi[3]->force.f, + pi[4]->force.f, pi[5]->force.f, pi[6]->force.f, pi[7]->force.f); + grad_hj.v = + vec_set(pj[0]->force.f, pj[1]->force.f, pj[2]->force.f, pj[3]->force.f, + pj[4]->force.f, pj[5]->force.f, pj[6]->force.f, pj[7]->force.f); pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho, pi[4]->rho, pi[5]->rho, pi[6]->rho, pi[7]->rho); pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho, pj[4]->rho, @@ -820,10 +998,14 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force( pj[6]->force.balsara, pj[7]->force.balsara); #elif VEC_SIZE == 4 mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass); - piPOrho.v = vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2, - pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2); - pjPOrho.v = vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2, - pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2); + piPOrho2.v = vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2, + pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2); + pjPOrho2.v = vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2, + pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2); + grad_hi.v = + vec_set(pi[0]->force.f, pi[1]->force.f, pi[2]->force.f, pi[3]->force.f); + grad_hj.v = + vec_set(pj[0]->force.f, pj[1]->force.f, pj[2]->force.f, pj[3]->force.f); pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho); pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho); ci.v = vec_set(pi[0]->force.soundspeed, pi[1]->force.soundspeed, @@ -846,14 +1028,12 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force( /* Get the radius and inverse radius. */ r2.v = vec_load(R2); - ri.v = vec_rsqrt(r2.v); - ri.v = ri.v - vec_set1(0.5f) * ri.v * (r2.v * ri.v * ri.v - vec_set1(1.0f)); + ri = vec_reciprocal_sqrt(r2); r.v = r2.v * ri.v; /* Get the kernel for hi. */ hi.v = vec_load(Hi); - hi_inv.v = vec_rcp(hi.v); - hi_inv.v = hi_inv.v - hi_inv.v * (hi.v * hi_inv.v - vec_set1(1.0f)); + hi_inv = vec_reciprocal(hi); hid_inv = pow_dimension_plus_one_vec(hi_inv); xi.v = r.v * hi_inv.v; kernel_deval_vec(&xi, &wi, &wi_dx); @@ -861,8 +1041,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force( /* Get the kernel for hj. */ hj.v = vec_load(Hj); - hj_inv.v = vec_rcp(hj.v); - hj_inv.v = hj_inv.v - hj_inv.v * (hj.v * hj_inv.v - vec_set1(1.0f)); + hj_inv = vec_reciprocal(hj); hjd_inv = pow_dimension_plus_one_vec(hj_inv); xj.v = r.v * hj_inv.v; kernel_deval_vec(&xj, &wj, &wj_dx); @@ -888,7 +1067,9 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force( /* Now, convolve with the kernel */ visc_term.v = vec_set1(0.5f) * visc.v * (wi_dr.v + wj_dr.v) * ri.v; - sph_term.v = (piPOrho.v * wi_dr.v + pjPOrho.v * wj_dr.v) * ri.v; + sph_term.v = + (grad_hi.v * piPOrho2.v * wi_dr.v + grad_hj.v * pjPOrho2.v * wj_dr.v) * + ri.v; /* Eventually get the acceleration */ acc.v = visc_term.v + sph_term.v; diff --git a/src/kernel_hydro.h b/src/kernel_hydro.h index 8f38fc0d2b98988a48fe36edcbd2f9419d237d41..7bf2e01a719a29b731bb437096093b13ca086e37 100644 --- a/src/kernel_hydro.h +++ b/src/kernel_hydro.h @@ -362,6 +362,117 @@ __attribute__((always_inline)) INLINE static void kernel_deval_vec( dw_dx->v * kernel_constant_vec.v * kernel_gamma_inv_dim_plus_one_vec.v; } +/* Define constant vectors for the Wendland C2 kernel coefficients. */ +#ifdef WENDLAND_C2_KERNEL +static const vector wendland_const_c0 = FILL_VEC(4.f); +static const vector wendland_const_c1 = FILL_VEC(-15.f); +static const vector wendland_const_c2 = FILL_VEC(20.f); +static const vector wendland_const_c3 = FILL_VEC(-10.f); +static const vector wendland_const_c4 = FILL_VEC(0.f); +static const vector wendland_const_c5 = FILL_VEC(1.f); +#endif + +/** + * @brief Computes the kernel function and its derivative for two particles + * using interleaved vectors. + * + * Return 0 if $u > \\gamma = H/h$ + * + * @param u The ratio of the distance to the smoothing length $u = x/h$. + * @param w (return) The value of the kernel function $W(x,h)$. + * @param dw_dx (return) The norm of the gradient of $|\\nabla W(x,h)|$. + * @param u2 The ratio of the distance to the smoothing length $u = x/h$ for + * second particle. + * @param w2 (return) The value of the kernel function $W(x,h)$ for second + * particle. + * @param dw_dx2 (return) The norm of the gradient of $|\\nabla W(x,h)|$ for + * second particle. + */ +__attribute__((always_inline)) INLINE static void kernel_deval_2_vec( + vector *u, vector *w, vector *dw_dx, vector *u2, vector *w2, + vector *dw_dx2) { + + /* Go to the range [0,1[ from [0,H[ */ + vector x, x2; + x.v = vec_mul(u->v, kernel_gamma_inv_vec.v); + x2.v = vec_mul(u2->v, kernel_gamma_inv_vec.v); + +#ifdef WENDLAND_C2_KERNEL + /* Init the iteration for Horner's scheme. */ + w->v = vec_fma(wendland_const_c0.v, x.v, wendland_const_c1.v); + w2->v = vec_fma(wendland_const_c0.v, x2.v, wendland_const_c1.v); + dw_dx->v = wendland_const_c0.v; + dw_dx2->v = wendland_const_c0.v; + + /* Calculate the polynomial interleaving vector operations */ + dw_dx->v = vec_fma(dw_dx->v, x.v, w->v); + dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v); + w->v = vec_fma(x.v, w->v, wendland_const_c2.v); + w2->v = vec_fma(x2.v, w2->v, wendland_const_c2.v); + + dw_dx->v = vec_fma(dw_dx->v, x.v, w->v); + dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v); + w->v = vec_fma(x.v, w->v, wendland_const_c3.v); + w2->v = vec_fma(x2.v, w2->v, wendland_const_c3.v); + + dw_dx->v = vec_fma(dw_dx->v, x.v, w->v); + dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v); + w->v = vec_fma(x.v, w->v, wendland_const_c4.v); + w2->v = vec_fma(x2.v, w2->v, wendland_const_c4.v); + + dw_dx->v = vec_fma(dw_dx->v, x.v, w->v); + dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v); + w->v = vec_fma(x.v, w->v, wendland_const_c5.v); + w2->v = vec_fma(x2.v, w2->v, wendland_const_c5.v); + + /* Return everything */ + w->v = + vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v)); + w2->v = vec_mul(w2->v, + vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v)); + dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v, + kernel_gamma_inv_dim_plus_one_vec.v)); + dw_dx2->v = vec_mul(dw_dx2->v, vec_mul(kernel_constant_vec.v, + kernel_gamma_inv_dim_plus_one_vec.v)); +#else + + /* Load x and get the interval id. */ + vector ind, ind2; + ind.m = vec_ftoi(vec_fmin(x.v * kernel_ivals_vec.v, kernel_ivals_vec.v)); + ind2.m = vec_ftoi(vec_fmin(x2.v * kernel_ivals_vec.v, kernel_ivals_vec.v)); + + /* load the coefficients. */ + vector c[kernel_degree + 1], c2[kernel_degree + 1]; + for (int k = 0; k < VEC_SIZE; k++) + for (int j = 0; j < kernel_degree + 1; j++) { + c[j].f[k] = kernel_coeffs[ind.i[k] * (kernel_degree + 1) + j]; + c2[j].f[k] = kernel_coeffs[ind2.i[k] * (kernel_degree + 1) + j]; + } + + /* Init the iteration for Horner's scheme. */ + w->v = (c[0].v * x.v) + c[1].v; + w2->v = (c2[0].v * x2.v) + c2[1].v; + dw_dx->v = c[0].v; + dw_dx2->v = c2[0].v; + + /* And we're off! */ + for (int k = 2; k <= kernel_degree; k++) { + dw_dx->v = (dw_dx->v * x.v) + w->v; + dw_dx2->v = (dw_dx2->v * x2.v) + w2->v; + w->v = (x.v * w->v) + c[k].v; + w2->v = (x2.v * w2->v) + c2[k].v; + } + /* Return everything */ + w->v = w->v * kernel_constant_vec.v * kernel_gamma_inv_dim_vec.v; + w2->v = w2->v * kernel_constant_vec.v * kernel_gamma_inv_dim_vec.v; + dw_dx->v = + dw_dx->v * kernel_constant_vec.v * kernel_gamma_inv_dim_plus_one_vec.v; + dw_dx2->v = + dw_dx2->v * kernel_constant_vec.v * kernel_gamma_inv_dim_plus_one_vec.v; + +#endif +} + #endif /* Some cross-check functions */ diff --git a/src/runner.c b/src/runner.c index 25ff384ecaabfdad7231b8bcd5f9a5b7e90ac732..23e5c8d2d920023394f466c60deec821bca4d332 100644 --- a/src/runner.c +++ b/src/runner.c @@ -53,6 +53,7 @@ #include "hydro_properties.h" #include "kick.h" #include "minmax.h" +#include "runner_doiact_vec.h" #include "scheduler.h" #include "sourceterms.h" #include "space.h" @@ -1093,7 +1094,13 @@ void *runner_main(void *data) { /* Different types of tasks... */ switch (t->type) { case task_type_self: - if (t->subtype == task_subtype_density) runner_doself1_density(r, ci); + if (t->subtype == task_subtype_density) { +#ifdef WITH_VECTORIZATION + runner_doself1_density_vec(r, ci); +#else + runner_doself1_density(r, ci); +#endif + } #ifdef EXTRA_HYDRO_LOOP else if (t->subtype == task_subtype_gradient) runner_doself1_gradient(r, ci); diff --git a/src/runner.h b/src/runner.h index 6aab8469a1eb9aba22dac31553424bbcce8f183f..2ee4b3e4c4d5c93a1180af245e18b4b68a2db188 100644 --- a/src/runner.h +++ b/src/runner.h @@ -23,6 +23,8 @@ #ifndef SWIFT_RUNNER_H #define SWIFT_RUNNER_H +#include "cache.h" + extern const double runner_shift[13][3]; extern const char runner_flip[27]; @@ -45,6 +47,9 @@ struct runner { /*! The engine owing this runner. */ struct engine *e; + + /*! The particle cache of this runner. */ + struct cache par_cache; }; /* Function prototypes. */ diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c new file mode 100644 index 0000000000000000000000000000000000000000..5db0ec6f89168f2d8cc9f4da574aeaab9ba54ca1 --- /dev/null +++ b/src/runner_doiact_vec.c @@ -0,0 +1,868 @@ +/******************************************************************************* + * This file is part of SWIFT. + * Copyright (c) 2016 James Willis (james.s.willis@durham.ac.uk) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + ******************************************************************************/ + +/* Config parameters. */ +#include "../config.h" + +/* This object's header. */ +#include "runner_doiact_vec.h" + +#ifdef WITH_VECTORIZATION +/** + * @brief Compute the vector remainder interactions from the secondary cache. + * + * @param int_cache (return) secondary #cache of interactions between two + * particles. + * @param icount Interaction count. + * @param rhoSum (return) #vector holding the cumulative sum of the density + * update on pi. + * @param rho_dhSum (return) #vector holding the cumulative sum of the density + * gradient update on pi. + * @param wcountSum (return) #vector holding the cumulative sum of the wcount + * update on pi. + * @param wcount_dhSum (return) #vector holding the cumulative sum of the wcount + * gradient update on pi. + * @param div_vSum (return) #vector holding the cumulative sum of the divergence + * update on pi. + * @param curlvxSum (return) #vector holding the cumulative sum of the curl of + * vx update on pi. + * @param curlvySum (return) #vector holding the cumulative sum of the curl of + * vy update on pi. + * @param curlvzSum (return) #vector holding the cumulative sum of the curl of + * vz update on pi. + * @param v_hi_inv #vector of 1/h for pi. + * @param v_vix #vector of x velocity of pi. + * @param v_viy #vector of y velocity of pi. + * @param v_viz #vector of z velocity of pi. + * @param icount_align (return) Interaction count after the remainder + * interactions have been performed, should be a multiple of the vector length. + */ +__attribute__((always_inline)) INLINE static void calcRemInteractions( + struct c2_cache *const int_cache, const int icount, vector *rhoSum, + vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum, + vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum, + vector v_hi_inv, vector v_vix, vector v_viy, vector v_viz, + int *icount_align) { + +#ifdef HAVE_AVX512_F + KNL_MASK_16 knl_mask, knl_mask2; +#endif + vector int_mask, int_mask2; + + /* Work out the number of remainder interactions and pad secondary cache. */ + *icount_align = icount; + int rem = icount % (NUM_VEC_PROC * VEC_SIZE); + if (rem != 0) { + int pad = (NUM_VEC_PROC * VEC_SIZE) - rem; + *icount_align += pad; + +/* Initialise masks to true. */ +#ifdef HAVE_AVX512_F + knl_mask = 0xFFFF; + knl_mask2 = 0xFFFF; + int_mask.m = vec_setint1(0xFFFFFFFF); + int_mask2.m = vec_setint1(0xFFFFFFFF); +#else + int_mask.m = vec_setint1(0xFFFFFFFF); + int_mask2.m = vec_setint1(0xFFFFFFFF); +#endif + /* Pad secondary cache so that there are no contributions in the interaction + * function. */ + for (int i = icount; i < *icount_align; i++) { + int_cache->mq[i] = 0.f; + int_cache->r2q[i] = 1.f; + int_cache->dxq[i] = 0.f; + int_cache->dyq[i] = 0.f; + int_cache->dzq[i] = 0.f; + int_cache->vxq[i] = 0.f; + int_cache->vyq[i] = 0.f; + int_cache->vzq[i] = 0.f; + } + + /* Zero parts of mask that represent the padded values.*/ + if (pad < VEC_SIZE) { +#ifdef HAVE_AVX512_F + knl_mask2 = knl_mask2 >> pad; +#else + for (int i = VEC_SIZE - pad; i < VEC_SIZE; i++) int_mask2.i[i] = 0; +#endif + } else { +#ifdef HAVE_AVX512_F + knl_mask = knl_mask >> (VEC_SIZE - rem); + knl_mask2 = 0; +#else + for (int i = rem; i < VEC_SIZE; i++) int_mask.i[i] = 0; + int_mask2.v = vec_setzero(); +#endif + } + + /* Perform remainder interaction and remove remainder from aligned + * interaction count. */ + *icount_align = icount - rem; + runner_iact_nonsym_2_vec_density( + &int_cache->r2q[*icount_align], &int_cache->dxq[*icount_align], + &int_cache->dyq[*icount_align], &int_cache->dzq[*icount_align], + v_hi_inv, v_vix, v_viy, v_viz, &int_cache->vxq[*icount_align], + &int_cache->vyq[*icount_align], &int_cache->vzq[*icount_align], + &int_cache->mq[*icount_align], rhoSum, rho_dhSum, wcountSum, + wcount_dhSum, div_vSum, curlvxSum, curlvySum, curlvzSum, int_mask, + int_mask2, +#ifdef HAVE_AVX512_F + knl_mask, knl_mask2); +#else + 0, 0); +#endif + } +} + +/** + * @brief Left-packs the values needed by an interaction into the secondary + * cache (Supports AVX, AVX2 and AVX512 instruction sets). + * + * @param mask Contains which particles need to interact. + * @param pjd Index of the particle to store into. + * @param v_r2 #vector of the separation between two particles squared. + * @param v_dx #vector of the x separation between two particles. + * @param v_dy #vector of the y separation between two particles. + * @param v_dz #vector of the z separation between two particles. + * @param v_mj #vector of the mass of particle pj. + * @param v_vjx #vector of x velocity of pj. + * @param v_vjy #vector of y velocity of pj. + * @param v_vjz #vector of z velocity of pj. + * @param cell_cache #cache of all particles in the cell. + * @param int_cache (return) secondary #cache of interactions between two + * particles. + * @param icount Interaction count. + * @param rhoSum #vector holding the cumulative sum of the density update on pi. + * @param rho_dhSum #vector holding the cumulative sum of the density gradient + * update on pi. + * @param wcountSum #vector holding the cumulative sum of the wcount update on + * pi. + * @param wcount_dhSum #vector holding the cumulative sum of the wcount gradient + * update on pi. + * @param div_vSum #vector holding the cumulative sum of the divergence update + * on pi. + * @param curlvxSum #vector holding the cumulative sum of the curl of vx update + * on pi. + * @param curlvySum #vector holding the cumulative sum of the curl of vy update + * on pi. + * @param curlvzSum #vector holding the cumulative sum of the curl of vz update + * on pi. + * @param v_hi_inv #vector of 1/h for pi. + * @param v_vix #vector of x velocity of pi. + * @param v_viy #vector of y velocity of pi. + * @param v_viz #vector of z velocity of pi. + */ +__attribute__((always_inline)) INLINE static void storeInteractions( + const int mask, const int pjd, vector *v_r2, vector *v_dx, vector *v_dy, + vector *v_dz, vector *v_mj, vector *v_vjx, vector *v_vjy, vector *v_vjz, + const struct cache *const cell_cache, struct c2_cache *const int_cache, + int *icount, vector *rhoSum, vector *rho_dhSum, vector *wcountSum, + vector *wcount_dhSum, vector *div_vSum, vector *curlvxSum, + vector *curlvySum, vector *curlvzSum, vector v_hi_inv, vector v_vix, + vector v_viy, vector v_viz) { + +/* Left-pack values needed into the secondary cache using the interaction mask. + */ +#if defined(HAVE_AVX2) || defined(HAVE_AVX512_F) + int pack = 0; + +#ifdef HAVE_AVX512_F + pack += __builtin_popcount(mask); + VEC_LEFT_PACK(v_r2->v, mask, &int_cache->r2q[*icount]); + VEC_LEFT_PACK(v_dx->v, mask, &int_cache->dxq[*icount]); + VEC_LEFT_PACK(v_dy->v, mask, &int_cache->dyq[*icount]); + VEC_LEFT_PACK(v_dz->v, mask, &int_cache->dzq[*icount]); + VEC_LEFT_PACK(v_mj->v, mask, &int_cache->mq[*icount]); + VEC_LEFT_PACK(v_vjx->v, mask, &int_cache->vxq[*icount]); + VEC_LEFT_PACK(v_vjy->v, mask, &int_cache->vyq[*icount]); + VEC_LEFT_PACK(v_vjz->v, mask, &int_cache->vzq[*icount]); +#else + vector v_mask; + VEC_FORM_PACKED_MASK(mask, v_mask.m, pack); + + VEC_LEFT_PACK(v_r2->v, v_mask.m, &int_cache->r2q[*icount]); + VEC_LEFT_PACK(v_dx->v, v_mask.m, &int_cache->dxq[*icount]); + VEC_LEFT_PACK(v_dy->v, v_mask.m, &int_cache->dyq[*icount]); + VEC_LEFT_PACK(v_dz->v, v_mask.m, &int_cache->dzq[*icount]); + VEC_LEFT_PACK(v_mj->v, v_mask.m, &int_cache->mq[*icount]); + VEC_LEFT_PACK(v_vjx->v, v_mask.m, &int_cache->vxq[*icount]); + VEC_LEFT_PACK(v_vjy->v, v_mask.m, &int_cache->vyq[*icount]); + VEC_LEFT_PACK(v_vjz->v, v_mask.m, &int_cache->vzq[*icount]); + +#endif /* HAVE_AVX512_F */ + + (*icount) += pack; +#else + /* Quicker to do it serially in AVX rather than use intrinsics. */ + for (int bit_index = 0; bit_index < VEC_SIZE; bit_index++) { + if (mask & (1 << bit_index)) { + /* Add this interaction to the queue. */ + int_cache->r2q[*icount] = v_r2->f[bit_index]; + int_cache->dxq[*icount] = v_dx->f[bit_index]; + int_cache->dyq[*icount] = v_dy->f[bit_index]; + int_cache->dzq[*icount] = v_dz->f[bit_index]; + int_cache->mq[*icount] = cell_cache->m[pjd + bit_index]; + int_cache->vxq[*icount] = cell_cache->vx[pjd + bit_index]; + int_cache->vyq[*icount] = cell_cache->vy[pjd + bit_index]; + int_cache->vzq[*icount] = cell_cache->vz[pjd + bit_index]; + + (*icount)++; + } + } + +#endif /* defined(HAVE_AVX2) || defined(HAVE_AVX512_F) */ + + /* Flush the c2 cache if it has reached capacity. */ + if (*icount >= (C2_CACHE_SIZE - (NUM_VEC_PROC * VEC_SIZE))) { + + int icount_align = *icount; + + /* Peform remainder interactions. */ + calcRemInteractions(int_cache, *icount, rhoSum, rho_dhSum, wcountSum, + wcount_dhSum, div_vSum, curlvxSum, curlvySum, curlvzSum, + v_hi_inv, v_vix, v_viy, v_viz, &icount_align); + + vector int_mask, int_mask2; + int_mask.m = vec_setint1(0xFFFFFFFF); + int_mask2.m = vec_setint1(0xFFFFFFFF); + + /* Perform interactions. */ + for (int pjd = 0; pjd < icount_align; pjd += (NUM_VEC_PROC * VEC_SIZE)) { + runner_iact_nonsym_2_vec_density( + &int_cache->r2q[pjd], &int_cache->dxq[pjd], &int_cache->dyq[pjd], + &int_cache->dzq[pjd], v_hi_inv, v_vix, v_viy, v_viz, + &int_cache->vxq[pjd], &int_cache->vyq[pjd], &int_cache->vzq[pjd], + &int_cache->mq[pjd], rhoSum, rho_dhSum, wcountSum, wcount_dhSum, + div_vSum, curlvxSum, curlvySum, curlvzSum, int_mask, int_mask2, 0, 0); + } + + /* Reset interaction count. */ + *icount = 0; + } +} +#endif /* WITH_VECTORIZATION */ + +/** + * @brief Compute the cell self-interaction (non-symmetric) using vector + * intrinsics with one particle pi at a time. + * + * @param r The #runner. + * @param c The #cell. + */ +__attribute__((always_inline)) INLINE void runner_doself1_density_vec( + struct runner *r, struct cell *restrict c) { + +#ifdef WITH_VECTORIZATION + const int ti_current = r->e->ti_current; + int doi_mask; + struct part *restrict pi; + int count_align; + int num_vec_proc = NUM_VEC_PROC; + + struct part *restrict parts = c->parts; + const int count = c->count; + + vector v_hi, v_vix, v_viy, v_viz, v_hig2, v_r2; + + TIMER_TIC + + if (c->ti_end_min > ti_current) return; + if (c->ti_end_max < ti_current) error("Cell in an impossible time-zone"); + + /* Get the particle cache from the runner and re-allocate + * the cache if it is not big enough for the cell. */ + struct cache *restrict cell_cache = &r->par_cache; + + if (cell_cache->count < count) { + cache_init(cell_cache, count); + } + + /* Read the particles from the cell and store them locally in the cache. */ + cache_read_particles(c, cell_cache); + + /* Create secondary cache to store particle interactions. */ + struct c2_cache int_cache; + int icount = 0, icount_align = 0; + + /* Loop over the particles in the cell. */ + for (int pid = 0; pid < count; pid++) { + + /* Get a pointer to the ith particle. */ + pi = &parts[pid]; + + /* Is the ith particle active? */ + if (pi->ti_end > ti_current) continue; + + vector pix, piy, piz; + + const float hi = cell_cache->h[pid]; + + /* Fill particle pi vectors. */ + pix.v = vec_set1(cell_cache->x[pid]); + piy.v = vec_set1(cell_cache->y[pid]); + piz.v = vec_set1(cell_cache->z[pid]); + v_hi.v = vec_set1(hi); + v_vix.v = vec_set1(cell_cache->vx[pid]); + v_viy.v = vec_set1(cell_cache->vy[pid]); + v_viz.v = vec_set1(cell_cache->vz[pid]); + + const float hig2 = hi * hi * kernel_gamma2; + v_hig2.v = vec_set1(hig2); + + /* Reset cumulative sums of update vectors. */ + vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, + curlvySum, curlvzSum; + + /* Get the inverse of hi. */ + vector v_hi_inv; + + v_hi_inv = vec_reciprocal(v_hi); + + rhoSum.v = vec_setzero(); + rho_dhSum.v = vec_setzero(); + wcountSum.v = vec_setzero(); + wcount_dhSum.v = vec_setzero(); + div_vSum.v = vec_setzero(); + curlvxSum.v = vec_setzero(); + curlvySum.v = vec_setzero(); + curlvzSum.v = vec_setzero(); + + /* Pad cache if there is a serial remainder. */ + count_align = count; + int rem = count % (num_vec_proc * VEC_SIZE); + if (rem != 0) { + int pad = (num_vec_proc * VEC_SIZE) - rem; + + count_align += pad; + /* Set positions to the same as particle pi so when the r2 > 0 mask is + * applied these extra contributions are masked out.*/ + for (int i = count; i < count_align; i++) { + cell_cache->x[i] = pix.f[0]; + cell_cache->y[i] = piy.f[0]; + cell_cache->z[i] = piz.f[0]; + } + } + + vector pjx, pjy, pjz; + vector pjvx, pjvy, pjvz, mj; + vector pjx2, pjy2, pjz2; + vector pjvx2, pjvy2, pjvz2, mj2; + + /* Find all of particle pi's interacions and store needed values in the + * secondary cache.*/ + for (int pjd = 0; pjd < count_align; pjd += (num_vec_proc * VEC_SIZE)) { + + /* Load 2 sets of vectors from the particle cache. */ + pjx.v = vec_load(&cell_cache->x[pjd]); + pjy.v = vec_load(&cell_cache->y[pjd]); + pjz.v = vec_load(&cell_cache->z[pjd]); + pjvx.v = vec_load(&cell_cache->vx[pjd]); + pjvy.v = vec_load(&cell_cache->vy[pjd]); + pjvz.v = vec_load(&cell_cache->vz[pjd]); + mj.v = vec_load(&cell_cache->m[pjd]); + + pjx2.v = vec_load(&cell_cache->x[pjd + VEC_SIZE]); + pjy2.v = vec_load(&cell_cache->y[pjd + VEC_SIZE]); + pjz2.v = vec_load(&cell_cache->z[pjd + VEC_SIZE]); + pjvx2.v = vec_load(&cell_cache->vx[pjd + VEC_SIZE]); + pjvy2.v = vec_load(&cell_cache->vy[pjd + VEC_SIZE]); + pjvz2.v = vec_load(&cell_cache->vz[pjd + VEC_SIZE]); + mj2.v = vec_load(&cell_cache->m[pjd + VEC_SIZE]); + + /* Compute the pairwise distance. */ + vector v_dx_tmp, v_dy_tmp, v_dz_tmp; + vector v_dx_tmp2, v_dy_tmp2, v_dz_tmp2, v_r2_2; + + v_dx_tmp.v = vec_sub(pix.v, pjx.v); + v_dy_tmp.v = vec_sub(piy.v, pjy.v); + v_dz_tmp.v = vec_sub(piz.v, pjz.v); + v_dx_tmp2.v = vec_sub(pix.v, pjx2.v); + v_dy_tmp2.v = vec_sub(piy.v, pjy2.v); + v_dz_tmp2.v = vec_sub(piz.v, pjz2.v); + + v_r2.v = vec_mul(v_dx_tmp.v, v_dx_tmp.v); + v_r2.v = vec_fma(v_dy_tmp.v, v_dy_tmp.v, v_r2.v); + v_r2.v = vec_fma(v_dz_tmp.v, v_dz_tmp.v, v_r2.v); + v_r2_2.v = vec_mul(v_dx_tmp2.v, v_dx_tmp2.v); + v_r2_2.v = vec_fma(v_dy_tmp2.v, v_dy_tmp2.v, v_r2_2.v); + v_r2_2.v = vec_fma(v_dz_tmp2.v, v_dz_tmp2.v, v_r2_2.v); + +/* Form a mask from r2 < hig2 and r2 > 0.*/ +#ifdef HAVE_AVX512_F + // KNL_MASK_16 doi_mask, doi_mask_check, doi_mask2, doi_mask2_check; + KNL_MASK_16 doi_mask_check, doi_mask2, doi_mask2_check; + + doi_mask_check = vec_cmp_gt(v_r2.v, vec_setzero()); + doi_mask = vec_cmp_lt(v_r2.v, v_hig2.v); + + doi_mask2_check = vec_cmp_gt(v_r2_2.v, vec_setzero()); + doi_mask2 = vec_cmp_lt(v_r2_2.v, v_hig2.v); + + doi_mask = doi_mask & doi_mask_check; + doi_mask2 = doi_mask2 & doi_mask2_check; + +#else + vector v_doi_mask, v_doi_mask_check, v_doi_mask2, v_doi_mask2_check; + int doi_mask2; + + /* Form r2 > 0 mask and r2 < hig2 mask. */ + v_doi_mask_check.v = vec_cmp_gt(v_r2.v, vec_setzero()); + v_doi_mask.v = vec_cmp_lt(v_r2.v, v_hig2.v); + + /* Form r2 > 0 mask and r2 < hig2 mask. */ + v_doi_mask2_check.v = vec_cmp_gt(v_r2_2.v, vec_setzero()); + v_doi_mask2.v = vec_cmp_lt(v_r2_2.v, v_hig2.v); + + /* Combine two masks and form integer mask. */ + doi_mask = vec_cmp_result(vec_and(v_doi_mask.v, v_doi_mask_check.v)); + doi_mask2 = vec_cmp_result(vec_and(v_doi_mask2.v, v_doi_mask2_check.v)); +#endif /* HAVE_AVX512_F */ + + /* If there are any interactions left pack interaction values into c2 + * cache. */ + if (doi_mask) { + storeInteractions(doi_mask, pjd, &v_r2, &v_dx_tmp, &v_dy_tmp, &v_dz_tmp, + &mj, &pjvx, &pjvy, &pjvz, cell_cache, &int_cache, + &icount, &rhoSum, &rho_dhSum, &wcountSum, + &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, + &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz); + } + if (doi_mask2) { + storeInteractions( + doi_mask2, pjd + VEC_SIZE, &v_r2_2, &v_dx_tmp2, &v_dy_tmp2, + &v_dz_tmp2, &mj2, &pjvx2, &pjvy2, &pjvz2, cell_cache, &int_cache, + &icount, &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, + &curlvxSum, &curlvySum, &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz); + } + } + + /* Perform padded vector remainder interactions if any are present. */ + calcRemInteractions(&int_cache, icount, &rhoSum, &rho_dhSum, &wcountSum, + &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, + &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz, + &icount_align); + + /* Initialise masks to true in case remainder interactions have been + * performed. */ + vector int_mask, int_mask2; +#ifdef HAVE_AVX512_F + KNL_MASK_16 knl_mask = 0xFFFF; + KNL_MASK_16 knl_mask2 = 0xFFFF; + int_mask.m = vec_setint1(0xFFFFFFFF); + int_mask2.m = vec_setint1(0xFFFFFFFF); +#else + int_mask.m = vec_setint1(0xFFFFFFFF); + int_mask2.m = vec_setint1(0xFFFFFFFF); +#endif + + /* Perform interaction with 2 vectors. */ + for (int pjd = 0; pjd < icount_align; pjd += (num_vec_proc * VEC_SIZE)) { + runner_iact_nonsym_2_vec_density( + &int_cache.r2q[pjd], &int_cache.dxq[pjd], &int_cache.dyq[pjd], + &int_cache.dzq[pjd], v_hi_inv, v_vix, v_viy, v_viz, + &int_cache.vxq[pjd], &int_cache.vyq[pjd], &int_cache.vzq[pjd], + &int_cache.mq[pjd], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, + &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, int_mask, int_mask2, +#ifdef HAVE_AVX512_F + knl_mask, knl_mask2); +#else + 0, 0); +#endif + } + + /* Perform horizontal adds on vector sums and store result in particle pi. + */ + VEC_HADD(rhoSum, pi->rho); + VEC_HADD(rho_dhSum, pi->density.rho_dh); + VEC_HADD(wcountSum, pi->density.wcount); + VEC_HADD(wcount_dhSum, pi->density.wcount_dh); + VEC_HADD(div_vSum, pi->density.div_v); + VEC_HADD(curlvxSum, pi->density.rot_v[0]); + VEC_HADD(curlvySum, pi->density.rot_v[1]); + VEC_HADD(curlvzSum, pi->density.rot_v[2]); + + /* Reset interaction count. */ + icount = 0; + } /* loop over all particles. */ + + TIMER_TOC(timer_doself_density); +#endif /* WITH_VECTORIZATION */ +} + +/** + * @brief Compute the cell self-interaction (non-symmetric) using vector + * intrinsics with two particle pis at a time. + * CURRENTLY BROKEN DO NOT USE. + * + * @param r The #runner. + * @param c The #cell. + */ +__attribute__((always_inline)) INLINE void runner_doself1_density_vec_2( + struct runner *r, struct cell *restrict c) { + +#ifdef WITH_VECTORIZATION + const int ti_current = r->e->ti_current; + int doi_mask; + int doi2_mask; + struct part *restrict pi; + struct part *restrict pi2; + int count_align; + + vector v_hi, v_vix, v_viy, v_viz, v_hig2, v_r2; + vector v_hi2, v_vix2, v_viy2, v_viz2, v_hig2_2, v2_r2; + + TIMER_TIC + + /* TODO: Need to find two active particles, not just one. */ + if (c->ti_end_min > ti_current) return; + if (c->ti_end_max < ti_current) error("Cell in an impossible time-zone"); + + struct part *restrict parts = c->parts; + const int count = c->count; + + /* Get the particle cache from the runner and re-allocate + * the cache if it is not big enough for the cell. */ + struct cache *restrict cell_cache = &r->par_cache; + + if (cell_cache->count < count) { + cache_init(cell_cache, count); + } + + /* Read the particles from the cell and store them locally in the cache. */ + cache_read_particles(c, &r->par_cache); + + /* Create two secondary caches. */ + int icount = 0, icount_align = 0; + struct c2_cache int_cache; + + int icount2 = 0, icount_align2 = 0; + struct c2_cache int_cache2; + + /* Loop over the particles in the cell. */ + for (int pid = 0; pid < count; pid += 2) { + + /* Get a pointer to the ith particle and next i particle. */ + pi = &parts[pid]; + pi2 = &parts[pid + 1]; + + /* Is the ith particle active? */ + if (pi->ti_end > ti_current) continue; + + vector pix, piy, piz; + vector pix2, piy2, piz2; + + const float hi = cell_cache->h[pid]; + const float hi2 = cell_cache->h[pid + 1]; + + /* Fill pi position vector. */ + pix.v = vec_set1(cell_cache->x[pid]); + piy.v = vec_set1(cell_cache->y[pid]); + piz.v = vec_set1(cell_cache->z[pid]); + v_hi.v = vec_set1(hi); + v_vix.v = vec_set1(cell_cache->vx[pid]); + v_viy.v = vec_set1(cell_cache->vy[pid]); + v_viz.v = vec_set1(cell_cache->vz[pid]); + + pix2.v = vec_set1(cell_cache->x[pid + 1]); + piy2.v = vec_set1(cell_cache->y[pid + 1]); + piz2.v = vec_set1(cell_cache->z[pid + 1]); + v_hi2.v = vec_set1(hi2); + v_vix2.v = vec_set1(cell_cache->vx[pid + 1]); + v_viy2.v = vec_set1(cell_cache->vy[pid + 1]); + v_viz2.v = vec_set1(cell_cache->vz[pid + 1]); + + const float hig2 = hi * hi * kernel_gamma2; + const float hig2_2 = hi2 * hi2 * kernel_gamma2; + v_hig2.v = vec_set1(hig2); + v_hig2_2.v = vec_set1(hig2_2); + + vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, + curlvySum, curlvzSum; + vector rhoSum2, rho_dhSum2, wcountSum2, wcount_dhSum2, div_vSum2, + curlvxSum2, curlvySum2, curlvzSum2; + + vector v_hi_inv, v_hi_inv2; + + v_hi_inv = vec_reciprocal(v_hi); + v_hi_inv2 = vec_reciprocal(v_hi2); + + rhoSum.v = vec_setzero(); + rho_dhSum.v = vec_setzero(); + wcountSum.v = vec_setzero(); + wcount_dhSum.v = vec_setzero(); + div_vSum.v = vec_setzero(); + curlvxSum.v = vec_setzero(); + curlvySum.v = vec_setzero(); + curlvzSum.v = vec_setzero(); + + rhoSum2.v = vec_setzero(); + rho_dhSum2.v = vec_setzero(); + wcountSum2.v = vec_setzero(); + wcount_dhSum2.v = vec_setzero(); + div_vSum2.v = vec_setzero(); + curlvxSum2.v = vec_setzero(); + curlvySum2.v = vec_setzero(); + curlvzSum2.v = vec_setzero(); + + /* Pad cache if there is a serial remainder. */ + count_align = count; + int rem = count % (NUM_VEC_PROC * VEC_SIZE); + if (rem != 0) { + int pad = (NUM_VEC_PROC * VEC_SIZE) - rem; + + count_align += pad; + /* Set positions to the same as particle pi so when the r2 > 0 mask is + * applied these extra contributions are masked out.*/ + for (int i = count; i < count_align; i++) { + cell_cache->x[i] = pix.f[0]; + cell_cache->y[i] = piy.f[0]; + cell_cache->z[i] = piz.f[0]; + } + } + + vector pjx, pjy, pjz; + vector pjvx, pjvy, pjvz, mj; + vector pjx2, pjy2, pjz2; + vector pjvx2, pjvy2, pjvz2, mj2; + + /* Find all of particle pi's interacions and store needed values in + * secondary cache.*/ + for (int pjd = 0; pjd < count_align; pjd += (NUM_VEC_PROC * VEC_SIZE)) { + + /* Load 2 sets of vectors from the particle cache. */ + pjx.v = vec_load(&cell_cache->x[pjd]); + pjy.v = vec_load(&cell_cache->y[pjd]); + pjz.v = vec_load(&cell_cache->z[pjd]); + pjvx.v = vec_load(&cell_cache->vx[pjd]); + pjvy.v = vec_load(&cell_cache->vy[pjd]); + pjvz.v = vec_load(&cell_cache->vz[pjd]); + mj.v = vec_load(&cell_cache->m[pjd]); + + pjx2.v = vec_load(&cell_cache->x[pjd + VEC_SIZE]); + pjy2.v = vec_load(&cell_cache->y[pjd + VEC_SIZE]); + pjz2.v = vec_load(&cell_cache->z[pjd + VEC_SIZE]); + pjvx2.v = vec_load(&cell_cache->vx[pjd + VEC_SIZE]); + pjvy2.v = vec_load(&cell_cache->vy[pjd + VEC_SIZE]); + pjvz2.v = vec_load(&cell_cache->vz[pjd + VEC_SIZE]); + mj2.v = vec_load(&cell_cache->m[pjd + VEC_SIZE]); + + /* Compute the pairwise distance. */ + vector v_dx_tmp, v_dy_tmp, v_dz_tmp; + vector v_dx_tmp2, v_dy_tmp2, v_dz_tmp2, v_r2_2; + vector v_dx2_tmp, v_dy2_tmp, v_dz2_tmp; + vector v_dx2_tmp2, v_dy2_tmp2, v_dz2_tmp2, v2_r2_2; + + v_dx_tmp.v = vec_sub(pix.v, pjx.v); + v_dy_tmp.v = vec_sub(piy.v, pjy.v); + v_dz_tmp.v = vec_sub(piz.v, pjz.v); + v_dx_tmp2.v = vec_sub(pix.v, pjx2.v); + v_dy_tmp2.v = vec_sub(piy.v, pjy2.v); + v_dz_tmp2.v = vec_sub(piz.v, pjz2.v); + + v_dx2_tmp.v = vec_sub(pix2.v, pjx.v); + v_dy2_tmp.v = vec_sub(piy2.v, pjy.v); + v_dz2_tmp.v = vec_sub(piz2.v, pjz.v); + v_dx2_tmp2.v = vec_sub(pix2.v, pjx2.v); + v_dy2_tmp2.v = vec_sub(piy2.v, pjy2.v); + v_dz2_tmp2.v = vec_sub(piz2.v, pjz2.v); + + v_r2.v = vec_mul(v_dx_tmp.v, v_dx_tmp.v); + v_r2.v = vec_fma(v_dy_tmp.v, v_dy_tmp.v, v_r2.v); + v_r2.v = vec_fma(v_dz_tmp.v, v_dz_tmp.v, v_r2.v); + v_r2_2.v = vec_mul(v_dx_tmp2.v, v_dx_tmp2.v); + v_r2_2.v = vec_fma(v_dy_tmp2.v, v_dy_tmp2.v, v_r2_2.v); + v_r2_2.v = vec_fma(v_dz_tmp2.v, v_dz_tmp2.v, v_r2_2.v); + + v2_r2.v = vec_mul(v_dx2_tmp.v, v_dx2_tmp.v); + v2_r2.v = vec_fma(v_dy2_tmp.v, v_dy2_tmp.v, v2_r2.v); + v2_r2.v = vec_fma(v_dz2_tmp.v, v_dz2_tmp.v, v2_r2.v); + v2_r2_2.v = vec_mul(v_dx2_tmp2.v, v_dx2_tmp2.v); + v2_r2_2.v = vec_fma(v_dy2_tmp2.v, v_dy2_tmp2.v, v2_r2_2.v); + v2_r2_2.v = vec_fma(v_dz2_tmp2.v, v_dz2_tmp2.v, v2_r2_2.v); + +/* Form a mask from r2 < hig2 and r2 > 0.*/ +#ifdef HAVE_AVX512_F + // KNL_MASK_16 doi_mask, doi_mask_check, doi_mask2, doi_mask2_check; + KNL_MASK_16 doi_mask_check, doi_mask2, doi_mask2_check; + KNL_MASK_16 doi2_mask_check, doi2_mask2, doi2_mask2_check; + + doi_mask_check = vec_cmp_gt(v_r2.v, vec_setzero()); + doi_mask = vec_cmp_lt(v_r2.v, v_hig2.v); + + doi2_mask_check = vec_cmp_gt(v2_r2.v, vec_setzero()); + doi2_mask = vec_cmp_lt(v2_r2.v, v_hig2_2.v); + + doi_mask2_check = vec_cmp_gt(v_r2_2.v, vec_setzero()); + doi_mask2 = vec_cmp_lt(v_r2_2.v, v_hig2.v); + + doi2_mask2_check = vec_cmp_gt(v2_r2_2.v, vec_setzero()); + doi2_mask2 = vec_cmp_lt(v2_r2_2.v, v_hig2_2.v); + + doi_mask = doi_mask & doi_mask_check; + doi_mask2 = doi_mask2 & doi_mask2_check; + + doi2_mask = doi2_mask & doi2_mask_check; + doi2_mask2 = doi2_mask2 & doi2_mask2_check; +#else + vector v_doi_mask, v_doi_mask_check, v_doi_mask2, v_doi_mask2_check; + int doi_mask2; + + vector v_doi2_mask, v_doi2_mask_check, v_doi2_mask2, v_doi2_mask2_check; + int doi2_mask2; + + v_doi_mask_check.v = vec_cmp_gt(v_r2.v, vec_setzero()); + v_doi_mask.v = vec_cmp_lt(v_r2.v, v_hig2.v); + + v_doi2_mask_check.v = vec_cmp_gt(v2_r2.v, vec_setzero()); + v_doi2_mask.v = vec_cmp_lt(v2_r2.v, v_hig2_2.v); + + v_doi_mask2_check.v = vec_cmp_gt(v_r2_2.v, vec_setzero()); + v_doi_mask2.v = vec_cmp_lt(v_r2_2.v, v_hig2.v); + + v_doi2_mask2_check.v = vec_cmp_gt(v2_r2_2.v, vec_setzero()); + v_doi2_mask2.v = vec_cmp_lt(v2_r2_2.v, v_hig2_2.v); + + doi_mask = vec_cmp_result(vec_and(v_doi_mask.v, v_doi_mask_check.v)); + doi_mask2 = vec_cmp_result(vec_and(v_doi_mask2.v, v_doi_mask2_check.v)); + doi2_mask = vec_cmp_result(vec_and(v_doi2_mask.v, v_doi2_mask_check.v)); + doi2_mask2 = + vec_cmp_result(vec_and(v_doi2_mask2.v, v_doi2_mask2_check.v)); +#endif /* HAVE_AVX512_F */ + + /* Hit or miss? */ + // if (doi_mask) { + storeInteractions(doi_mask, pjd, &v_r2, &v_dx_tmp, &v_dy_tmp, &v_dz_tmp, + &mj, &pjvx, &pjvy, &pjvz, cell_cache, &int_cache, + &icount, &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, + &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, v_hi_inv, + v_vix, v_viy, v_viz); + //} + // if (doi2_mask) { + storeInteractions( + doi2_mask, pjd, &v2_r2, &v_dx2_tmp, &v_dy2_tmp, &v_dz2_tmp, &mj, + &pjvx, &pjvy, &pjvz, cell_cache, &int_cache2, &icount2, &rhoSum2, + &rho_dhSum2, &wcountSum2, &wcount_dhSum2, &div_vSum2, &curlvxSum2, + &curlvySum2, &curlvzSum2, v_hi_inv2, v_vix2, v_viy2, v_viz2); + //} + /* Hit or miss? */ + // if (doi_mask2) { + storeInteractions(doi_mask2, pjd + VEC_SIZE, &v_r2_2, &v_dx_tmp2, + &v_dy_tmp2, &v_dz_tmp2, &mj2, &pjvx2, &pjvy2, &pjvz2, + cell_cache, &int_cache, &icount, &rhoSum, &rho_dhSum, + &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, + &curlvySum, &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz); + //} + // if (doi2_mask2) { + storeInteractions(doi2_mask2, pjd + VEC_SIZE, &v2_r2_2, &v_dx2_tmp2, + &v_dy2_tmp2, &v_dz2_tmp2, &mj2, &pjvx2, &pjvy2, &pjvz2, + cell_cache, &int_cache2, &icount2, &rhoSum2, + &rho_dhSum2, &wcountSum2, &wcount_dhSum2, &div_vSum2, + &curlvxSum2, &curlvySum2, &curlvzSum2, v_hi_inv2, + v_vix2, v_viy2, v_viz2); + //} + } + + /* Perform padded vector remainder interactions if any are present. */ + calcRemInteractions(&int_cache, icount, &rhoSum, &rho_dhSum, &wcountSum, + &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, + &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz, + &icount_align); + + calcRemInteractions(&int_cache2, icount2, &rhoSum2, &rho_dhSum2, + &wcountSum2, &wcount_dhSum2, &div_vSum2, &curlvxSum2, + &curlvySum2, &curlvzSum2, v_hi_inv2, v_vix2, v_viy2, + v_viz2, &icount_align2); + + /* Initialise masks to true incase remainder interactions have been + * performed. */ + vector int_mask, int_mask2; + vector int2_mask, int2_mask2; +#ifdef HAVE_AVX512_F + KNL_MASK_16 knl_mask = 0xFFFF; + KNL_MASK_16 knl_mask2 = 0xFFFF; + int_mask.m = vec_setint1(0xFFFFFFFF); + int_mask2.m = vec_setint1(0xFFFFFFFF); + int2_mask.m = vec_setint1(0xFFFFFFFF); + int2_mask2.m = vec_setint1(0xFFFFFFFF); +#else + int_mask.m = vec_setint1(0xFFFFFFFF); + int_mask2.m = vec_setint1(0xFFFFFFFF); + + int2_mask.m = vec_setint1(0xFFFFFFFF); + int2_mask2.m = vec_setint1(0xFFFFFFFF); +#endif + + /* Perform interaction with 2 vectors. */ + for (int pjd = 0; pjd < icount_align; pjd += (NUM_VEC_PROC * VEC_SIZE)) { + runner_iact_nonsym_2_vec_density( + &int_cache.r2q[pjd], &int_cache.dxq[pjd], &int_cache.dyq[pjd], + &int_cache.dzq[pjd], v_hi_inv, v_vix, v_viy, v_viz, + &int_cache.vxq[pjd], &int_cache.vyq[pjd], &int_cache.vzq[pjd], + &int_cache.mq[pjd], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, + &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, int_mask, int_mask2, +#ifdef HAVE_AVX512_F + knl_mask, knl_mask2); +#else + 0, 0); +#endif + } + + for (int pjd = 0; pjd < icount_align2; pjd += (NUM_VEC_PROC * VEC_SIZE)) { + runner_iact_nonsym_2_vec_density( + &int_cache2.r2q[pjd], &int_cache2.dxq[pjd], &int_cache2.dyq[pjd], + &int_cache2.dzq[pjd], v_hi_inv2, v_vix2, v_viy2, v_viz2, + &int_cache2.vxq[pjd], &int_cache2.vyq[pjd], &int_cache2.vzq[pjd], + &int_cache2.mq[pjd], &rhoSum2, &rho_dhSum2, &wcountSum2, + &wcount_dhSum2, &div_vSum2, &curlvxSum2, &curlvySum2, &curlvzSum2, + int2_mask, int2_mask2, +#ifdef HAVE_AVX512_F + knl_mask, knl_mask2); +#else + 0, 0); +#endif + } + /* Perform horizontal adds on vector sums and store result in particle pi. + */ + VEC_HADD(rhoSum, pi->rho); + VEC_HADD(rho_dhSum, pi->density.rho_dh); + VEC_HADD(wcountSum, pi->density.wcount); + VEC_HADD(wcount_dhSum, pi->density.wcount_dh); + VEC_HADD(div_vSum, pi->density.div_v); + VEC_HADD(curlvxSum, pi->density.rot_v[0]); + VEC_HADD(curlvySum, pi->density.rot_v[1]); + VEC_HADD(curlvzSum, pi->density.rot_v[2]); + + VEC_HADD(rhoSum2, pi2->rho); + VEC_HADD(rho_dhSum2, pi2->density.rho_dh); + VEC_HADD(wcountSum2, pi2->density.wcount); + VEC_HADD(wcount_dhSum2, pi2->density.wcount_dh); + VEC_HADD(div_vSum2, pi2->density.div_v); + VEC_HADD(curlvxSum2, pi2->density.rot_v[0]); + VEC_HADD(curlvySum2, pi2->density.rot_v[1]); + VEC_HADD(curlvzSum2, pi2->density.rot_v[2]); + + /* Reset interaction count. */ + icount = 0; + icount2 = 0; + } /* loop over all particles. */ + + TIMER_TOC(timer_doself_density); +#endif /* WITH_VECTORIZATION */ +} diff --git a/src/runner_doiact_vec.h b/src/runner_doiact_vec.h new file mode 100644 index 0000000000000000000000000000000000000000..9bb24f12cedf03ec49a5a03f92d308f92d49aa54 --- /dev/null +++ b/src/runner_doiact_vec.h @@ -0,0 +1,39 @@ +/******************************************************************************* + * This file is part of SWIFT. + * Copyright (c) 2016 James Willis (james.s.willis@durham.ac.uk) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + ******************************************************************************/ + +#ifndef SWIFT_RUNNER_VEC_H +#define SWIFT_RUNNER_VEC_H + +/* Config parameters. */ +#include "../config.h" + +/* Local headers */ +#include "cell.h" +#include "engine.h" +#include "hydro.h" +#include "part.h" +#include "runner.h" +#include "timers.h" +#include "vector.h" + +/* Function prototypes. */ +void runner_doself1_density_vec(struct runner *r, struct cell *restrict c); +void runner_doself1_density_vec_2(struct runner *r, struct cell *restrict c); + +#endif /* SWIFT_RUNNER_VEC_H */ diff --git a/src/swift.h b/src/swift.h index 2928c263525f57a7ee999b50547aa374b456f556..6c2bcf1811c336b7d5dd2b838b4a4f518ba34ebb 100644 --- a/src/swift.h +++ b/src/swift.h @@ -24,6 +24,7 @@ /* Local headers. */ #include "atomic.h" +#include "cache.h" #include "cell.h" #include "clocks.h" #include "const.h" diff --git a/src/tools.c b/src/tools.c index e526bb1b838f6d97b72eadb4070f3f2a94938c04..f4cb86744332dd211ac07e366cd048049b2068bd 100644 --- a/src/tools.c +++ b/src/tools.c @@ -558,7 +558,171 @@ void shuffle_particles(struct part *parts, const int count) { } /** - * @brief Computes the forces between all g-particles using the N^2 algorithm + * @brief Compares two values based on their relative difference: |a - b|/|a + + * b| + * + * @param a Value a + * @param b Value b + * @param threshold The limit on the relative difference between the two values + * @param absDiff Absolute difference: |a - b| + * @param absSum Absolute sum: |a + b| + * @param relDiff Relative difference: |a - b|/|a + b| + * + * @return 1 if difference found, 0 otherwise + */ +int compare_values(double a, double b, double threshold, double *absDiff, + double *absSum, double *relDiff) { + + int result = 0; + *absDiff = 0.0, *absSum = 0.0, *relDiff = 0.0; + + *absDiff = fabs(a - b); + *absSum = fabs(a + b); + if (*absSum > 0.f) { + *relDiff = *absDiff / *absSum; + } + + if (*relDiff > threshold) { + result = 1; + } + + return result; +} + +/** + * @brief Compares two particles' properties using the relative difference and a + * threshold. + * + * @param a Particle A + * @param b Particle B + * @param threshold The limit on the relative difference between the two values + * + * @return 1 if difference found, 0 otherwise + */ +int compare_particles(struct part a, struct part b, double threshold) { + int result = 0; + double absDiff = 0.0, absSum = 0.0, relDiff = 0.0; + + for (int k = 0; k < 3; k++) { + if (compare_values(a.x[k], b.x[k], threshold, &absDiff, &absSum, + &relDiff)) { + message( + "Relative difference (%e) larger than tolerance (%e) for x[%d] of " + "particle %lld.", + relDiff, threshold, k, a.id); + message("a = %e, b = %e", a.x[k], b.x[k]); + result = 1; + } + } + for (int k = 0; k < 3; k++) { + if (compare_values(a.v[k], b.v[k], threshold, &absDiff, &absSum, + &relDiff)) { + message( + "Relative difference (%e) larger than tolerance (%e) for v[%d] of " + "particle %lld.", + relDiff, threshold, k, a.id); + message("a = %e, b = %e", a.v[k], b.v[k]); + result = 1; + } + } + for (int k = 0; k < 3; k++) { + if (compare_values(a.a_hydro[k], b.a_hydro[k], threshold, &absDiff, &absSum, + &relDiff)) { + message( + "Relative difference (%e) larger than tolerance (%e) for a_hydro[%d] " + "of particle %lld.", + relDiff, threshold, k, a.id); + message("a = %e, b = %e", a.a_hydro[k], b.a_hydro[k]); + result = 1; + } + } + if (compare_values(a.rho, b.rho, threshold, &absDiff, &absSum, &relDiff)) { + message( + "Relative difference (%e) larger than tolerance (%e) for rho of " + "particle %lld.", + relDiff, threshold, a.id); + message("a = %e, b = %e", a.rho, b.rho); + result = 1; + } + if (compare_values(a.density.rho_dh, b.density.rho_dh, threshold, &absDiff, + &absSum, &relDiff)) { + message( + "Relative difference (%e) larger than tolerance (%e) for rho_dh of " + "particle %lld.", + relDiff, threshold, a.id); + message("a = %e, b = %e", a.density.rho_dh, b.density.rho_dh); + result = 1; + } + if (compare_values(a.density.wcount, b.density.wcount, threshold, &absDiff, + &absSum, &relDiff)) { + message( + "Relative difference (%e) larger than tolerance (%e) for wcount of " + "particle %lld.", + relDiff, threshold, a.id); + message("a = %e, b = %e", a.density.wcount, b.density.wcount); + result = 1; + } + if (compare_values(a.density.wcount_dh, b.density.wcount_dh, threshold, + &absDiff, &absSum, &relDiff)) { + message( + "Relative difference (%e) larger than tolerance (%e) for wcount_dh of " + "particle %lld.", + relDiff, threshold, a.id); + message("a = %e, b = %e", a.density.wcount_dh, b.density.wcount_dh); + result = 1; + } + if (compare_values(a.force.h_dt, b.force.h_dt, threshold, &absDiff, &absSum, + &relDiff)) { + message( + "Relative difference (%e) larger than tolerance (%e) for h_dt of " + "particle %lld.", + relDiff, threshold, a.id); + message("a = %e, b = %e", a.force.h_dt, b.force.h_dt); + result = 1; + } + if (compare_values(a.force.v_sig, b.force.v_sig, threshold, &absDiff, &absSum, + &relDiff)) { + message( + "Relative difference (%e) larger than tolerance (%e) for v_sig of " + "particle %lld.", + relDiff, threshold, a.id); + message("a = %e, b = %e", a.force.v_sig, b.force.v_sig); + result = 1; + } + if (compare_values(a.entropy_dt, b.entropy_dt, threshold, &absDiff, &absSum, + &relDiff)) { + message( + "Relative difference (%e) larger than tolerance (%e) for entropy_dt of " + "particle %lld.", + relDiff, threshold, a.id); + message("a = %e, b = %e", a.entropy_dt, b.entropy_dt); + result = 1; + } + if (compare_values(a.density.div_v, b.density.div_v, threshold, &absDiff, + &absSum, &relDiff)) { + message( + "Relative difference (%e) larger than tolerance (%e) for div_v of " + "particle %lld.", + relDiff, threshold, a.id); + message("a = %e, b = %e", a.density.div_v, b.density.div_v); + result = 1; + } + for (int k = 0; k < 3; k++) { + if (compare_values(a.density.rot_v[k], b.density.rot_v[k], threshold, + &absDiff, &absSum, &relDiff)) { + message( + "Relative difference (%e) larger than tolerance (%e) for rot_v[%d] " + "of particle %lld.", + relDiff, threshold, k, a.id); + message("a = %e, b = %e", a.density.rot_v[k], b.density.rot_v[k]); + result = 1; + } + } + + return result; +} + +/** @brief Computes the forces between all g-particles using the N^2 algorithm * * Overwrites the accelerations of the gparts with the values. * Do not use for actual runs. diff --git a/src/tools.h b/src/tools.h index 43ddd946c3e8cdf53139bb917135dffd8a8acd12..ece3078dce7cc8ab4b15538a1e5d9a990d81b36d 100644 --- a/src/tools.h +++ b/src/tools.h @@ -47,4 +47,8 @@ void shuffle_particles(struct part *parts, const int count); void gravity_n2(struct gpart *gparts, const int gcount, const struct phys_const *constants, float rlr); +int compare_values(double a, double b, double threshold, double *absDiff, + double *absSum, double *relDiff); +int compare_particles(struct part a, struct part b, double threshold); + #endif /* SWIFT_TOOL_H */ diff --git a/src/vector.h b/src/vector.h index 53869fd2594227d3332d7435f47cdff7cded224b..b2b3049f6abdf2a8464cb516147b050a6f4fc3e1 100644 --- a/src/vector.h +++ b/src/vector.h @@ -46,19 +46,33 @@ #define VEC_FLOAT __m512 #define VEC_DBL __m512d #define VEC_INT __m512i +#define KNL_MASK_16 __mmask16 #define vec_load(a) _mm512_load_ps(a) +#define vec_store(a, addr) _mm512_store_ps(addr, a) +#define vec_setzero() _mm512_setzero_ps() +#define vec_setintzero() _mm512_setzero_epi32() #define vec_set1(a) _mm512_set1_ps(a) +#define vec_setint1(a) _mm512_set1_epi32(a) #define vec_set(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ _mm512_set_ps(p, o, n, m, l, k, j, i, h, g, f, e, d, c, b, a) #define vec_dbl_set(a, b, c, d, e, f, g, h) \ _mm512_set_pd(h, g, f, e, d, c, b, a) +#define vec_add(a, b) _mm512_add_ps(a, b) +#define vec_sub(a, b) _mm512_sub_ps(a, b) +#define vec_mul(a, b) _mm512_mul_ps(a, b) +#define vec_fma(a, b, c) _mm512_fmadd_ps(a, b, c) #define vec_sqrt(a) _mm512_sqrt_ps(a) -#define vec_rcp(a) _mm512_rcp_ps(a) -#define vec_rsqrt(a) _mm512_rsqrt_ps(a) +#define vec_rcp(a) _mm512_rcp14_ps(a) +#define vec_rsqrt(a) _mm512_rsqrt14_ps(a) #define vec_ftoi(a) _mm512_cvttps_epi32(a) #define vec_fmin(a, b) _mm512_min_ps(a, b) #define vec_fmax(a, b) _mm512_max_ps(a, b) #define vec_fabs(a) _mm512_andnot_ps(_mm512_set1_ps(-0.f), a) +#define vec_floor(a) _mm512_floor_ps(a) +#define vec_cmp_gt(a, b) _mm512_cmp_ps_mask(a, b, _CMP_GT_OQ) +#define vec_cmp_lt(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ) +#define vec_cmp_lte(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ) +#define vec_and(a, b) _mm512_and_ps(a, b) #define vec_todbl_lo(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 0)) #define vec_todbl_hi(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 1)) #define vec_dbl_tofloat(a, b) _mm512_insertf128(_mm512_castps128_ps512(a), b, 1) @@ -86,15 +100,28 @@ .f[6] = a, .f[7] = a, .f[8] = a, .f[9] = a, .f[10] = a, .f[11] = a, \ .f[12] = a, .f[13] = a, .f[14] = a, .f[15] = a \ } +#define VEC_HADD(a, b) b += _mm512_reduce_add_ps(a.v) +#define VEC_FORM_PACKED_MASK(mask, v_mask, pack) \ + pack += __builtin_popcount(mask); +#define VEC_LEFT_PACK(a, mask, result) \ + _mm512_mask_compressstoreu_ps(result, mask, a) #elif defined(HAVE_AVX) #define VEC_SIZE 8 #define VEC_FLOAT __m256 #define VEC_DBL __m256d #define VEC_INT __m256i #define vec_load(a) _mm256_load_ps(a) +#define vec_store(a, addr) _mm256_store_ps(addr, a) +#define vec_unaligned_store(a, addr) _mm256_storeu_ps(addr, a) +#define vec_setzero() _mm256_setzero_ps() +#define vec_setintzero() _mm256_setzero_si256() #define vec_set1(a) _mm256_set1_ps(a) +#define vec_setint1(a) _mm256_set1_epi32(a) #define vec_set(a, b, c, d, e, f, g, h) _mm256_set_ps(h, g, f, e, d, c, b, a) #define vec_dbl_set(a, b, c, d) _mm256_set_pd(d, c, b, a) +#define vec_add(a, b) _mm256_add_ps(a, b) +#define vec_sub(a, b) _mm256_sub_ps(a, b) +#define vec_mul(a, b) _mm256_mul_ps(a, b) #define vec_sqrt(a) _mm256_sqrt_ps(a) #define vec_rcp(a) _mm256_rcp_ps(a) #define vec_rsqrt(a) _mm256_rsqrt_ps(a) @@ -102,6 +129,12 @@ #define vec_fmin(a, b) _mm256_min_ps(a, b) #define vec_fmax(a, b) _mm256_max_ps(a, b) #define vec_fabs(a) _mm256_andnot_ps(_mm256_set1_ps(-0.f), a) +#define vec_floor(a) _mm256_floor_ps(a) +#define vec_cmp_lt(a, b) _mm256_cmp_ps(a, b, _CMP_LT_OQ) +#define vec_cmp_gt(a, b) _mm256_cmp_ps(a, b, _CMP_GT_OQ) +#define vec_cmp_lte(a, b) _mm256_cmp_ps(a, b, _CMP_LE_OQ) +#define vec_cmp_result(a) _mm256_movemask_ps(a) +#define vec_and(a, b) _mm256_and_ps(a, b) #define vec_todbl_lo(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 0)) #define vec_todbl_hi(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 1)) #define vec_dbl_tofloat(a, b) _mm256_insertf128(_mm256_castps128_ps256(a), b, 1) @@ -118,9 +151,63 @@ .f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a, .f[4] = a, .f[5] = a, \ .f[6] = a, .f[7] = a \ } +#define VEC_HADD(a, b) \ + a.v = _mm256_hadd_ps(a.v, a.v); \ + a.v = _mm256_hadd_ps(a.v, a.v); \ + b += a.f[0] + a.f[4]; +#define VEC_GET_LOW(a) _mm256_castps256_ps128(a) +#define VEC_GET_HIGH(a) _mm256_extractf128_ps(a, 1) #ifdef HAVE_AVX2 +#define vec_fma(a, b, c) _mm256_fmadd_ps(a, b, c) +#define identity_indices 0x0706050403020100 #define VEC_HAVE_GATHER #define vec_gather(base, offsets) _mm256_i32gather_ps(base, offsets.m, 1) +#define VEC_FORM_PACKED_MASK(mask, v_mask, pack) \ + { \ + unsigned long expanded_mask = _pdep_u64(mask, 0x0101010101010101); \ + expanded_mask *= 0xFF; \ + unsigned long wanted_indices = _pext_u64(identity_indices, expanded_mask); \ + __m128i bytevec = _mm_cvtsi64_si128(wanted_indices); \ + v_mask = _mm256_cvtepu8_epi32(bytevec); \ + pack += __builtin_popcount(mask); \ + } +#define VEC_LEFT_PACK(a, mask, result) \ + vec_unaligned_store(_mm256_permutevar8x32_ps(a, mask), result) +#endif +#ifndef vec_fma +#define vec_fma(a, b, c) vec_add(vec_mul(a, b), c) +#endif +#ifndef VEC_FORM_PACKED_MASK +#define VEC_FORM_PACKED_MASK(mask, v_mask, pack) \ + { \ + for (int i = 0; i < VEC_SIZE; i++) \ + if ((mask & (1 << i))) v_mask.i[pack++] = i; \ + } +#define VEC_FORM_PACKED_MASK_2(mask, v_mask, pack, mask2, v_mask2, pack2) \ + { \ + for (int i = 0; i < VEC_SIZE; i++) { \ + if ((mask & (1 << i))) v_mask.i[pack++] = i; \ + if ((mask2 & (1 << i))) v_mask2.i[pack2++] = i; \ + } \ + } +#endif +#ifndef VEC_LEFT_PACK +#define VEC_LEFT_PACK(a, mask, result) \ + { \ + __m256 t1 = _mm256_castps128_ps256(_mm256_extractf128_ps(a, 1)); \ + __m256 t2 = _mm256_insertf128_ps(t1, _mm256_castps256_ps128(a), 1); \ + __m256 r0 = _mm256_permutevar_ps(a, mask); \ + __m256 r1 = _mm256_permutevar_ps(t2, mask); \ + __m128i k1 = _mm_slli_epi32( \ + (__m128i)(_mm_xor_si128((__m128i)VEC_GET_HIGH((__m256)mask), \ + (__m128i)_mm_set1_epi32(4))), \ + 29); \ + __m128i k0 = _mm_slli_epi32((__m128i)(VEC_GET_LOW((__m256)mask)), 29); \ + __m256 kk = \ + _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_castsi128_ps(k0)), \ + _mm_castsi128_ps(k1), 1); \ + *((__m256 *)(result)) = _mm256_blendv_ps(r0, r1, kk); \ + } #endif #elif defined(HAVE_SSE2) #define VEC_SIZE 4 @@ -128,6 +215,7 @@ #define VEC_DBL __m128d #define VEC_INT __m128i #define vec_load(a) _mm_load_ps(a) +#define vec_store(a, addr) _mm_store_ps(addr, a) #define vec_set1(a) _mm_set1_ps(a) #define vec_set(a, b, c, d) _mm_set_ps(d, c, b, a) #define vec_dbl_set(a, b) _mm_set_pd(b, a) @@ -138,6 +226,10 @@ #define vec_fmin(a, b) _mm_min_ps(a, b) #define vec_fmax(a, b) _mm_max_ps(a, b) #define vec_fabs(a) _mm_andnot_ps(_mm_set1_ps(-0.f), a) +#define vec_floor(a) _mm_floor_ps(a) +#define vec_cmp_lt(a, b) _mm_cmplt_ps(a, b) +#define vec_cmp_lte(a, b) _mm_cmp_ps(a, b, _CMP_LE_OQ) +#define vec_cmp_result(a) _mm_movemask_ps(a) #define vec_todbl_lo(a) _mm_cvtps_pd(a) #define vec_todbl_hi(a) _mm_cvtps_pd(_mm_movehl_ps(a, a)) #define vec_dbl_tofloat(a, b) _mm_movelh_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b)) @@ -165,6 +257,45 @@ typedef union { int i[VEC_SIZE]; } vector; +/** + * @brief Calculates the inverse ($1/x$) of a vector using intrinsics and a + * Newton iteration to obtain the correct level of accuracy. + * + * @param x #vector to be inverted. + * @return x_inv #vector inverted x. + */ +__attribute__((always_inline)) INLINE vector vec_reciprocal(vector x) { + + vector x_inv; + + x_inv.v = vec_rcp(x.v); + x_inv.v = vec_sub(x_inv.v, + vec_mul(x_inv.v, (vec_fma(x.v, x_inv.v, vec_set1(-1.0f))))); + + return x_inv; +} + +/** + * @brief Calculates the inverse and square root (\f$1/\sqrt{x}\f$) of a vector + * using intrinsics and a Newton iteration to obtain the correct level of + * accuracy. + * + * @param x #vector to be inverted. + * @return x_inv #vector inverted x. + */ +__attribute__((always_inline)) INLINE vector vec_reciprocal_sqrt(vector x) { + + vector x_inv; + + x_inv.v = vec_rsqrt(x.v); + x_inv.v = vec_sub( + x_inv.v, + vec_mul(vec_mul(vec_set1(0.5f), x_inv.v), + (vec_fma(x.v, vec_mul(x_inv.v, x_inv.v), vec_set1(-1.0f))))); + + return x_inv; +} + #else /* Needed for cache alignment. */ #define VEC_SIZE 16 diff --git a/tests/test27cells.c b/tests/test27cells.c index ec9325322d9cb7531e3c1916e81323fa86f6bb60..f645d0e2097990f8e3e3de13d5b108d3cf921bc9 100644 --- a/tests/test27cells.c +++ b/tests/test27cells.c @@ -30,6 +30,23 @@ /* Local headers. */ #include "swift.h" +#define ACC_THRESHOLD 1e-5 + +#if defined(WITH_VECTORIZATION) && defined(DOSELF1_VEC) +#define DOSELF1 runner_doself1_density_vec +#define DOSELF1_NAME "runner_doself1_density_vec" +#endif + +#if defined(WITH_VECTORIZATION) && defined(DOSELF1_VEC_2) +#define DOSELF1 runner_doself1_density_vec_2 +#define DOSELF1_NAME "runner_doself1_density_vec_2" +#endif + +#ifndef DOSELF1 +#define DOSELF1 runner_doself1_density +#define DOSELF1_NAME "runner_doself1_density" +#endif + enum velocity_types { velocity_zero, velocity_random, @@ -255,15 +272,41 @@ void dump_particle_fields(char *fileName, struct cell *main_cell, fclose(file); } +/** + * @brief Compares the vectorised result against + * the serial result of the interaction. + * + * @param serial_parts Particle array that has been interacted serially + * @param vec_parts Particle array to be interacted using vectors + * @param count No. of particles that have been interacted + * @param threshold Level of accuracy needed + * + * @return Non-zero value if difference found, 0 otherwise + */ +int check_results(struct part *serial_parts, struct part *vec_parts, int count, + double threshold) { + int result = 0; + + for (int i = 0; i < count; i++) + result += compare_particles(serial_parts[i], vec_parts[i], threshold); + + return result; +} + /* Just a forward declaration... */ void runner_dopair1_density(struct runner *r, struct cell *ci, struct cell *cj); void runner_doself1_density(struct runner *r, struct cell *ci); +void runner_doself1_density_vec(struct runner *r, struct cell *ci); +void runner_doself1_density_vec_2(struct runner *r, struct cell *ci); /* And go... */ int main(int argc, char *argv[]) { + + engine_pin(); size_t runs = 0, particles = 0; double h = 1.23485, size = 1., rho = 1.; double perturbation = 0.; + double threshold = ACC_THRESHOLD; char outputFileNameExtension[200] = ""; char outputFileName[200] = ""; enum velocity_types vel = velocity_zero; @@ -279,7 +322,7 @@ int main(int argc, char *argv[]) { srand(0); char c; - while ((c = getopt(argc, argv, "m:s:h:n:r:t:d:f:v:")) != -1) { + while ((c = getopt(argc, argv, "m:s:h:n:r:t:d:f:v:a:")) != -1) { switch (c) { case 'h': sscanf(optarg, "%lf", &h); @@ -305,6 +348,9 @@ int main(int argc, char *argv[]) { case 'v': sscanf(optarg, "%d", (int *)&vel); break; + case 'a': + sscanf(optarg, "%lf", &threshold); + break; case '?': error("Unknown option."); break; @@ -330,6 +376,8 @@ int main(int argc, char *argv[]) { } /* Help users... */ + message("Function called: %s", DOSELF1_NAME); + message("Vector size: %d", VEC_SIZE); message("Adiabatic index: ga = %f", hydro_gamma); message("Hydro implementation: %s", SPH_IMPLEMENTATION); message("Smoothing length: h = %f", h * size); @@ -372,6 +420,9 @@ int main(int argc, char *argv[]) { /* Store the main cell for future use */ main_cell = cells[13]; + ticks timings[27]; + for (int i = 0; i < 27; i++) timings[i] = 0; + ticks time = 0; for (size_t i = 0; i < runs; ++i) { /* Zero the fields */ @@ -382,12 +433,30 @@ int main(int argc, char *argv[]) { #if !(defined(MINIMAL_SPH) && defined(WITH_VECTORIZATION)) /* Run all the pairs */ - for (int j = 0; j < 27; ++j) - if (cells[j] != main_cell) + for (int j = 0; j < 27; ++j) { + if (cells[j] != main_cell) { + const ticks sub_tic = getticks(); + runner_dopair1_density(&runner, main_cell, cells[j]); - /* And now the self-interaction */ - runner_doself1_density(&runner, main_cell); + const ticks sub_toc = getticks(); + timings[j] += sub_toc - sub_tic; + } + } + +/* And now the self-interaction */ +#ifdef WITH_VECTORIZATION + runner.par_cache.count = 0; + cache_init(&runner.par_cache, 512); +#endif + + const ticks self_tic = getticks(); + + DOSELF1(&runner, main_cell); + + const ticks self_toc = getticks(); + + timings[13] += self_toc - self_tic; #endif @@ -405,8 +474,26 @@ int main(int argc, char *argv[]) { } } + /* Store the vectorised particle results. */ + struct part vec_parts[main_cell->count]; + for (int i = 0; i < main_cell->count; i++) vec_parts[i] = main_cell->parts[i]; + /* Output timing */ - message("SWIFT calculation took : %15lli ticks.", time / runs); + ticks corner_time = timings[0] + timings[2] + timings[6] + timings[8] + + timings[18] + timings[20] + timings[24] + timings[26]; + + ticks edge_time = timings[1] + timings[3] + timings[5] + timings[7] + + timings[9] + timings[11] + timings[15] + timings[17] + + timings[19] + timings[21] + timings[23] + timings[25]; + + ticks face_time = timings[4] + timings[10] + timings[12] + timings[14] + + timings[16] + timings[22]; + + message("Corner calculations took : %15lli ticks.", corner_time / runs); + message("Edge calculations took : %15lli ticks.", edge_time / runs); + message("Face calculations took : %15lli ticks.", face_time / runs); + message("Self calculations took : %15lli ticks.", timings[13] / runs); + message("SWIFT calculation took : %15lli ticks.", time / runs); /* Now perform a brute-force version for accuracy tests */ @@ -435,6 +522,10 @@ int main(int argc, char *argv[]) { sprintf(outputFileName, "brute_force_27_%s.dat", outputFileNameExtension); dump_particle_fields(outputFileName, main_cell, cells); + /* Check serial results against the vectorised results. */ + if (check_results(main_cell->parts, vec_parts, main_cell->count, threshold)) + message("Differences found..."); + /* Output timing */ message("Brute force calculation took : %15lli ticks.", toc - tic); diff --git a/tests/test27cells.sh.in b/tests/test27cells.sh.in index bf9cfeaf9a70790a321fa7ec4c63983d8cfd866c..07b6b92a82cee2bbe9c593f8f62e750d4406f84e 100755 --- a/tests/test27cells.sh.in +++ b/tests/test27cells.sh.in @@ -6,7 +6,7 @@ do rm -f brute_force_27_standard.dat swift_dopair_27_standard.dat - ./test27cells -n 6 -r 1 -d 0 -f standard -v $v + ./test27cells -n 6 -r 1 -d 0 -f standard -v $v -a 1e-4 if [ -e brute_force_27_standard.dat ] then diff --git a/tests/test27cellsPerturbed.sh.in b/tests/test27cellsPerturbed.sh.in index 3cdaf79ab17e705ec69a0b646949cc5a71109796..30498594b659101216b51dfea2346fa9230dbc97 100755 --- a/tests/test27cellsPerturbed.sh.in +++ b/tests/test27cellsPerturbed.sh.in @@ -6,7 +6,7 @@ do rm -f brute_force_27_perturbed.dat swift_dopair_27_perturbed.dat - ./test27cells -n 6 -r 1 -d 0.1 -f perturbed -v $v + ./test27cells -n 6 -r 1 -d 0.1 -f perturbed -v $v -a 5e-4 if [ -e brute_force_27_perturbed.dat ] then diff --git a/tests/testInteractions.c b/tests/testInteractions.c index 4faebbbdb35c4bc36cca16cb207517f05629f8c1..4ce7fe40554d24551750629fa47c0bee7acdb6da 100644 --- a/tests/testInteractions.c +++ b/tests/testInteractions.c @@ -30,6 +30,9 @@ int main() { return 0; } #include <unistd.h> #include "swift.h" +#define array_align sizeof(float) * VEC_SIZE +#define ACC_THRESHOLD 1e-5 + /* Typdef function pointers for serial and vectorised versions of the * interaction functions. */ typedef void (*serial_interaction)(float, float *, float, float, struct part *, @@ -48,8 +51,8 @@ typedef void (*vec_interaction)(float *, float *, float *, float *, *separation. * @param partId The running counter of IDs. */ -struct part *make_particles(int count, double *offset, double spacing, double h, - long long *partId) { +struct part *make_particles(size_t count, double *offset, double spacing, + double h, long long *partId) { struct part *particles; if (posix_memalign((void **)&particles, part_align, @@ -60,11 +63,28 @@ struct part *make_particles(int count, double *offset, double spacing, double h, /* Construct the particles */ struct part *p; - for (size_t i = 0; i < VEC_SIZE + 1; ++i) { + + /* Set test particle at centre of unit sphere. */ + p = &particles[0]; + + /* Place the test particle at the centre of a unit sphere. */ + p->x[0] = 0.0f; + p->x[1] = 0.0f; + p->x[2] = 0.0f; + + p->h = h; + p->id = ++(*partId); + p->mass = 1.0f; + + /* Place rest of particles around the test particle + * with random position within a unit sphere. */ + for (size_t i = 1; i < count; ++i) { p = &particles[i]; - p->x[0] = offset[0] + spacing * i; - p->x[1] = offset[1] + spacing * i; - p->x[2] = offset[2] + spacing * i; + + /* Randomise positions within a unit sphere. */ + p->x[0] = random_uniform(-1.0, 1.0); + p->x[1] = random_uniform(-1.0, 1.0); + p->x[2] = random_uniform(-1.0, 1.0); /* Randomise velocities. */ p->v[0] = random_uniform(-0.05, 0.05); @@ -81,20 +101,17 @@ struct part *make_particles(int count, double *offset, double spacing, double h, /** * @brief Populates particle properties needed for the force calculation. */ -void prepare_force(struct part *parts) { +void prepare_force(struct part *parts, size_t count) { struct part *p; - for (size_t i = 0; i < VEC_SIZE + 1; ++i) { + for (size_t i = 0; i < count; ++i) { p = &parts[i]; p->rho = i + 1; -#if defined(GADGET2_SPH) - p->force.balsara = i + 1; - p->force.P_over_rho2 = i + 1; -#elif defined(DEFAULT_SPH) - p->force.balsara = i + 1; + p->force.balsara = random_uniform(0.0, 1.0); p->force.P_over_rho2 = i + 1; -#else -#endif + p->force.soundspeed = random_uniform(2.0, 3.0); + p->force.v_sig = 0.0f; + p->force.h_dt = 0.0f; } } @@ -106,7 +123,7 @@ void dump_indv_particle_fields(char *fileName, struct part *p) { FILE *file = fopen(fileName, "a"); fprintf(file, - "%6llu %10f %10f %10f %10f %10f %10f %10f %10f %10f %13e %13e %13e " + "%6llu %10f %10f %10f %10f %10f %10f %10e %10e %10e %13e %13e %13e " "%13e %13e %13e %13e " "%13e %13e %13e %10f\n", p->id, p->x[0], p->x[1], p->x[2], p->v[0], p->v[1], p->v[2], @@ -120,7 +137,8 @@ void dump_indv_particle_fields(char *fileName, struct part *p) { p->density.div_v, p->density.rot_v[0], p->density.rot_v[1], p->density.rot_v[2], 0. #else - 0., 0., 0., 0., 0. + p->density.div_v, p->density.rot_v[0], p->density.rot_v[1], + p->density.rot_v[2] #endif ); fclose(file); @@ -140,27 +158,52 @@ void write_header(char *fileName) { "ID", "pos_x", "pos_y", "pos_z", "v_x", "v_y", "v_z", "a_x", "a_y", "a_z", "rho", "rho_dh", "wcount", "wcount_dh", "dh/dt", "v_sig", "div_v", "curl_vx", "curl_vy", "curl_vz", "dS/dt"); - fprintf(file, "\nPARTICLES BEFORE INTERACTION:\n"); + fprintf(file, "\n# PARTICLES BEFORE INTERACTION:\n"); fclose(file); } /** - * @brief Calls the serial and vectorised version of the non-symmetrical density - * interaction. + * @brief Compares the vectorised result against + * the serial result of the interaction. + * + * @param serial_test_part Particle that has been updated serially + * @param serial_parts Particle array that has been interacted serially + * @param vec_test_part Particle that has been updated using vectors + * @param vec_parts Particle array to be interacted using vectors + * @param count No. of particles that have been interacted + * + * @return Non-zero value if difference found, 0 otherwise + */ +int check_results(struct part serial_test_part, struct part *serial_parts, + struct part vec_test_part, struct part *vec_parts, + int count) { + int result = 0; + result += compare_particles(serial_test_part, vec_test_part, ACC_THRESHOLD); + + for (int i = 0; i < count; i++) + result += compare_particles(serial_parts[i], vec_parts[i], ACC_THRESHOLD); + + return result; +} + +/* + * @brief Calls the serial and vectorised version of an interaction + * function given by the function pointers. * + * @param test_part Particle that will be updated * @param parts Particle array to be interacted * @param count No. of particles to be interacted - * @param serial_inter_func Function pointer to serial interaction function - * @param vec_inter_func Function pointer to vectorised interaction function - * @param filePrefix Prefix of output files + * @param serial_inter_func Serial interaction function to be called + * @param vec_inter_func Vectorised interaction function to be called + * @param runs No. of times to call interactions * */ -void test_interactions(struct part *parts, int count, +void test_interactions(struct part test_part, struct part *parts, size_t count, serial_interaction serial_inter_func, - vec_interaction vec_inter_func, char *filePrefix) { + vec_interaction vec_inter_func, char *filePrefix, + size_t runs) { - /* Use the first particle in the array as the one that gets updated. */ - struct part pi = parts[0]; + ticks serial_time = 0, vec_time = 0; FILE *file; char serial_filename[200] = ""; @@ -174,98 +217,148 @@ void test_interactions(struct part *parts, int count, write_header(serial_filename); write_header(vec_filename); - /* Dump state of particles before serial interaction. */ - dump_indv_particle_fields(serial_filename, &pi); - for (int i = 1; i < count; i++) - dump_indv_particle_fields(serial_filename, &parts[i]); - - /* Make copy of pi to be used in vectorised version. */ - struct part pi_vec = pi; - struct part pj_vec[VEC_SIZE]; - for (int i = 0; i < VEC_SIZE; i++) pj_vec[i] = parts[i + 1]; - - float r2q[VEC_SIZE] __attribute__((aligned(sizeof(float) * VEC_SIZE))); - float hiq[VEC_SIZE] __attribute__((aligned(sizeof(float) * VEC_SIZE))); - float hjq[VEC_SIZE] __attribute__((aligned(sizeof(float) * VEC_SIZE))); - float dxq[3 * VEC_SIZE] __attribute__((aligned(sizeof(float) * VEC_SIZE))); - struct part *piq[VEC_SIZE], *pjq[VEC_SIZE]; - - /* Perform serial interaction */ - for (int i = 1; i < count; i++) { - /* Compute the pairwise distance. */ - float r2 = 0.0f; - float dx[3]; - for (int k = 0; k < 3; k++) { - dx[k] = pi.x[k] - parts[i].x[k]; - r2 += dx[k] * dx[k]; + /* Test particle at the center of a unit sphere. */ + struct part pi_serial, pi_vec; + + /* Remaining particles in the sphere that will interact with test particle. */ + struct part pj_serial[count], pj_vec[count]; + + /* Stores the separation, smoothing length and pointers to particles + * needed for the vectorised interaction. */ + float r2q[count] __attribute__((aligned(array_align))); + float hiq[count] __attribute__((aligned(array_align))); + float hjq[count] __attribute__((aligned(array_align))); + float dxq[3 * count] __attribute__((aligned(array_align))); + struct part *piq[count], *pjq[count]; + + /* Call serial interaction a set number of times. */ + for (size_t k = 0; k < runs; k++) { + /* Reset particle to initial setup */ + pi_serial = test_part; + for (size_t i = 0; i < count; i++) pj_serial[i] = parts[i]; + + /* Only dump data on first run. */ + if (k == 0) { + /* Dump state of particles before serial interaction. */ + dump_indv_particle_fields(serial_filename, &pi_serial); + for (size_t i = 0; i < count; i++) + dump_indv_particle_fields(serial_filename, &pj_serial[i]); } - serial_inter_func(r2, dx, pi.h, parts[i].h, &pi, &parts[i]); + /* Perform serial interaction */ + for (size_t i = 0; i < count; i++) { + /* Compute the pairwise distance. */ + float r2 = 0.0f; + float dx[3]; + for (size_t k = 0; k < 3; k++) { + dx[k] = pi_serial.x[k] - pj_serial[i].x[k]; + r2 += dx[k] * dx[k]; + } + + const ticks tic = getticks(); + + serial_inter_func(r2, dx, pi_serial.h, pj_serial[i].h, &pi_serial, + &pj_serial[i]); + + serial_time += getticks() - tic; + } } file = fopen(serial_filename, "a"); - fprintf(file, "\nPARTICLES AFTER INTERACTION:\n"); + fprintf(file, "\n# PARTICLES AFTER INTERACTION:\n"); fclose(file); /* Dump result of serial interaction. */ - dump_indv_particle_fields(serial_filename, &pi); - for (int i = 1; i < count; i++) - dump_indv_particle_fields(serial_filename, &parts[i]); - - /* Setup arrays for vector interaction. */ - for (int i = 0; i < VEC_SIZE; i++) { - /* Compute the pairwise distance. */ - float r2 = 0.0f; - float dx[3]; - for (int k = 0; k < 3; k++) { - dx[k] = pi_vec.x[k] - pj_vec[i].x[k]; - r2 += dx[k] * dx[k]; + dump_indv_particle_fields(serial_filename, &pi_serial); + for (size_t i = 0; i < count; i++) + dump_indv_particle_fields(serial_filename, &pj_serial[i]); + + /* Call vector interaction a set number of times. */ + for (size_t k = 0; k < runs; k++) { + /* Reset particle to initial setup */ + pi_vec = test_part; + for (size_t i = 0; i < count; i++) pj_vec[i] = parts[i]; + + /* Setup arrays for vector interaction. */ + for (size_t i = 0; i < count; i++) { + /* Compute the pairwise distance. */ + float r2 = 0.0f; + float dx[3]; + for (size_t k = 0; k < 3; k++) { + dx[k] = pi_vec.x[k] - pj_vec[i].x[k]; + r2 += dx[k] * dx[k]; + } + + r2q[i] = r2; + dxq[3 * i + 0] = dx[0]; + dxq[3 * i + 1] = dx[1]; + dxq[3 * i + 2] = dx[2]; + hiq[i] = pi_vec.h; + hjq[i] = pj_vec[i].h; + piq[i] = &pi_vec; + pjq[i] = &pj_vec[i]; } - r2q[i] = r2; - dxq[3 * i + 0] = dx[0]; - dxq[3 * i + 1] = dx[1]; - dxq[3 * i + 2] = dx[2]; - hiq[i] = pi_vec.h; - hjq[i] = pj_vec[i].h; - piq[i] = &pi_vec; - pjq[i] = &pj_vec[i]; - } - /* Dump state of particles before vector interaction. */ - dump_indv_particle_fields(vec_filename, piq[0]); - for (size_t i = 0; i < VEC_SIZE; i++) - dump_indv_particle_fields(vec_filename, pjq[i]); + /* Only dump data on first run. */ + if (k == 0) { + /* Dump state of particles before vector interaction. */ + dump_indv_particle_fields(vec_filename, piq[0]); + for (size_t i = 0; i < count; i++) + dump_indv_particle_fields(vec_filename, pjq[i]); + } - /* Perform vector interaction. */ - vec_inter_func(r2q, dxq, hiq, hjq, piq, pjq); + const ticks vec_tic = getticks(); + + /* Perform vector interaction. */ + for (size_t i = 0; i < count; i += VEC_SIZE) { + vec_inter_func(&(r2q[i]), &(dxq[3 * i]), &(hiq[i]), &(hjq[i]), &(piq[i]), + &(pjq[i])); + } + + vec_time += getticks() - vec_tic; + } file = fopen(vec_filename, "a"); - fprintf(file, "\nPARTICLES AFTER INTERACTION:\n"); + fprintf(file, "\n# PARTICLES AFTER INTERACTION:\n"); fclose(file); - /* Dump result of serial interaction. */ + /* Dump result of vector interaction. */ dump_indv_particle_fields(vec_filename, piq[0]); - for (size_t i = 0; i < VEC_SIZE; i++) + for (size_t i = 0; i < count; i++) dump_indv_particle_fields(vec_filename, pjq[i]); + + /* Check serial results against the vectorised results. */ + if (check_results(pi_serial, pj_serial, pi_vec, pj_vec, count)) + message("Differences found..."); + + message("The serial interactions took : %15lli ticks.", + serial_time / runs); + message("The vectorised interactions took : %15lli ticks.", vec_time / runs); } /* And go... */ int main(int argc, char *argv[]) { - double h = 1.2348, spacing = 0.5; + size_t runs = 10000; + double h = 1.0, spacing = 0.5; double offset[3] = {0.0, 0.0, 0.0}; - int count = VEC_SIZE + 1; + size_t count = 256; /* Get some randomness going */ srand(0); char c; - while ((c = getopt(argc, argv, "s:h:")) != -1) { + while ((c = getopt(argc, argv, "h:s:n:r:")) != -1) { switch (c) { case 'h': sscanf(optarg, "%lf", &h); break; case 's': sscanf(optarg, "%lf", &spacing); + case 'n': + sscanf(optarg, "%zu", &count); + break; + case 'r': + sscanf(optarg, "%zu", &runs); break; case '?': error("Unknown option."); @@ -281,26 +374,35 @@ int main(int argc, char *argv[]) { "runner_iact_vec_density." "\n\nOptions:" "\n-h DISTANCE=1.2348 - Smoothing length in units of <x>" - "\n-s spacing - Spacing between particles", + "\n-s SPACING=0.5 - Spacing between particles" + "\n-n NUMBER=9 - No. of particles", argv[0]); exit(1); } + /* Correct count so that VEC_SIZE of particles interact with the test + * particle. */ + count = count - (count % VEC_SIZE) + 1; + /* Build the infrastructure */ static long long partId = 0; + struct part density_test_particle, force_test_particle; struct part *density_particles = make_particles(count, offset, spacing, h, &partId); struct part *force_particles = make_particles(count, offset, spacing, h, &partId); - prepare_force(force_particles); + prepare_force(force_particles, count); /* Define which interactions to call */ serial_interaction serial_inter_func = &runner_iact_nonsym_density; vec_interaction vec_inter_func = &runner_iact_nonsym_vec_density; + density_test_particle = density_particles[0]; /* Call the non-sym density test. */ - test_interactions(density_particles, count, serial_inter_func, vec_inter_func, - "test_nonsym_density"); + message("Testing non-symmetrical density interaction..."); + test_interactions(density_test_particle, &density_particles[1], count - 1, + serial_inter_func, vec_inter_func, "test_nonsym_density", + runs); density_particles = make_particles(count, offset, spacing, h, &partId); @@ -308,28 +410,36 @@ int main(int argc, char *argv[]) { serial_inter_func = &runner_iact_density; vec_inter_func = &runner_iact_vec_density; + density_test_particle = density_particles[0]; /* Call the symmetrical density test. */ - test_interactions(density_particles, count, serial_inter_func, vec_inter_func, - "test_sym_density"); + message("Testing symmetrical density interaction..."); + test_interactions(density_test_particle, &density_particles[1], count - 1, + serial_inter_func, vec_inter_func, "test_sym_density", + runs); /* Re-assign function pointers. */ serial_inter_func = &runner_iact_nonsym_force; vec_inter_func = &runner_iact_nonsym_vec_force; + force_test_particle = force_particles[0]; /* Call the test non-sym force test. */ - test_interactions(force_particles, count, serial_inter_func, vec_inter_func, - "test_nonsym_force"); + message("Testing non-symmetrical force interaction..."); + test_interactions(force_test_particle, &force_particles[1], count - 1, + serial_inter_func, vec_inter_func, "test_nonsym_force", + runs); force_particles = make_particles(count, offset, spacing, h, &partId); - prepare_force(force_particles); + prepare_force(force_particles, count); /* Re-assign function pointers. */ serial_inter_func = &runner_iact_force; vec_inter_func = &runner_iact_vec_force; + force_test_particle = force_particles[0]; /* Call the test symmetrical force test. */ - test_interactions(force_particles, count, serial_inter_func, vec_inter_func, - "test_sym_force"); + message("Testing symmetrical force interaction..."); + test_interactions(force_test_particle, &force_particles[1], count - 1, + serial_inter_func, vec_inter_func, "test_sym_force", runs); return 0; } diff --git a/tests/tolerance_27_normal.dat b/tests/tolerance_27_normal.dat index 71acaa89be231d02fc33e47c96a7bacf623bbf48..e2053261ed6675e4f0ce92866db896819eaff0c2 100644 --- a/tests/tolerance_27_normal.dat +++ b/tests/tolerance_27_normal.dat @@ -1,3 +1,3 @@ # ID pos_x pos_y pos_z v_x v_y v_z rho rho_dh wcount wcount_dh div_v curl_vx curl_vy curl_vz - 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 2e-6 2e-5 2e-3 2e-6 2e-6 2e-6 2e-6 - 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 2e-6 1e-5 1e-4 2e-5 2e-5 2e-5 2e-5 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 2e-5 2e-4 2e-3 2e-6 2e-6 2e-6 2e-6 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 6e-5 1e-4 1e-4 2e-5 2e-5 2e-5 2e-5 diff --git a/tests/tolerance_27_perturbed.dat b/tests/tolerance_27_perturbed.dat index 45293cbaa223b5887f3b0ce05cd9430d0db7440b..226a3d1423685656b37fa8acfe1741c859099352 100644 --- a/tests/tolerance_27_perturbed.dat +++ b/tests/tolerance_27_perturbed.dat @@ -1,3 +1,3 @@ # ID pos_x pos_y pos_z v_x v_y v_z rho rho_dh wcount wcount_dh div_v curl_vx curl_vy curl_vz - 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1.2e-6 1e-5 2.1e-5 2e-3 2.1e-6 2e-6 2e-6 2e-6 - 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 3e-3 1e-5 1e-4 2e-5 4e-4 4e-4 4e-4 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1.2e-6 1e-4 4e-5 2e-3 3.1e-6 2e-6 2e-6 2e-6 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 3e-3 1e-5 1e-4 2e-5 2e-3 2e-3 2e-3