diff --git a/src/cache.h b/src/cache.h index 19d61b657b3aa1fe8675ee413fcde146071381e9..52534ecdc81af9707bb98cec913972967a8a1ce2 100644 --- a/src/cache.h +++ b/src/cache.h @@ -26,9 +26,11 @@ #include "cell.h" #include "error.h" #include "part.h" +#include "sort_part.h" #include "vector.h" #define NUM_VEC_PROC 2 +#define CACHE_ALIGN 64 #define C2_CACHE_SIZE (NUM_VEC_PROC * VEC_SIZE * 6) + (NUM_VEC_PROC * VEC_SIZE) #define C2_CACHE_ALIGN sizeof(float) * VEC_SIZE @@ -37,28 +39,31 @@ struct cache { /* Particle x position. */ - float *restrict x __attribute__((aligned(sizeof(float) * VEC_SIZE))); + float *restrict x __attribute__((aligned(CACHE_ALIGN))); /* Particle y position. */ - float *restrict y __attribute__((aligned(sizeof(float) * VEC_SIZE))); + float *restrict y __attribute__((aligned(CACHE_ALIGN))); /* Particle z position. */ - float *restrict z __attribute__((aligned(sizeof(float) * VEC_SIZE))); + float *restrict z __attribute__((aligned(CACHE_ALIGN))); /* Particle smoothing length. */ - float *restrict h __attribute__((aligned(sizeof(float) * VEC_SIZE))); + float *restrict h __attribute__((aligned(CACHE_ALIGN))); /* Particle mass. */ - float *restrict m __attribute__((aligned(sizeof(float) * VEC_SIZE))); + float *restrict m __attribute__((aligned(CACHE_ALIGN))); /* Particle x velocity. */ - float *restrict vx __attribute__((aligned(sizeof(float) * VEC_SIZE))); + float *restrict vx __attribute__((aligned(CACHE_ALIGN))); /* Particle y velocity. */ - float *restrict vy __attribute__((aligned(sizeof(float) * VEC_SIZE))); + float *restrict vy __attribute__((aligned(CACHE_ALIGN))); /* Particle z velocity. */ - float *restrict vz __attribute__((aligned(sizeof(float) * VEC_SIZE))); + float *restrict vz __attribute__((aligned(CACHE_ALIGN))); + + /* Maximum distance of particles into neighbouring cell. */ + float *restrict max_d __attribute__((aligned(CACHE_ALIGN))); /* Cache size. */ int count; @@ -102,10 +107,12 @@ struct c2_cache { __attribute__((always_inline)) INLINE void cache_init(struct cache *c, size_t count) { - /* Align cache on correct byte boundary and pad cache size to include 2 vector - * lengths for remainder operations. */ - unsigned long alignment = sizeof(float) * VEC_SIZE; - unsigned int sizeBytes = (count + (2 * VEC_SIZE)) * sizeof(float); + /* Align cache on correct byte boundary and pad cache size to be a multiple of + * the vector size + * and include 2 vector lengths for remainder operations. */ + unsigned int pad = 2 * VEC_SIZE, rem = count % VEC_SIZE; + if (rem > 0) pad += VEC_SIZE - rem; + unsigned int sizeBytes = (count + pad) * sizeof(float); int error = 0; /* Free memory if cache has already been allocated. */ @@ -118,16 +125,18 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c, free(c->vy); free(c->vz); free(c->h); + free(c->max_d); } - error += posix_memalign((void **)&c->x, alignment, sizeBytes); - error += posix_memalign((void **)&c->y, alignment, sizeBytes); - error += posix_memalign((void **)&c->z, alignment, sizeBytes); - error += posix_memalign((void **)&c->m, alignment, sizeBytes); - error += posix_memalign((void **)&c->vx, alignment, sizeBytes); - error += posix_memalign((void **)&c->vy, alignment, sizeBytes); - error += posix_memalign((void **)&c->vz, alignment, sizeBytes); - error += posix_memalign((void **)&c->h, alignment, sizeBytes); + error += posix_memalign((void **)&c->x, CACHE_ALIGN, sizeBytes); + error += posix_memalign((void **)&c->y, CACHE_ALIGN, sizeBytes); + error += posix_memalign((void **)&c->z, CACHE_ALIGN, sizeBytes); + error += posix_memalign((void **)&c->m, CACHE_ALIGN, sizeBytes); + error += posix_memalign((void **)&c->vx, CACHE_ALIGN, sizeBytes); + error += posix_memalign((void **)&c->vy, CACHE_ALIGN, sizeBytes); + error += posix_memalign((void **)&c->vz, CACHE_ALIGN, sizeBytes); + error += posix_memalign((void **)&c->h, CACHE_ALIGN, sizeBytes); + error += posix_memalign((void **)&c->max_d, CACHE_ALIGN, sizeBytes); if (error != 0) error("Couldn't allocate cache, no. of particles: %d", (int)count); @@ -145,8 +154,11 @@ __attribute__((always_inline)) INLINE void cache_read_particles( #if defined(GADGET2_SPH) - /* Shift the particles positions to a local frame so single precision can be - * used instead of double precision. */ +/* Shift the particles positions to a local frame so single precision can be + * used instead of double precision. */ +#if defined(WITH_VECTORIZATION) && defined(__ICC) +#pragma vector aligned +#endif for (int i = 0; i < ci->count; i++) { ci_cache->x[i] = ci->parts[i].x[0] - ci->loc[0]; ci_cache->y[i] = ci->parts[i].x[1] - ci->loc[1]; @@ -163,7 +175,247 @@ __attribute__((always_inline)) INLINE void cache_read_particles( } /** - * @brief Clean the memory allocated by a #cache object. + * @brief Populate cache by reading in the particles from two cells in unsorted + * order. + * + * @param ci The i #cell. + * @param cj The j #cell. + * @param ci_cache The cache for cell ci. + * @param cj_cache The cache for cell cj. + * @param shift The amount to shift the particle positions to account for BCs + */ +__attribute__((always_inline)) INLINE void cache_read_two_cells( + const struct cell *const ci, const struct cell *const cj, + struct cache *const ci_cache, struct cache *const cj_cache, + const double *const shift) { + + /* Shift the particles positions to a local frame (ci frame) so single + * precision can be + * used instead of double precision. Also shift the cell ci, particles + * positions due to BCs but leave cell cj. */ + for (int i = 0; i < ci->count; i++) { + ci_cache->x[i] = ci->parts[i].x[0] - ci->loc[0] - shift[0]; + ci_cache->y[i] = ci->parts[i].x[1] - ci->loc[1] - shift[1]; + ci_cache->z[i] = ci->parts[i].x[2] - ci->loc[2] - shift[2]; + ci_cache->h[i] = ci->parts[i].h; + + ci_cache->m[i] = ci->parts[i].mass; + ci_cache->vx[i] = ci->parts[i].v[0]; + ci_cache->vy[i] = ci->parts[i].v[1]; + ci_cache->vz[i] = ci->parts[i].v[2]; + } + + for (int i = 0; i < cj->count; i++) { + cj_cache->x[i] = cj->parts[i].x[0] - ci->loc[0]; + cj_cache->y[i] = cj->parts[i].x[1] - ci->loc[1]; + cj_cache->z[i] = cj->parts[i].x[2] - ci->loc[2]; + cj_cache->h[i] = cj->parts[i].h; + + cj_cache->m[i] = cj->parts[i].mass; + cj_cache->vx[i] = cj->parts[i].v[0]; + cj_cache->vy[i] = cj->parts[i].v[1]; + cj_cache->vz[i] = cj->parts[i].v[2]; + } +} + +__attribute__((always_inline)) INLINE void cache_read_cell_sorted( + const struct cell *const ci, struct cache *const ci_cache, + const struct entry *restrict sort_i, double *const loc, + double *const shift) { + + int idx; +/* Shift the particles positions to a local frame (ci frame) so single precision + * can be + * used instead of double precision. Also shift the cell ci, particles positions + * due to BCs but leave cell cj. */ +#if defined(WITH_VECTORIZATION) && defined(__ICC) +#pragma simd +#endif + for (int i = 0; i < ci->count; i++) { + idx = sort_i[i].i; + + ci_cache->x[i] = ci->parts[idx].x[0] - loc[0] - shift[0]; + ci_cache->y[i] = ci->parts[idx].x[1] - loc[1] - shift[1]; + ci_cache->z[i] = ci->parts[idx].x[2] - loc[2] - shift[2]; + ci_cache->h[i] = ci->parts[idx].h; + + ci_cache->m[i] = ci->parts[idx].mass; + ci_cache->vx[i] = ci->parts[idx].v[0]; + ci_cache->vy[i] = ci->parts[idx].v[1]; + ci_cache->vz[i] = ci->parts[idx].v[2]; + } +} + +/** + * @brief Populate cache by reading in the particles from two cells in sorted + * order. + * + * @param ci The i #cell. + * @param cj The j #cell. + * @param ci_cache The #cache for cell ci. + * @param cj_cache The #cache for cell cj. + * @param sort_i The array of sorted particle indices for cell ci. + * @param sort_j The array of sorted particle indices for cell ci. + * @param shift The amount to shift the particle positions to account for BCs + */ +__attribute__((always_inline)) INLINE void cache_read_two_cells_sorted( + const struct cell *const ci, const struct cell *const cj, + struct cache *const ci_cache, struct cache *const cj_cache, + const struct entry *restrict sort_i, const struct entry *restrict sort_j, + const double *const shift) { + + int idx; +/* Shift the particles positions to a local frame (ci frame) so single precision + * can be + * used instead of double precision. Also shift the cell ci, particles positions + * due to BCs but leave cell cj. */ +#if defined(WITH_VECTORIZATION) && defined(__ICC) +#pragma simd +#endif + for (int i = 0; i < ci->count; i++) { + idx = sort_i[i].i; + ci_cache->x[i] = ci->parts[idx].x[0] - ci->loc[0] - shift[0]; + ci_cache->y[i] = ci->parts[idx].x[1] - ci->loc[1] - shift[1]; + ci_cache->z[i] = ci->parts[idx].x[2] - ci->loc[2] - shift[2]; + ci_cache->h[i] = ci->parts[idx].h; + + ci_cache->m[i] = ci->parts[idx].mass; + ci_cache->vx[i] = ci->parts[idx].v[0]; + ci_cache->vy[i] = ci->parts[idx].v[1]; + ci_cache->vz[i] = ci->parts[idx].v[2]; + } + +#if defined(WITH_VECTORIZATION) && defined(__ICC) +#pragma simd +#endif + for (int i = 0; i < cj->count; i++) { + idx = sort_j[i].i; + cj_cache->x[i] = cj->parts[idx].x[0] - ci->loc[0]; + cj_cache->y[i] = cj->parts[idx].x[1] - ci->loc[1]; + cj_cache->z[i] = cj->parts[idx].x[2] - ci->loc[2]; + cj_cache->h[i] = cj->parts[idx].h; + + cj_cache->m[i] = cj->parts[idx].mass; + cj_cache->vx[i] = cj->parts[idx].v[0]; + cj_cache->vy[i] = cj->parts[idx].v[1]; + cj_cache->vz[i] = cj->parts[idx].v[2]; + } +} + +/** + * @brief Populate caches by only reading particles that are within range of + * each other within the adjoining cell.Also read the particles into the cache + * in sorted order. + * + * @param ci The i #cell. + * @param cj The j #cell. + * @param ci_cache The #cache for cell ci. + * @param cj_cache The #cache for cell cj. + * @param sort_i The array of sorted particle indices for cell ci. + * @param sort_j The array of sorted particle indices for cell ci. + * @param shift The amount to shift the particle positions to account for BCs + * @param first_pi The first particle in cell ci that is in range. + * @param last_pj The last particle in cell cj that is in range. + * @param num_vec_proc Number of vectors that will be used to process the + * interaction. + */ +__attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( + const struct cell *const ci, const struct cell *const cj, + struct cache *const ci_cache, struct cache *const cj_cache, + const struct entry *restrict sort_i, const struct entry *restrict sort_j, + const double *const shift, int *first_pi, int *last_pj, + const int num_vec_proc) { + + int idx, ci_cache_idx; + /* Pad number of particles read to the vector size. */ + int rem = (ci->count - *first_pi) % (num_vec_proc * VEC_SIZE); + if (rem != 0) { + int pad = (num_vec_proc * VEC_SIZE) - rem; + + if (*first_pi - pad >= 0) *first_pi -= pad; + } + + rem = *last_pj % (num_vec_proc * VEC_SIZE); + if (rem != 0) { + int pad = (num_vec_proc * VEC_SIZE) - rem; + + if (*last_pj + pad < cj->count) *last_pj += pad; + } + + int first_pi_align = *first_pi; + int last_pj_align = *last_pj; + +/* Shift the particles positions to a local frame (ci frame) so single precision + * can be + * used instead of double precision. Also shift the cell ci, particles positions + * due to BCs but leave cell cj. */ +#if defined(WITH_VECTORIZATION) && defined(__ICC) +#pragma vector aligned +#endif + for (int i = first_pi_align; i < ci->count; i++) { + /* Make sure ci_cache is filled from the first element. */ + ci_cache_idx = i - first_pi_align; + idx = sort_i[i].i; + ci_cache->x[ci_cache_idx] = ci->parts[idx].x[0] - ci->loc[0] - shift[0]; + ci_cache->y[ci_cache_idx] = ci->parts[idx].x[1] - ci->loc[1] - shift[1]; + ci_cache->z[ci_cache_idx] = ci->parts[idx].x[2] - ci->loc[2] - shift[2]; + ci_cache->h[ci_cache_idx] = ci->parts[idx].h; + + ci_cache->m[ci_cache_idx] = ci->parts[idx].mass; + ci_cache->vx[ci_cache_idx] = ci->parts[idx].v[0]; + ci_cache->vy[ci_cache_idx] = ci->parts[idx].v[1]; + ci_cache->vz[ci_cache_idx] = ci->parts[idx].v[2]; + } + + /* Pad cache with fake particles that exist outside the cell so will not + * interact.*/ + float fake_pix = 2.0f * ci->parts[sort_i[ci->count - 1].i].x[0]; + for (int i = ci->count - first_pi_align; + i < ci->count - first_pi_align + VEC_SIZE; i++) { + ci_cache->x[i] = fake_pix; + ci_cache->y[i] = 1.f; + ci_cache->z[i] = 1.f; + ci_cache->h[i] = 1.f; + + ci_cache->m[i] = 1.f; + ci_cache->vx[i] = 1.f; + ci_cache->vy[i] = 1.f; + ci_cache->vz[i] = 1.f; + } + +#if defined(WITH_VECTORIZATION) && defined(__ICC) +#pragma vector aligned +#endif + for (int i = 0; i <= last_pj_align; i++) { + idx = sort_j[i].i; + cj_cache->x[i] = cj->parts[idx].x[0] - ci->loc[0]; + cj_cache->y[i] = cj->parts[idx].x[1] - ci->loc[1]; + cj_cache->z[i] = cj->parts[idx].x[2] - ci->loc[2]; + cj_cache->h[i] = cj->parts[idx].h; + + cj_cache->m[i] = cj->parts[idx].mass; + cj_cache->vx[i] = cj->parts[idx].v[0]; + cj_cache->vy[i] = cj->parts[idx].v[1]; + cj_cache->vz[i] = cj->parts[idx].v[2]; + } + + /* Pad cache with fake particles that exist outside the cell so will not + * interact.*/ + float fake_pjx = 2.0f * cj->parts[sort_j[cj->count - 1].i].x[0]; + for (int i = last_pj_align + 1; i < last_pj_align + 1 + VEC_SIZE; i++) { + cj_cache->x[i] = fake_pjx; + cj_cache->y[i] = 1.f; + cj_cache->z[i] = 1.f; + cj_cache->h[i] = 1.f; + + cj_cache->m[i] = 1.f; + cj_cache->vx[i] = 1.f; + cj_cache->vy[i] = 1.f; + cj_cache->vz[i] = 1.f; + } +} + +/* @brief Clean the memory allocated by a #cache object. * * @param c The #cache to clean. */ @@ -177,6 +429,7 @@ static INLINE void cache_clean(struct cache *c) { free(c->vy); free(c->vz); free(c->h); + free(c->max_d); } } diff --git a/src/engine.c b/src/engine.c index 3c8ec68160c0aca9fd8c009cc0a8fdd5df88f28e..ab14af5c2e3fee19fcf2ec5963788aa8338d0568 100644 --- a/src/engine.c +++ b/src/engine.c @@ -4264,9 +4264,11 @@ void engine_init(struct engine *e, struct space *s, e->runners[k].qid = k * nr_queues / e->nr_threads; } - /* Allocate particle cache. */ - e->runners[k].par_cache.count = 0; - cache_init(&e->runners[k].par_cache, CACHE_SIZE); + /* Allocate particle caches. */ + e->runners[k].ci_cache.count = 0; + e->runners[k].cj_cache.count = 0; + cache_init(&e->runners[k].ci_cache, CACHE_SIZE); + cache_init(&e->runners[k].cj_cache, CACHE_SIZE); if (verbose) { if (with_aff) @@ -4353,7 +4355,8 @@ void engine_compute_next_snapshot_time(struct engine *e) { */ void engine_clean(struct engine *e) { - for (int i = 0; i < e->nr_threads; ++i) cache_clean(&e->runners[i].par_cache); + for (int i = 0; i < e->nr_threads; ++i) cache_clean(&e->runners[i].ci_cache); + for (int i = 0; i < e->nr_threads; ++i) cache_clean(&e->runners[i].cj_cache); free(e->runners); free(e->snapshotUnits); free(e->links); diff --git a/src/hydro/Gadget2/hydro_iact.h b/src/hydro/Gadget2/hydro_iact.h index 3fef18b4f487f1734a5f93c4bad46cf4e6968240..ae4ee27f5ed83350aaf1a0bfd0069d7b85701854 100644 --- a/src/hydro/Gadget2/hydro_iact.h +++ b/src/hydro/Gadget2/hydro_iact.h @@ -32,6 +32,7 @@ * Gadget-2 tree-code neighbours search. */ +#include "cache.h" #include "minmax.h" /** @@ -150,7 +151,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_density( for (k = 0; k < 3; k++) dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]); #else - error("Unknown vector size.") + error("Unknown vector size."); #endif /* Get the radius and inverse radius. */ @@ -317,7 +318,7 @@ runner_iact_nonsym_vec_density(float *R2, float *Dx, float *Hi, float *Hj, for (k = 0; k < 3; k++) dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]); #else - error("Unknown vector size.") + error("Unknown vector size."); #endif /* Get the radius and inverse radius. */ @@ -374,6 +375,106 @@ runner_iact_nonsym_vec_density(float *R2, float *Dx, float *Hi, float *Hj, } #ifdef WITH_VECTORIZATION + +/** + * @brief Density interaction computed using 1 vector + * (non-symmetric vectorized version). + */ +__attribute__((always_inline)) INLINE static void +runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz, + vector hi_inv, vector vix, vector viy, + vector viz, float *Vjx, float *Vjy, float *Vjz, + float *Mj, vector *rhoSum, vector *rho_dhSum, + vector *wcountSum, vector *wcount_dhSum, + vector *div_vSum, vector *curlvxSum, + vector *curlvySum, vector *curlvzSum, + vector mask, int knlMask) { + + vector r, ri, xi, wi, wi_dx; + vector mj; + vector dvx, dvy, dvz; + vector vjx, vjy, vjz; + vector dvdr; + vector curlvrx, curlvry, curlvrz; + + /* Fill the vectors. */ + mj.v = vec_load(Mj); + vjx.v = vec_load(Vjx); + vjy.v = vec_load(Vjy); + vjz.v = vec_load(Vjz); + + /* Get the radius and inverse radius. */ + ri = vec_reciprocal_sqrt(*r2); + r.v = vec_mul(r2->v, ri.v); + + xi.v = vec_mul(r.v, hi_inv.v); + + /* Calculate the kernel for two particles. */ + kernel_deval_1_vec(&xi, &wi, &wi_dx); + + /* Compute dv. */ + dvx.v = vec_sub(vix.v, vjx.v); + dvy.v = vec_sub(viy.v, vjy.v); + dvz.v = vec_sub(viz.v, vjz.v); + + /* Compute dv dot r */ + dvdr.v = vec_fma(dvx.v, dx->v, vec_fma(dvy.v, dy->v, vec_mul(dvz.v, dz->v))); + dvdr.v = vec_mul(dvdr.v, ri.v); + + /* Compute dv cross r */ + curlvrx.v = + vec_fma(dvy.v, dz->v, vec_mul(vec_set1(-1.0f), vec_mul(dvz.v, dy->v))); + curlvry.v = + vec_fma(dvz.v, dx->v, vec_mul(vec_set1(-1.0f), vec_mul(dvx.v, dz->v))); + curlvrz.v = + vec_fma(dvx.v, dy->v, vec_mul(vec_set1(-1.0f), vec_mul(dvy.v, dx->v))); + curlvrx.v = vec_mul(curlvrx.v, ri.v); + curlvry.v = vec_mul(curlvry.v, ri.v); + curlvrz.v = vec_mul(curlvrz.v, ri.v); + +/* Mask updates to intermediate vector sums for particle pi. */ +#ifdef HAVE_AVX512_F + rhoSum->v = + _mm512_mask_add_ps(rhoSum->v, knlMask, vec_mul(mj.v, wi.v), rhoSum->v); + + rho_dhSum->v = + _mm512_mask_sub_ps(rho_dhSum->v, knlMask, rho_dhSum->v, + vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, + vec_mul(xi.v, wi_dx.v)))); + + wcountSum->v = _mm512_mask_add_ps(wcountSum->v, knlMask, wi.v, wcountSum->v); + + wcount_dhSum->v = _mm512_mask_sub_ps(wcount_dhSum->v, knlMask, + wcount_dhSum->v, vec_mul(xi.v, wi_dx.v)); + + div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask, div_vSum->v, + vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v))); + + curlvxSum->v = _mm512_mask_add_ps(curlvxSum->v, knlMask, + vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), + curlvxSum->v); + + curlvySum->v = _mm512_mask_add_ps(curlvySum->v, knlMask, + vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), + curlvySum->v); + + curlvzSum->v = _mm512_mask_add_ps(curlvzSum->v, knlMask, + vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), + curlvzSum->v); +#else + rhoSum->v += vec_and(vec_mul(mj.v, wi.v), mask.v); + rho_dhSum->v -= vec_and(vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, + vec_mul(xi.v, wi_dx.v))), + mask.v); + wcountSum->v += vec_and(wi.v, mask.v); + wcount_dhSum->v -= vec_and(vec_mul(xi.v, wi_dx.v), mask.v); + div_vSum->v -= vec_and(vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask.v); + curlvxSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask.v); + curlvySum->v += vec_and(vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask.v); + curlvzSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask.v); +#endif +} + /** * @brief Density interaction computed using 2 interleaved vectors * (non-symmetric vectorized version). @@ -744,7 +845,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_force( vec_set(pj[0]->force.balsara, pj[1]->force.balsara, pj[2]->force.balsara, pj[3]->force.balsara); #else - error("Unknown vector size.") + error("Unknown vector size."); #endif /* Get the radius and inverse radius. */ @@ -1023,7 +1124,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force( vec_set(pj[0]->force.balsara, pj[1]->force.balsara, pj[2]->force.balsara, pj[3]->force.balsara); #else - error("Unknown vector size.") + error("Unknown vector size."); #endif /* Get the radius and inverse radius. */ diff --git a/src/kernel_hydro.h b/src/kernel_hydro.h index ceaa3b7b46279e3dfe063bbda34d97233f33acbb..f634a59d7ee769951e6560d46a92053c144cc766 100644 --- a/src/kernel_hydro.h +++ b/src/kernel_hydro.h @@ -304,6 +304,41 @@ __attribute__((always_inline)) INLINE static void kernel_eval( *W = w * kernel_constant * kernel_gamma_inv_dim; } +/** + * @brief Computes the kernel function derivative. + * + * The kernel function needs to be mutliplied by \f$h^{-d}\f$ and the gradient + * by \f$h^{-(d+1)}\f$, where \f$d\f$ is the dimensionality of the problem. + * + * Returns 0 if \f$u > \gamma = H/h\f$. + * + * @param u The ratio of the distance to the smoothing length \f$u = x/h\f$. + * @param dW_dx (return) The norm of the gradient of \f$|\nabla W(x,h)|\f$. + */ +__attribute__((always_inline)) INLINE static void kernel_eval_dWdx( + float u, float *restrict dW_dx) { + + /* Go to the range [0,1[ from [0,H[ */ + const float x = u * kernel_gamma_inv; + + /* Pick the correct branch of the kernel */ + const int temp = (int)(x * kernel_ivals_f); + const int ind = temp > kernel_ivals ? kernel_ivals : temp; + const float *const coeffs = &kernel_coeffs[ind * (kernel_degree + 1)]; + + /* First two terms of the polynomial ... */ + float dw_dx = ((float)kernel_degree * coeffs[0] * x) + + (float)(kernel_degree - 1) * coeffs[1]; + + /* ... and the rest of them */ + for (int k = 2; k < kernel_degree; k++) { + dw_dx = dw_dx * x + (float)(kernel_degree - k) * coeffs[k]; + } + + /* Return everything */ + *dW_dx = dw_dx * kernel_constant * kernel_gamma_inv_dim_plus_one; +} + /* ------------------------------------------------------------------------- */ #ifdef WITH_VECTORIZATION @@ -362,7 +397,8 @@ __attribute__((always_inline)) INLINE static void kernel_deval_vec( dw_dx->v * kernel_constant_vec.v * kernel_gamma_inv_dim_plus_one_vec.v; } -/* Define constant vectors for the Wendland C2 kernel coefficients. */ +/* Define constant vectors for the Wendland C2 and Cubic Spline kernel + * coefficients. */ #ifdef WENDLAND_C2_KERNEL static const vector wendland_const_c0 = FILL_VEC(4.f); static const vector wendland_const_c1 = FILL_VEC(-15.f); @@ -370,8 +406,115 @@ static const vector wendland_const_c2 = FILL_VEC(20.f); static const vector wendland_const_c3 = FILL_VEC(-10.f); static const vector wendland_const_c4 = FILL_VEC(0.f); static const vector wendland_const_c5 = FILL_VEC(1.f); + +static const vector wendland_dwdx_const_c0 = FILL_VEC(20.f); +static const vector wendland_dwdx_const_c1 = FILL_VEC(-60.f); +static const vector wendland_dwdx_const_c2 = FILL_VEC(60.f); +static const vector wendland_dwdx_const_c3 = FILL_VEC(-20.f); +#elif defined(CUBIC_SPLINE_KERNEL) +/* First region 0 < u < 0.5 */ +static const vector cubic_1_const_c0 = FILL_VEC(3.f); +static const vector cubic_1_const_c1 = FILL_VEC(-3.f); +static const vector cubic_1_const_c2 = FILL_VEC(0.f); +static const vector cubic_1_const_c3 = FILL_VEC(0.5f); +static const vector cubic_1_dwdx_const_c0 = FILL_VEC(9.f); +static const vector cubic_1_dwdx_const_c1 = FILL_VEC(-6.f); +static const vector cubic_1_dwdx_const_c2 = FILL_VEC(0.f); + +/* Second region 0.5 <= u < 1 */ +static const vector cubic_2_const_c0 = FILL_VEC(-1.f); +static const vector cubic_2_const_c1 = FILL_VEC(3.f); +static const vector cubic_2_const_c2 = FILL_VEC(-3.f); +static const vector cubic_2_const_c3 = FILL_VEC(1.f); +static const vector cubic_2_dwdx_const_c0 = FILL_VEC(-3.f); +static const vector cubic_2_dwdx_const_c1 = FILL_VEC(6.f); +static const vector cubic_2_dwdx_const_c2 = FILL_VEC(-3.f); +static const vector cond = FILL_VEC(0.5f); +#endif + +/** + * @brief Computes the kernel function and its derivative for two particles + * using vectors. + * + * Return 0 if $u > \\gamma = H/h$ + * + * @param u The ratio of the distance to the smoothing length $u = x/h$. + * @param w (return) The value of the kernel function $W(x,h)$. + * @param dw_dx (return) The norm of the gradient of $|\\nabla W(x,h)|$. + */ +__attribute__((always_inline)) INLINE static void kernel_deval_1_vec( + vector *u, vector *w, vector *dw_dx) { + + /* Go to the range [0,1[ from [0,H[ */ + vector x; + x.v = vec_mul(u->v, kernel_gamma_inv_vec.v); + +#ifdef WENDLAND_C2_KERNEL + /* Init the iteration for Horner's scheme. */ + w->v = vec_fma(wendland_const_c0.v, x.v, wendland_const_c1.v); + dw_dx->v = wendland_const_c0.v; + + /* Calculate the polynomial interleaving vector operations */ + dw_dx->v = vec_fma(dw_dx->v, x.v, w->v); + w->v = vec_fma(x.v, w->v, wendland_const_c2.v); + + dw_dx->v = vec_fma(dw_dx->v, x.v, w->v); + w->v = vec_fma(x.v, w->v, wendland_const_c3.v); + + dw_dx->v = vec_fma(dw_dx->v, x.v, w->v); + w->v = vec_mul(x.v, w->v); /* wendland_const_c4 is zero. */ + + dw_dx->v = vec_fma(dw_dx->v, x.v, w->v); + w->v = vec_fma(x.v, w->v, wendland_const_c5.v); +#elif defined(CUBIC_SPLINE_KERNEL) + vector w2, dw_dx2; + vector mask_reg1, mask_reg2; + + /* Form a mask for each part of the kernel. */ + mask_reg1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */ + mask_reg2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */ + ; + + /* Work out w for both regions of the kernel and combine the results together + * using masks. */ + + /* Init the iteration for Horner's scheme. */ + w->v = vec_fma(cubic_1_const_c0.v, x.v, cubic_1_const_c1.v); + w2.v = vec_fma(cubic_2_const_c0.v, x.v, cubic_2_const_c1.v); + dw_dx->v = cubic_1_const_c0.v; + dw_dx2.v = cubic_2_const_c0.v; + + /* Calculate the polynomial interleaving vector operations. */ + dw_dx->v = vec_fma(dw_dx->v, x.v, w->v); + dw_dx2.v = vec_fma(dw_dx2.v, x.v, w2.v); + w->v = vec_mul(x.v, w->v); /* cubic_1_const_c2 is zero. */ + w2.v = vec_fma(x.v, w2.v, cubic_2_const_c2.v); + + dw_dx->v = vec_fma(dw_dx->v, x.v, w->v); + dw_dx2.v = vec_fma(dw_dx2.v, x.v, w2.v); + w->v = vec_fma(x.v, w->v, cubic_1_const_c3.v); + w2.v = vec_fma(x.v, w2.v, cubic_2_const_c3.v); + + /* Mask out unneeded values. */ + w->v = vec_and(w->v, mask_reg1.v); + w2.v = vec_and(w2.v, mask_reg2.v); + dw_dx->v = vec_and(dw_dx->v, mask_reg1.v); + dw_dx2.v = vec_and(dw_dx2.v, mask_reg2.v); + + /* Added both w and w2 together to form complete result. */ + w->v = vec_add(w->v, w2.v); + dw_dx->v = vec_add(dw_dx->v, dw_dx2.v); +#else +#error #endif + /* Return everything */ + w->v = + vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v)); + dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v, + kernel_gamma_inv_dim_plus_one_vec.v)); +} + /** * @brief Computes the kernel function and its derivative for two particles * using interleaved vectors. @@ -417,8 +560,8 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec( dw_dx->v = vec_fma(dw_dx->v, x.v, w->v); dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v); - w->v = vec_fma(x.v, w->v, wendland_const_c4.v); - w2->v = vec_fma(x2.v, w2->v, wendland_const_c4.v); + w->v = vec_mul(x.v, w->v); /* wendland_const_c4 is zero. */ + w2->v = vec_mul(x2.v, w2->v); /* wendland_const_c4 is zero. */ dw_dx->v = vec_fma(dw_dx->v, x.v, w->v); dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v); @@ -434,34 +577,67 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec( kernel_gamma_inv_dim_plus_one_vec.v)); dw_dx2->v = vec_mul(dw_dx2->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_plus_one_vec.v)); -#else - - /* Load x and get the interval id. */ - vector ind, ind2; - ind.m = vec_ftoi(vec_fmin(x.v * kernel_ivals_vec.v, kernel_ivals_vec.v)); - ind2.m = vec_ftoi(vec_fmin(x2.v * kernel_ivals_vec.v, kernel_ivals_vec.v)); - - /* load the coefficients. */ - vector c[kernel_degree + 1], c2[kernel_degree + 1]; - for (int k = 0; k < VEC_SIZE; k++) - for (int j = 0; j < kernel_degree + 1; j++) { - c[j].f[k] = kernel_coeffs[ind.i[k] * (kernel_degree + 1) + j]; - c2[j].f[k] = kernel_coeffs[ind2.i[k] * (kernel_degree + 1) + j]; - } +#elif defined(CUBIC_SPLINE_KERNEL) + vector w_2, dw_dx_2; + vector w2_2, dw_dx2_2; + vector mask_reg1, mask_reg2, mask_reg1_v2, mask_reg2_v2; + + /* Form a mask for each part of the kernel. */ + mask_reg1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */ + mask_reg1_v2.v = vec_cmp_lt(x2.v, cond.v); /* 0 < x < 0.5 */ + mask_reg2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */ + ; + mask_reg2_v2.v = vec_cmp_gte(x2.v, cond.v); /* 0.5 < x < 1 */ + ; + + /* Work out w for both regions of the kernel and combine the results together + * using masks. */ /* Init the iteration for Horner's scheme. */ - w->v = (c[0].v * x.v) + c[1].v; - w2->v = (c2[0].v * x2.v) + c2[1].v; - dw_dx->v = c[0].v; - dw_dx2->v = c2[0].v; + w->v = vec_fma(cubic_1_const_c0.v, x.v, cubic_1_const_c1.v); + w2->v = vec_fma(cubic_1_const_c0.v, x2.v, cubic_1_const_c1.v); + w_2.v = vec_fma(cubic_2_const_c0.v, x.v, cubic_2_const_c1.v); + w2_2.v = vec_fma(cubic_2_const_c0.v, x2.v, cubic_2_const_c1.v); + dw_dx->v = cubic_1_const_c0.v; + dw_dx2->v = cubic_1_const_c0.v; + dw_dx_2.v = cubic_2_const_c0.v; + dw_dx2_2.v = cubic_2_const_c0.v; + + /* Calculate the polynomial interleaving vector operations. */ + dw_dx->v = vec_fma(dw_dx->v, x.v, w->v); + dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v); + dw_dx_2.v = vec_fma(dw_dx_2.v, x.v, w_2.v); + dw_dx2_2.v = vec_fma(dw_dx2_2.v, x2.v, w2_2.v); + w->v = vec_mul(x.v, w->v); /* cubic_1_const_c2 is zero. */ + w2->v = vec_mul(x2.v, w2->v); /* cubic_1_const_c2 is zero. */ + w_2.v = vec_fma(x.v, w_2.v, cubic_2_const_c2.v); + w2_2.v = vec_fma(x2.v, w2_2.v, cubic_2_const_c2.v); + + dw_dx->v = vec_fma(dw_dx->v, x.v, w->v); + dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v); + dw_dx_2.v = vec_fma(dw_dx_2.v, x.v, w_2.v); + dw_dx2_2.v = vec_fma(dw_dx2_2.v, x2.v, w2_2.v); + w->v = vec_fma(x.v, w->v, cubic_1_const_c3.v); + w2->v = vec_fma(x2.v, w2->v, cubic_1_const_c3.v); + w_2.v = vec_fma(x.v, w_2.v, cubic_2_const_c3.v); + w2_2.v = vec_fma(x2.v, w2_2.v, cubic_2_const_c3.v); + + /* Mask out unneeded values. */ + w->v = vec_and(w->v, mask_reg1.v); + w2->v = vec_and(w2->v, mask_reg1_v2.v); + w_2.v = vec_and(w_2.v, mask_reg2.v); + w2_2.v = vec_and(w2_2.v, mask_reg2_v2.v); + dw_dx->v = vec_and(dw_dx->v, mask_reg1.v); + dw_dx2->v = vec_and(dw_dx2->v, mask_reg1_v2.v); + dw_dx_2.v = vec_and(dw_dx_2.v, mask_reg2.v); + dw_dx2_2.v = vec_and(dw_dx2_2.v, mask_reg2_v2.v); + + /* Added both w and w2 together to form complete result. */ + w->v = vec_add(w->v, w_2.v); + w2->v = vec_add(w2->v, w2_2.v); + dw_dx->v = vec_add(dw_dx->v, dw_dx_2.v); + dw_dx2->v = vec_add(dw_dx2->v, dw_dx2_2.v); - /* And we're off! */ - for (int k = 2; k <= kernel_degree; k++) { - dw_dx->v = (dw_dx->v * x.v) + w->v; - dw_dx2->v = (dw_dx2->v * x2.v) + w2->v; - w->v = (x.v * w->v) + c[k].v; - w2->v = (x2.v * w2->v) + c2[k].v; - } /* Return everything */ w->v = w->v * kernel_constant_vec.v * kernel_gamma_inv_dim_vec.v; w2->v = w2->v * kernel_constant_vec.v * kernel_gamma_inv_dim_vec.v; @@ -473,8 +649,133 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec( #endif } +/** + * @brief Computes the kernel function for two particles + * using vectors. + * + * Return 0 if $u > \\gamma = H/h$ + * + * @param u The ratio of the distance to the smoothing length $u = x/h$. + * @param w (return) The value of the kernel function $W(x,h)$. + */ +__attribute__((always_inline)) INLINE static void kernel_eval_W_vec(vector *u, + vector *w) { + + /* Go to the range [0,1[ from [0,H[ */ + vector x; + x.v = vec_mul(u->v, kernel_gamma_inv_vec.v); + +#ifdef WENDLAND_C2_KERNEL + /* Init the iteration for Horner's scheme. */ + w->v = vec_fma(wendland_const_c0.v, x.v, wendland_const_c1.v); + + /* Calculate the polynomial interleaving vector operations */ + w->v = vec_fma(x.v, w->v, wendland_const_c2.v); + w->v = vec_fma(x.v, w->v, wendland_const_c3.v); + w->v = vec_mul(x.v, w->v); /* wendland_const_c4 is zero.*/ + w->v = vec_fma(x.v, w->v, wendland_const_c5.v); +#elif defined(CUBIC_SPLINE_KERNEL) + vector w2; + vector mask1, mask2; + + /* Form a mask for each part of the kernel. */ + mask1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */ + mask2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */ + ; + + /* Work out w for both regions of the kernel and combine the results together + * using masks. */ + + /* Init the iteration for Horner's scheme. */ + w->v = vec_fma(cubic_1_const_c0.v, x.v, cubic_1_const_c1.v); + w2.v = vec_fma(cubic_2_const_c0.v, x.v, cubic_2_const_c1.v); + + /* Calculate the polynomial interleaving vector operations. */ + w->v = vec_mul(x.v, w->v); /* cubic_1_const_c2 is zero */ + w2.v = vec_fma(x.v, w2.v, cubic_2_const_c2.v); + + w->v = vec_fma(x.v, w->v, cubic_1_const_c3.v); + w2.v = vec_fma(x.v, w2.v, cubic_2_const_c3.v); + + /* Mask out unneeded values. */ + w->v = vec_and(w->v, mask1.v); + w2.v = vec_and(w2.v, mask2.v); + + /* Added both w and w2 together to form complete result. */ + w->v = vec_add(w->v, w2.v); +#else +#error #endif + /* Return everything */ + w->v = + vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v)); +} + +/** + * @brief Computes the kernel function derivative for two particles + * using vectors. + * + * Return 0 if $u > \\gamma = H/h$ + * + * @param u The ratio of the distance to the smoothing length $u = x/h$. + * @param dw_dx (return) The norm of the gradient of $|\\nabla W(x,h)|$. + */ +__attribute__((always_inline)) INLINE static void kernel_eval_dWdx_vec( + vector *u, vector *dw_dx) { + + /* Go to the range [0,1[ from [0,H[ */ + vector x; + x.v = vec_mul(u->v, kernel_gamma_inv_vec.v); + +#ifdef WENDLAND_C2_KERNEL + /* Init the iteration for Horner's scheme. */ + dw_dx->v = vec_fma(wendland_dwdx_const_c0.v, x.v, wendland_dwdx_const_c1.v); + + /* Calculate the polynomial interleaving vector operations */ + dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c2.v); + + dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c3.v); + + dw_dx->v = vec_mul(dw_dx->v, x.v); + +#elif defined(CUBIC_SPLINE_KERNEL) + vector dw_dx2; + vector mask1, mask2; + + /* Form a mask for each part of the kernel. */ + mask1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */ + mask2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */ + ; + + /* Work out w for both regions of the kernel and combine the results together + * using masks. */ + + /* Init the iteration for Horner's scheme. */ + dw_dx->v = vec_fma(cubic_1_dwdx_const_c0.v, x.v, cubic_1_dwdx_const_c1.v); + dw_dx2.v = vec_fma(cubic_2_dwdx_const_c0.v, x.v, cubic_2_dwdx_const_c1.v); + + /* Calculate the polynomial interleaving vector operations. */ + dw_dx->v = vec_mul(dw_dx->v, x.v); /* cubic_1_dwdx_const_c2 is zero. */ + dw_dx2.v = vec_fma(dw_dx2.v, x.v, cubic_2_dwdx_const_c2.v); + + /* Mask out unneeded values. */ + dw_dx->v = vec_and(dw_dx->v, mask1.v); + dw_dx2.v = vec_and(dw_dx2.v, mask2.v); + + /* Added both dwdx and dwdx2 together to form complete result. */ + dw_dx->v = vec_add(dw_dx->v, dw_dx2.v); +#else +#error +#endif + + /* Return everything */ + dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v, + kernel_gamma_inv_dim_plus_one_vec.v)); +} + +#endif /* WITH_VECTORIZATION */ + /* Some cross-check functions */ void hydro_kernel_dump(int N); diff --git a/src/runner.c b/src/runner.c index 778eb0c7c6a11770dbe60a858ec9bcbc2c78c2d0..dba98e69373761b8ee52261a0b692d7109e183db 100644 --- a/src/runner.c +++ b/src/runner.c @@ -1790,8 +1790,13 @@ void *runner_main(void *data) { break; case task_type_pair: - if (t->subtype == task_subtype_density) + if (t->subtype == task_subtype_density) { +#if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH) + runner_dopair1_density_vec(r, ci, cj); +#else runner_dopair1_density(r, ci, cj); +#endif + } #ifdef EXTRA_HYDRO_LOOP else if (t->subtype == task_subtype_gradient) runner_dopair1_gradient(r, ci, cj); diff --git a/src/runner.h b/src/runner.h index 51de9827406b014debeefba03ad09f636e3fa45f..49f7cd88a0b345cac1b29b7fd38cf59b21268012 100644 --- a/src/runner.h +++ b/src/runner.h @@ -49,8 +49,11 @@ struct runner { /*! The engine owing this runner. */ struct engine *e; - /*! The particle cache of this runner. */ - struct cache par_cache; + /*! The particle cache of cell ci. */ + struct cache ci_cache; + + /*! The particle cache of cell cj. */ + struct cache cj_cache; }; /* Function prototypes. */ diff --git a/src/runner_doiact.h b/src/runner_doiact.h index 36d287c32798f749eeab90db73289deb8b21f906..839e492956140bd2b633d1e7c99c2b2a43f89629 100644 --- a/src/runner_doiact.h +++ b/src/runner_doiact.h @@ -26,6 +26,9 @@ #define PASTE(x, y) x##_##y +#define _DOPAIR1_BRANCH(f) PASTE(runner_dopair1_branch, f) +#define DOPAIR1_BRANCH _DOPAIR1_BRANCH(FUNCTION) + #define _DOPAIR1(f) PASTE(runner_dopair1, f) #define DOPAIR1 _DOPAIR1(FUNCTION) @@ -2287,8 +2290,13 @@ void DOSUB_PAIR1(struct runner *r, struct cell *ci, struct cell *cj, int sid, cj->dx_max_sort > cj->dmin * space_maxreldx) runner_do_sort(r, cj, (1 << sid), 1); - /* Compute the interactions. */ +/* Compute the interactions. */ +#if (DOPAIR1 == runner_dopair1_density) && defined(WITH_VECTORIZATION) && \ + defined(GADGET2_SPH) + runner_dopair1_density_vec(r, ci, cj); +#else DOPAIR1(r, ci, cj); +#endif } if (gettimer) TIMER_TOC(TIMER_DOSUB_PAIR); diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c index d78de2cd3c508fd17a816316dc931010f2febf80..90b612684d7a8296f0d3802ec539b3b769a0e877 100644 --- a/src/runner_doiact_vec.c +++ b/src/runner_doiact_vec.c @@ -20,6 +20,8 @@ /* Config parameters. */ #include "../config.h" +#include "swift.h" + #include "active.h" /* This object's header. */ @@ -259,6 +261,98 @@ __attribute__((always_inline)) INLINE static void storeInteractions( *icount = 0; } } + +/* @brief Populates the arrays max_di and max_dj with the maximum distances of + * particles into their neighbouring cells. Also finds the first pi that + * interacts with any particle in cj and the last pj that interacts with any + * particle in ci. + * @param ci #cell pointer to ci + * @param cj #cell pointer to cj + * @param sort_i #entry array for particle distance in ci + * @param sort_j #entry array for particle distance in cj + * @param ci_cache #cache for cell ci + * @param cj_cache #cache for cell cj + * @param dx_max maximum particle movement allowed in cell + * @param rshift cutoff shift + * @param max_di array to hold the maximum distances of pi particles into cell + * cj + * @param max_dj array to hold the maximum distances of pj particles into cell + * cj + * @param init_pi first pi to interact with a pj particle + * @param init_pj last pj to interact with a pi particle + */ +__attribute__((always_inline)) INLINE static void populate_max_d_no_cache( + const struct cell *ci, const struct cell *cj, + const struct entry *restrict sort_i, const struct entry *restrict sort_j, + const float dx_max, const float rshift, float *max_di, float *max_dj, + int *init_pi, int *init_pj, const struct engine *e) { + + struct part *restrict parts_i = ci->parts; + struct part *restrict parts_j = cj->parts; + struct part *p = &parts_i[sort_i[0].i]; + + float h, d; + + /* Get the distance of the last pi and the first pj on the sorted axis.*/ + const float di_max = sort_i[ci->count - 1].d - rshift; + const float dj_min = sort_j[0].d; + + int first_pi = 0, last_pj = cj->count - 1; + + /* Find the first active particle in ci to interact with any particle in cj. + */ + /* Populate max_di with distances. */ + int active_id = ci->count - 1; + for (int k = ci->count - 1; k >= 0; k--) { + p = &parts_i[sort_i[k].i]; + h = p->h; + d = sort_i[k].d + h * kernel_gamma + dx_max - rshift; + + max_di[k] = d; + + /* If the particle is out of range set the index to + * the last active particle within range. */ + if (d < dj_min) { + first_pi = active_id; + break; + } else { + if (part_is_active(p, e)) active_id = k; + } + } + + /* Find the maximum distance of pi particles into cj.*/ + for (int k = first_pi + 1; k < ci->count; k++) { + max_di[k] = fmaxf(max_di[k - 1], max_di[k]); + } + + /* Find the last particle in cj to interact with any particle in ci. */ + /* Populate max_dj with distances. */ + active_id = 0; + for (int k = 0; k < cj->count; k++) { + p = &parts_j[sort_j[k].i]; + h = p->h; + d = sort_j[k].d - h * kernel_gamma - dx_max - rshift; + + max_dj[k] = d; + + /* If the particle is out of range set the index to + * the last active particle within range. */ + if (d > di_max) { + last_pj = active_id; + break; + } else { + if (part_is_active(p, e)) active_id = k; + } + } + + /* Find the maximum distance of pj particles into ci.*/ + for (int k = 1; k <= last_pj; k++) { + max_dj[k] = fmaxf(max_dj[k - 1], max_dj[k]); + } + + *init_pi = first_pi; + *init_pj = last_pj; +} #endif /* WITH_VECTORIZATION */ /** @@ -287,11 +381,11 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( if (!cell_is_active(c, e)) return; - if (!cell_is_drifted(c, e)) cell_drift_particles(c, e); + if (!cell_is_drifted(c, e)) error("Interacting undrifted cell."); /* Get the particle cache from the runner and re-allocate * the cache if it is not big enough for the cell. */ - struct cache *restrict cell_cache = &r->par_cache; + struct cache *restrict cell_cache = &r->ci_cache; if (cell_cache->count < count) { cache_init(cell_cache, count); @@ -354,6 +448,7 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( int pad = (num_vec_proc * VEC_SIZE) - rem; count_align += pad; + /* Set positions to the same as particle pi so when the r2 > 0 mask is * applied these extra contributions are masked out.*/ for (int i = count; i < count_align; i++) { @@ -394,17 +489,17 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( vector v_dx_tmp2, v_dy_tmp2, v_dz_tmp2, v_r2_2; v_dx_tmp.v = vec_sub(pix.v, pjx.v); - v_dy_tmp.v = vec_sub(piy.v, pjy.v); - v_dz_tmp.v = vec_sub(piz.v, pjz.v); v_dx_tmp2.v = vec_sub(pix.v, pjx2.v); + v_dy_tmp.v = vec_sub(piy.v, pjy.v); v_dy_tmp2.v = vec_sub(piy.v, pjy2.v); + v_dz_tmp.v = vec_sub(piz.v, pjz.v); v_dz_tmp2.v = vec_sub(piz.v, pjz2.v); v_r2.v = vec_mul(v_dx_tmp.v, v_dx_tmp.v); - v_r2.v = vec_fma(v_dy_tmp.v, v_dy_tmp.v, v_r2.v); - v_r2.v = vec_fma(v_dz_tmp.v, v_dz_tmp.v, v_r2.v); v_r2_2.v = vec_mul(v_dx_tmp2.v, v_dx_tmp2.v); + v_r2.v = vec_fma(v_dy_tmp.v, v_dy_tmp.v, v_r2.v); v_r2_2.v = vec_fma(v_dy_tmp2.v, v_dy_tmp2.v, v_r2_2.v); + v_r2.v = vec_fma(v_dz_tmp.v, v_dz_tmp.v, v_r2.v); v_r2_2.v = vec_fma(v_dz_tmp2.v, v_dz_tmp2.v, v_r2_2.v); /* Form a mask from r2 < hig2 and r2 > 0.*/ @@ -545,14 +640,14 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec_2( /* Get the particle cache from the runner and re-allocate * the cache if it is not big enough for the cell. */ - struct cache *restrict cell_cache = &r->par_cache; + struct cache *restrict cell_cache = &r->ci_cache; if (cell_cache->count < count) { cache_init(cell_cache, count); } /* Read the particles from the cell and store them locally in the cache. */ - cache_read_particles(c, &r->par_cache); + cache_read_particles(c, &r->ci_cache); /* Create two secondary caches. */ int icount = 0, icount_align = 0; @@ -872,3 +967,438 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec_2( TIMER_TOC(timer_doself_density); #endif /* WITH_VECTORIZATION */ } + +/** + * @brief Compute the density interactions between a cell pair (non-symmetric) + * using vector intrinsics. + * + * @param r The #runner. + * @param ci The first #cell. + * @param cj The second #cell. + */ +void runner_dopair1_density_vec(struct runner *r, struct cell *ci, + struct cell *cj) { + +#ifdef WITH_VECTORIZATION + const struct engine *restrict e = r->e; + + vector v_hi, v_vix, v_viy, v_viz, v_hig2; + + TIMER_TIC; + + /* Anything to do here? */ + if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return; + + if (!cell_is_drifted(ci, e) || !cell_is_drifted(cj, e)) + error("Interacting undrifted cells."); + + /* Get the sort ID. */ + double shift[3] = {0.0, 0.0, 0.0}; + const int sid = space_getsid(e->s, &ci, &cj, shift); + + /* Have the cells been sorted? */ + if (!(ci->sorted & (1 << sid)) || ci->dx_max_sort > space_maxreldx * ci->dmin) + runner_do_sort(r, ci, (1 << sid), 1); + if (!(cj->sorted & (1 << sid)) || cj->dx_max_sort > space_maxreldx * cj->dmin) + runner_do_sort(r, cj, (1 << sid), 1); + + /* Get the cutoff shift. */ + double rshift = 0.0; + for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k]; + + /* Pick-out the sorted lists. */ + const struct entry *restrict sort_i = &ci->sort[sid * (ci->count + 1)]; + const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)]; + +#ifdef SWIFT_DEBUG_CHECKS + /* Check that the dx_max_sort values in the cell are indeed an upper + bound on particle movement. */ + for (int pid = 0; pid < ci->count; pid++) { + const struct part *p = &ci->parts[sort_i[pid].i]; + const float d = p->x[0] * runner_shift[sid][0] + + p->x[1] * runner_shift[sid][1] + + p->x[2] * runner_shift[sid][2]; + if (fabsf(d - sort_i[pid].d) - ci->dx_max_sort > + 1.0e-6 * max(fabsf(d), ci->dx_max_sort)) + error("particle shift diff exceeds dx_max_sort."); + } + for (int pjd = 0; pjd < cj->count; pjd++) { + const struct part *p = &cj->parts[sort_j[pjd].i]; + const float d = p->x[0] * runner_shift[sid][0] + + p->x[1] * runner_shift[sid][1] + + p->x[2] * runner_shift[sid][2]; + if (fabsf(d - sort_j[pjd].d) - cj->dx_max_sort > + 1.0e-6 * max(fabsf(d), cj->dx_max_sort)) + error("particle shift diff exceeds dx_max_sort."); + } +#endif /* SWIFT_DEBUG_CHECKS */ + + /* Get some other useful values. */ + const int count_i = ci->count; + const int count_j = cj->count; + const double hi_max = ci->h_max * kernel_gamma - rshift; + const double hj_max = cj->h_max * kernel_gamma; + struct part *restrict parts_i = ci->parts; + struct part *restrict parts_j = cj->parts; + const double di_max = sort_i[count_i - 1].d - rshift; + const double dj_min = sort_j[0].d; + const float dx_max = (ci->dx_max_sort + cj->dx_max_sort); + + /* Check if any particles are active and return if there are not. */ + int numActive = 0; + for (int pid = count_i - 1; + pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min; pid--) { + struct part *restrict pi = &parts_i[sort_i[pid].i]; + if (part_is_active(pi, e)) { + numActive++; + break; + } + } + + if (!numActive) { + for (int pjd = 0; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max; + pjd++) { + struct part *restrict pj = &parts_j[sort_j[pjd].i]; + if (part_is_active(pj, e)) { + numActive++; + break; + } + } + } + + if (numActive == 0) return; + + /* Get both particle caches from the runner and re-allocate + * them if they are not big enough for the cells. */ + struct cache *restrict ci_cache = &r->ci_cache; + struct cache *restrict cj_cache = &r->cj_cache; + + if (ci_cache->count < count_i) { + cache_init(ci_cache, count_i); + } + if (cj_cache->count < count_j) { + cache_init(cj_cache, count_j); + } + + int first_pi, last_pj; + float *max_di __attribute__((aligned(sizeof(float) * VEC_SIZE))); + float *max_dj __attribute__((aligned(sizeof(float) * VEC_SIZE))); + + max_di = r->ci_cache.max_d; + max_dj = r->cj_cache.max_d; + + /* Find particles maximum distance into cj, max_di[] and ci, max_dj[]. */ + /* Also find the first pi that interacts with any particle in cj and the last + * pj that interacts with any particle in ci. */ + populate_max_d_no_cache(ci, cj, sort_i, sort_j, dx_max, rshift, max_di, + max_dj, &first_pi, &last_pj, e); + + /* Find the maximum index into cj that is required by a particle in ci. */ + /* Find the maximum index into ci that is required by a particle in cj. */ + float di, dj; + int max_ind_j = count_j - 1; + int max_ind_i = 0; + + dj = sort_j[max_ind_j].d; + while (max_ind_j > 0 && max_di[count_i - 1] < dj) { + max_ind_j--; + + dj = sort_j[max_ind_j].d; + } + + di = sort_i[max_ind_i].d; + while (max_ind_i < count_i - 1 && max_dj[0] > di) { + max_ind_i++; + + di = sort_i[max_ind_i].d; + } + + /* Limits of the outer loops. */ + int first_pi_loop = first_pi; + int last_pj_loop = last_pj; + + /* Take the max/min of both values calculated to work out how many particles + * to read into the cache. */ + last_pj = max(last_pj, max_ind_j); + first_pi = min(first_pi, max_ind_i); + + /* Read the needed particles into the two caches. */ + int first_pi_align = first_pi; + int last_pj_align = last_pj; + cache_read_two_partial_cells_sorted(ci, cj, ci_cache, cj_cache, sort_i, + sort_j, shift, &first_pi_align, + &last_pj_align, 1); + + /* Get the number of particles read into the ci cache. */ + int ci_cache_count = count_i - first_pi_align; + + if (cell_is_active(ci, e)) { + + /* Loop over the parts in ci. */ + for (int pid = count_i - 1; pid >= first_pi_loop && max_ind_j >= 0; pid--) { + + /* Get a hold of the ith part in ci. */ + struct part *restrict pi = &parts_i[sort_i[pid].i]; + if (!part_is_active(pi, e)) continue; + + /* Determine the exit iteration of the interaction loop. */ + dj = sort_j[max_ind_j].d; + while (max_ind_j > 0 && max_di[pid] < dj) { + max_ind_j--; + + dj = sort_j[max_ind_j].d; + } + int exit_iteration = max_ind_j + 1; + + /* Set the cache index. */ + int ci_cache_idx = pid - first_pi_align; + + const float hi = ci_cache->h[ci_cache_idx]; + const float hig2 = hi * hi * kernel_gamma2; + + vector pix, piy, piz; + + /* Fill particle pi vectors. */ + pix.v = vec_set1(ci_cache->x[ci_cache_idx]); + piy.v = vec_set1(ci_cache->y[ci_cache_idx]); + piz.v = vec_set1(ci_cache->z[ci_cache_idx]); + v_hi.v = vec_set1(hi); + v_vix.v = vec_set1(ci_cache->vx[ci_cache_idx]); + v_viy.v = vec_set1(ci_cache->vy[ci_cache_idx]); + v_viz.v = vec_set1(ci_cache->vz[ci_cache_idx]); + + v_hig2.v = vec_set1(hig2); + + /* Reset cumulative sums of update vectors. */ + vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, + curlvySum, curlvzSum; + + /* Get the inverse of hi. */ + vector v_hi_inv; + + v_hi_inv = vec_reciprocal(v_hi); + + rhoSum.v = vec_setzero(); + rho_dhSum.v = vec_setzero(); + wcountSum.v = vec_setzero(); + wcount_dhSum.v = vec_setzero(); + div_vSum.v = vec_setzero(); + curlvxSum.v = vec_setzero(); + curlvySum.v = vec_setzero(); + curlvzSum.v = vec_setzero(); + + /* Pad the exit iteration if there is a serial remainder. */ + int exit_iteration_align = exit_iteration; + int rem = exit_iteration % VEC_SIZE; + if (rem != 0) { + int pad = VEC_SIZE - rem; + + if (exit_iteration_align + pad <= last_pj_align + 1) + exit_iteration_align += pad; + } + + vector pjx, pjy, pjz; + + /* Loop over the parts in cj. */ + for (int pjd = 0; pjd < exit_iteration_align; pjd += VEC_SIZE) { + + /* Get the cache index to the jth particle. */ + int cj_cache_idx = pjd; + + vector v_dx, v_dy, v_dz, v_r2; + +#ifdef SWIFT_DEBUG_CHECKS + if (cj_cache_idx % VEC_SIZE != 0 || cj_cache_idx < 0) { + error("Unaligned read!!! cj_cache_idx=%d", cj_cache_idx); + } +#endif + + /* Load 2 sets of vectors from the particle cache. */ + pjx.v = vec_load(&cj_cache->x[cj_cache_idx]); + pjy.v = vec_load(&cj_cache->y[cj_cache_idx]); + pjz.v = vec_load(&cj_cache->z[cj_cache_idx]); + + /* Compute the pairwise distance. */ + v_dx.v = vec_sub(pix.v, pjx.v); + v_dy.v = vec_sub(piy.v, pjy.v); + v_dz.v = vec_sub(piz.v, pjz.v); + + v_r2.v = vec_mul(v_dx.v, v_dx.v); + v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v); + v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v); + + vector v_doi_mask; + int doi_mask; + + /* Form r2 < hig2 mask. */ + v_doi_mask.v = vec_cmp_lt(v_r2.v, v_hig2.v); + + /* Form integer mask. */ + doi_mask = vec_cmp_result(v_doi_mask.v); + + /* If there are any interactions perform them. */ + if (doi_mask) + runner_iact_nonsym_1_vec_density( + &v_r2, &v_dx, &v_dy, &v_dz, v_hi_inv, v_vix, v_viy, v_viz, + &cj_cache->vx[cj_cache_idx], &cj_cache->vy[cj_cache_idx], + &cj_cache->vz[cj_cache_idx], &cj_cache->m[cj_cache_idx], &rhoSum, + &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, + &curlvySum, &curlvzSum, v_doi_mask, +#ifdef HAVE_AVX512_F + knl_mask); +#else + 0); +#endif + + } /* loop over the parts in cj. */ + + /* Perform horizontal adds on vector sums and store result in particle pi. + */ + VEC_HADD(rhoSum, pi->rho); + VEC_HADD(rho_dhSum, pi->density.rho_dh); + VEC_HADD(wcountSum, pi->density.wcount); + VEC_HADD(wcount_dhSum, pi->density.wcount_dh); + VEC_HADD(div_vSum, pi->density.div_v); + VEC_HADD(curlvxSum, pi->density.rot_v[0]); + VEC_HADD(curlvySum, pi->density.rot_v[1]); + VEC_HADD(curlvzSum, pi->density.rot_v[2]); + + } /* loop over the parts in ci. */ + } + + if (cell_is_active(cj, e)) { + /* Loop over the parts in cj. */ + for (int pjd = 0; pjd <= last_pj_loop && max_ind_i < count_i; pjd++) { + + /* Get a hold of the jth part in cj. */ + struct part *restrict pj = &parts_j[sort_j[pjd].i]; + if (!part_is_active(pj, e)) continue; + + /* Determine the exit iteration of the interaction loop. */ + di = sort_i[max_ind_i].d; + while (max_ind_i < count_i - 1 && max_dj[pjd] > di) { + max_ind_i++; + + di = sort_i[max_ind_i].d; + } + int exit_iteration = max_ind_i; + + /* Set the cache index. */ + int cj_cache_idx = pjd; + + const float hj = cj_cache->h[cj_cache_idx]; + const float hjg2 = hj * hj * kernel_gamma2; + + vector pjx, pjy, pjz; + vector v_hj, v_vjx, v_vjy, v_vjz, v_hjg2; + + /* Fill particle pi vectors. */ + pjx.v = vec_set1(cj_cache->x[cj_cache_idx]); + pjy.v = vec_set1(cj_cache->y[cj_cache_idx]); + pjz.v = vec_set1(cj_cache->z[cj_cache_idx]); + v_hj.v = vec_set1(hj); + v_vjx.v = vec_set1(cj_cache->vx[cj_cache_idx]); + v_vjy.v = vec_set1(cj_cache->vy[cj_cache_idx]); + v_vjz.v = vec_set1(cj_cache->vz[cj_cache_idx]); + + v_hjg2.v = vec_set1(hjg2); + + /* Reset cumulative sums of update vectors. */ + vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, + curlvySum, curlvzSum; + + /* Get the inverse of hj. */ + vector v_hj_inv; + + v_hj_inv = vec_reciprocal(v_hj); + + rhoSum.v = vec_setzero(); + rho_dhSum.v = vec_setzero(); + wcountSum.v = vec_setzero(); + wcount_dhSum.v = vec_setzero(); + div_vSum.v = vec_setzero(); + curlvxSum.v = vec_setzero(); + curlvySum.v = vec_setzero(); + curlvzSum.v = vec_setzero(); + + vector pix, piy, piz; + + /* Convert exit iteration to cache indices. */ + int exit_iteration_align = exit_iteration - first_pi_align; + + /* Pad the exit iteration align so cache reads are aligned. */ + int rem = exit_iteration_align % VEC_SIZE; + if (exit_iteration_align < VEC_SIZE) { + exit_iteration_align = 0; + } else + exit_iteration_align -= rem; + + /* Loop over the parts in ci. */ + for (int ci_cache_idx = exit_iteration_align; + ci_cache_idx < ci_cache_count; ci_cache_idx += VEC_SIZE) { + +#ifdef SWIFT_DEBUG_CHECKS + if (ci_cache_idx % VEC_SIZE != 0 || ci_cache_idx < 0) { + error("Unaligned read!!! ci_cache_idx=%d", ci_cache_idx); + } +#endif + + vector v_dx, v_dy, v_dz, v_r2; + + /* Load 2 sets of vectors from the particle cache. */ + pix.v = vec_load(&ci_cache->x[ci_cache_idx]); + piy.v = vec_load(&ci_cache->y[ci_cache_idx]); + piz.v = vec_load(&ci_cache->z[ci_cache_idx]); + + /* Compute the pairwise distance. */ + v_dx.v = vec_sub(pjx.v, pix.v); + v_dy.v = vec_sub(pjy.v, piy.v); + v_dz.v = vec_sub(pjz.v, piz.v); + + v_r2.v = vec_mul(v_dx.v, v_dx.v); + v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v); + v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v); + + vector v_doj_mask; + int doj_mask; + + /* Form r2 < hig2 mask. */ + v_doj_mask.v = vec_cmp_lt(v_r2.v, v_hjg2.v); + + /* Form integer mask. */ + doj_mask = vec_cmp_result(v_doj_mask.v); + + /* If there are any interactions perform them. */ + if (doj_mask) + runner_iact_nonsym_1_vec_density( + &v_r2, &v_dx, &v_dy, &v_dz, v_hj_inv, v_vjx, v_vjy, v_vjz, + &ci_cache->vx[ci_cache_idx], &ci_cache->vy[ci_cache_idx], + &ci_cache->vz[ci_cache_idx], &ci_cache->m[ci_cache_idx], &rhoSum, + &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, + &curlvySum, &curlvzSum, v_doj_mask, +#ifdef HAVE_AVX512_F + knl_mask); +#else + 0); +#endif + + } /* loop over the parts in ci. */ + + /* Perform horizontal adds on vector sums and store result in particle pj. + */ + VEC_HADD(rhoSum, pj->rho); + VEC_HADD(rho_dhSum, pj->density.rho_dh); + VEC_HADD(wcountSum, pj->density.wcount); + VEC_HADD(wcount_dhSum, pj->density.wcount_dh); + VEC_HADD(div_vSum, pj->density.div_v); + VEC_HADD(curlvxSum, pj->density.rot_v[0]); + VEC_HADD(curlvySum, pj->density.rot_v[1]); + VEC_HADD(curlvzSum, pj->density.rot_v[2]); + + } /* loop over the parts in cj. */ + + TIMER_TOC(timer_dopair_density); + } + +#endif /* WITH_VECTORIZATION */ +} diff --git a/src/runner_doiact_vec.h b/src/runner_doiact_vec.h index 9bb24f12cedf03ec49a5a03f92d308f92d49aa54..e252083ae743248a5f23d1772c2e770c5e1c6c14 100644 --- a/src/runner_doiact_vec.h +++ b/src/runner_doiact_vec.h @@ -24,6 +24,7 @@ #include "../config.h" /* Local headers */ +#include "active.h" #include "cell.h" #include "engine.h" #include "hydro.h" @@ -35,5 +36,7 @@ /* Function prototypes. */ void runner_doself1_density_vec(struct runner *r, struct cell *restrict c); void runner_doself1_density_vec_2(struct runner *r, struct cell *restrict c); +void runner_dopair1_density_vec(struct runner *r, struct cell *restrict ci, + struct cell *restrict cj); #endif /* SWIFT_RUNNER_VEC_H */ diff --git a/src/vector.h b/src/vector.h index 5582eecf93b6e7a9f82b885cda1f6c70d6bf059a..7d82b9f5c9e2bc6e1fea9426ab3870eeec180408 100644 --- a/src/vector.h +++ b/src/vector.h @@ -74,6 +74,7 @@ #define vec_cmp_gt(a, b) _mm512_cmp_ps_mask(a, b, _CMP_GT_OQ) #define vec_cmp_lt(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ) #define vec_cmp_lte(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ) +#define vec_cmp_gte(a, b) _mm512_cmp_ps_mask(a, b, _CMP_GE_OQ) #define vec_and(a, b) _mm512_and_ps(a, b) #define vec_todbl_lo(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 0)) #define vec_todbl_hi(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 1)) @@ -106,8 +107,15 @@ } /* Performs a horizontal add on the vector and adds the result to a float. */ +#ifdef __ICC #define VEC_HADD(a, b) b += _mm512_reduce_add_ps(a.v) - +#else /* _mm512_reduce_add_ps not present in GCC compiler. \ + TODO: Implement intrinsic version.*/ +#define VEC_HADD(a, b) \ + { \ + for (int i = 0; i < VEC_SIZE; i++) b += a.f[i]; \ + } +#endif /* Calculates the number of set bits in the mask and adds the result to an int. */ #define VEC_FORM_PACKED_MASK(mask, v_mask, pack) \ @@ -147,6 +155,7 @@ #define vec_cmp_lt(a, b) _mm256_cmp_ps(a, b, _CMP_LT_OQ) #define vec_cmp_gt(a, b) _mm256_cmp_ps(a, b, _CMP_GT_OQ) #define vec_cmp_lte(a, b) _mm256_cmp_ps(a, b, _CMP_LE_OQ) +#define vec_cmp_gte(a, b) _mm256_cmp_ps(a, b, _CMP_GE_OQ) #define vec_cmp_result(a) _mm256_movemask_ps(a) #define vec_and(a, b) _mm256_and_ps(a, b) #define vec_todbl_lo(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 0)) diff --git a/tests/test27cells.c b/tests/test27cells.c index 2e7e9b63c9e8f09239fc1ba145fabcf30d09e977..bd827b68e90ea5f4e9d5577612e6cecda2edf83a 100644 --- a/tests/test27cells.c +++ b/tests/test27cells.c @@ -34,7 +34,9 @@ #if defined(WITH_VECTORIZATION) #define DOSELF1 runner_doself1_density_vec +#define DOPAIR1 runner_dopair1_density_vec #define DOSELF1_NAME "runner_doself1_density_vec" +#define DOPAIR1_NAME "runner_dopair1_density_vec" #endif #ifndef DOSELF1 @@ -42,6 +44,11 @@ #define DOSELF1_NAME "runner_doself1_density" #endif +#ifndef DOPAIR1 +#define DOPAIR1 runner_dopair1_density +#define DOPAIR1_NAME "runner_dopair1_density" +#endif + enum velocity_types { velocity_zero, velocity_random, @@ -303,9 +310,11 @@ int check_results(struct part *serial_parts, struct part *vec_parts, int count, } /* Just a forward declaration... */ -void runner_dopair1_density(struct runner *r, struct cell *ci, struct cell *cj); void runner_doself1_density(struct runner *r, struct cell *ci); void runner_doself1_density_vec(struct runner *r, struct cell *ci); +void runner_dopair1_density(struct runner *r, struct cell *ci, struct cell *cj); +void runner_dopair1_density_vec(struct runner *r, struct cell *ci, + struct cell *cj); /* And go... */ int main(int argc, char *argv[]) { @@ -384,7 +393,8 @@ int main(int argc, char *argv[]) { } /* Help users... */ - message("Function called: %s", DOSELF1_NAME); + message("DOSELF1 function called: %s", DOSELF1_NAME); + message("DOPAIR1 function called: %s", DOPAIR1_NAME); message("Vector size: %d", VEC_SIZE); message("Adiabatic index: ga = %f", hydro_gamma); message("Hydro implementation: %s", SPH_IMPLEMENTATION); @@ -450,24 +460,26 @@ int main(int argc, char *argv[]) { #if !(defined(MINIMAL_SPH) && defined(WITH_VECTORIZATION)) +#ifdef WITH_VECTORIZATION + runner.ci_cache.count = 0; + cache_init(&runner.ci_cache, 512); + runner.cj_cache.count = 0; + cache_init(&runner.cj_cache, 512); +#endif + /* Run all the pairs */ for (int j = 0; j < 27; ++j) { if (cells[j] != main_cell) { const ticks sub_tic = getticks(); - runner_dopair1_density(&runner, main_cell, cells[j]); + DOPAIR1(&runner, main_cell, cells[j]); const ticks sub_toc = getticks(); timings[j] += sub_toc - sub_tic; } } -/* And now the self-interaction */ -#ifdef WITH_VECTORIZATION - runner.par_cache.count = 0; - cache_init(&runner.par_cache, 512); -#endif - + /* And now the self-interaction */ const ticks self_tic = getticks(); DOSELF1(&runner, main_cell); diff --git a/tests/testKernel.c b/tests/testKernel.c index 344c038be0020019cc989cca6969b4e65bfd973f..13f4e36534eb11a4c8f7ba9c19a48de6599e31f5 100644 --- a/tests/testKernel.c +++ b/tests/testKernel.c @@ -53,7 +53,7 @@ int main() { printf("\nVector Output for VEC_SIZE=%d\n", VEC_SIZE); printf("-------------\n"); -#ifdef WITH_VECORIZATION +#ifdef WITH_VECTORIZATION for (int i = 0; i < numPoints; i += VEC_SIZE) {