Commit f194e08b authored by James Willis's avatar James Willis
Browse files

Use mask_t in runner_doself2_force_vec.

parent caea34d3
......@@ -1142,7 +1142,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force(
#ifdef WITH_VECTORIZATION
__attribute__((always_inline)) INLINE static void runner_iact_nonsym_1_vec_force(
float *R2, float *Dx, float *Dy, float *Dz, vector *vix, vector *viy, vector *viz, vector *pirho, vector *grad_hi, vector *piPOrho2, vector *balsara_i, vector *ci, float *Vjx, float *Vjy, float *Vjz, float *Pjrho, float *Grad_hj, float *PjPOrho2, float *Balsara_j, float *Cj, float *Mj, vector *hi_inv, float *Hj_inv,
vector *a_hydro_xSum, vector *a_hydro_ySum, vector *a_hydro_zSum, vector *h_dtSum, vector *v_sigSum, vector *entropy_dtSum, vector mask) {
vector *a_hydro_xSum, vector *a_hydro_ySum, vector *a_hydro_zSum, vector *h_dtSum, vector *v_sigSum, vector *entropy_dtSum, mask_t mask) {
#ifdef WITH_VECTORIZATION
......@@ -1372,7 +1372,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_1_vec_force
__attribute__((always_inline)) INLINE static void runner_iact_nonsym_2_vec_force(
float *R2, float *Dx, float *Dy, float *Dz, vector *vix, vector *viy, vector *viz, vector *pirho, vector *grad_hi, vector *piPOrho2, vector *balsara_i, vector *ci, float *Vjx, float *Vjy, float *Vjz, float *Pjrho, float *Grad_hj, float *PjPOrho2, float *Balsara_j, float *Cj, float *Mj, vector *hi_inv, float *Hj_inv,
vector *a_hydro_xSum, vector *a_hydro_ySum, vector *a_hydro_zSum, vector *h_dtSum, vector *v_sigSum, vector *entropy_dtSum, vector mask, vector mask_2) {
vector *a_hydro_xSum, vector *a_hydro_ySum, vector *a_hydro_zSum, vector *h_dtSum, vector *v_sigSum, vector *entropy_dtSum, mask_t mask, mask_t mask_2) {
#ifdef WITH_VECTORIZATION
......@@ -1527,18 +1527,18 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_2_vec_force
entropy_dt_2.v = mj_2.v * visc_term_2.v * dvdr_2.v;
/* Store the forces back on the particles. */
a_hydro_xSum->v -= vec_and(piax.v, mask.v);
a_hydro_xSum->v -= vec_and(piax_2.v, mask_2.v);
a_hydro_ySum->v -= vec_and(piay.v, mask.v);
a_hydro_ySum->v -= vec_and(piay_2.v, mask_2.v);
a_hydro_zSum->v -= vec_and(piaz.v, mask.v);
a_hydro_zSum->v -= vec_and(piaz_2.v, mask_2.v);
h_dtSum->v -= vec_and(pih_dt.v, mask.v);
h_dtSum->v -= vec_and(pih_dt_2.v, mask_2.v);
v_sigSum->v = vec_fmax(v_sigSum->v, vec_and(v_sig.v, mask.v));
v_sigSum->v = vec_fmax(v_sigSum->v, vec_and(v_sig_2.v, mask_2.v));
entropy_dtSum->v += vec_and(entropy_dt.v,mask.v);
entropy_dtSum->v += vec_and(entropy_dt_2.v,mask_2.v);
a_hydro_xSum->v = vec_mask_sub(a_hydro_xSum->v, piax.v, mask);
a_hydro_xSum->v = vec_mask_sub(a_hydro_xSum->v, piax_2.v, mask_2);
a_hydro_ySum->v = vec_mask_sub(a_hydro_ySum->v, piay.v, mask);
a_hydro_ySum->v = vec_mask_sub(a_hydro_ySum->v, piay_2.v, mask_2);
a_hydro_zSum->v = vec_mask_sub(a_hydro_zSum->v, piaz.v, mask);
a_hydro_zSum->v = vec_mask_sub(a_hydro_zSum->v, piaz_2.v, mask_2);
h_dtSum->v = vec_mask_sub(h_dtSum->v, pih_dt.v, mask);
h_dtSum->v = vec_mask_sub(h_dtSum->v, pih_dt_2.v, mask_2);
v_sigSum->v = vec_fmax(v_sigSum->v, vec_and_mask(v_sig, mask));
v_sigSum->v = vec_fmax(v_sigSum->v, vec_and_mask(v_sig_2, mask_2));
entropy_dtSum->v = vec_mask_add(entropy_dtSum->v, entropy_dt.v, mask);
entropy_dtSum->v = vec_mask_add(entropy_dtSum->v, entropy_dt_2.v, mask_2);
#else
......
......@@ -276,10 +276,7 @@ __attribute__((always_inline)) INLINE static void calcRemForceInteractions(
vector *v_rhoi, vector *v_grad_hi, vector *v_pOrhoi2, vector *v_balsara_i, vector *v_ci,
int *icount_align, int num_vec_proc) {
#ifdef HAVE_AVX512_F
KNL_MASK_16 knl_mask, knl_mask2;
#endif
vector int_mask, int_mask2;
mask_t int_mask, int_mask2;
/* Work out the number of remainder interactions and pad secondary cache. */
*icount_align = icount;
......@@ -288,16 +285,10 @@ __attribute__((always_inline)) INLINE static void calcRemForceInteractions(
int pad = (num_vec_proc * VEC_SIZE) - rem;
*icount_align += pad;
/* Initialise masks to true. */
#ifdef HAVE_AVX512_F
knl_mask = 0xFFFF;
knl_mask2 = 0xFFFF;
int_mask.m = vec_setint1(0xFFFFFFFF);
int_mask2.m = vec_setint1(0xFFFFFFFF);
#else
int_mask.m = vec_setint1(0xFFFFFFFF);
int_mask2.m = vec_setint1(0xFFFFFFFF);
#endif
/* Initialise masks to true. */
vec_init_mask(int_mask);
vec_init_mask(int_mask2);
/* Pad secondary cache so that there are no contributions in the interaction
* function. */
for (int i = icount; i < *icount_align; i++) {
......@@ -319,19 +310,10 @@ __attribute__((always_inline)) INLINE static void calcRemForceInteractions(
/* Zero parts of mask that represent the padded values.*/
if (pad < VEC_SIZE) {
#ifdef HAVE_AVX512_F
knl_mask2 = knl_mask2 >> pad;
#else
for (int i = VEC_SIZE - pad; i < VEC_SIZE; i++) int_mask2.i[i] = 0;
#endif
vec_pad_mask(int_mask2,pad);
} else {
#ifdef HAVE_AVX512_F
knl_mask = knl_mask >> (VEC_SIZE - rem);
knl_mask2 = 0;
#else
for (int i = rem; i < VEC_SIZE; i++) int_mask.i[i] = 0;
int_mask2.v = vec_setzero();
#endif
vec_pad_mask(int_mask,VEC_SIZE - rem);
vec_zero_mask(int_mask2);
}
/* Perform remainder interaction and remove remainder from aligned
......@@ -341,12 +323,7 @@ __attribute__((always_inline)) INLINE static void calcRemForceInteractions(
runner_iact_nonsym_2_vec_force(
&int_cache->r2q[*icount_align], &int_cache->dxq[*icount_align], &int_cache->dyq[*icount_align], &int_cache->dzq[*icount_align], v_vix, v_viy, v_viz, v_rhoi, v_grad_hi, v_pOrhoi2, v_balsara_i, v_ci,
&int_cache->vxq[*icount_align], &int_cache->vyq[*icount_align], &int_cache->vzq[*icount_align], &int_cache->rhoq[*icount_align], &int_cache->grad_hq[*icount_align], &int_cache->pOrho2q[*icount_align], &int_cache->balsaraq[*icount_align], &int_cache->soundspeedq[*icount_align], &int_cache->mq[*icount_align], v_hi_inv, &int_cache->h_invq[*icount_align],
a_hydro_xSum, a_hydro_ySum, a_hydro_zSum, h_dtSum, v_sigSum, entropy_dtSum, int_mask, int_mask2
#ifdef HAVE_AVX512_F
,knl_mask, knl_mask2);
#else
);
#endif
a_hydro_xSum, a_hydro_ySum, a_hydro_zSum, h_dtSum, v_sigSum, entropy_dtSum, int_mask, int_mask2);
}
}
......@@ -992,8 +969,7 @@ __attribute__((always_inline)) INLINE void runner_doself2_force_vec(
VEC_HADD(a_hydro_ySum, pi->a_hydro[1]);
VEC_HADD(a_hydro_zSum, pi->a_hydro[2]);
VEC_HADD(h_dtSum, pi->force.h_dt);
for(int k=0; k<VEC_SIZE; k++)
pi->force.v_sig = max(pi->force.v_sig, v_sigSum.f[k]);
VEC_HMAX(v_sigSum, pi->force.v_sig);
VEC_HADD(entropy_dtSum, pi->entropy_dt);
/* Reset interaction count. */
......
......@@ -81,6 +81,7 @@
#define vec_form_int_mask(a) a
#define vec_and(a, b) _mm512_and_ps(a, b)
#define vec_mask_and(a, b) a & b
#define vec_and_mask(a, mask) _mm512_maskz_expand_ps(mask, a)
#define vec_init_mask(mask) mask = 0xFFFF
#define vec_zero_mask(mask) mask = 0
#define vec_create_mask(mask, cond) mask = cond
......@@ -130,6 +131,9 @@
#define VEC_FORM_PACKED_MASK(mask, v_mask, pack) \
pack += __builtin_popcount(mask);
/* Finds the horizontal maximum of vector b and returns a float. */
#define VEC_HMAX(a, b) a = _mm512_reduce_max_ps(b.v)
/* Performs a left-pack on a vector based upon a mask and returns the result. */
#define VEC_LEFT_PACK(a, mask, result) \
_mm512_mask_compressstoreu_ps(result, mask, a)
......@@ -171,6 +175,7 @@
#define vec_form_int_mask(a) _mm256_movemask_ps(a.v)
#define vec_and(a, b) _mm256_and_ps(a, b)
#define vec_mask_and(a, b) _mm256_and_ps(a.v, b.v)
#define vec_and_mask(a, mask) vec_mask_and(a, mask)
#define vec_init_mask(mask) mask.m = vec_setint1(0xFFFFFFFF)
#define vec_create_mask(mask, cond) mask.v = cond
#define vec_zero_mask(mask) mask.v = vec_setzero()
......@@ -201,13 +206,10 @@
b += a.f[0] + a.f[4];
/* Performs a horizontal maximum on the vector and takes the maximum of the result with a float, b. */
#define VEC_HMAX(a, b) \
{ \
__m256 y = _mm256_permute2f128_ps(a.v, a.v, 1); /* Permute 128-bit values, y = [a.high, a.low] */ \
__m256 m1 = _mm256_max_ps(a.v, y); /* m1[0] = max(x[0], x[3]), m1[1] = max(x[1], x[4]), etc. */ \
__m256 m2 = _mm256_permute_ps(m1, 177); /* Set m2[0] = m1[1], m2[1] = m1[0], m2[2] = m1[3] etc. */\
__m256 m = _mm256_max_ps(m1, m2); /* m[0] and m[7] contain maximums of each part of vector. */ \
b = fmaxf(fmaxf(b,m[0]),m[7]); \
#define VEC_HMAX(a, b) \
{ \
for(int k=0; k<VEC_SIZE; k++) \
b = max(b, a.f[k]); \
}
/* Returns the lower 128-bits of the 256-bit vector. */
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment