diff --git a/src/hydro/Gadget2/hydro_iact.h b/src/hydro/Gadget2/hydro_iact.h index b57994d24059596426be8f13db5318e619297d59..4603d90b93c8af5dcbb6ef7360c1726e089a7f02 100644 --- a/src/hydro/Gadget2/hydro_iact.h +++ b/src/hydro/Gadget2/hydro_iact.h @@ -1142,7 +1142,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force( #ifdef WITH_VECTORIZATION __attribute__((always_inline)) INLINE static void runner_iact_nonsym_1_vec_force( float *R2, float *Dx, float *Dy, float *Dz, vector *vix, vector *viy, vector *viz, vector *pirho, vector *grad_hi, vector *piPOrho2, vector *balsara_i, vector *ci, float *Vjx, float *Vjy, float *Vjz, float *Pjrho, float *Grad_hj, float *PjPOrho2, float *Balsara_j, float *Cj, float *Mj, vector *hi_inv, float *Hj_inv, - vector *a_hydro_xSum, vector *a_hydro_ySum, vector *a_hydro_zSum, vector *h_dtSum, vector *v_sigSum, vector *entropy_dtSum, vector mask) { + vector *a_hydro_xSum, vector *a_hydro_ySum, vector *a_hydro_zSum, vector *h_dtSum, vector *v_sigSum, vector *entropy_dtSum, mask_t mask) { #ifdef WITH_VECTORIZATION @@ -1372,7 +1372,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_1_vec_force __attribute__((always_inline)) INLINE static void runner_iact_nonsym_2_vec_force( float *R2, float *Dx, float *Dy, float *Dz, vector *vix, vector *viy, vector *viz, vector *pirho, vector *grad_hi, vector *piPOrho2, vector *balsara_i, vector *ci, float *Vjx, float *Vjy, float *Vjz, float *Pjrho, float *Grad_hj, float *PjPOrho2, float *Balsara_j, float *Cj, float *Mj, vector *hi_inv, float *Hj_inv, - vector *a_hydro_xSum, vector *a_hydro_ySum, vector *a_hydro_zSum, vector *h_dtSum, vector *v_sigSum, vector *entropy_dtSum, vector mask, vector mask_2) { + vector *a_hydro_xSum, vector *a_hydro_ySum, vector *a_hydro_zSum, vector *h_dtSum, vector *v_sigSum, vector *entropy_dtSum, mask_t mask, mask_t mask_2) { #ifdef WITH_VECTORIZATION @@ -1527,18 +1527,18 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_2_vec_force entropy_dt_2.v = mj_2.v * visc_term_2.v * dvdr_2.v; /* Store the forces back on the particles. */ - a_hydro_xSum->v -= vec_and(piax.v, mask.v); - a_hydro_xSum->v -= vec_and(piax_2.v, mask_2.v); - a_hydro_ySum->v -= vec_and(piay.v, mask.v); - a_hydro_ySum->v -= vec_and(piay_2.v, mask_2.v); - a_hydro_zSum->v -= vec_and(piaz.v, mask.v); - a_hydro_zSum->v -= vec_and(piaz_2.v, mask_2.v); - h_dtSum->v -= vec_and(pih_dt.v, mask.v); - h_dtSum->v -= vec_and(pih_dt_2.v, mask_2.v); - v_sigSum->v = vec_fmax(v_sigSum->v, vec_and(v_sig.v, mask.v)); - v_sigSum->v = vec_fmax(v_sigSum->v, vec_and(v_sig_2.v, mask_2.v)); - entropy_dtSum->v += vec_and(entropy_dt.v,mask.v); - entropy_dtSum->v += vec_and(entropy_dt_2.v,mask_2.v); + a_hydro_xSum->v = vec_mask_sub(a_hydro_xSum->v, piax.v, mask); + a_hydro_xSum->v = vec_mask_sub(a_hydro_xSum->v, piax_2.v, mask_2); + a_hydro_ySum->v = vec_mask_sub(a_hydro_ySum->v, piay.v, mask); + a_hydro_ySum->v = vec_mask_sub(a_hydro_ySum->v, piay_2.v, mask_2); + a_hydro_zSum->v = vec_mask_sub(a_hydro_zSum->v, piaz.v, mask); + a_hydro_zSum->v = vec_mask_sub(a_hydro_zSum->v, piaz_2.v, mask_2); + h_dtSum->v = vec_mask_sub(h_dtSum->v, pih_dt.v, mask); + h_dtSum->v = vec_mask_sub(h_dtSum->v, pih_dt_2.v, mask_2); + v_sigSum->v = vec_fmax(v_sigSum->v, vec_and_mask(v_sig, mask)); + v_sigSum->v = vec_fmax(v_sigSum->v, vec_and_mask(v_sig_2, mask_2)); + entropy_dtSum->v = vec_mask_add(entropy_dtSum->v, entropy_dt.v, mask); + entropy_dtSum->v = vec_mask_add(entropy_dtSum->v, entropy_dt_2.v, mask_2); #else diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c index f2ecc98a119079dfafba0ebd52b47c8533110cf8..3f44509f482b6493111de3823cdd93940d32cfc8 100644 --- a/src/runner_doiact_vec.c +++ b/src/runner_doiact_vec.c @@ -276,10 +276,7 @@ __attribute__((always_inline)) INLINE static void calcRemForceInteractions( vector *v_rhoi, vector *v_grad_hi, vector *v_pOrhoi2, vector *v_balsara_i, vector *v_ci, int *icount_align, int num_vec_proc) { -#ifdef HAVE_AVX512_F - KNL_MASK_16 knl_mask, knl_mask2; -#endif - vector int_mask, int_mask2; + mask_t int_mask, int_mask2; /* Work out the number of remainder interactions and pad secondary cache. */ *icount_align = icount; @@ -288,16 +285,10 @@ __attribute__((always_inline)) INLINE static void calcRemForceInteractions( int pad = (num_vec_proc * VEC_SIZE) - rem; *icount_align += pad; -/* Initialise masks to true. */ -#ifdef HAVE_AVX512_F - knl_mask = 0xFFFF; - knl_mask2 = 0xFFFF; - int_mask.m = vec_setint1(0xFFFFFFFF); - int_mask2.m = vec_setint1(0xFFFFFFFF); -#else - int_mask.m = vec_setint1(0xFFFFFFFF); - int_mask2.m = vec_setint1(0xFFFFFFFF); -#endif + /* Initialise masks to true. */ + vec_init_mask(int_mask); + vec_init_mask(int_mask2); + /* Pad secondary cache so that there are no contributions in the interaction * function. */ for (int i = icount; i < *icount_align; i++) { @@ -319,19 +310,10 @@ __attribute__((always_inline)) INLINE static void calcRemForceInteractions( /* Zero parts of mask that represent the padded values.*/ if (pad < VEC_SIZE) { -#ifdef HAVE_AVX512_F - knl_mask2 = knl_mask2 >> pad; -#else - for (int i = VEC_SIZE - pad; i < VEC_SIZE; i++) int_mask2.i[i] = 0; -#endif + vec_pad_mask(int_mask2,pad); } else { -#ifdef HAVE_AVX512_F - knl_mask = knl_mask >> (VEC_SIZE - rem); - knl_mask2 = 0; -#else - for (int i = rem; i < VEC_SIZE; i++) int_mask.i[i] = 0; - int_mask2.v = vec_setzero(); -#endif + vec_pad_mask(int_mask,VEC_SIZE - rem); + vec_zero_mask(int_mask2); } /* Perform remainder interaction and remove remainder from aligned @@ -341,12 +323,7 @@ __attribute__((always_inline)) INLINE static void calcRemForceInteractions( runner_iact_nonsym_2_vec_force( &int_cache->r2q[*icount_align], &int_cache->dxq[*icount_align], &int_cache->dyq[*icount_align], &int_cache->dzq[*icount_align], v_vix, v_viy, v_viz, v_rhoi, v_grad_hi, v_pOrhoi2, v_balsara_i, v_ci, &int_cache->vxq[*icount_align], &int_cache->vyq[*icount_align], &int_cache->vzq[*icount_align], &int_cache->rhoq[*icount_align], &int_cache->grad_hq[*icount_align], &int_cache->pOrho2q[*icount_align], &int_cache->balsaraq[*icount_align], &int_cache->soundspeedq[*icount_align], &int_cache->mq[*icount_align], v_hi_inv, &int_cache->h_invq[*icount_align], - a_hydro_xSum, a_hydro_ySum, a_hydro_zSum, h_dtSum, v_sigSum, entropy_dtSum, int_mask, int_mask2 -#ifdef HAVE_AVX512_F - ,knl_mask, knl_mask2); -#else - ); -#endif + a_hydro_xSum, a_hydro_ySum, a_hydro_zSum, h_dtSum, v_sigSum, entropy_dtSum, int_mask, int_mask2); } } @@ -992,8 +969,7 @@ __attribute__((always_inline)) INLINE void runner_doself2_force_vec( VEC_HADD(a_hydro_ySum, pi->a_hydro[1]); VEC_HADD(a_hydro_zSum, pi->a_hydro[2]); VEC_HADD(h_dtSum, pi->force.h_dt); - for(int k=0; k<VEC_SIZE; k++) - pi->force.v_sig = max(pi->force.v_sig, v_sigSum.f[k]); + VEC_HMAX(v_sigSum, pi->force.v_sig); VEC_HADD(entropy_dtSum, pi->entropy_dt); /* Reset interaction count. */ diff --git a/src/vector.h b/src/vector.h index 0c8f1788316f657f46da43a588d4ab7380a73c62..15dd0d1d83ba24345ab92b2c6c6522327d18731e 100644 --- a/src/vector.h +++ b/src/vector.h @@ -81,6 +81,7 @@ #define vec_form_int_mask(a) a #define vec_and(a, b) _mm512_and_ps(a, b) #define vec_mask_and(a, b) a & b +#define vec_and_mask(a, mask) _mm512_maskz_expand_ps(mask, a) #define vec_init_mask(mask) mask = 0xFFFF #define vec_zero_mask(mask) mask = 0 #define vec_create_mask(mask, cond) mask = cond @@ -130,6 +131,9 @@ #define VEC_FORM_PACKED_MASK(mask, v_mask, pack) \ pack += __builtin_popcount(mask); +/* Finds the horizontal maximum of vector b and returns a float. */ +#define VEC_HMAX(a, b) a = _mm512_reduce_max_ps(b.v) + /* Performs a left-pack on a vector based upon a mask and returns the result. */ #define VEC_LEFT_PACK(a, mask, result) \ _mm512_mask_compressstoreu_ps(result, mask, a) @@ -171,6 +175,7 @@ #define vec_form_int_mask(a) _mm256_movemask_ps(a.v) #define vec_and(a, b) _mm256_and_ps(a, b) #define vec_mask_and(a, b) _mm256_and_ps(a.v, b.v) +#define vec_and_mask(a, mask) vec_mask_and(a, mask) #define vec_init_mask(mask) mask.m = vec_setint1(0xFFFFFFFF) #define vec_create_mask(mask, cond) mask.v = cond #define vec_zero_mask(mask) mask.v = vec_setzero() @@ -201,13 +206,10 @@ b += a.f[0] + a.f[4]; /* Performs a horizontal maximum on the vector and takes the maximum of the result with a float, b. */ -#define VEC_HMAX(a, b) \ -{ \ -__m256 y = _mm256_permute2f128_ps(a.v, a.v, 1); /* Permute 128-bit values, y = [a.high, a.low] */ \ -__m256 m1 = _mm256_max_ps(a.v, y); /* m1[0] = max(x[0], x[3]), m1[1] = max(x[1], x[4]), etc. */ \ -__m256 m2 = _mm256_permute_ps(m1, 177); /* Set m2[0] = m1[1], m2[1] = m1[0], m2[2] = m1[3] etc. */\ -__m256 m = _mm256_max_ps(m1, m2); /* m[0] and m[7] contain maximums of each part of vector. */ \ - b = fmaxf(fmaxf(b,m[0]),m[7]); \ +#define VEC_HMAX(a, b) \ +{ \ + for(int k=0; k<VEC_SIZE; k++) \ + b = max(b, a.f[k]); \ } /* Returns the lower 128-bits of the 256-bit vector. */