diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c index 671502b6d275890b1faf08fd9c4a04042b4c3a4f..07c8fa4dc31c57c8e3ce0d2023f51f46c70f3431 100644 --- a/src/runner_doiact_vec.c +++ b/src/runner_doiact_vec.c @@ -176,7 +176,7 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions( * @param v_viz #vector of z velocity of pi. */ __attribute__((always_inline)) INLINE static void storeInteractions( - const int mask, const int pjd, vector *v_r2, vector *v_dx, vector *v_dy, + const short mask, const int pjd, vector *v_r2, vector *v_dx, vector *v_dy, vector *v_dz, const struct cache *const cell_cache, struct c2_cache *const int_cache, int *icount, vector *rhoSum, vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum, vector *div_vSum, vector *curlvxSum, @@ -624,7 +624,6 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( #ifdef WITH_VECTORIZATION const struct engine *e = r->e; - int doi_mask; struct part *restrict pi; int count_align; int num_vec_proc = NUM_VEC_PROC; @@ -749,36 +748,26 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( v_r2.v = vec_fma(v_dz_tmp.v, v_dz_tmp.v, v_r2.v); v_r2_2.v = vec_fma(v_dz_tmp2.v, v_dz_tmp2.v, v_r2_2.v); -/* Form a mask from r2 < hig2 and r2 > 0.*/ -#ifdef HAVE_AVX512_F - // KNL_MASK_16 doi_mask, doi_mask_check, doi_mask2, doi_mask2_check; - KNL_MASK_16 doi_mask_check, doi_mask2, doi_mask2_check; - - doi_mask_check = vec_cmp_gt(v_r2.v, vec_setzero()); - doi_mask = vec_cmp_lt(v_r2.v, v_hig2.v); - - doi_mask2_check = vec_cmp_gt(v_r2_2.v, vec_setzero()); - doi_mask2 = vec_cmp_lt(v_r2_2.v, v_hig2.v); - - doi_mask = doi_mask & doi_mask_check; - doi_mask2 = doi_mask2 & doi_mask2_check; - -#else - vector v_doi_mask, v_doi_mask_check, v_doi_mask2, v_doi_mask2_check; - int doi_mask2; + /* Form a mask from r2 < hig2 and r2 > 0.*/ + mask_t v_doi_mask, v_doi_mask_check, v_doi_mask2, v_doi_mask2_check; + short doi_mask, doi_mask2; /* Form r2 > 0 mask and r2 < hig2 mask. */ - v_doi_mask_check.v = vec_cmp_gt(v_r2.v, vec_setzero()); - v_doi_mask.v = vec_cmp_lt(v_r2.v, v_hig2.v); + v_doi_mask_check = vec_cmp_gt(v_r2.v, vec_setzero()); + v_doi_mask = vec_cmp_lt(v_r2.v, v_hig2.v); /* Form r2 > 0 mask and r2 < hig2 mask. */ - v_doi_mask2_check.v = vec_cmp_gt(v_r2_2.v, vec_setzero()); - v_doi_mask2.v = vec_cmp_lt(v_r2_2.v, v_hig2.v); + v_doi_mask2_check = vec_cmp_gt(v_r2_2.v, vec_setzero()); + v_doi_mask2 = vec_cmp_lt(v_r2_2.v, v_hig2.v); - /* Combine two masks and form integer mask. */ - doi_mask = vec_cmp_result(vec_and(v_doi_mask.v, v_doi_mask_check.v)); - doi_mask2 = vec_cmp_result(vec_and(v_doi_mask2.v, v_doi_mask2_check.v)); -#endif /* HAVE_AVX512_F */ + /* Combine the two masks. */ + mask_t doi_mask_combi, doi_mask2_combi; + doi_mask_combi = vec_mask_and(v_doi_mask, v_doi_mask_check); + doi_mask2_combi = vec_mask_and(v_doi_mask2, v_doi_mask2_check); + + /* Form integer mask. */ + doi_mask = vec_cmp_result(doi_mask_combi); + doi_mask2 = vec_cmp_result(doi_mask2_combi); /* If there are any interactions left pack interaction values into c2 * cache. */ diff --git a/src/vector.h b/src/vector.h index ba803a666954fe8c22e7ba5a01147c22cd7f028a..6f235c0549a2557e72f8b186af5beff80bd7200f 100644 --- a/src/vector.h +++ b/src/vector.h @@ -60,7 +60,9 @@ #define vec_dbl_set(a, b, c, d, e, f, g, h) \ _mm512_set_pd(h, g, f, e, d, c, b, a) #define vec_add(a, b) _mm512_add_ps(a, b) +#define vec_mask_add(a, b, mask) _mm512_mask_add_ps(a, mask, b, a) #define vec_sub(a, b) _mm512_sub_ps(a, b) +#define vec_mask_sub(a, b, mask) _mm512_mask_sub_ps(a, mask, a, b) #define vec_mul(a, b) _mm512_mul_ps(a, b) #define vec_fma(a, b, c) _mm512_fmadd_ps(a, b, c) #define vec_sqrt(a) _mm512_sqrt_ps(a) @@ -75,7 +77,9 @@ #define vec_cmp_lt(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ) #define vec_cmp_lte(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ) #define vec_cmp_gte(a, b) _mm512_cmp_ps_mask(a, b, _CMP_GE_OQ) +#define vec_cmp_result(a) a #define vec_and(a, b) _mm512_and_ps(a, b) +#define vec_mask_and(a, b) a & b #define vec_todbl_lo(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 0)) #define vec_todbl_hi(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 1)) #define vec_dbl_tofloat(a, b) _mm512_insertf128(_mm512_castps128_ps512(a), b, 1) @@ -142,7 +146,9 @@ #define vec_set(a, b, c, d, e, f, g, h) _mm256_set_ps(h, g, f, e, d, c, b, a) #define vec_dbl_set(a, b, c, d) _mm256_set_pd(d, c, b, a) #define vec_add(a, b) _mm256_add_ps(a, b) +#define vec_mask_add(a, b, mask) vec_add(a, vec_and(b,mask)) #define vec_sub(a, b) _mm256_sub_ps(a, b) +#define vec_mask_sub(a, b, mask) vec_sub(a, vec_and(b,mask)) #define vec_mul(a, b) _mm256_mul_ps(a, b) #define vec_sqrt(a) _mm256_sqrt_ps(a) #define vec_rcp(a) _mm256_rcp_ps(a) @@ -158,6 +164,7 @@ #define vec_cmp_gte(a, b) _mm256_cmp_ps(a, b, _CMP_GE_OQ) #define vec_cmp_result(a) _mm256_movemask_ps(a) #define vec_and(a, b) _mm256_and_ps(a, b) +#define vec_mask_and(a, b) _mm256_and_ps(a, b) #define vec_todbl_lo(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 0)) #define vec_todbl_hi(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 1)) #define vec_dbl_tofloat(a, b) _mm256_insertf128(_mm256_castps128_ps256(a), b, 1) @@ -346,6 +353,13 @@ typedef union { int i[VEC_SIZE]; } vector; +/* Define the mask type depending on the instruction set used. */ +#ifdef HAVE_AVX512_F +typedef __mmask16 mask_t; +#else +typedef VEC_FLOAT mask_t; +#endif + /** * @brief Calculates the inverse ($1/x$) of a vector using intrinsics and a * Newton iteration to obtain the correct level of accuracy.