Commit b2a5ba65 authored by James Willis's avatar James Willis
Browse files

Tidy masking up so that AVX-512 logic is hidden from user and occurs in vector.h.

parent aa9064bb
...@@ -176,7 +176,7 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions( ...@@ -176,7 +176,7 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions(
* @param v_viz #vector of z velocity of pi. * @param v_viz #vector of z velocity of pi.
*/ */
__attribute__((always_inline)) INLINE static void storeInteractions( __attribute__((always_inline)) INLINE static void storeInteractions(
const int mask, const int pjd, vector *v_r2, vector *v_dx, vector *v_dy, const short mask, const int pjd, vector *v_r2, vector *v_dx, vector *v_dy,
vector *v_dz, const struct cache *const cell_cache, struct c2_cache *const int_cache, vector *v_dz, const struct cache *const cell_cache, struct c2_cache *const int_cache,
int *icount, vector *rhoSum, vector *rho_dhSum, vector *wcountSum, int *icount, vector *rhoSum, vector *rho_dhSum, vector *wcountSum,
vector *wcount_dhSum, vector *div_vSum, vector *curlvxSum, vector *wcount_dhSum, vector *div_vSum, vector *curlvxSum,
...@@ -624,7 +624,6 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( ...@@ -624,7 +624,6 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
#ifdef WITH_VECTORIZATION #ifdef WITH_VECTORIZATION
const struct engine *e = r->e; const struct engine *e = r->e;
int doi_mask;
struct part *restrict pi; struct part *restrict pi;
int count_align; int count_align;
int num_vec_proc = NUM_VEC_PROC; int num_vec_proc = NUM_VEC_PROC;
...@@ -749,36 +748,26 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( ...@@ -749,36 +748,26 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
v_r2.v = vec_fma(v_dz_tmp.v, v_dz_tmp.v, v_r2.v); v_r2.v = vec_fma(v_dz_tmp.v, v_dz_tmp.v, v_r2.v);
v_r2_2.v = vec_fma(v_dz_tmp2.v, v_dz_tmp2.v, v_r2_2.v); v_r2_2.v = vec_fma(v_dz_tmp2.v, v_dz_tmp2.v, v_r2_2.v);
/* Form a mask from r2 < hig2 and r2 > 0.*/ /* Form a mask from r2 < hig2 and r2 > 0.*/
#ifdef HAVE_AVX512_F mask_t v_doi_mask, v_doi_mask_check, v_doi_mask2, v_doi_mask2_check;
// KNL_MASK_16 doi_mask, doi_mask_check, doi_mask2, doi_mask2_check; short doi_mask, doi_mask2;
KNL_MASK_16 doi_mask_check, doi_mask2, doi_mask2_check;
doi_mask_check = vec_cmp_gt(v_r2.v, vec_setzero());
doi_mask = vec_cmp_lt(v_r2.v, v_hig2.v);
doi_mask2_check = vec_cmp_gt(v_r2_2.v, vec_setzero());
doi_mask2 = vec_cmp_lt(v_r2_2.v, v_hig2.v);
doi_mask = doi_mask & doi_mask_check;
doi_mask2 = doi_mask2 & doi_mask2_check;
#else
vector v_doi_mask, v_doi_mask_check, v_doi_mask2, v_doi_mask2_check;
int doi_mask2;
/* Form r2 > 0 mask and r2 < hig2 mask. */ /* Form r2 > 0 mask and r2 < hig2 mask. */
v_doi_mask_check.v = vec_cmp_gt(v_r2.v, vec_setzero()); v_doi_mask_check = vec_cmp_gt(v_r2.v, vec_setzero());
v_doi_mask.v = vec_cmp_lt(v_r2.v, v_hig2.v); v_doi_mask = vec_cmp_lt(v_r2.v, v_hig2.v);
/* Form r2 > 0 mask and r2 < hig2 mask. */ /* Form r2 > 0 mask and r2 < hig2 mask. */
v_doi_mask2_check.v = vec_cmp_gt(v_r2_2.v, vec_setzero()); v_doi_mask2_check = vec_cmp_gt(v_r2_2.v, vec_setzero());
v_doi_mask2.v = vec_cmp_lt(v_r2_2.v, v_hig2.v); v_doi_mask2 = vec_cmp_lt(v_r2_2.v, v_hig2.v);
/* Combine two masks and form integer mask. */ /* Combine the two masks. */
doi_mask = vec_cmp_result(vec_and(v_doi_mask.v, v_doi_mask_check.v)); mask_t doi_mask_combi, doi_mask2_combi;
doi_mask2 = vec_cmp_result(vec_and(v_doi_mask2.v, v_doi_mask2_check.v)); doi_mask_combi = vec_mask_and(v_doi_mask, v_doi_mask_check);
#endif /* HAVE_AVX512_F */ doi_mask2_combi = vec_mask_and(v_doi_mask2, v_doi_mask2_check);
/* Form integer mask. */
doi_mask = vec_cmp_result(doi_mask_combi);
doi_mask2 = vec_cmp_result(doi_mask2_combi);
/* If there are any interactions left pack interaction values into c2 /* If there are any interactions left pack interaction values into c2
* cache. */ * cache. */
......
...@@ -60,7 +60,9 @@ ...@@ -60,7 +60,9 @@
#define vec_dbl_set(a, b, c, d, e, f, g, h) \ #define vec_dbl_set(a, b, c, d, e, f, g, h) \
_mm512_set_pd(h, g, f, e, d, c, b, a) _mm512_set_pd(h, g, f, e, d, c, b, a)
#define vec_add(a, b) _mm512_add_ps(a, b) #define vec_add(a, b) _mm512_add_ps(a, b)
#define vec_mask_add(a, b, mask) _mm512_mask_add_ps(a, mask, b, a)
#define vec_sub(a, b) _mm512_sub_ps(a, b) #define vec_sub(a, b) _mm512_sub_ps(a, b)
#define vec_mask_sub(a, b, mask) _mm512_mask_sub_ps(a, mask, a, b)
#define vec_mul(a, b) _mm512_mul_ps(a, b) #define vec_mul(a, b) _mm512_mul_ps(a, b)
#define vec_fma(a, b, c) _mm512_fmadd_ps(a, b, c) #define vec_fma(a, b, c) _mm512_fmadd_ps(a, b, c)
#define vec_sqrt(a) _mm512_sqrt_ps(a) #define vec_sqrt(a) _mm512_sqrt_ps(a)
...@@ -75,7 +77,9 @@ ...@@ -75,7 +77,9 @@
#define vec_cmp_lt(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ) #define vec_cmp_lt(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ)
#define vec_cmp_lte(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ) #define vec_cmp_lte(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ)
#define vec_cmp_gte(a, b) _mm512_cmp_ps_mask(a, b, _CMP_GE_OQ) #define vec_cmp_gte(a, b) _mm512_cmp_ps_mask(a, b, _CMP_GE_OQ)
#define vec_cmp_result(a) a
#define vec_and(a, b) _mm512_and_ps(a, b) #define vec_and(a, b) _mm512_and_ps(a, b)
#define vec_mask_and(a, b) a & b
#define vec_todbl_lo(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 0)) #define vec_todbl_lo(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 0))
#define vec_todbl_hi(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 1)) #define vec_todbl_hi(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 1))
#define vec_dbl_tofloat(a, b) _mm512_insertf128(_mm512_castps128_ps512(a), b, 1) #define vec_dbl_tofloat(a, b) _mm512_insertf128(_mm512_castps128_ps512(a), b, 1)
...@@ -142,7 +146,9 @@ ...@@ -142,7 +146,9 @@
#define vec_set(a, b, c, d, e, f, g, h) _mm256_set_ps(h, g, f, e, d, c, b, a) #define vec_set(a, b, c, d, e, f, g, h) _mm256_set_ps(h, g, f, e, d, c, b, a)
#define vec_dbl_set(a, b, c, d) _mm256_set_pd(d, c, b, a) #define vec_dbl_set(a, b, c, d) _mm256_set_pd(d, c, b, a)
#define vec_add(a, b) _mm256_add_ps(a, b) #define vec_add(a, b) _mm256_add_ps(a, b)
#define vec_mask_add(a, b, mask) vec_add(a, vec_and(b,mask))
#define vec_sub(a, b) _mm256_sub_ps(a, b) #define vec_sub(a, b) _mm256_sub_ps(a, b)
#define vec_mask_sub(a, b, mask) vec_sub(a, vec_and(b,mask))
#define vec_mul(a, b) _mm256_mul_ps(a, b) #define vec_mul(a, b) _mm256_mul_ps(a, b)
#define vec_sqrt(a) _mm256_sqrt_ps(a) #define vec_sqrt(a) _mm256_sqrt_ps(a)
#define vec_rcp(a) _mm256_rcp_ps(a) #define vec_rcp(a) _mm256_rcp_ps(a)
...@@ -158,6 +164,7 @@ ...@@ -158,6 +164,7 @@
#define vec_cmp_gte(a, b) _mm256_cmp_ps(a, b, _CMP_GE_OQ) #define vec_cmp_gte(a, b) _mm256_cmp_ps(a, b, _CMP_GE_OQ)
#define vec_cmp_result(a) _mm256_movemask_ps(a) #define vec_cmp_result(a) _mm256_movemask_ps(a)
#define vec_and(a, b) _mm256_and_ps(a, b) #define vec_and(a, b) _mm256_and_ps(a, b)
#define vec_mask_and(a, b) _mm256_and_ps(a, b)
#define vec_todbl_lo(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 0)) #define vec_todbl_lo(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 0))
#define vec_todbl_hi(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 1)) #define vec_todbl_hi(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 1))
#define vec_dbl_tofloat(a, b) _mm256_insertf128(_mm256_castps128_ps256(a), b, 1) #define vec_dbl_tofloat(a, b) _mm256_insertf128(_mm256_castps128_ps256(a), b, 1)
...@@ -346,6 +353,13 @@ typedef union { ...@@ -346,6 +353,13 @@ typedef union {
int i[VEC_SIZE]; int i[VEC_SIZE];
} vector; } vector;
/* Define the mask type depending on the instruction set used. */
#ifdef HAVE_AVX512_F
typedef __mmask16 mask_t;
#else
typedef VEC_FLOAT mask_t;
#endif
/** /**
* @brief Calculates the inverse ($1/x$) of a vector using intrinsics and a * @brief Calculates the inverse ($1/x$) of a vector using intrinsics and a
* Newton iteration to obtain the correct level of accuracy. * Newton iteration to obtain the correct level of accuracy.
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment