Tidy masking up so that AVX-512 logic is hidden from user and occurs in vector.h.

b2a5ba65 · James Willis · aa9064bb · b2a5ba65 · b2a5ba65
Commit b2a5ba65 authored 8 years ago by James Willis
--- a/src/runner_doiact_vec.c
+++ b/src/runner_doiact_vec.c
@@ -176,7 +176,7 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions(
 * @param v_viz #vector of z velocity of pi.
 */
 __attribute__((always_inline)) INLINE static void storeInteractions(
-    const int mask, const int pjd, vector *v_r2, vector *v_dx, vector *v_dy,
+    const short mask, const int pjd, vector *v_r2, vector *v_dx, vector *v_dy,
    vector *v_dz, const struct cache *const cell_cache, struct c2_cache *const int_cache,
    int *icount, vector *rhoSum, vector *rho_dhSum, vector *wcountSum,
    vector *wcount_dhSum, vector *div_vSum, vector *curlvxSum,
@@ -624,7 +624,6 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(

 #ifdef WITH_VECTORIZATION
  const struct engine *e = r->e;
-  int doi_mask;
  struct part *restrict pi;
  int count_align;
  int num_vec_proc = NUM_VEC_PROC;
@@ -749,36 +748,26 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
      v_r2.v = vec_fma(v_dz_tmp.v, v_dz_tmp.v, v_r2.v);
      v_r2_2.v = vec_fma(v_dz_tmp2.v, v_dz_tmp2.v, v_r2_2.v);

-/* Form a mask from r2 < hig2 and r2 > 0.*/
-#ifdef HAVE_AVX512_F
-      // KNL_MASK_16 doi_mask, doi_mask_check, doi_mask2, doi_mask2_check;
-      KNL_MASK_16 doi_mask_check, doi_mask2, doi_mask2_check;
-
-      doi_mask_check = vec_cmp_gt(v_r2.v, vec_setzero());
-      doi_mask = vec_cmp_lt(v_r2.v, v_hig2.v);
-
-      doi_mask2_check = vec_cmp_gt(v_r2_2.v, vec_setzero());
-      doi_mask2 = vec_cmp_lt(v_r2_2.v, v_hig2.v);
-
-      doi_mask = doi_mask & doi_mask_check;
-      doi_mask2 = doi_mask2 & doi_mask2_check;
-
-#else
-      vector v_doi_mask, v_doi_mask_check, v_doi_mask2, v_doi_mask2_check;
-      int doi_mask2;
+      /* Form a mask from r2 < hig2 and r2 > 0.*/
+      mask_t v_doi_mask, v_doi_mask_check, v_doi_mask2, v_doi_mask2_check;
+      short doi_mask, doi_mask2;

      /* Form r2 > 0 mask and r2 < hig2 mask. */
-      v_doi_mask_check.v = vec_cmp_gt(v_r2.v, vec_setzero());
-      v_doi_mask.v = vec_cmp_lt(v_r2.v, v_hig2.v);
+      v_doi_mask_check = vec_cmp_gt(v_r2.v, vec_setzero());
+      v_doi_mask = vec_cmp_lt(v_r2.v, v_hig2.v);

      /* Form r2 > 0 mask and r2 < hig2 mask. */
-      v_doi_mask2_check.v = vec_cmp_gt(v_r2_2.v, vec_setzero());
-      v_doi_mask2.v = vec_cmp_lt(v_r2_2.v, v_hig2.v);
+      v_doi_mask2_check = vec_cmp_gt(v_r2_2.v, vec_setzero());
+      v_doi_mask2 = vec_cmp_lt(v_r2_2.v, v_hig2.v);

-      /* Combine two masks and form integer mask. */
-      doi_mask = vec_cmp_result(vec_and(v_doi_mask.v, v_doi_mask_check.v));
-      doi_mask2 = vec_cmp_result(vec_and(v_doi_mask2.v, v_doi_mask2_check.v));
-#endif /* HAVE_AVX512_F */
+      /* Combine the two masks. */
+      mask_t doi_mask_combi, doi_mask2_combi;
+      doi_mask_combi = vec_mask_and(v_doi_mask, v_doi_mask_check);
+      doi_mask2_combi = vec_mask_and(v_doi_mask2, v_doi_mask2_check);
+
+      /* Form integer mask. */
+      doi_mask = vec_cmp_result(doi_mask_combi);
+      doi_mask2 = vec_cmp_result(doi_mask2_combi);

      /* If there are any interactions left pack interaction values into c2
       * cache. */

--- a/src/vector.h
+++ b/src/vector.h
@@ -60,7 +60,9 @@
 #define vec_dbl_set(a, b, c, d, e, f, g, h) \
  _mm512_set_pd(h, g, f, e, d, c, b, a)
 #define vec_add(a, b) _mm512_add_ps(a, b)
+#define vec_mask_add(a, b, mask) _mm512_mask_add_ps(a, mask, b, a) 
 #define vec_sub(a, b) _mm512_sub_ps(a, b)
+#define vec_mask_sub(a, b, mask) _mm512_mask_sub_ps(a, mask, a, b) 
 #define vec_mul(a, b) _mm512_mul_ps(a, b)
 #define vec_fma(a, b, c) _mm512_fmadd_ps(a, b, c)
 #define vec_sqrt(a) _mm512_sqrt_ps(a)
@@ -75,7 +77,9 @@
 #define vec_cmp_lt(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ)
 #define vec_cmp_lte(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ)
 #define vec_cmp_gte(a, b) _mm512_cmp_ps_mask(a, b, _CMP_GE_OQ)
+#define vec_cmp_result(a) a
 #define vec_and(a, b) _mm512_and_ps(a, b)
+#define vec_mask_and(a, b) a & b
 #define vec_todbl_lo(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 0))
 #define vec_todbl_hi(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 1))
 #define vec_dbl_tofloat(a, b) _mm512_insertf128(_mm512_castps128_ps512(a), b, 1)
@@ -142,7 +146,9 @@
 #define vec_set(a, b, c, d, e, f, g, h) _mm256_set_ps(h, g, f, e, d, c, b, a)
 #define vec_dbl_set(a, b, c, d) _mm256_set_pd(d, c, b, a)
 #define vec_add(a, b) _mm256_add_ps(a, b)
+#define vec_mask_add(a, b, mask) vec_add(a, vec_and(b,mask)) 
 #define vec_sub(a, b) _mm256_sub_ps(a, b)
+#define vec_mask_sub(a, b, mask) vec_sub(a, vec_and(b,mask)) 
 #define vec_mul(a, b) _mm256_mul_ps(a, b)
 #define vec_sqrt(a) _mm256_sqrt_ps(a)
 #define vec_rcp(a) _mm256_rcp_ps(a)
@@ -158,6 +164,7 @@
 #define vec_cmp_gte(a, b) _mm256_cmp_ps(a, b, _CMP_GE_OQ)
 #define vec_cmp_result(a) _mm256_movemask_ps(a)
 #define vec_and(a, b) _mm256_and_ps(a, b)
+#define vec_mask_and(a, b) _mm256_and_ps(a, b)
 #define vec_todbl_lo(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 0))
 #define vec_todbl_hi(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 1))
 #define vec_dbl_tofloat(a, b) _mm256_insertf128(_mm256_castps128_ps256(a), b, 1)
@@ -346,6 +353,13 @@ typedef union {
  int i[VEC_SIZE];
 } vector;

+/* Define the mask type depending on the instruction set used. */
+#ifdef HAVE_AVX512_F
+typedef __mmask16 mask_t;
+#else
+typedef VEC_FLOAT mask_t;
+#endif
+
 /**
 * @brief Calculates the inverse ($1/x$) of a vector using intrinsics and a
 * Newton iteration to obtain the correct level of accuracy.