diff --git a/src/hydro/Gadget2/hydro_iact.h b/src/hydro/Gadget2/hydro_iact.h index c713c5911f65d552c7d32468d0773db244bf9404..80334683931bed8b0bf005fa4d52b2c72638840f 100644 --- a/src/hydro/Gadget2/hydro_iact.h +++ b/src/hydro/Gadget2/hydro_iact.h @@ -388,7 +388,7 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz, vector *wcountSum, vector *wcount_dhSum, vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum, - vector mask, int knlMask) { + mask_t mask) { vector r, ri, xi, wi, wi_dx; vector mj; @@ -462,16 +462,15 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz, vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), curlvzSum->v); #else - rhoSum->v += vec_and(vec_mul(mj.v, wi.v), mask.v); - rho_dhSum->v -= vec_and(vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, - vec_mul(xi.v, wi_dx.v))), - mask.v); - wcountSum->v += vec_and(wi.v, mask.v); - wcount_dhSum->v -= vec_and(vec_mul(xi.v, wi_dx.v), mask.v); - div_vSum->v -= vec_and(vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask.v); - curlvxSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask.v); - curlvySum->v += vec_and(vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask.v); - curlvzSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask.v); + rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj.v, wi.v), mask); + rho_dhSum->v = vec_mask_sub(rho_dhSum->v, vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, + vec_mul(xi.v, wi_dx.v))), mask); + wcountSum->v = vec_mask_add(wcountSum->v, wi.v, mask); + wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, vec_mul(xi.v, wi_dx.v), mask); + div_vSum->v = vec_mask_sub(div_vSum->v, vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask); + curlvxSum->v = vec_mask_add(curlvxSum->v,vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask); + curlvySum->v = vec_mask_add(curlvySum->v,vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask); + curlvzSum->v = vec_mask_add(curlvzSum->v,vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask); #endif } @@ -485,7 +484,7 @@ runner_iact_nonsym_2_vec_density( vector viy, vector viz, float *Vjx, float *Vjy, float *Vjz, float *Mj, vector *rhoSum, vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum, vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum, - vector mask, vector mask2, int knlMask, int knlMask2) { + mask_t mask, mask_t mask2) { vector r, ri, r2, xi, wi, wi_dx; vector mj; @@ -616,30 +615,24 @@ runner_iact_nonsym_2_vec_density( curlvzSum->v, knlMask2, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)), curlvzSum->v); #else - rhoSum->v += vec_and(vec_mul(mj.v, wi.v), mask.v); - rhoSum->v += vec_and(vec_mul(mj2.v, wi2.v), mask2.v); - rho_dhSum->v -= vec_and(vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, - vec_mul(xi.v, wi_dx.v))), - mask.v); - rho_dhSum->v -= - vec_and(vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v, - vec_mul(xi2.v, wi_dx2.v))), - mask2.v); - wcountSum->v += vec_and(wi.v, mask.v); - wcountSum->v += vec_and(wi2.v, mask2.v); - wcount_dhSum->v -= vec_and(vec_mul(xi.v, wi_dx.v), mask.v); - wcount_dhSum->v -= vec_and(vec_mul(xi2.v, wi_dx2.v), mask2.v); - div_vSum->v -= vec_and(vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask.v); - div_vSum->v -= vec_and(vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)), mask2.v); - curlvxSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask.v); - curlvxSum->v += - vec_and(vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)), mask2.v); - curlvySum->v += vec_and(vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask.v); - curlvySum->v += - vec_and(vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)), mask2.v); - curlvzSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask.v); - curlvzSum->v += - vec_and(vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)), mask2.v); + rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj.v, wi.v), mask); + rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj2.v, wi2.v), mask2); + rho_dhSum->v = vec_mask_sub(rho_dhSum->v, vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, + vec_mul(xi.v, wi_dx.v))), mask); + rho_dhSum->v = vec_mask_sub(rho_dhSum->v, vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v, + vec_mul(xi2.v, wi_dx2.v))), mask2); + wcountSum->v = vec_mask_add(wcountSum->v, wi.v, mask); + wcountSum->v = vec_mask_add(wcountSum->v, wi2.v, mask2); + wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, vec_mul(xi.v, wi_dx.v), mask); + wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, vec_mul(xi2.v, wi_dx2.v), mask2); + div_vSum->v = vec_mask_sub(div_vSum->v, vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask); + div_vSum->v = vec_mask_sub(div_vSum->v, vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)), mask2); + curlvxSum->v = vec_mask_add(curlvxSum->v,vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask); + curlvxSum->v = vec_mask_add(curlvxSum->v,vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)), mask2); + curlvySum->v = vec_mask_add(curlvySum->v,vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask); + curlvySum->v = vec_mask_add(curlvySum->v,vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)), mask2); + curlvzSum->v = vec_mask_add(curlvzSum->v,vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask); + curlvzSum->v = vec_mask_add(curlvzSum->v,vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)), mask2); #endif } #endif diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c index 07c8fa4dc31c57c8e3ce0d2023f51f46c70f3431..d8fdbc8f628ec9c437d887320109b327f954683a 100644 --- a/src/runner_doiact_vec.c +++ b/src/runner_doiact_vec.c @@ -66,10 +66,7 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions( vector v_hi_inv, vector v_vix, vector v_viy, vector v_viz, int *icount_align) { -#ifdef HAVE_AVX512_F - KNL_MASK_16 knl_mask, knl_mask2; -#endif - vector int_mask, int_mask2; + mask_t int_mask, int_mask2; /* Work out the number of remainder interactions and pad secondary cache. */ *icount_align = icount; @@ -78,16 +75,10 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions( int pad = (NUM_VEC_PROC * VEC_SIZE) - rem; *icount_align += pad; -/* Initialise masks to true. */ -#ifdef HAVE_AVX512_F - knl_mask = 0xFFFF; - knl_mask2 = 0xFFFF; - int_mask.m = vec_setint1(0xFFFFFFFF); - int_mask2.m = vec_setint1(0xFFFFFFFF); -#else - int_mask.m = vec_setint1(0xFFFFFFFF); - int_mask2.m = vec_setint1(0xFFFFFFFF); -#endif + /* Initialise masks to true. */ + vec_init_mask(int_mask); + vec_init_mask(int_mask2); + /* Pad secondary cache so that there are no contributions in the interaction * function. */ for (int i = icount; i < *icount_align; i++) { @@ -103,19 +94,10 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions( /* Zero parts of mask that represent the padded values.*/ if (pad < VEC_SIZE) { -#ifdef HAVE_AVX512_F - knl_mask2 = knl_mask2 >> pad; -#else - for (int i = VEC_SIZE - pad; i < VEC_SIZE; i++) int_mask2.i[i] = 0; -#endif + vec_pad_mask(int_mask2,pad); } else { -#ifdef HAVE_AVX512_F - knl_mask = knl_mask >> (VEC_SIZE - rem); - knl_mask2 = 0; -#else - for (int i = rem; i < VEC_SIZE; i++) int_mask.i[i] = 0; - int_mask2.v = vec_setzero(); -#endif + vec_pad_mask(int_mask,VEC_SIZE - rem); + vec_zero_mask(int_mask2); } /* Perform remainder interaction and remove remainder from aligned @@ -127,13 +109,7 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions( v_hi_inv, v_vix, v_viy, v_viz, &int_cache->vxq[*icount_align], &int_cache->vyq[*icount_align], &int_cache->vzq[*icount_align], &int_cache->mq[*icount_align], rhoSum, rho_dhSum, wcountSum, - wcount_dhSum, div_vSum, curlvxSum, curlvySum, curlvzSum, int_mask, - int_mask2, -#ifdef HAVE_AVX512_F - knl_mask, knl_mask2); -#else - 0, 0); -#endif + wcount_dhSum, div_vSum, curlvxSum, curlvySum, curlvzSum, int_mask, int_mask2); } } @@ -244,9 +220,9 @@ __attribute__((always_inline)) INLINE static void storeInteractions( wcount_dhSum, div_vSum, curlvxSum, curlvySum, curlvzSum, v_hi_inv, v_vix, v_viy, v_viz, &icount_align); - vector int_mask, int_mask2; - int_mask.m = vec_setint1(0xFFFFFFFF); - int_mask2.m = vec_setint1(0xFFFFFFFF); + mask_t int_mask, int_mask2; + vec_init_mask(int_mask); + vec_init_mask(int_mask2); /* Perform interactions. */ for (int pjd = 0; pjd < icount_align; pjd += (NUM_VEC_PROC * VEC_SIZE)) { @@ -255,7 +231,7 @@ __attribute__((always_inline)) INLINE static void storeInteractions( &int_cache->dzq[pjd], v_hi_inv, v_vix, v_viy, v_viz, &int_cache->vxq[pjd], &int_cache->vyq[pjd], &int_cache->vzq[pjd], &int_cache->mq[pjd], rhoSum, rho_dhSum, wcountSum, wcount_dhSum, - div_vSum, curlvxSum, curlvySum, curlvzSum, int_mask, int_mask2, 0xFFFF, 0xFFFF); + div_vSum, curlvxSum, curlvySum, curlvzSum, int_mask, int_mask2); } /* Reset interaction count. */ @@ -750,24 +726,19 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( /* Form a mask from r2 < hig2 and r2 > 0.*/ mask_t v_doi_mask, v_doi_mask_check, v_doi_mask2, v_doi_mask2_check; - short doi_mask, doi_mask2; + int doi_mask, doi_mask2; /* Form r2 > 0 mask and r2 < hig2 mask. */ - v_doi_mask_check = vec_cmp_gt(v_r2.v, vec_setzero()); - v_doi_mask = vec_cmp_lt(v_r2.v, v_hig2.v); + vec_create_mask(v_doi_mask_check, vec_cmp_gt(v_r2.v, vec_setzero())); + vec_create_mask(v_doi_mask, vec_cmp_lt(v_r2.v, v_hig2.v)); /* Form r2 > 0 mask and r2 < hig2 mask. */ - v_doi_mask2_check = vec_cmp_gt(v_r2_2.v, vec_setzero()); - v_doi_mask2 = vec_cmp_lt(v_r2_2.v, v_hig2.v); - - /* Combine the two masks. */ - mask_t doi_mask_combi, doi_mask2_combi; - doi_mask_combi = vec_mask_and(v_doi_mask, v_doi_mask_check); - doi_mask2_combi = vec_mask_and(v_doi_mask2, v_doi_mask2_check); + vec_create_mask(v_doi_mask2_check, vec_cmp_gt(v_r2_2.v, vec_setzero())); + vec_create_mask(v_doi_mask2, vec_cmp_lt(v_r2_2.v, v_hig2.v)); - /* Form integer mask. */ - doi_mask = vec_cmp_result(doi_mask_combi); - doi_mask2 = vec_cmp_result(doi_mask2_combi); + /* Combine the two masks and form an integer mask. */ + doi_mask = vec_cmp_result(vec_mask_and(v_doi_mask, v_doi_mask_check)); + doi_mask2 = vec_cmp_result(vec_mask_and(v_doi_mask2, v_doi_mask2_check)); /* If there are any interactions left pack interaction values into c2 * cache. */ @@ -795,16 +766,9 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( /* Initialise masks to true in case remainder interactions have been * performed. */ - vector int_mask, int_mask2; -#ifdef HAVE_AVX512_F - KNL_MASK_16 knl_mask = 0xFFFF; - KNL_MASK_16 knl_mask2 = 0xFFFF; - int_mask.m = vec_setint1(0xFFFFFFFF); - int_mask2.m = vec_setint1(0xFFFFFFFF); -#else - int_mask.m = vec_setint1(0xFFFFFFFF); - int_mask2.m = vec_setint1(0xFFFFFFFF); -#endif + mask_t int_mask, int_mask2; + vec_init_mask(int_mask); + vec_init_mask(int_mask2); /* Perform interaction with 2 vectors. */ for (int pjd = 0; pjd < icount_align; pjd += (num_vec_proc * VEC_SIZE)) { @@ -813,12 +777,7 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( &int_cache.dzq[pjd], v_hi_inv, v_vix, v_viy, v_viz, &int_cache.vxq[pjd], &int_cache.vyq[pjd], &int_cache.vzq[pjd], &int_cache.mq[pjd], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, - &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, int_mask, int_mask2, -#ifdef HAVE_AVX512_F - knl_mask, knl_mask2); -#else - 0, 0); -#endif + &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, int_mask, int_mask2); } /* Perform horizontal adds on vector sums and store result in particle pi. @@ -1320,7 +1279,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v); v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v); - vector v_doi_mask; + mask_t v_doi_mask; int doi_mask; /* Form r2 < hig2 mask. */ @@ -1336,12 +1295,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, &cj_cache->vx[cj_cache_idx], &cj_cache->vy[cj_cache_idx], &cj_cache->vz[cj_cache_idx], &cj_cache->m[cj_cache_idx], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, - &curlvySum, &curlvzSum, v_doi_mask, -#ifdef HAVE_AVX512_F - knl_mask); -#else - 0); -#endif + &curlvySum, &curlvzSum, v_doi_mask); } /* loop over the parts in cj. */ @@ -1452,7 +1406,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v); v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v); - vector v_doj_mask; + mask_t v_doj_mask; int doj_mask; /* Form r2 < hig2 mask. */ @@ -1468,12 +1422,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, &ci_cache->vx[ci_cache_idx], &ci_cache->vy[ci_cache_idx], &ci_cache->vz[ci_cache_idx], &ci_cache->m[ci_cache_idx], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, - &curlvySum, &curlvzSum, v_doj_mask, -#ifdef HAVE_AVX512_F - knl_mask); -#else - 0); -#endif + &curlvySum, &curlvzSum, v_doj_mask); } /* loop over the parts in ci. */ diff --git a/src/vector.h b/src/vector.h index 6f235c0549a2557e72f8b186af5beff80bd7200f..8a48f31a2ddc870f0aaa7dd6bcceba6663ecd686 100644 --- a/src/vector.h +++ b/src/vector.h @@ -80,6 +80,10 @@ #define vec_cmp_result(a) a #define vec_and(a, b) _mm512_and_ps(a, b) #define vec_mask_and(a, b) a & b +#define vec_init_mask(mask) mask = 0xFFFF +#define vec_zero_mask(mask) mask = 0 +#define vec_create_mask(mask, cond) mask = cond +#define vec_pad_mask(mask,pad) mask = mask >> (pad) #define vec_todbl_lo(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 0)) #define vec_todbl_hi(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 1)) #define vec_dbl_tofloat(a, b) _mm512_insertf128(_mm512_castps128_ps512(a), b, 1) @@ -146,9 +150,9 @@ #define vec_set(a, b, c, d, e, f, g, h) _mm256_set_ps(h, g, f, e, d, c, b, a) #define vec_dbl_set(a, b, c, d) _mm256_set_pd(d, c, b, a) #define vec_add(a, b) _mm256_add_ps(a, b) -#define vec_mask_add(a, b, mask) vec_add(a, vec_and(b,mask)) +#define vec_mask_add(a, b, mask) vec_add(a, vec_and(b,mask.v)) #define vec_sub(a, b) _mm256_sub_ps(a, b) -#define vec_mask_sub(a, b, mask) vec_sub(a, vec_and(b,mask)) +#define vec_mask_sub(a, b, mask) vec_sub(a, vec_and(b,mask.v)) #define vec_mul(a, b) _mm256_mul_ps(a, b) #define vec_sqrt(a) _mm256_sqrt_ps(a) #define vec_rcp(a) _mm256_rcp_ps(a) @@ -164,7 +168,11 @@ #define vec_cmp_gte(a, b) _mm256_cmp_ps(a, b, _CMP_GE_OQ) #define vec_cmp_result(a) _mm256_movemask_ps(a) #define vec_and(a, b) _mm256_and_ps(a, b) -#define vec_mask_and(a, b) _mm256_and_ps(a, b) +#define vec_mask_and(a, b) _mm256_and_ps(a.v, b.v) +#define vec_init_mask(mask) mask.m = vec_setint1(0xFFFFFFFF) +#define vec_create_mask(mask, cond) mask.v = cond +#define vec_zero_mask(mask) mask.v = vec_setzero() +#define vec_pad_mask(mask,pad) for (int i = VEC_SIZE - (pad); i < VEC_SIZE; i++) mask.i[i] = 0 #define vec_todbl_lo(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 0)) #define vec_todbl_hi(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 1)) #define vec_dbl_tofloat(a, b) _mm256_insertf128(_mm256_castps128_ps256(a), b, 1) @@ -357,7 +365,7 @@ typedef union { #ifdef HAVE_AVX512_F typedef __mmask16 mask_t; #else -typedef VEC_FLOAT mask_t; +typedef vector mask_t; #endif /** diff --git a/tests/benchmarkInteractions.c b/tests/benchmarkInteractions.c index 0f5b3d2eb294c13e3035885b511c702e6f0cd540..3ea5bfaaa5c6a0f7bdf8cfe5c685981f1da8cb34 100644 --- a/tests/benchmarkInteractions.c +++ b/tests/benchmarkInteractions.c @@ -368,7 +368,7 @@ void test_interactions(struct part test_part, struct part *parts, size_t count, /* Perform vector interaction. */ #ifdef WITH_VECTORIZATION - vector hi_vec, hi_inv_vec, vix_vec, viy_vec, viz_vec, mask, mask2; + vector hi_vec, hi_inv_vec, vix_vec, viy_vec, viz_vec; vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, curlvySum, curlvzSum; @@ -387,14 +387,10 @@ void test_interactions(struct part test_part, struct part *parts, size_t count, viz_vec.v = vec_load(&vizq[0]); hi_inv_vec = vec_reciprocal(hi_vec); - mask.m = vec_setint1(0xFFFFFFFF); - mask2.m = vec_setint1(0xFFFFFFFF); -#ifdef HAVE_AVX512_F - KNL_MASK_16 knl_mask, knl_mask2; - knl_mask = 0xFFFF; - knl_mask2 = 0xFFFF; -#endif + mask_vec_t mask, mask2; + vec_init_mask(mask); + vec_init_mask(mask2); const ticks vec_tic = getticks(); @@ -404,12 +400,7 @@ void test_interactions(struct part test_part, struct part *parts, size_t count, (vix_vec), (viy_vec), (viz_vec), &(vjxq[i]), &(vjyq[i]), &(vjzq[i]), &(mjq[i]), &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, - mask, mask2, -#ifdef HAVE_AVX512_F - knl_mask, knl_mask2); -#else - 0, 0); -#endif + mask, mask2); } VEC_HADD(rhoSum, piq[0]->rho);