Commit 3b6b4f56 authored by James Willis's avatar James Willis
Browse files

Created a typedef for a mask object to abstract the mask creation away from...

Created a typedef for a mask object to abstract the mask creation away from the user and placed it in vector.h.
parent b2a5ba65
......@@ -388,7 +388,7 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz,
vector *wcountSum, vector *wcount_dhSum,
vector *div_vSum, vector *curlvxSum,
vector *curlvySum, vector *curlvzSum,
vector mask, int knlMask) {
mask_t mask) {
vector r, ri, xi, wi, wi_dx;
vector mj;
......@@ -462,16 +462,15 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz,
vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)),
curlvzSum->v);
#else
rhoSum->v += vec_and(vec_mul(mj.v, wi.v), mask.v);
rho_dhSum->v -= vec_and(vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
vec_mul(xi.v, wi_dx.v))),
mask.v);
wcountSum->v += vec_and(wi.v, mask.v);
wcount_dhSum->v -= vec_and(vec_mul(xi.v, wi_dx.v), mask.v);
div_vSum->v -= vec_and(vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask.v);
curlvxSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask.v);
curlvySum->v += vec_and(vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask.v);
curlvzSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask.v);
rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj.v, wi.v), mask);
rho_dhSum->v = vec_mask_sub(rho_dhSum->v, vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
vec_mul(xi.v, wi_dx.v))), mask);
wcountSum->v = vec_mask_add(wcountSum->v, wi.v, mask);
wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, vec_mul(xi.v, wi_dx.v), mask);
div_vSum->v = vec_mask_sub(div_vSum->v, vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask);
curlvxSum->v = vec_mask_add(curlvxSum->v,vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask);
curlvySum->v = vec_mask_add(curlvySum->v,vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask);
curlvzSum->v = vec_mask_add(curlvzSum->v,vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask);
#endif
}
......@@ -485,7 +484,7 @@ runner_iact_nonsym_2_vec_density(
vector viy, vector viz, float *Vjx, float *Vjy, float *Vjz, float *Mj,
vector *rhoSum, vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum,
vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum,
vector mask, vector mask2, int knlMask, int knlMask2) {
mask_t mask, mask_t mask2) {
vector r, ri, r2, xi, wi, wi_dx;
vector mj;
......@@ -616,30 +615,24 @@ runner_iact_nonsym_2_vec_density(
curlvzSum->v, knlMask2, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)),
curlvzSum->v);
#else
rhoSum->v += vec_and(vec_mul(mj.v, wi.v), mask.v);
rhoSum->v += vec_and(vec_mul(mj2.v, wi2.v), mask2.v);
rho_dhSum->v -= vec_and(vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
vec_mul(xi.v, wi_dx.v))),
mask.v);
rho_dhSum->v -=
vec_and(vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v,
vec_mul(xi2.v, wi_dx2.v))),
mask2.v);
wcountSum->v += vec_and(wi.v, mask.v);
wcountSum->v += vec_and(wi2.v, mask2.v);
wcount_dhSum->v -= vec_and(vec_mul(xi.v, wi_dx.v), mask.v);
wcount_dhSum->v -= vec_and(vec_mul(xi2.v, wi_dx2.v), mask2.v);
div_vSum->v -= vec_and(vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask.v);
div_vSum->v -= vec_and(vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)), mask2.v);
curlvxSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask.v);
curlvxSum->v +=
vec_and(vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)), mask2.v);
curlvySum->v += vec_and(vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask.v);
curlvySum->v +=
vec_and(vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)), mask2.v);
curlvzSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask.v);
curlvzSum->v +=
vec_and(vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)), mask2.v);
rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj.v, wi.v), mask);
rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj2.v, wi2.v), mask2);
rho_dhSum->v = vec_mask_sub(rho_dhSum->v, vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
vec_mul(xi.v, wi_dx.v))), mask);
rho_dhSum->v = vec_mask_sub(rho_dhSum->v, vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v,
vec_mul(xi2.v, wi_dx2.v))), mask2);
wcountSum->v = vec_mask_add(wcountSum->v, wi.v, mask);
wcountSum->v = vec_mask_add(wcountSum->v, wi2.v, mask2);
wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, vec_mul(xi.v, wi_dx.v), mask);
wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, vec_mul(xi2.v, wi_dx2.v), mask2);
div_vSum->v = vec_mask_sub(div_vSum->v, vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask);
div_vSum->v = vec_mask_sub(div_vSum->v, vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)), mask2);
curlvxSum->v = vec_mask_add(curlvxSum->v,vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask);
curlvxSum->v = vec_mask_add(curlvxSum->v,vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)), mask2);
curlvySum->v = vec_mask_add(curlvySum->v,vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask);
curlvySum->v = vec_mask_add(curlvySum->v,vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)), mask2);
curlvzSum->v = vec_mask_add(curlvzSum->v,vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask);
curlvzSum->v = vec_mask_add(curlvzSum->v,vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)), mask2);
#endif
}
#endif
......
......@@ -66,10 +66,7 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions(
vector v_hi_inv, vector v_vix, vector v_viy, vector v_viz,
int *icount_align) {
#ifdef HAVE_AVX512_F
KNL_MASK_16 knl_mask, knl_mask2;
#endif
vector int_mask, int_mask2;
mask_t int_mask, int_mask2;
/* Work out the number of remainder interactions and pad secondary cache. */
*icount_align = icount;
......@@ -78,16 +75,10 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions(
int pad = (NUM_VEC_PROC * VEC_SIZE) - rem;
*icount_align += pad;
/* Initialise masks to true. */
#ifdef HAVE_AVX512_F
knl_mask = 0xFFFF;
knl_mask2 = 0xFFFF;
int_mask.m = vec_setint1(0xFFFFFFFF);
int_mask2.m = vec_setint1(0xFFFFFFFF);
#else
int_mask.m = vec_setint1(0xFFFFFFFF);
int_mask2.m = vec_setint1(0xFFFFFFFF);
#endif
/* Initialise masks to true. */
vec_init_mask(int_mask);
vec_init_mask(int_mask2);
/* Pad secondary cache so that there are no contributions in the interaction
* function. */
for (int i = icount; i < *icount_align; i++) {
......@@ -103,19 +94,10 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions(
/* Zero parts of mask that represent the padded values.*/
if (pad < VEC_SIZE) {
#ifdef HAVE_AVX512_F
knl_mask2 = knl_mask2 >> pad;
#else
for (int i = VEC_SIZE - pad; i < VEC_SIZE; i++) int_mask2.i[i] = 0;
#endif
vec_pad_mask(int_mask2,pad);
} else {
#ifdef HAVE_AVX512_F
knl_mask = knl_mask >> (VEC_SIZE - rem);
knl_mask2 = 0;
#else
for (int i = rem; i < VEC_SIZE; i++) int_mask.i[i] = 0;
int_mask2.v = vec_setzero();
#endif
vec_pad_mask(int_mask,VEC_SIZE - rem);
vec_zero_mask(int_mask2);
}
/* Perform remainder interaction and remove remainder from aligned
......@@ -127,13 +109,7 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions(
v_hi_inv, v_vix, v_viy, v_viz, &int_cache->vxq[*icount_align],
&int_cache->vyq[*icount_align], &int_cache->vzq[*icount_align],
&int_cache->mq[*icount_align], rhoSum, rho_dhSum, wcountSum,
wcount_dhSum, div_vSum, curlvxSum, curlvySum, curlvzSum, int_mask,
int_mask2,
#ifdef HAVE_AVX512_F
knl_mask, knl_mask2);
#else
0, 0);
#endif
wcount_dhSum, div_vSum, curlvxSum, curlvySum, curlvzSum, int_mask, int_mask2);
}
}
......@@ -244,9 +220,9 @@ __attribute__((always_inline)) INLINE static void storeInteractions(
wcount_dhSum, div_vSum, curlvxSum, curlvySum, curlvzSum,
v_hi_inv, v_vix, v_viy, v_viz, &icount_align);
vector int_mask, int_mask2;
int_mask.m = vec_setint1(0xFFFFFFFF);
int_mask2.m = vec_setint1(0xFFFFFFFF);
mask_t int_mask, int_mask2;
vec_init_mask(int_mask);
vec_init_mask(int_mask2);
/* Perform interactions. */
for (int pjd = 0; pjd < icount_align; pjd += (NUM_VEC_PROC * VEC_SIZE)) {
......@@ -255,7 +231,7 @@ __attribute__((always_inline)) INLINE static void storeInteractions(
&int_cache->dzq[pjd], v_hi_inv, v_vix, v_viy, v_viz,
&int_cache->vxq[pjd], &int_cache->vyq[pjd], &int_cache->vzq[pjd],
&int_cache->mq[pjd], rhoSum, rho_dhSum, wcountSum, wcount_dhSum,
div_vSum, curlvxSum, curlvySum, curlvzSum, int_mask, int_mask2, 0xFFFF, 0xFFFF);
div_vSum, curlvxSum, curlvySum, curlvzSum, int_mask, int_mask2);
}
/* Reset interaction count. */
......@@ -750,24 +726,19 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
/* Form a mask from r2 < hig2 and r2 > 0.*/
mask_t v_doi_mask, v_doi_mask_check, v_doi_mask2, v_doi_mask2_check;
short doi_mask, doi_mask2;
int doi_mask, doi_mask2;
/* Form r2 > 0 mask and r2 < hig2 mask. */
v_doi_mask_check = vec_cmp_gt(v_r2.v, vec_setzero());
v_doi_mask = vec_cmp_lt(v_r2.v, v_hig2.v);
vec_create_mask(v_doi_mask_check, vec_cmp_gt(v_r2.v, vec_setzero()));
vec_create_mask(v_doi_mask, vec_cmp_lt(v_r2.v, v_hig2.v));
/* Form r2 > 0 mask and r2 < hig2 mask. */
v_doi_mask2_check = vec_cmp_gt(v_r2_2.v, vec_setzero());
v_doi_mask2 = vec_cmp_lt(v_r2_2.v, v_hig2.v);
/* Combine the two masks. */
mask_t doi_mask_combi, doi_mask2_combi;
doi_mask_combi = vec_mask_and(v_doi_mask, v_doi_mask_check);
doi_mask2_combi = vec_mask_and(v_doi_mask2, v_doi_mask2_check);
vec_create_mask(v_doi_mask2_check, vec_cmp_gt(v_r2_2.v, vec_setzero()));
vec_create_mask(v_doi_mask2, vec_cmp_lt(v_r2_2.v, v_hig2.v));
/* Form integer mask. */
doi_mask = vec_cmp_result(doi_mask_combi);
doi_mask2 = vec_cmp_result(doi_mask2_combi);
/* Combine the two masks and form an integer mask. */
doi_mask = vec_cmp_result(vec_mask_and(v_doi_mask, v_doi_mask_check));
doi_mask2 = vec_cmp_result(vec_mask_and(v_doi_mask2, v_doi_mask2_check));
/* If there are any interactions left pack interaction values into c2
* cache. */
......@@ -795,16 +766,9 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
/* Initialise masks to true in case remainder interactions have been
* performed. */
vector int_mask, int_mask2;
#ifdef HAVE_AVX512_F
KNL_MASK_16 knl_mask = 0xFFFF;
KNL_MASK_16 knl_mask2 = 0xFFFF;
int_mask.m = vec_setint1(0xFFFFFFFF);
int_mask2.m = vec_setint1(0xFFFFFFFF);
#else
int_mask.m = vec_setint1(0xFFFFFFFF);
int_mask2.m = vec_setint1(0xFFFFFFFF);
#endif
mask_t int_mask, int_mask2;
vec_init_mask(int_mask);
vec_init_mask(int_mask2);
/* Perform interaction with 2 vectors. */
for (int pjd = 0; pjd < icount_align; pjd += (num_vec_proc * VEC_SIZE)) {
......@@ -813,12 +777,7 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
&int_cache.dzq[pjd], v_hi_inv, v_vix, v_viy, v_viz,
&int_cache.vxq[pjd], &int_cache.vyq[pjd], &int_cache.vzq[pjd],
&int_cache.mq[pjd], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum,
&div_vSum, &curlvxSum, &curlvySum, &curlvzSum, int_mask, int_mask2,
#ifdef HAVE_AVX512_F
knl_mask, knl_mask2);
#else
0, 0);
#endif
&div_vSum, &curlvxSum, &curlvySum, &curlvzSum, int_mask, int_mask2);
}
/* Perform horizontal adds on vector sums and store result in particle pi.
......@@ -1320,7 +1279,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v);
v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v);
vector v_doi_mask;
mask_t v_doi_mask;
int doi_mask;
/* Form r2 < hig2 mask. */
......@@ -1336,12 +1295,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
&cj_cache->vx[cj_cache_idx], &cj_cache->vy[cj_cache_idx],
&cj_cache->vz[cj_cache_idx], &cj_cache->m[cj_cache_idx], &rhoSum,
&rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum,
&curlvySum, &curlvzSum, v_doi_mask,
#ifdef HAVE_AVX512_F
knl_mask);
#else
0);
#endif
&curlvySum, &curlvzSum, v_doi_mask);
} /* loop over the parts in cj. */
......@@ -1452,7 +1406,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v);
v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v);
vector v_doj_mask;
mask_t v_doj_mask;
int doj_mask;
/* Form r2 < hig2 mask. */
......@@ -1468,12 +1422,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
&ci_cache->vx[ci_cache_idx], &ci_cache->vy[ci_cache_idx],
&ci_cache->vz[ci_cache_idx], &ci_cache->m[ci_cache_idx], &rhoSum,
&rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum,
&curlvySum, &curlvzSum, v_doj_mask,
#ifdef HAVE_AVX512_F
knl_mask);
#else
0);
#endif
&curlvySum, &curlvzSum, v_doj_mask);
} /* loop over the parts in ci. */
......
......@@ -80,6 +80,10 @@
#define vec_cmp_result(a) a
#define vec_and(a, b) _mm512_and_ps(a, b)
#define vec_mask_and(a, b) a & b
#define vec_init_mask(mask) mask = 0xFFFF
#define vec_zero_mask(mask) mask = 0
#define vec_create_mask(mask, cond) mask = cond
#define vec_pad_mask(mask,pad) mask = mask >> (pad)
#define vec_todbl_lo(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 0))
#define vec_todbl_hi(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 1))
#define vec_dbl_tofloat(a, b) _mm512_insertf128(_mm512_castps128_ps512(a), b, 1)
......@@ -146,9 +150,9 @@
#define vec_set(a, b, c, d, e, f, g, h) _mm256_set_ps(h, g, f, e, d, c, b, a)
#define vec_dbl_set(a, b, c, d) _mm256_set_pd(d, c, b, a)
#define vec_add(a, b) _mm256_add_ps(a, b)
#define vec_mask_add(a, b, mask) vec_add(a, vec_and(b,mask))
#define vec_mask_add(a, b, mask) vec_add(a, vec_and(b,mask.v))
#define vec_sub(a, b) _mm256_sub_ps(a, b)
#define vec_mask_sub(a, b, mask) vec_sub(a, vec_and(b,mask))
#define vec_mask_sub(a, b, mask) vec_sub(a, vec_and(b,mask.v))
#define vec_mul(a, b) _mm256_mul_ps(a, b)
#define vec_sqrt(a) _mm256_sqrt_ps(a)
#define vec_rcp(a) _mm256_rcp_ps(a)
......@@ -164,7 +168,11 @@
#define vec_cmp_gte(a, b) _mm256_cmp_ps(a, b, _CMP_GE_OQ)
#define vec_cmp_result(a) _mm256_movemask_ps(a)
#define vec_and(a, b) _mm256_and_ps(a, b)
#define vec_mask_and(a, b) _mm256_and_ps(a, b)
#define vec_mask_and(a, b) _mm256_and_ps(a.v, b.v)
#define vec_init_mask(mask) mask.m = vec_setint1(0xFFFFFFFF)
#define vec_create_mask(mask, cond) mask.v = cond
#define vec_zero_mask(mask) mask.v = vec_setzero()
#define vec_pad_mask(mask,pad) for (int i = VEC_SIZE - (pad); i < VEC_SIZE; i++) mask.i[i] = 0
#define vec_todbl_lo(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 0))
#define vec_todbl_hi(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 1))
#define vec_dbl_tofloat(a, b) _mm256_insertf128(_mm256_castps128_ps256(a), b, 1)
......@@ -357,7 +365,7 @@ typedef union {
#ifdef HAVE_AVX512_F
typedef __mmask16 mask_t;
#else
typedef VEC_FLOAT mask_t;
typedef vector mask_t;
#endif
/**
......
......@@ -368,7 +368,7 @@ void test_interactions(struct part test_part, struct part *parts, size_t count,
/* Perform vector interaction. */
#ifdef WITH_VECTORIZATION
vector hi_vec, hi_inv_vec, vix_vec, viy_vec, viz_vec, mask, mask2;
vector hi_vec, hi_inv_vec, vix_vec, viy_vec, viz_vec;
vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum,
curlvySum, curlvzSum;
......@@ -387,14 +387,10 @@ void test_interactions(struct part test_part, struct part *parts, size_t count,
viz_vec.v = vec_load(&vizq[0]);
hi_inv_vec = vec_reciprocal(hi_vec);
mask.m = vec_setint1(0xFFFFFFFF);
mask2.m = vec_setint1(0xFFFFFFFF);
#ifdef HAVE_AVX512_F
KNL_MASK_16 knl_mask, knl_mask2;
knl_mask = 0xFFFF;
knl_mask2 = 0xFFFF;
#endif
mask_vec_t mask, mask2;
vec_init_mask(mask);
vec_init_mask(mask2);
const ticks vec_tic = getticks();
......@@ -404,12 +400,7 @@ void test_interactions(struct part test_part, struct part *parts, size_t count,
(vix_vec), (viy_vec), (viz_vec), &(vjxq[i]), &(vjyq[i]),
&(vjzq[i]), &(mjq[i]), &rhoSum, &rho_dhSum, &wcountSum,
&wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, &curlvzSum,
mask, mask2,
#ifdef HAVE_AVX512_F
knl_mask, knl_mask2);
#else
0, 0);
#endif
mask, mask2);
}
VEC_HADD(rhoSum, piq[0]->rho);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment