Commit 3602dceb authored by James Willis's avatar James Willis
Browse files

Use generic masks and remove old vectorisation from AVX-512 compilation.

parent 9b154ac9
......@@ -388,7 +388,7 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz,
vector *wcountSum, vector *wcount_dhSum,
vector *div_vSum, vector *curlvxSum,
vector *curlvySum, vector *curlvzSum,
vector mask, int knlMask) {
mask_t mask) {
vector r, ri, ui, wi, wi_dx;
vector mj;
......@@ -433,48 +433,21 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz,
curlvrz.v = vec_mul(curlvrz.v, ri.v);
/* Mask updates to intermediate vector sums for particle pi. */
#ifdef HAVE_AVX512_F
rhoSum->v =
_mm512_mask_add_ps(rhoSum->v, knlMask, vec_mul(mj.v, wi.v), rhoSum->v);
rho_dhSum->v =
_mm512_mask_sub_ps(rho_dhSum->v, knlMask, rho_dhSum->v,
vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
vec_mul(ui.v, wi_dx.v))));
wcountSum->v = _mm512_mask_add_ps(wcountSum->v, knlMask, wi.v, wcountSum->v);
wcount_dhSum->v = _mm512_mask_sub_ps(
rho_dhSum->v, knlMask, rho_dhSum->v,
vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v)));
div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask, div_vSum->v,
vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)));
curlvxSum->v = _mm512_mask_add_ps(curlvxSum->v, knlMask,
vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)),
curlvxSum->v);
curlvySum->v = _mm512_mask_add_ps(curlvySum->v, knlMask,
vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)),
curlvySum->v);
curlvzSum->v = _mm512_mask_add_ps(curlvzSum->v, knlMask,
vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)),
curlvzSum->v);
#else
rhoSum->v += vec_and(vec_mul(mj.v, wi.v), mask.v);
rho_dhSum->v -= vec_and(vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
vec_mul(ui.v, wi_dx.v))),
mask.v);
wcountSum->v += vec_and(wi.v, mask.v);
wcount_dhSum->v -= vec_and(
vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v)), mask.v);
div_vSum->v -= vec_and(vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask.v);
curlvxSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask.v);
curlvySum->v += vec_and(vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask.v);
curlvzSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask.v);
#endif
rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj.v, wi.v), mask);
rho_dhSum->v = vec_mask_sub(
rho_dhSum->v, vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
vec_mul(ui.v, wi_dx.v))),
mask);
wcountSum->v = vec_mask_add(wcountSum->v, wi.v, mask);
wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, vec_mul(ui.v, wi_dx.v), mask);
div_vSum->v =
vec_mask_sub(div_vSum->v, vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask);
curlvxSum->v = vec_mask_add(curlvxSum->v,
vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask);
curlvySum->v = vec_mask_add(curlvySum->v,
vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask);
curlvzSum->v = vec_mask_add(curlvzSum->v,
vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask);
}
/**
......@@ -487,7 +460,7 @@ runner_iact_nonsym_2_vec_density(
vector viy, vector viz, float *Vjx, float *Vjy, float *Vjz, float *Mj,
vector *rhoSum, vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum,
vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum,
vector mask, vector mask2, int knlMask, int knlMask2) {
mask_t mask, mask_t mask2, short mask_cond) {
vector r, ri, r2, ui, wi, wi_dx;
vector mj;
......@@ -567,38 +540,61 @@ runner_iact_nonsym_2_vec_density(
curlvrz.v = vec_mul(curlvrz.v, ri.v);
curlvrz2.v = vec_mul(curlvrz2.v, ri2.v);
rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj.v, wi.v), mask);
rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj2.v, wi2.v), mask2);
rho_dhSum->v = vec_mask_sub(
rho_dhSum->v, vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
vec_mul(xi.v, wi_dx.v))),
mask);
rho_dhSum->v = vec_mask_sub(
rho_dhSum->v, vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v,
vec_mul(xi2.v, wi_dx2.v))),
mask2);
wcountSum->v = vec_mask_add(wcountSum->v, wi.v, mask);
wcountSum->v = vec_mask_add(wcountSum->v, wi2.v, mask2);
wcount_dhSum->v =
vec_mask_sub(wcount_dhSum->v, vec_mul(xi.v, wi_dx.v), mask);
wcount_dhSum->v =
vec_mask_sub(wcount_dhSum->v, vec_mul(xi2.v, wi_dx2.v), mask2);
div_vSum->v = vec_mask_sub(div_vSum->v,
vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask);
div_vSum->v = vec_mask_sub(
div_vSum->v, vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)), mask2);
curlvxSum->v = vec_mask_add(
curlvxSum->v, vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask);
curlvxSum->v = vec_mask_add(
curlvxSum->v, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)), mask2);
curlvySum->v = vec_mask_add(
curlvySum->v, vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask);
curlvySum->v = vec_mask_add(
curlvySum->v, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)), mask2);
curlvzSum->v = vec_mask_add(
curlvzSum->v, vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask);
curlvzSum->v = vec_mask_add(
curlvzSum->v, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)), mask2);
/* Mask updates to intermediate vector sums for particle pi. */
/* Mask only when needed. */
if (mask_cond) {
rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj.v, wi.v), mask);
rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj2.v, wi2.v), mask2);
rho_dhSum->v = vec_mask_sub(
rho_dhSum->v, vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
vec_mul(ui.v, wi_dx.v))),
mask);
rho_dhSum->v = vec_mask_sub(
rho_dhSum->v, vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v,
vec_mul(ui2.v, wi_dx2.v))),
mask2);
wcountSum->v = vec_mask_add(wcountSum->v, wi.v, mask);
wcountSum->v = vec_mask_add(wcountSum->v, wi2.v, mask2);
wcount_dhSum->v =
vec_mask_sub(wcount_dhSum->v, vec_mul(ui.v, wi_dx.v), mask);
wcount_dhSum->v =
vec_mask_sub(wcount_dhSum->v, vec_mul(ui2.v, wi_dx2.v), mask2);
div_vSum->v = vec_mask_sub(div_vSum->v,
vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask);
div_vSum->v = vec_mask_sub(
div_vSum->v, vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)), mask2);
curlvxSum->v = vec_mask_add(
curlvxSum->v, vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask);
curlvxSum->v = vec_mask_add(
curlvxSum->v, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)), mask2);
curlvySum->v = vec_mask_add(
curlvySum->v, vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask);
curlvySum->v = vec_mask_add(
curlvySum->v, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)), mask2);
curlvzSum->v = vec_mask_add(
curlvzSum->v, vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask);
curlvzSum->v = vec_mask_add(
curlvzSum->v, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)), mask2);
} else {
rhoSum->v = vec_add(rhoSum->v, vec_mul(mj.v, wi.v));
rhoSum->v = vec_add(rhoSum->v, vec_mul(mj2.v, wi2.v));
rho_dhSum->v = vec_sub(rho_dhSum->v, vec_mul(
mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v))));
rho_dhSum->v = vec_sub(rho_dhSum->v, vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v,
vec_mul(ui2.v, wi_dx2.v))));
wcountSum->v = vec_add(wcountSum->v, wi.v);
wcountSum->v = vec_add(wcountSum->v, wi2.v);
wcount_dhSum->v = vec_sub(wcount_dhSum->v, vec_mul(ui.v, wi_dx.v));
wcount_dhSum->v = vec_sub(wcount_dhSum->v, vec_mul(ui2.v, wi_dx2.v));
div_vSum->v = vec_sub(div_vSum->v, vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)));
div_vSum->v = vec_sub(div_vSum->v, vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)));
curlvxSum->v = vec_add(curlvxSum->v, vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)));
curlvxSum->v = vec_add(curlvxSum->v, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)));
curlvySum->v = vec_add(curlvySum->v, vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)));
curlvySum->v = vec_add(curlvySum->v, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)));
curlvzSum->v = vec_add(curlvzSum->v, vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)));
curlvzSum->v = vec_add(curlvzSum->v, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)));
}
}
#endif
......@@ -705,7 +701,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_force(
float *R2, float *Dx, float *Hi, float *Hj, struct part **pi,
struct part **pj) {
#ifdef WITH_VECTORIZATION
#ifdef WITH_OLD_VECTORIZATION
vector r, r2, ri;
vector xi, xj;
......@@ -987,7 +983,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force(
float *R2, float *Dx, float *Hi, float *Hj, struct part **pi,
struct part **pj) {
#ifdef WITH_VECTORIZATION
#ifdef WITH_OLD_VECTORIZATION
vector r, r2, ri;
vector xi, xj;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment