Commit 3602dceb authored by James Willis's avatar James Willis
Browse files

Use generic masks and remove old vectorisation from AVX-512 compilation.

parent 9b154ac9
...@@ -388,7 +388,7 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz, ...@@ -388,7 +388,7 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz,
vector *wcountSum, vector *wcount_dhSum, vector *wcountSum, vector *wcount_dhSum,
vector *div_vSum, vector *curlvxSum, vector *div_vSum, vector *curlvxSum,
vector *curlvySum, vector *curlvzSum, vector *curlvySum, vector *curlvzSum,
vector mask, int knlMask) { mask_t mask) {
vector r, ri, ui, wi, wi_dx; vector r, ri, ui, wi, wi_dx;
vector mj; vector mj;
...@@ -433,48 +433,21 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz, ...@@ -433,48 +433,21 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz,
curlvrz.v = vec_mul(curlvrz.v, ri.v); curlvrz.v = vec_mul(curlvrz.v, ri.v);
/* Mask updates to intermediate vector sums for particle pi. */ /* Mask updates to intermediate vector sums for particle pi. */
#ifdef HAVE_AVX512_F rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj.v, wi.v), mask);
rhoSum->v = rho_dhSum->v = vec_mask_sub(
_mm512_mask_add_ps(rhoSum->v, knlMask, vec_mul(mj.v, wi.v), rhoSum->v); rho_dhSum->v, vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
vec_mul(ui.v, wi_dx.v))),
rho_dhSum->v = mask);
_mm512_mask_sub_ps(rho_dhSum->v, knlMask, rho_dhSum->v, wcountSum->v = vec_mask_add(wcountSum->v, wi.v, mask);
vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, vec_mul(ui.v, wi_dx.v), mask);
vec_mul(ui.v, wi_dx.v)))); div_vSum->v =
vec_mask_sub(div_vSum->v, vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask);
wcountSum->v = _mm512_mask_add_ps(wcountSum->v, knlMask, wi.v, wcountSum->v); curlvxSum->v = vec_mask_add(curlvxSum->v,
vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask);
wcount_dhSum->v = _mm512_mask_sub_ps( curlvySum->v = vec_mask_add(curlvySum->v,
rho_dhSum->v, knlMask, rho_dhSum->v, vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask);
vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v))); curlvzSum->v = vec_mask_add(curlvzSum->v,
vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask);
div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask, div_vSum->v,
vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)));
curlvxSum->v = _mm512_mask_add_ps(curlvxSum->v, knlMask,
vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)),
curlvxSum->v);
curlvySum->v = _mm512_mask_add_ps(curlvySum->v, knlMask,
vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)),
curlvySum->v);
curlvzSum->v = _mm512_mask_add_ps(curlvzSum->v, knlMask,
vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)),
curlvzSum->v);
#else
rhoSum->v += vec_and(vec_mul(mj.v, wi.v), mask.v);
rho_dhSum->v -= vec_and(vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
vec_mul(ui.v, wi_dx.v))),
mask.v);
wcountSum->v += vec_and(wi.v, mask.v);
wcount_dhSum->v -= vec_and(
vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v)), mask.v);
div_vSum->v -= vec_and(vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask.v);
curlvxSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask.v);
curlvySum->v += vec_and(vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask.v);
curlvzSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask.v);
#endif
} }
/** /**
...@@ -487,7 +460,7 @@ runner_iact_nonsym_2_vec_density( ...@@ -487,7 +460,7 @@ runner_iact_nonsym_2_vec_density(
vector viy, vector viz, float *Vjx, float *Vjy, float *Vjz, float *Mj, vector viy, vector viz, float *Vjx, float *Vjy, float *Vjz, float *Mj,
vector *rhoSum, vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum, vector *rhoSum, vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum,
vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum, vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum,
vector mask, vector mask2, int knlMask, int knlMask2) { mask_t mask, mask_t mask2, short mask_cond) {
vector r, ri, r2, ui, wi, wi_dx; vector r, ri, r2, ui, wi, wi_dx;
vector mj; vector mj;
...@@ -567,38 +540,61 @@ runner_iact_nonsym_2_vec_density( ...@@ -567,38 +540,61 @@ runner_iact_nonsym_2_vec_density(
curlvrz.v = vec_mul(curlvrz.v, ri.v); curlvrz.v = vec_mul(curlvrz.v, ri.v);
curlvrz2.v = vec_mul(curlvrz2.v, ri2.v); curlvrz2.v = vec_mul(curlvrz2.v, ri2.v);
rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj.v, wi.v), mask); /* Mask updates to intermediate vector sums for particle pi. */
rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj2.v, wi2.v), mask2); /* Mask only when needed. */
rho_dhSum->v = vec_mask_sub( if (mask_cond) {
rho_dhSum->v, vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj.v, wi.v), mask);
vec_mul(xi.v, wi_dx.v))), rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj2.v, wi2.v), mask2);
mask); rho_dhSum->v = vec_mask_sub(
rho_dhSum->v = vec_mask_sub( rho_dhSum->v, vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
rho_dhSum->v, vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v, vec_mul(ui.v, wi_dx.v))),
vec_mul(xi2.v, wi_dx2.v))), mask);
mask2); rho_dhSum->v = vec_mask_sub(
wcountSum->v = vec_mask_add(wcountSum->v, wi.v, mask); rho_dhSum->v, vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v,
wcountSum->v = vec_mask_add(wcountSum->v, wi2.v, mask2); vec_mul(ui2.v, wi_dx2.v))),
wcount_dhSum->v = mask2);
vec_mask_sub(wcount_dhSum->v, vec_mul(xi.v, wi_dx.v), mask); wcountSum->v = vec_mask_add(wcountSum->v, wi.v, mask);
wcount_dhSum->v = wcountSum->v = vec_mask_add(wcountSum->v, wi2.v, mask2);
vec_mask_sub(wcount_dhSum->v, vec_mul(xi2.v, wi_dx2.v), mask2); wcount_dhSum->v =
div_vSum->v = vec_mask_sub(div_vSum->v, vec_mask_sub(wcount_dhSum->v, vec_mul(ui.v, wi_dx.v), mask);
vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask); wcount_dhSum->v =
div_vSum->v = vec_mask_sub( vec_mask_sub(wcount_dhSum->v, vec_mul(ui2.v, wi_dx2.v), mask2);
div_vSum->v, vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)), mask2); div_vSum->v = vec_mask_sub(div_vSum->v,
curlvxSum->v = vec_mask_add( vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask);
curlvxSum->v, vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask); div_vSum->v = vec_mask_sub(
curlvxSum->v = vec_mask_add( div_vSum->v, vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)), mask2);
curlvxSum->v, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)), mask2); curlvxSum->v = vec_mask_add(
curlvySum->v = vec_mask_add( curlvxSum->v, vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask);
curlvySum->v, vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask); curlvxSum->v = vec_mask_add(
curlvySum->v = vec_mask_add( curlvxSum->v, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)), mask2);
curlvySum->v, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)), mask2); curlvySum->v = vec_mask_add(
curlvzSum->v = vec_mask_add( curlvySum->v, vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask);
curlvzSum->v, vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask); curlvySum->v = vec_mask_add(
curlvzSum->v = vec_mask_add( curlvySum->v, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)), mask2);
curlvzSum->v, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)), mask2); curlvzSum->v = vec_mask_add(
curlvzSum->v, vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask);
curlvzSum->v = vec_mask_add(
curlvzSum->v, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)), mask2);
} else {
rhoSum->v = vec_add(rhoSum->v, vec_mul(mj.v, wi.v));
rhoSum->v = vec_add(rhoSum->v, vec_mul(mj2.v, wi2.v));
rho_dhSum->v = vec_sub(rho_dhSum->v, vec_mul(
mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v))));
rho_dhSum->v = vec_sub(rho_dhSum->v, vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v,
vec_mul(ui2.v, wi_dx2.v))));
wcountSum->v = vec_add(wcountSum->v, wi.v);
wcountSum->v = vec_add(wcountSum->v, wi2.v);
wcount_dhSum->v = vec_sub(wcount_dhSum->v, vec_mul(ui.v, wi_dx.v));
wcount_dhSum->v = vec_sub(wcount_dhSum->v, vec_mul(ui2.v, wi_dx2.v));
div_vSum->v = vec_sub(div_vSum->v, vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)));
div_vSum->v = vec_sub(div_vSum->v, vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)));
curlvxSum->v = vec_add(curlvxSum->v, vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)));
curlvxSum->v = vec_add(curlvxSum->v, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)));
curlvySum->v = vec_add(curlvySum->v, vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)));
curlvySum->v = vec_add(curlvySum->v, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)));
curlvzSum->v = vec_add(curlvzSum->v, vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)));
curlvzSum->v = vec_add(curlvzSum->v, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)));
}
} }
#endif #endif
...@@ -705,7 +701,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_force( ...@@ -705,7 +701,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_force(
float *R2, float *Dx, float *Hi, float *Hj, struct part **pi, float *R2, float *Dx, float *Hi, float *Hj, struct part **pi,
struct part **pj) { struct part **pj) {
#ifdef WITH_VECTORIZATION #ifdef WITH_OLD_VECTORIZATION
vector r, r2, ri; vector r, r2, ri;
vector xi, xj; vector xi, xj;
...@@ -987,7 +983,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force( ...@@ -987,7 +983,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force(
float *R2, float *Dx, float *Hi, float *Hj, struct part **pi, float *R2, float *Dx, float *Hi, float *Hj, struct part **pi,
struct part **pj) { struct part **pj) {
#ifdef WITH_VECTORIZATION #ifdef WITH_OLD_VECTORIZATION
vector r, r2, ri; vector r, r2, ri;
vector xi, xj; vector xi, xj;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment