diff --git a/src/hydro/Gadget2/hydro_iact.h b/src/hydro/Gadget2/hydro_iact.h index ddebc92383ecc480caaf00600b4fcda32c027b14..43af05ac97756f4dcf26625a5ec2528e7c1e26c8 100644 --- a/src/hydro/Gadget2/hydro_iact.h +++ b/src/hydro/Gadget2/hydro_iact.h @@ -388,7 +388,7 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz, vector *wcountSum, vector *wcount_dhSum, vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum, - vector mask, int knlMask) { + mask_t mask) { vector r, ri, ui, wi, wi_dx; vector mj; @@ -433,48 +433,21 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz, curlvrz.v = vec_mul(curlvrz.v, ri.v); /* Mask updates to intermediate vector sums for particle pi. */ -#ifdef HAVE_AVX512_F - rhoSum->v = - _mm512_mask_add_ps(rhoSum->v, knlMask, vec_mul(mj.v, wi.v), rhoSum->v); - - rho_dhSum->v = - _mm512_mask_sub_ps(rho_dhSum->v, knlMask, rho_dhSum->v, - vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, - vec_mul(ui.v, wi_dx.v)))); - - wcountSum->v = _mm512_mask_add_ps(wcountSum->v, knlMask, wi.v, wcountSum->v); - - wcount_dhSum->v = _mm512_mask_sub_ps( - rho_dhSum->v, knlMask, rho_dhSum->v, - vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v))); - - div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask, div_vSum->v, - vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v))); - - curlvxSum->v = _mm512_mask_add_ps(curlvxSum->v, knlMask, - vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), - curlvxSum->v); - - curlvySum->v = _mm512_mask_add_ps(curlvySum->v, knlMask, - vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), - curlvySum->v); - - curlvzSum->v = _mm512_mask_add_ps(curlvzSum->v, knlMask, - vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), - curlvzSum->v); -#else - rhoSum->v += vec_and(vec_mul(mj.v, wi.v), mask.v); - rho_dhSum->v -= vec_and(vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, - vec_mul(ui.v, wi_dx.v))), - mask.v); - wcountSum->v += vec_and(wi.v, mask.v); - wcount_dhSum->v -= vec_and( - vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v)), mask.v); - div_vSum->v -= vec_and(vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask.v); - curlvxSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask.v); - curlvySum->v += vec_and(vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask.v); - curlvzSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask.v); -#endif + rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj.v, wi.v), mask); + rho_dhSum->v = vec_mask_sub( + rho_dhSum->v, vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, + vec_mul(ui.v, wi_dx.v))), + mask); + wcountSum->v = vec_mask_add(wcountSum->v, wi.v, mask); + wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, vec_mul(ui.v, wi_dx.v), mask); + div_vSum->v = + vec_mask_sub(div_vSum->v, vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask); + curlvxSum->v = vec_mask_add(curlvxSum->v, + vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask); + curlvySum->v = vec_mask_add(curlvySum->v, + vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask); + curlvzSum->v = vec_mask_add(curlvzSum->v, + vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask); } /** @@ -487,7 +460,7 @@ runner_iact_nonsym_2_vec_density( vector viy, vector viz, float *Vjx, float *Vjy, float *Vjz, float *Mj, vector *rhoSum, vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum, vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum, - vector mask, vector mask2, int knlMask, int knlMask2) { + mask_t mask, mask_t mask2, short mask_cond) { vector r, ri, r2, ui, wi, wi_dx; vector mj; @@ -567,38 +540,61 @@ runner_iact_nonsym_2_vec_density( curlvrz.v = vec_mul(curlvrz.v, ri.v); curlvrz2.v = vec_mul(curlvrz2.v, ri2.v); - rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj.v, wi.v), mask); - rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj2.v, wi2.v), mask2); - rho_dhSum->v = vec_mask_sub( - rho_dhSum->v, vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, - vec_mul(xi.v, wi_dx.v))), - mask); - rho_dhSum->v = vec_mask_sub( - rho_dhSum->v, vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v, - vec_mul(xi2.v, wi_dx2.v))), - mask2); - wcountSum->v = vec_mask_add(wcountSum->v, wi.v, mask); - wcountSum->v = vec_mask_add(wcountSum->v, wi2.v, mask2); - wcount_dhSum->v = - vec_mask_sub(wcount_dhSum->v, vec_mul(xi.v, wi_dx.v), mask); - wcount_dhSum->v = - vec_mask_sub(wcount_dhSum->v, vec_mul(xi2.v, wi_dx2.v), mask2); - div_vSum->v = vec_mask_sub(div_vSum->v, - vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask); - div_vSum->v = vec_mask_sub( - div_vSum->v, vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)), mask2); - curlvxSum->v = vec_mask_add( - curlvxSum->v, vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask); - curlvxSum->v = vec_mask_add( - curlvxSum->v, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)), mask2); - curlvySum->v = vec_mask_add( - curlvySum->v, vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask); - curlvySum->v = vec_mask_add( - curlvySum->v, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)), mask2); - curlvzSum->v = vec_mask_add( - curlvzSum->v, vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask); - curlvzSum->v = vec_mask_add( - curlvzSum->v, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)), mask2); + /* Mask updates to intermediate vector sums for particle pi. */ + /* Mask only when needed. */ + if (mask_cond) { + rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj.v, wi.v), mask); + rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj2.v, wi2.v), mask2); + rho_dhSum->v = vec_mask_sub( + rho_dhSum->v, vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, + vec_mul(ui.v, wi_dx.v))), + mask); + rho_dhSum->v = vec_mask_sub( + rho_dhSum->v, vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v, + vec_mul(ui2.v, wi_dx2.v))), + mask2); + wcountSum->v = vec_mask_add(wcountSum->v, wi.v, mask); + wcountSum->v = vec_mask_add(wcountSum->v, wi2.v, mask2); + wcount_dhSum->v = + vec_mask_sub(wcount_dhSum->v, vec_mul(ui.v, wi_dx.v), mask); + wcount_dhSum->v = + vec_mask_sub(wcount_dhSum->v, vec_mul(ui2.v, wi_dx2.v), mask2); + div_vSum->v = vec_mask_sub(div_vSum->v, + vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask); + div_vSum->v = vec_mask_sub( + div_vSum->v, vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)), mask2); + curlvxSum->v = vec_mask_add( + curlvxSum->v, vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask); + curlvxSum->v = vec_mask_add( + curlvxSum->v, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)), mask2); + curlvySum->v = vec_mask_add( + curlvySum->v, vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask); + curlvySum->v = vec_mask_add( + curlvySum->v, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)), mask2); + curlvzSum->v = vec_mask_add( + curlvzSum->v, vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask); + curlvzSum->v = vec_mask_add( + curlvzSum->v, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)), mask2); + } else { + rhoSum->v = vec_add(rhoSum->v, vec_mul(mj.v, wi.v)); + rhoSum->v = vec_add(rhoSum->v, vec_mul(mj2.v, wi2.v)); + rho_dhSum->v = vec_sub(rho_dhSum->v, vec_mul( + mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v)))); + rho_dhSum->v = vec_sub(rho_dhSum->v, vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v, + vec_mul(ui2.v, wi_dx2.v)))); + wcountSum->v = vec_add(wcountSum->v, wi.v); + wcountSum->v = vec_add(wcountSum->v, wi2.v); + wcount_dhSum->v = vec_sub(wcount_dhSum->v, vec_mul(ui.v, wi_dx.v)); + wcount_dhSum->v = vec_sub(wcount_dhSum->v, vec_mul(ui2.v, wi_dx2.v)); + div_vSum->v = vec_sub(div_vSum->v, vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v))); + div_vSum->v = vec_sub(div_vSum->v, vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v))); + curlvxSum->v = vec_add(curlvxSum->v, vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v))); + curlvxSum->v = vec_add(curlvxSum->v, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v))); + curlvySum->v = vec_add(curlvySum->v, vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v))); + curlvySum->v = vec_add(curlvySum->v, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v))); + curlvzSum->v = vec_add(curlvzSum->v, vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v))); + curlvzSum->v = vec_add(curlvzSum->v, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v))); + } } #endif @@ -705,7 +701,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_force( float *R2, float *Dx, float *Hi, float *Hj, struct part **pi, struct part **pj) { -#ifdef WITH_VECTORIZATION +#ifdef WITH_OLD_VECTORIZATION vector r, r2, ri; vector xi, xj; @@ -987,7 +983,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force( float *R2, float *Dx, float *Hi, float *Hj, struct part **pi, struct part **pj) { -#ifdef WITH_VECTORIZATION +#ifdef WITH_OLD_VECTORIZATION vector r, r2, ri; vector xi, xj;