Commit 5f3bfc81 authored by James Willis's avatar James Willis
Browse files

Use intrinsics for vector operations as arithmetic overloading is not...

Use intrinsics for vector operations as arithmetic overloading is not supported by the Intel compiler yet for AVX-512 vectors.

Conflicts:
	src/hydro/Gadget2/hydro_iact.h
parent d187ae15
...@@ -567,88 +567,38 @@ runner_iact_nonsym_2_vec_density( ...@@ -567,88 +567,38 @@ runner_iact_nonsym_2_vec_density(
curlvrz.v = vec_mul(curlvrz.v, ri.v); curlvrz.v = vec_mul(curlvrz.v, ri.v);
curlvrz2.v = vec_mul(curlvrz2.v, ri2.v); curlvrz2.v = vec_mul(curlvrz2.v, ri2.v);
/* Mask updates to intermediate vector sums for particle pi. */ rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj.v, wi.v), mask);
#ifdef HAVE_AVX512_F rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj2.v, wi2.v), mask2);
rhoSum->v = rho_dhSum->v = vec_mask_sub(
_mm512_mask_add_ps(rhoSum->v, knlMask, vec_mul(mj.v, wi.v), rhoSum->v); rho_dhSum->v, vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
rhoSum->v = vec_mul(xi.v, wi_dx.v))),
_mm512_mask_add_ps(rhoSum->v, knlMask2, vec_mul(mj2.v, wi2.v), rhoSum->v); mask);
rho_dhSum->v = vec_mask_sub(
rho_dhSum->v = rho_dhSum->v, vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v,
_mm512_mask_sub_ps(rho_dhSum->v, knlMask, rho_dhSum->v, vec_mul(xi2.v, wi_dx2.v))),
vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, mask2);
vec_mul(ui.v, wi_dx.v)))); wcountSum->v = vec_mask_add(wcountSum->v, wi.v, mask);
rho_dhSum->v = wcountSum->v = vec_mask_add(wcountSum->v, wi2.v, mask2);
_mm512_mask_sub_ps(rho_dhSum->v, knlMask2, rho_dhSum->v, wcount_dhSum->v =
vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi2.v, vec_mask_sub(wcount_dhSum->v, vec_mul(xi.v, wi_dx.v), mask);
vec_mul(ui2.v, wi_dx2.v)))); wcount_dhSum->v =
vec_mask_sub(wcount_dhSum->v, vec_mul(xi2.v, wi_dx2.v), mask2);
wcountSum->v = _mm512_mask_add_ps(wcountSum->v, knlMask, wi.v, wcountSum->v); div_vSum->v = vec_mask_sub(div_vSum->v,
wcountSum->v = vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask);
_mm512_mask_add_ps(wcountSum->v, knlMask2, wi2.v, wcountSum->v); div_vSum->v = vec_mask_sub(
div_vSum->v, vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)), mask2);
wcount_dhSum->v = _mm512_mask_sub_ps( curlvxSum->v = vec_mask_add(
rho_dhSum->v, knlMask, rho_dhSum->v, curlvxSum->v, vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask);
vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v))); curlvxSum->v = vec_mask_add(
curlvxSum->v, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)), mask2);
wcount_dhSum->v = _mm512_mask_sub_ps( curlvySum->v = vec_mask_add(
rho_dhSum->v, knlMask2, rho_dhSum->v, curlvySum->v, vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask);
vec_fma(vec_set1(hydro_dimension), wi2.v, vec_mul(ui2.v, wi_dx2.v))); curlvySum->v = vec_mask_add(
curlvySum->v, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)), mask2);
div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask, div_vSum->v, curlvzSum->v = vec_mask_add(
vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v))); curlvzSum->v, vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask);
div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask2, div_vSum->v, curlvzSum->v = vec_mask_add(
vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v))); curlvzSum->v, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)), mask2);
curlvxSum->v = _mm512_mask_add_ps(curlvxSum->v, knlMask,
vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)),
curlvxSum->v);
curlvxSum->v = _mm512_mask_add_ps(
curlvxSum->v, knlMask2, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)),
curlvxSum->v);
curlvySum->v = _mm512_mask_add_ps(curlvySum->v, knlMask,
vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)),
curlvySum->v);
curlvySum->v = _mm512_mask_add_ps(
curlvySum->v, knlMask2, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)),
curlvySum->v);
curlvzSum->v = _mm512_mask_add_ps(curlvzSum->v, knlMask,
vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)),
curlvzSum->v);
curlvzSum->v = _mm512_mask_add_ps(
curlvzSum->v, knlMask2, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)),
curlvzSum->v);
#else
rhoSum->v += vec_and(vec_mul(mj.v, wi.v), mask.v);
rhoSum->v += vec_and(vec_mul(mj2.v, wi2.v), mask2.v);
rho_dhSum->v -= vec_and(vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
vec_mul(ui.v, wi_dx.v))),
mask.v);
rho_dhSum->v -=
vec_and(vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v,
vec_mul(ui2.v, wi_dx2.v))),
mask2.v);
wcountSum->v += vec_and(wi.v, mask.v);
wcountSum->v += vec_and(wi2.v, mask2.v);
wcount_dhSum->v -= vec_and(
vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v)), mask.v);
wcount_dhSum->v -= vec_and(
vec_fma(vec_set1(hydro_dimension), wi2.v, vec_mul(ui2.v, wi_dx2.v)),
mask2.v);
div_vSum->v -= vec_and(vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask.v);
div_vSum->v -= vec_and(vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)), mask2.v);
curlvxSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask.v);
curlvxSum->v +=
vec_and(vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)), mask2.v);
curlvySum->v += vec_and(vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask.v);
curlvySum->v +=
vec_and(vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)), mask2.v);
curlvzSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask.v);
curlvzSum->v +=
vec_and(vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)), mask2.v);
#endif
} }
#endif #endif
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment