diff --git a/src/hydro/Gadget2/hydro_iact.h b/src/hydro/Gadget2/hydro_iact.h index a8fc8889f985edf14c29fa340f768880e3efa764..96025cf60eef26d0bc970041756c11a45bcdfa64 100644 --- a/src/hydro/Gadget2/hydro_iact.h +++ b/src/hydro/Gadget2/hydro_iact.h @@ -433,13 +433,12 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz, curlvrz.v = vec_mul(curlvrz.v, ri.v); vector scaleFactor; - scaleFactor.v = vec_fma(vec_set1(hydro_dimension), wi.v, - vec_mul(ui.v, wi_dx.v)); + scaleFactor.v = + vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v)); -/* Mask updates to intermediate vector sums for particle pi. */ + /* Mask updates to intermediate vector sums for particle pi. */ rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj.v, wi.v), mask); - rho_dhSum->v = vec_mask_sub( - rho_dhSum->v, vec_mul(mj.v, scaleFactor.v), mask); + rho_dhSum->v = vec_mask_sub(rho_dhSum->v, vec_mul(mj.v, scaleFactor.v), mask); wcountSum->v = vec_mask_add(wcountSum->v, wi.v, mask); wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, scaleFactor.v, mask); div_vSum->v = @@ -457,11 +456,13 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz, * (non-symmetric vectorized version). */ __attribute__((always_inline)) INLINE static void -runner_iact_nonsym_2_vec_density( - float *R2, float *Dx, float *Dy, float *Dz, vector hi_inv, vector vix, - vector viy, vector viz, float *Vjx, float *Vjy, float *Vjz, float *Mj, - vector *rhoSum, vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum, - vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum, +runner_iact_nonsym_2_vec_density(float *R2, float *Dx, float *Dy, float *Dz, + vector hi_inv, vector vix, vector viy, + vector viz, float *Vjx, float *Vjy, float *Vjz, + float *Mj, vector *rhoSum, vector *rho_dhSum, + vector *wcountSum, vector *wcount_dhSum, + vector *div_vSum, vector *curlvxSum, + vector *curlvySum, vector *curlvzSum, mask_t mask, mask_t mask2, short mask_cond) { vector r, ri, r2, ui, wi, wi_dx; @@ -543,20 +544,20 @@ runner_iact_nonsym_2_vec_density( curlvrz2.v = vec_mul(curlvrz2.v, ri2.v); vector scaleFactor, scaleFactor2; - scaleFactor.v = vec_fma(vec_set1(hydro_dimension), wi.v, - vec_mul(ui.v, wi_dx.v)); - scaleFactor2.v = vec_fma(vec_set1(hydro_dimension), wi2.v, - vec_mul(ui2.v, wi_dx2.v)); + scaleFactor.v = + vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v)); + scaleFactor2.v = + vec_fma(vec_set1(hydro_dimension), wi2.v, vec_mul(ui2.v, wi_dx2.v)); /* Mask updates to intermediate vector sums for particle pi. */ /* Mask only when needed. */ if (mask_cond) { rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj.v, wi.v), mask); rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj2.v, wi2.v), mask2); - rho_dhSum->v = vec_mask_sub( - rho_dhSum->v, vec_mul(mj.v, scaleFactor.v), mask); - rho_dhSum->v = vec_mask_sub( - rho_dhSum->v, vec_mul(mj2.v, scaleFactor2.v), mask2); + rho_dhSum->v = + vec_mask_sub(rho_dhSum->v, vec_mul(mj.v, scaleFactor.v), mask); + rho_dhSum->v = + vec_mask_sub(rho_dhSum->v, vec_mul(mj2.v, scaleFactor2.v), mask2); wcountSum->v = vec_mask_add(wcountSum->v, wi.v, mask); wcountSum->v = vec_mask_add(wcountSum->v, wi2.v, mask2); wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, scaleFactor.v, mask); @@ -587,13 +588,20 @@ runner_iact_nonsym_2_vec_density( wcount_dhSum->v = vec_sub(wcount_dhSum->v, scaleFactor.v); wcount_dhSum->v = vec_sub(wcount_dhSum->v, scaleFactor2.v); div_vSum->v = vec_sub(div_vSum->v, vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v))); - div_vSum->v = vec_sub(div_vSum->v, vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v))); - curlvxSum->v = vec_add(curlvxSum->v, vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v))); - curlvxSum->v = vec_add(curlvxSum->v, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v))); - curlvySum->v = vec_add(curlvySum->v, vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v))); - curlvySum->v = vec_add(curlvySum->v, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v))); - curlvzSum->v = vec_add(curlvzSum->v, vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v))); - curlvzSum->v = vec_add(curlvzSum->v, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v))); + div_vSum->v = + vec_sub(div_vSum->v, vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v))); + curlvxSum->v = + vec_add(curlvxSum->v, vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v))); + curlvxSum->v = + vec_add(curlvxSum->v, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v))); + curlvySum->v = + vec_add(curlvySum->v, vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v))); + curlvySum->v = + vec_add(curlvySum->v, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v))); + curlvzSum->v = + vec_add(curlvzSum->v, vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v))); + curlvzSum->v = + vec_add(curlvzSum->v, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v))); } } #endif diff --git a/src/kernel_hydro.h b/src/kernel_hydro.h index e97f11eca5da1c481ad2a19044806a642ecddef4..8a7db6e12adb8fd1ccc15e0aee2c253b11dd7fec 100644 --- a/src/kernel_hydro.h +++ b/src/kernel_hydro.h @@ -373,7 +373,8 @@ __attribute__((always_inline)) INLINE static void kernel_deval_vec( /* Load x and get the interval id. */ vector ind; - ind.m = vec_ftoi(vec_fmin(vec_mul(x.v, kernel_ivals_vec.v), kernel_ivals_vec.v)); + ind.m = + vec_ftoi(vec_fmin(vec_mul(x.v, kernel_ivals_vec.v), kernel_ivals_vec.v)); /* load the coefficients. */ vector c[kernel_degree + 1]; @@ -392,8 +393,10 @@ __attribute__((always_inline)) INLINE static void kernel_deval_vec( } /* Return everything */ - w->v = vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v)); - dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_plus_one_vec.v)); + w->v = + vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v)); + dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v, + kernel_gamma_inv_dim_plus_one_vec.v)); } /* Define constant vectors for the Wendland C2 and Cubic Spline kernel @@ -615,14 +618,16 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec( w2->v = vec_blend(mask_reg2_v2, w2->v, w2_2.v); dw_dx->v = vec_blend(mask_reg2, dw_dx->v, dw_dx_2.v); dw_dx2->v = vec_blend(mask_reg2_v2, dw_dx2->v, dw_dx2_2.v); - + /* Return everything */ - w->v = vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v)); - w2->v = vec_mul(w2->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v)); - dw_dx->v = - vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_plus_one_vec.v)); - dw_dx2->v = - vec_mul(dw_dx2->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_plus_one_vec.v)); + w->v = + vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v)); + w2->v = vec_mul(w2->v, + vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v)); + dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v, + kernel_gamma_inv_dim_plus_one_vec.v)); + dw_dx2->v = vec_mul(dw_dx2->v, vec_mul(kernel_constant_vec.v, + kernel_gamma_inv_dim_plus_one_vec.v)); #endif } @@ -658,7 +663,7 @@ __attribute__((always_inline)) INLINE static void kernel_eval_W_vec(vector *u, /* Form a mask for each part of the kernel. */ vec_create_mask(mask_reg2, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */ - + /* Work out w for both regions of the kernel and combine the results together * using masks. */ @@ -739,7 +744,7 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_vec( /* Mask out result for particles that lie outside of the kernel function. */ mask_t mask; - vec_create_mask(mask, vec_cmp_lt(x.v, vec_set1(1.f))); /* x < 1 */ + vec_create_mask(mask, vec_cmp_lt(x.v, vec_set1(1.f))); /* x < 1 */ dw_dx->v = vec_and_mask(dw_dx->v, mask); @@ -787,9 +792,9 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_2_vec( mask_t mask_reg2_v2; /* Form a mask for each part of the kernel. */ - vec_create_mask(mask_reg2, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */ + vec_create_mask(mask_reg2, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */ vec_create_mask(mask_reg2_v2, vec_cmp_gte(x_2.v, cond.v)); /* 0.5 < x < 1 */ - + /* Work out w for both regions of the kernel and combine the results together * using masks. */ @@ -815,8 +820,8 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_2_vec( /* Mask out result for particles that lie outside of the kernel function. */ mask_t mask, mask_2; - vec_create_mask(mask, vec_cmp_lt(x.v, vec_set1(1.f))); /* x < 1 */ - vec_create_mask(mask_2, vec_cmp_lt(x_2.v, vec_set1(1.f))); /* x < 1 */ + vec_create_mask(mask, vec_cmp_lt(x.v, vec_set1(1.f))); /* x < 1 */ + vec_create_mask(mask_2, vec_cmp_lt(x_2.v, vec_set1(1.f))); /* x < 1 */ dw_dx->v = vec_and_mask(dw_dx->v, mask); dw_dx_2->v = vec_and_mask(dw_dx_2->v, mask_2); diff --git a/src/vector.h b/src/vector.h index 50142aa9063f2ad45b22b11be4ca9849940c209f..bddad1df4835017d1366149dec08914e2f88c247 100644 --- a/src/vector.h +++ b/src/vector.h @@ -83,7 +83,8 @@ #define vec_form_int_mask(a) a #define vec_and(a, b) _mm512_and_ps(a, b) #define vec_mask_and(a, b) _mm512_kand(a, b) -#define vec_and_mask(a, mask) _mm512_maskz_expand_ps(mask, a) /* TODO: Alternative needs to be found. */ +#define vec_and_mask(a, mask) \ + _mm512_maskz_expand_ps(mask, a) /* TODO: Alternative needs to be found. */ #define vec_init_mask(mask) mask = 0xFFFF #define vec_zero_mask(mask) mask = 0 #define vec_create_mask(mask, cond) mask = cond diff --git a/tests/testKernel.c b/tests/testKernel.c index a3731188e51b1235fe84f36eab7c270c788f7dea..0658639070526f28ce1bceefc54d3f2d7a3ae765 100644 --- a/tests/testKernel.c +++ b/tests/testKernel.c @@ -68,7 +68,7 @@ int main() { vx.f[j] = (i + j) * 2.25f / numPoints; } - vx_h.v = vec_mul(vx.v, vec_set1(1.f/h)); + vx_h.v = vec_mul(vx.v, vec_set1(1.f / h)); kernel_deval_1_vec(&vx_h, &W_vec, &dW_vec); @@ -106,8 +106,8 @@ int main() { vx_2.f[j] = (i + j) * 2.25f / numPoints; } - vx_h.v = vec_mul(vx.v, vec_set1(1.f/h)); - vx_h_2.v = vec_mul(vx_2.v, vec_set1(1.f/h)); + vx_h.v = vec_mul(vx.v, vec_set1(1.f / h)); + vx_h_2.v = vec_mul(vx_2.v, vec_set1(1.f / h)); kernel_deval_2_vec(&vx_h, &W_vec, &dW_vec, &vx_h_2, &W_vec_2, &dW_vec_2);