diff --git a/src/cache.h b/src/cache.h index 14cd91835fceae06925c78fadae725db7eae30ad..6739c2020e897d54e6586c9d121490aaab5661bc 100644 --- a/src/cache.h +++ b/src/cache.h @@ -62,19 +62,19 @@ struct cache { /* Particle z velocity. */ float *restrict vz __attribute__((aligned(CACHE_ALIGN))); - + /* Particle density. */ float *restrict rho __attribute__((aligned(CACHE_ALIGN))); - + /* Particle smoothing length gradient. */ float *restrict grad_h __attribute__((aligned(CACHE_ALIGN))); - + /* Pressure over density squared. */ float *restrict pOrho2 __attribute__((aligned(CACHE_ALIGN))); - + /* Balsara switch. */ float *restrict balsara __attribute__((aligned(CACHE_ALIGN))); - + /* Particle sound speed. */ float *restrict soundspeed __attribute__((aligned(CACHE_ALIGN))); @@ -112,19 +112,19 @@ struct c2_cache { /* z velocity of particle pj. */ float vzq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); - + /* Density of particle pj. */ float rhoq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); - + /* Smoothing length gradient of particle pj. */ float grad_hq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); - + /* Pressure over density squared of particle pj. */ float pOrho2q[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); - + /* Balsara switch of particle pj. */ float balsaraq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); - + /* Sound speed of particle pj. */ float soundspeedq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); @@ -213,7 +213,7 @@ __attribute__((always_inline)) INLINE void cache_read_particles( ci_cache->vx[i] = ci->parts[i].v[0]; ci_cache->vy[i] = ci->parts[i].v[1]; ci_cache->vz[i] = ci->parts[i].v[2]; - + ci_cache->rho[i] = ci->parts[i].rho; ci_cache->grad_h[i] = ci->parts[i].force.f; ci_cache->pOrho2[i] = ci->parts[i].force.P_over_rho2; diff --git a/src/hydro/Gadget2/hydro_iact.h b/src/hydro/Gadget2/hydro_iact.h index 2c40ab7b0c6da807739e771935f0b3b0e285e940..fc62845ce1788a85d90e6e748e648b7887be2065 100644 --- a/src/hydro/Gadget2/hydro_iact.h +++ b/src/hydro/Gadget2/hydro_iact.h @@ -434,14 +434,20 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz, /* Mask updates to intermediate vector sums for particle pi. */ rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj.v, wi.v), mask); - rho_dhSum->v = vec_mask_sub(rho_dhSum->v, vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, - vec_mul(xi.v, wi_dx.v))), mask); + rho_dhSum->v = vec_mask_sub( + rho_dhSum->v, vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, + vec_mul(xi.v, wi_dx.v))), + mask); wcountSum->v = vec_mask_add(wcountSum->v, wi.v, mask); wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, vec_mul(xi.v, wi_dx.v), mask); - div_vSum->v = vec_mask_sub(div_vSum->v, vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask); - curlvxSum->v = vec_mask_add(curlvxSum->v,vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask); - curlvySum->v = vec_mask_add(curlvySum->v,vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask); - curlvzSum->v = vec_mask_add(curlvzSum->v,vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask); + div_vSum->v = + vec_mask_sub(div_vSum->v, vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask); + curlvxSum->v = vec_mask_add(curlvxSum->v, + vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask); + curlvySum->v = vec_mask_add(curlvySum->v, + vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask); + curlvzSum->v = vec_mask_add(curlvzSum->v, + vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask); } /** @@ -449,12 +455,14 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz, * (non-symmetric vectorized version). */ __attribute__((always_inline)) INLINE static void -runner_iact_nonsym_2_vec_density( - float *R2, float *Dx, float *Dy, float *Dz, vector hi_inv, vector vix, - vector viy, vector viz, float *Vjx, float *Vjy, float *Vjz, float *Mj, - vector *rhoSum, vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum, - vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum, - mask_t mask, mask_t mask2, short mask_cond) { +runner_iact_nonsym_2_vec_density(float *R2, float *Dx, float *Dy, float *Dz, + vector hi_inv, vector vix, vector viy, + vector viz, float *Vjx, float *Vjy, float *Vjz, + float *Mj, vector *rhoSum, vector *rho_dhSum, + vector *wcountSum, vector *wcount_dhSum, + vector *div_vSum, vector *curlvxSum, + vector *curlvySum, vector *curlvzSum, + mask_t mask, mask_t mask2, short mask_cond) { vector r, ri, r2, xi, wi, wi_dx; vector mj; @@ -534,35 +542,48 @@ runner_iact_nonsym_2_vec_density( curlvrz.v = vec_mul(curlvrz.v, ri.v); curlvrz2.v = vec_mul(curlvrz2.v, ri2.v); -/* Mask updates to intermediate vector sums for particle pi. */ + /* Mask updates to intermediate vector sums for particle pi. */ /* Mask only when needed. */ - if(mask_cond) { + if (mask_cond) { rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj.v, wi.v), mask); rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj2.v, wi2.v), mask2); - rho_dhSum->v = vec_mask_sub(rho_dhSum->v, vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, - vec_mul(xi.v, wi_dx.v))), mask); - rho_dhSum->v = vec_mask_sub(rho_dhSum->v, vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v, - vec_mul(xi2.v, wi_dx2.v))), mask2); + rho_dhSum->v = vec_mask_sub( + rho_dhSum->v, vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, + vec_mul(xi.v, wi_dx.v))), + mask); + rho_dhSum->v = vec_mask_sub( + rho_dhSum->v, vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v, + vec_mul(xi2.v, wi_dx2.v))), + mask2); wcountSum->v = vec_mask_add(wcountSum->v, wi.v, mask); wcountSum->v = vec_mask_add(wcountSum->v, wi2.v, mask2); - wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, vec_mul(xi.v, wi_dx.v), mask); - wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, vec_mul(xi2.v, wi_dx2.v), mask2); - div_vSum->v = vec_mask_sub(div_vSum->v, vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask); - div_vSum->v = vec_mask_sub(div_vSum->v, vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)), mask2); - curlvxSum->v = vec_mask_add(curlvxSum->v,vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask); - curlvxSum->v = vec_mask_add(curlvxSum->v,vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)), mask2); - curlvySum->v = vec_mask_add(curlvySum->v,vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask); - curlvySum->v = vec_mask_add(curlvySum->v,vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)), mask2); - curlvzSum->v = vec_mask_add(curlvzSum->v,vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask); - curlvzSum->v = vec_mask_add(curlvzSum->v,vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)), mask2); - } - else { + wcount_dhSum->v = + vec_mask_sub(wcount_dhSum->v, vec_mul(xi.v, wi_dx.v), mask); + wcount_dhSum->v = + vec_mask_sub(wcount_dhSum->v, vec_mul(xi2.v, wi_dx2.v), mask2); + div_vSum->v = vec_mask_sub(div_vSum->v, + vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask); + div_vSum->v = vec_mask_sub( + div_vSum->v, vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)), mask2); + curlvxSum->v = vec_mask_add( + curlvxSum->v, vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask); + curlvxSum->v = vec_mask_add( + curlvxSum->v, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)), mask2); + curlvySum->v = vec_mask_add( + curlvySum->v, vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask); + curlvySum->v = vec_mask_add( + curlvySum->v, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)), mask2); + curlvzSum->v = vec_mask_add( + curlvzSum->v, vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask); + curlvzSum->v = vec_mask_add( + curlvzSum->v, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)), mask2); + } else { rhoSum->v += vec_mul(mj.v, wi.v); rhoSum->v += vec_mul(mj2.v, wi2.v); - rho_dhSum->v -= vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, - vec_mul(xi.v, wi_dx.v))); + rho_dhSum->v -= vec_mul( + mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(xi.v, wi_dx.v))); rho_dhSum->v -= vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v, - vec_mul(xi2.v, wi_dx2.v))); + vec_mul(xi2.v, wi_dx2.v))); wcountSum->v += wi.v; wcountSum->v += wi2.v; wcount_dhSum->v -= vec_mul(xi.v, wi_dx.v); @@ -1140,9 +1161,15 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force( } #ifdef WITH_VECTORIZATION -__attribute__((always_inline)) INLINE static void runner_iact_nonsym_1_vec_force( - float *R2, float *Dx, float *Dy, float *Dz, vector *vix, vector *viy, vector *viz, vector *pirho, vector *grad_hi, vector *piPOrho2, vector *balsara_i, vector *ci, float *Vjx, float *Vjy, float *Vjz, float *Pjrho, float *Grad_hj, float *PjPOrho2, float *Balsara_j, float *Cj, float *Mj, vector *hi_inv, float *Hj_inv, - vector *a_hydro_xSum, vector *a_hydro_ySum, vector *a_hydro_zSum, vector *h_dtSum, vector *v_sigSum, vector *entropy_dtSum, mask_t mask) { +__attribute__((always_inline)) INLINE static void +runner_iact_nonsym_1_vec_force( + float *R2, float *Dx, float *Dy, float *Dz, vector *vix, vector *viy, + vector *viz, vector *pirho, vector *grad_hi, vector *piPOrho2, + vector *balsara_i, vector *ci, float *Vjx, float *Vjy, float *Vjz, + float *Pjrho, float *Grad_hj, float *PjPOrho2, float *Balsara_j, float *Cj, + float *Mj, vector *hi_inv, float *Hj_inv, vector *a_hydro_xSum, + vector *a_hydro_ySum, vector *a_hydro_zSum, vector *h_dtSum, + vector *v_sigSum, vector *entropy_dtSum, mask_t mask) { #ifdef WITH_VECTORIZATION @@ -1164,7 +1191,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_1_vec_force dx.v = vec_load(Dx); dy.v = vec_load(Dy); dz.v = vec_load(Dz); - + vjx.v = vec_load(Vjx); vjy.v = vec_load(Vjy); vjz.v = vec_load(Vjz); @@ -1179,7 +1206,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_1_vec_force fac_mu.v = vec_set1(1.f); /* Will change with cosmological integration */ -/* Load stuff. */ + /* Load stuff. */ balsara.v = balsara_i->v + balsara_j.v; /* Get the radius and inverse radius. */ @@ -1195,10 +1222,10 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_1_vec_force /* Get the kernel for hj. */ hjd_inv = pow_dimension_plus_one_vec(hj_inv); xj.v = r.v * hj_inv.v; - + /* Calculate the kernel for two particles. */ kernel_eval_dWdx_force_vec(&xj, &wj_dx); - + wj_dr.v = hjd_inv.v * wj_dx.v; /* Compute dv dot r. */ @@ -1223,7 +1250,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_1_vec_force sph_term.v = (grad_hi->v * piPOrho2->v * wi_dr.v + grad_hj.v * pjPOrho2.v * wj_dr.v) * ri.v; - + /* Eventually get the acceleration */ acc.v = visc_term.v + sph_term.v; @@ -1255,9 +1282,16 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_1_vec_force #endif } -__attribute__((always_inline)) INLINE static void runner_iact_nonsym_2_vec_force( - float *R2, float *Dx, float *Dy, float *Dz, vector *vix, vector *viy, vector *viz, vector *pirho, vector *grad_hi, vector *piPOrho2, vector *balsara_i, vector *ci, float *Vjx, float *Vjy, float *Vjz, float *Pjrho, float *Grad_hj, float *PjPOrho2, float *Balsara_j, float *Cj, float *Mj, vector *hi_inv, float *Hj_inv, - vector *a_hydro_xSum, vector *a_hydro_ySum, vector *a_hydro_zSum, vector *h_dtSum, vector *v_sigSum, vector *entropy_dtSum, mask_t mask, mask_t mask_2, short mask_cond) { +__attribute__((always_inline)) INLINE static void +runner_iact_nonsym_2_vec_force( + float *R2, float *Dx, float *Dy, float *Dz, vector *vix, vector *viy, + vector *viz, vector *pirho, vector *grad_hi, vector *piPOrho2, + vector *balsara_i, vector *ci, float *Vjx, float *Vjy, float *Vjz, + float *Pjrho, float *Grad_hj, float *PjPOrho2, float *Balsara_j, float *Cj, + float *Mj, vector *hi_inv, float *Hj_inv, vector *a_hydro_xSum, + vector *a_hydro_ySum, vector *a_hydro_zSum, vector *h_dtSum, + vector *v_sigSum, vector *entropy_dtSum, mask_t mask, mask_t mask_2, + short mask_cond) { #ifdef WITH_VECTORIZATION @@ -1292,7 +1326,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_2_vec_force dx.v = vec_load(Dx); dy.v = vec_load(Dy); dz.v = vec_load(Dz); - + vjx.v = vec_load(Vjx); vjy.v = vec_load(Vjy); vjz.v = vec_load(Vjz); @@ -1311,7 +1345,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_2_vec_force dx_2.v = vec_load(&Dx[VEC_SIZE]); dy_2.v = vec_load(&Dy[VEC_SIZE]); dz_2.v = vec_load(&Dz[VEC_SIZE]); - + vjx_2.v = vec_load(&Vjx[VEC_SIZE]); vjy_2.v = vec_load(&Vjy[VEC_SIZE]); vjz_2.v = vec_load(&Vjz[VEC_SIZE]); @@ -1324,7 +1358,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_2_vec_force cj_2.v = vec_load(&Cj[VEC_SIZE]); hj_inv_2.v = vec_load(&Hj_inv[VEC_SIZE]); -/* Load stuff. */ + /* Load stuff. */ balsara.v = balsara_i->v + balsara_j.v; balsara_2.v = balsara_i->v + balsara_j_2.v; @@ -1348,11 +1382,11 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_2_vec_force hjd_inv_2 = pow_dimension_plus_one_vec(hj_inv_2); xj.v = r.v * hj_inv.v; xj_2.v = r_2.v * hj_inv_2.v; - + /* Calculate the kernel for two particles. */ kernel_eval_dWdx_force_vec(&xj, &wj_dx); kernel_eval_dWdx_force_vec(&xj_2, &wj_dx_2); - + wj_dr.v = hjd_inv.v * wj_dx.v; wj_dr_2.v = hjd_inv_2.v * wj_dx_2.v; @@ -1360,13 +1394,13 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_2_vec_force dvdr.v = ((vix->v - vjx.v) * dx.v) + ((viy->v - vjy.v) * dy.v) + ((viz->v - vjz.v) * dz.v); dvdr_2.v = ((vix->v - vjx_2.v) * dx_2.v) + ((viy->v - vjy_2.v) * dy_2.v) + - ((viz->v - vjz_2.v) * dz_2.v); + ((viz->v - vjz_2.v) * dz_2.v); /* Compute the relative velocity. (This is 0 if the particles move away from * each other and negative otherwise) */ omega_ij.v = vec_fmin(dvdr.v, vec_setzero()); omega_ij_2.v = vec_fmin(dvdr_2.v, vec_setzero()); - mu_ij.v = fac_mu.v * ri.v * omega_ij.v; /* This is 0 or negative */ + mu_ij.v = fac_mu.v * ri.v * omega_ij.v; /* This is 0 or negative */ mu_ij_2.v = fac_mu.v * ri_2.v * omega_ij_2.v; /* This is 0 or negative */ /* Compute signal velocity */ @@ -1379,7 +1413,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_2_vec_force visc.v = vec_set1(-0.25f) * vec_set1(const_viscosity_alpha) * v_sig.v * mu_ij.v * balsara.v / rho_ij.v; visc_2.v = vec_set1(-0.25f) * vec_set1(const_viscosity_alpha) * v_sig_2.v * - mu_ij_2.v * balsara_2.v / rho_ij_2.v; + mu_ij_2.v * balsara_2.v / rho_ij_2.v; /* Now, convolve with the kernel */ visc_term.v = vec_set1(0.5f) * visc.v * (wi_dr.v + wj_dr.v) * ri.v; @@ -1387,9 +1421,9 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_2_vec_force sph_term.v = (grad_hi->v * piPOrho2->v * wi_dr.v + grad_hj.v * pjPOrho2.v * wj_dr.v) * ri.v; - sph_term_2.v = - (grad_hi->v * piPOrho2->v * wi_dr_2.v + grad_hj_2.v * pjPOrho2_2.v * wj_dr_2.v) * - ri_2.v; + sph_term_2.v = (grad_hi->v * piPOrho2->v * wi_dr_2.v + + grad_hj_2.v * pjPOrho2_2.v * wj_dr_2.v) * + ri_2.v; /* Eventually get the acceleration */ acc.v = visc_term.v + sph_term.v; @@ -1412,7 +1446,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_2_vec_force entropy_dt_2.v = mj_2.v * visc_term_2.v * dvdr_2.v; /* Store the forces back on the particles. */ - if(mask_cond) { + if (mask_cond) { a_hydro_xSum->v = vec_mask_sub(a_hydro_xSum->v, piax.v, mask); a_hydro_xSum->v = vec_mask_sub(a_hydro_xSum->v, piax_2.v, mask_2); a_hydro_ySum->v = vec_mask_sub(a_hydro_ySum->v, piay.v, mask); @@ -1425,8 +1459,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_2_vec_force v_sigSum->v = vec_fmax(v_sigSum->v, vec_and_mask(v_sig_2, mask_2)); entropy_dtSum->v = vec_mask_add(entropy_dtSum->v, entropy_dt.v, mask); entropy_dtSum->v = vec_mask_add(entropy_dtSum->v, entropy_dt_2.v, mask_2); - } - else { + } else { a_hydro_xSum->v = vec_sub(a_hydro_xSum->v, piax.v); a_hydro_xSum->v = vec_sub(a_hydro_xSum->v, piax_2.v); a_hydro_ySum->v = vec_sub(a_hydro_ySum->v, piay.v); diff --git a/src/kernel_hydro.h b/src/kernel_hydro.h index 7e1ad0dd8606f0992c9a65aa03645c95294e4235..fe40c00a1cbff3392353a3fbba67e13fd878418e 100644 --- a/src/kernel_hydro.h +++ b/src/kernel_hydro.h @@ -436,7 +436,7 @@ static const vector cond = FILL_VEC(0.5f); /** * @brief Computes the kernel function and its derivative for two particles - * using vectors. Does not return zero if $u > \\gamma = H/h$, should only + * using vectors. Does not return zero if $u > \\gamma = H/h$, should only * be called if particles are known to interact. * * @param u The ratio of the distance to the smoothing length $u = x/h$. @@ -517,7 +517,8 @@ __attribute__((always_inline)) INLINE static void kernel_deval_1_vec( /** * @brief Computes the kernel function and its derivative for two particles - * using interleaved vectors. Does not return zero if $u > \\gamma = H/h$, should only + * using interleaved vectors. Does not return zero if $u > \\gamma = H/h$, + * should only * be called if particles are known to interact. * * @param u The ratio of the distance to the smoothing length $u = x/h$. @@ -582,9 +583,9 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec( vector mask_reg1, mask_reg2, mask_reg1_v2, mask_reg2_v2; /* Form a mask for each part of the kernel. */ - mask_reg1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */ - mask_reg1_v2.v = vec_cmp_lt(x2.v, cond.v); /* 0 < x < 0.5 */ - mask_reg2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */ + mask_reg1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */ + mask_reg1_v2.v = vec_cmp_lt(x2.v, cond.v); /* 0 < x < 0.5 */ + mask_reg2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */ mask_reg2_v2.v = vec_cmp_gte(x2.v, cond.v); /* 0.5 < x < 1 */ /* Work out w for both regions of the kernel and combine the results together @@ -648,7 +649,7 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec( /** * @brief Computes the kernel function for two particles - * using vectors. Does not return zero if $u > \\gamma = H/h$, should only + * using vectors. Does not return zero if $u > \\gamma = H/h$, should only * be called if particles are known to interact. * * @param u The ratio of the distance to the smoothing length $u = x/h$. @@ -709,7 +710,7 @@ __attribute__((always_inline)) INLINE static void kernel_eval_W_vec(vector *u, /** * @brief Computes the kernel function derivative for two particles - * using vectors. Does not return zero if $u > \\gamma = H/h$, should only + * using vectors. Does not return zero if $u > \\gamma = H/h$, should only * be called if particles are known to interact. * * @param u The ratio of the distance to the smoothing length $u = x/h$. @@ -825,9 +826,9 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_vec( /* Mask out result for particles that lie outside of the kernel function. */ vector mask; - mask.v = vec_cmp_lt(x.v,vec_set1(1.f)); + mask.v = vec_cmp_lt(x.v, vec_set1(1.f)); - dw_dx->v = vec_and(dw_dx->v,mask.v); + dw_dx->v = vec_and(dw_dx->v, mask.v); /* Return everything */ dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v, @@ -854,7 +855,8 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_2_vec( #ifdef WENDLAND_C2_KERNEL /* Init the iteration for Horner's scheme. */ dw_dx->v = vec_fma(wendland_dwdx_const_c0.v, x.v, wendland_dwdx_const_c1.v); - dw_dx_2->v = vec_fma(wendland_dwdx_const_c0.v, x_2.v, wendland_dwdx_const_c1.v); + dw_dx_2->v = + vec_fma(wendland_dwdx_const_c0.v, x_2.v, wendland_dwdx_const_c1.v); /* Calculate the polynomial interleaving vector operations */ dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c2.v); @@ -872,9 +874,9 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_2_vec( vector mask_reg1_2, mask_reg2_2; /* Form a mask for each part of the kernel. */ - mask_reg1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */ + mask_reg1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */ mask_reg1_2.v = vec_cmp_lt(x_2.v, cond.v); /* 0 < x < 0.5 */ - mask_reg2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */ + mask_reg2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */ mask_reg2_2.v = vec_cmp_gte(x_2.v, cond.v); /* 0.5 < x < 1 */ /* Work out w for both regions of the kernel and combine the results together @@ -887,7 +889,7 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_2_vec( dw_dx2_2.v = vec_fma(cubic_2_dwdx_const_c0.v, x_2.v, cubic_2_dwdx_const_c1.v); /* Calculate the polynomial interleaving vector operations. */ - dw_dx->v = vec_mul(dw_dx->v, x.v); /* cubic_1_dwdx_const_c2 is zero. */ + dw_dx->v = vec_mul(dw_dx->v, x.v); /* cubic_1_dwdx_const_c2 is zero. */ dw_dx_2->v = vec_mul(dw_dx_2->v, x_2.v); /* cubic_1_dwdx_const_c2 is zero. */ dw_dx2.v = vec_fma(dw_dx2.v, x.v, cubic_2_dwdx_const_c2.v); dw_dx2_2.v = vec_fma(dw_dx2_2.v, x_2.v, cubic_2_dwdx_const_c2.v); @@ -907,17 +909,18 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_2_vec( /* Mask out result for particles that lie outside of the kernel function. */ vector mask, mask_2; - mask.v = vec_cmp_lt(x.v,vec_set1(1.f)); - mask_2.v = vec_cmp_lt(x_2.v,vec_set1(1.f)); + mask.v = vec_cmp_lt(x.v, vec_set1(1.f)); + mask_2.v = vec_cmp_lt(x_2.v, vec_set1(1.f)); - dw_dx->v = vec_and(dw_dx->v,mask.v); - dw_dx_2->v = vec_and(dw_dx_2->v,mask_2.v); + dw_dx->v = vec_and(dw_dx->v, mask.v); + dw_dx_2->v = vec_and(dw_dx_2->v, mask_2.v); /* Return everything */ dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_plus_one_vec.v)); - dw_dx_2->v = vec_mul(dw_dx_2->v, vec_mul(kernel_constant_vec.v, - kernel_gamma_inv_dim_plus_one_vec.v)); + dw_dx_2->v = vec_mul( + dw_dx_2->v, + vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_plus_one_vec.v)); } #endif /* WITH_VECTORIZATION */ diff --git a/src/runner.c b/src/runner.c index 29313dd5e15428931775f7ccbc1efc0e4f1b79a9..1faaec9c8f907695917ea8facdf30852c8b30e4b 100644 --- a/src/runner.c +++ b/src/runner.c @@ -1793,8 +1793,7 @@ void *runner_main(void *data) { #else runner_doself2_force(r, ci); #endif - } - else if (t->subtype == task_subtype_grav) + } else if (t->subtype == task_subtype_grav) runner_doself_grav(r, ci, 1); else if (t->subtype == task_subtype_external_grav) runner_do_grav_external(r, ci, 1); diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c index c429df0948f347bd312d4608f72b8e417da38e3c..481cfa79444a7c4277b70058e62372c6e6b21984 100644 --- a/src/runner_doiact_vec.c +++ b/src/runner_doiact_vec.c @@ -94,9 +94,9 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions( /* Zero parts of mask that represent the padded values.*/ if (pad < VEC_SIZE) { - vec_pad_mask(int_mask2,pad); + vec_pad_mask(int_mask2, pad); } else { - vec_pad_mask(int_mask,VEC_SIZE - rem); + vec_pad_mask(int_mask, VEC_SIZE - rem); vec_zero_mask(int_mask2); } @@ -109,7 +109,8 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions( v_hi_inv, v_vix, v_viy, v_viz, &int_cache->vxq[*icount_align], &int_cache->vyq[*icount_align], &int_cache->vzq[*icount_align], &int_cache->mq[*icount_align], rhoSum, rho_dhSum, wcountSum, - wcount_dhSum, div_vSum, curlvxSum, curlvySum, curlvzSum, int_mask, int_mask2, 1); + wcount_dhSum, div_vSum, curlvxSum, curlvySum, curlvzSum, int_mask, + int_mask2, 1); } } @@ -153,11 +154,11 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions( */ __attribute__((always_inline)) INLINE static void storeInteractions( const int mask, const int pjd, vector *v_r2, vector *v_dx, vector *v_dy, - vector *v_dz, const struct cache *const cell_cache, struct c2_cache *const int_cache, - int *icount, vector *rhoSum, vector *rho_dhSum, vector *wcountSum, - vector *wcount_dhSum, vector *div_vSum, vector *curlvxSum, - vector *curlvySum, vector *curlvzSum, vector v_hi_inv, vector v_vix, - vector v_viy, vector v_viz) { + vector *v_dz, const struct cache *const cell_cache, + struct c2_cache *const int_cache, int *icount, vector *rhoSum, + vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum, + vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum, + vector v_hi_inv, vector v_vix, vector v_viy, vector v_viz) { /* Left-pack values needed into the secondary cache using the interaction mask. */ @@ -169,10 +170,14 @@ __attribute__((always_inline)) INLINE static void storeInteractions( VEC_LEFT_PACK(v_dx->v, packed_mask, &int_cache->dxq[*icount]); VEC_LEFT_PACK(v_dy->v, packed_mask, &int_cache->dyq[*icount]); VEC_LEFT_PACK(v_dz->v, packed_mask, &int_cache->dzq[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->m[pjd]), packed_mask, &int_cache->mq[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->vx[pjd]), packed_mask, &int_cache->vxq[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->vy[pjd]), packed_mask, &int_cache->vyq[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->vz[pjd]), packed_mask, &int_cache->vzq[*icount]); + VEC_LEFT_PACK(vec_load(&cell_cache->m[pjd]), packed_mask, + &int_cache->mq[*icount]); + VEC_LEFT_PACK(vec_load(&cell_cache->vx[pjd]), packed_mask, + &int_cache->vxq[*icount]); + VEC_LEFT_PACK(vec_load(&cell_cache->vy[pjd]), packed_mask, + &int_cache->vyq[*icount]); + VEC_LEFT_PACK(vec_load(&cell_cache->vz[pjd]), packed_mask, + &int_cache->vzq[*icount]); /* Increment interaction count by number of bits set in mask. */ (*icount) += __builtin_popcount(mask); @@ -257,10 +262,10 @@ __attribute__((always_inline)) INLINE static void storeInteractions( __attribute__((always_inline)) INLINE static void calcRemForceInteractions( struct c2_cache *const int_cache, const int icount, vector *a_hydro_xSum, vector *a_hydro_ySum, vector *a_hydro_zSum, vector *h_dtSum, - vector *v_sigSum, vector *entropy_dtSum, - vector *v_hi_inv, vector *v_vix, vector *v_viy, vector *v_viz, - vector *v_rhoi, vector *v_grad_hi, vector *v_pOrhoi2, vector *v_balsara_i, vector *v_ci, - int *icount_align, int num_vec_proc) { + vector *v_sigSum, vector *entropy_dtSum, vector *v_hi_inv, vector *v_vix, + vector *v_viy, vector *v_viz, vector *v_rhoi, vector *v_grad_hi, + vector *v_pOrhoi2, vector *v_balsara_i, vector *v_ci, int *icount_align, + int num_vec_proc) { mask_t int_mask, int_mask2; @@ -296,9 +301,9 @@ __attribute__((always_inline)) INLINE static void calcRemForceInteractions( /* Zero parts of mask that represent the padded values.*/ if (pad < VEC_SIZE) { - vec_pad_mask(int_mask2,pad); + vec_pad_mask(int_mask2, pad); } else { - vec_pad_mask(int_mask,VEC_SIZE - rem); + vec_pad_mask(int_mask, VEC_SIZE - rem); vec_zero_mask(int_mask2); } @@ -307,9 +312,16 @@ __attribute__((always_inline)) INLINE static void calcRemForceInteractions( *icount_align = icount - rem; runner_iact_nonsym_2_vec_force( - &int_cache->r2q[*icount_align], &int_cache->dxq[*icount_align], &int_cache->dyq[*icount_align], &int_cache->dzq[*icount_align], v_vix, v_viy, v_viz, v_rhoi, v_grad_hi, v_pOrhoi2, v_balsara_i, v_ci, - &int_cache->vxq[*icount_align], &int_cache->vyq[*icount_align], &int_cache->vzq[*icount_align], &int_cache->rhoq[*icount_align], &int_cache->grad_hq[*icount_align], &int_cache->pOrho2q[*icount_align], &int_cache->balsaraq[*icount_align], &int_cache->soundspeedq[*icount_align], &int_cache->mq[*icount_align], v_hi_inv, &int_cache->h_invq[*icount_align], - a_hydro_xSum, a_hydro_ySum, a_hydro_zSum, h_dtSum, v_sigSum, entropy_dtSum, int_mask, int_mask2, 1); + &int_cache->r2q[*icount_align], &int_cache->dxq[*icount_align], + &int_cache->dyq[*icount_align], &int_cache->dzq[*icount_align], v_vix, + v_viy, v_viz, v_rhoi, v_grad_hi, v_pOrhoi2, v_balsara_i, v_ci, + &int_cache->vxq[*icount_align], &int_cache->vyq[*icount_align], + &int_cache->vzq[*icount_align], &int_cache->rhoq[*icount_align], + &int_cache->grad_hq[*icount_align], &int_cache->pOrho2q[*icount_align], + &int_cache->balsaraq[*icount_align], + &int_cache->soundspeedq[*icount_align], &int_cache->mq[*icount_align], + v_hi_inv, &int_cache->h_invq[*icount_align], a_hydro_xSum, a_hydro_ySum, + a_hydro_zSum, h_dtSum, v_sigSum, entropy_dtSum, int_mask, int_mask2, 1); } } @@ -353,10 +365,12 @@ __attribute__((always_inline)) INLINE static void calcRemForceInteractions( */ __attribute__((always_inline)) INLINE static void storeForceInteractions( const int mask, const int pjd, vector *v_r2, vector *v_dx, vector *v_dy, - vector *v_dz, const struct cache *const cell_cache, struct c2_cache *const int_cache, - int *icount, vector *a_hydro_xSum, vector *a_hydro_ySum, vector *a_hydro_zSum, - vector *h_dtSum, vector *v_sigSum, vector *entropy_dtSum, - vector *v_hi_inv, vector *v_vix, vector *v_viy, vector *v_viz, vector *v_rhoi, vector *v_grad_hi, vector *v_pOrhoi2, vector *v_balsara_i, vector *v_ci) { + vector *v_dz, const struct cache *const cell_cache, + struct c2_cache *const int_cache, int *icount, vector *a_hydro_xSum, + vector *a_hydro_ySum, vector *a_hydro_zSum, vector *h_dtSum, + vector *v_sigSum, vector *entropy_dtSum, vector *v_hi_inv, vector *v_vix, + vector *v_viy, vector *v_viz, vector *v_rhoi, vector *v_grad_hi, + vector *v_pOrhoi2, vector *v_balsara_i, vector *v_ci) { /* Left-pack values needed into the secondary cache using the interaction mask. */ @@ -368,22 +382,31 @@ __attribute__((always_inline)) INLINE static void storeForceInteractions( mask_t packed_mask; VEC_FORM_PACKED_MASK(mask, packed_mask); - + VEC_LEFT_PACK(v_r2->v, packed_mask, &int_cache->r2q[*icount]); VEC_LEFT_PACK(v_dx->v, packed_mask, &int_cache->dxq[*icount]); VEC_LEFT_PACK(v_dy->v, packed_mask, &int_cache->dyq[*icount]); VEC_LEFT_PACK(v_dz->v, packed_mask, &int_cache->dzq[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->m[pjd]), packed_mask, &int_cache->mq[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->vx[pjd]), packed_mask, &int_cache->vxq[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->vy[pjd]), packed_mask, &int_cache->vyq[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->vz[pjd]), packed_mask, &int_cache->vzq[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->rho[pjd]), packed_mask, &int_cache->rhoq[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->grad_h[pjd]), packed_mask, &int_cache->grad_hq[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->pOrho2[pjd]), packed_mask, &int_cache->pOrho2q[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->balsara[pjd]), packed_mask, &int_cache->balsaraq[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->soundspeed[pjd]), packed_mask, &int_cache->soundspeedq[*icount]); + VEC_LEFT_PACK(vec_load(&cell_cache->m[pjd]), packed_mask, + &int_cache->mq[*icount]); + VEC_LEFT_PACK(vec_load(&cell_cache->vx[pjd]), packed_mask, + &int_cache->vxq[*icount]); + VEC_LEFT_PACK(vec_load(&cell_cache->vy[pjd]), packed_mask, + &int_cache->vyq[*icount]); + VEC_LEFT_PACK(vec_load(&cell_cache->vz[pjd]), packed_mask, + &int_cache->vzq[*icount]); + VEC_LEFT_PACK(vec_load(&cell_cache->rho[pjd]), packed_mask, + &int_cache->rhoq[*icount]); + VEC_LEFT_PACK(vec_load(&cell_cache->grad_h[pjd]), packed_mask, + &int_cache->grad_hq[*icount]); + VEC_LEFT_PACK(vec_load(&cell_cache->pOrho2[pjd]), packed_mask, + &int_cache->pOrho2q[*icount]); + VEC_LEFT_PACK(vec_load(&cell_cache->balsara[pjd]), packed_mask, + &int_cache->balsaraq[*icount]); + VEC_LEFT_PACK(vec_load(&cell_cache->soundspeed[pjd]), packed_mask, + &int_cache->soundspeedq[*icount]); VEC_LEFT_PACK(v_hj_inv.v, packed_mask, &int_cache->h_invq[*icount]); - + /* Increment interaction count by number of bits set in mask. */ (*icount) += __builtin_popcount(mask); #else @@ -399,7 +422,7 @@ __attribute__((always_inline)) INLINE static void storeForceInteractions( int_cache->vxq[*icount] = cell_cache->vx[pjd + bit_index]; int_cache->vyq[*icount] = cell_cache->vy[pjd + bit_index]; int_cache->vzq[*icount] = cell_cache->vz[pjd + bit_index]; - + int_cache->rhoq[*icount] = cell_cache->rho[pjd + bit_index]; int_cache->grad_hq[*icount] = cell_cache->grad_h[pjd + bit_index]; int_cache->pOrho2q[*icount] = cell_cache->pOrho2[pjd + bit_index]; @@ -419,10 +442,10 @@ __attribute__((always_inline)) INLINE static void storeForceInteractions( int icount_align = *icount; /* Peform remainder interactions. */ - calcRemForceInteractions(int_cache, *icount, a_hydro_xSum, a_hydro_ySum, a_hydro_zSum, - h_dtSum, v_sigSum, entropy_dtSum, v_hi_inv, - v_vix, v_viy, v_viz, v_rhoi, v_grad_hi, v_pOrhoi2, v_balsara_i, v_ci, - &icount_align, 2); + calcRemForceInteractions(int_cache, *icount, a_hydro_xSum, a_hydro_ySum, + a_hydro_zSum, h_dtSum, v_sigSum, entropy_dtSum, + v_hi_inv, v_vix, v_viy, v_viz, v_rhoi, v_grad_hi, + v_pOrhoi2, v_balsara_i, v_ci, &icount_align, 2); /* Initialise masks to true in case remainder interactions have been * performed. */ @@ -434,9 +457,15 @@ __attribute__((always_inline)) INLINE static void storeForceInteractions( for (int pjd = 0; pjd < icount_align; pjd += (2 * VEC_SIZE)) { runner_iact_nonsym_2_vec_force( - &int_cache->r2q[pjd], &int_cache->dxq[pjd], &int_cache->dyq[pjd], &int_cache->dzq[pjd], v_vix, v_viy, v_viz, v_rhoi, v_grad_hi, v_pOrhoi2, v_balsara_i, v_ci, - &int_cache->vxq[pjd], &int_cache->vyq[pjd], &int_cache->vzq[pjd], &int_cache->rhoq[pjd], &int_cache->grad_hq[pjd], &int_cache->pOrho2q[pjd], &int_cache->balsaraq[pjd], &int_cache->soundspeedq[pjd], &int_cache->mq[pjd], v_hi_inv, &int_cache->h_invq[pjd], - a_hydro_xSum, a_hydro_ySum, a_hydro_zSum, h_dtSum, v_sigSum, entropy_dtSum, int_mask, int_mask2, 0); + &int_cache->r2q[pjd], &int_cache->dxq[pjd], &int_cache->dyq[pjd], + &int_cache->dzq[pjd], v_vix, v_viy, v_viz, v_rhoi, v_grad_hi, + v_pOrhoi2, v_balsara_i, v_ci, &int_cache->vxq[pjd], + &int_cache->vyq[pjd], &int_cache->vzq[pjd], &int_cache->rhoq[pjd], + &int_cache->grad_hq[pjd], &int_cache->pOrho2q[pjd], + &int_cache->balsaraq[pjd], &int_cache->soundspeedq[pjd], + &int_cache->mq[pjd], v_hi_inv, &int_cache->h_invq[pjd], a_hydro_xSum, + a_hydro_ySum, a_hydro_zSum, h_dtSum, v_sigSum, entropy_dtSum, + int_mask, int_mask2, 0); } /* Reset interaction count. */ @@ -654,7 +683,7 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( pjx2.v = vec_load(&cell_cache->x[pjd + VEC_SIZE]); pjy2.v = vec_load(&cell_cache->y[pjd + VEC_SIZE]); pjz2.v = vec_load(&cell_cache->z[pjd + VEC_SIZE]); - + /* Compute the pairwise distance. */ vector v_dx_tmp, v_dy_tmp, v_dz_tmp; vector v_dx_tmp2, v_dy_tmp2, v_dz_tmp2, v_r2_2; @@ -694,18 +723,18 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( * cache. */ if (doi_mask) { storeInteractions(doi_mask, pjd, &v_r2, &v_dx_tmp, &v_dy_tmp, &v_dz_tmp, - cell_cache, &int_cache, + cell_cache, &int_cache, &icount, &rhoSum, &rho_dhSum, + &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, + &curlvySum, &curlvzSum, v_hi_inv, v_vix, v_viy, + v_viz); + } + if (doi_mask2) { + storeInteractions(doi_mask2, pjd + VEC_SIZE, &v_r2_2, &v_dx_tmp2, + &v_dy_tmp2, &v_dz_tmp2, cell_cache, &int_cache, &icount, &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz); } - if (doi_mask2) { - storeInteractions( - doi_mask2, pjd + VEC_SIZE, &v_r2_2, &v_dx_tmp2, &v_dy_tmp2, - &v_dz_tmp2, cell_cache, &int_cache, - &icount, &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, - &curlvxSum, &curlvySum, &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz); - } } /* Perform padded vector remainder interactions if any are present. */ @@ -727,7 +756,8 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( &int_cache.dzq[pjd], v_hi_inv, v_vix, v_viy, v_viz, &int_cache.vxq[pjd], &int_cache.vyq[pjd], &int_cache.vzq[pjd], &int_cache.mq[pjd], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, - &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, int_mask, int_mask2, 0); + &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, int_mask, int_mask2, + 0); } /* Perform horizontal adds on vector sums and store result in particle pi. @@ -763,7 +793,7 @@ __attribute__((always_inline)) INLINE void runner_doself2_force_vec( const struct engine *e = r->e; struct part *restrict pi; int count_align; - const int num_vec_proc = 1;//NUM_VEC_PROC; + const int num_vec_proc = 1; // NUM_VEC_PROC; struct part *restrict parts = c->parts; const int count = c->count; @@ -771,7 +801,7 @@ __attribute__((always_inline)) INLINE void runner_doself2_force_vec( vector v_hi, v_vix, v_viy, v_viz, v_hig2, v_r2; vector v_rhoi, v_grad_hi, v_pOrhoi2, v_balsara_i, v_ci; - //TIMER_TIC + // TIMER_TIC if (!cell_is_active(c, e)) return; @@ -789,172 +819,178 @@ __attribute__((always_inline)) INLINE void runner_doself2_force_vec( cache_read_particles(c, cell_cache); #ifdef SWIFT_DEBUG_CHECKS - for(int i=0; i<count; i++) { + for (int i = 0; i < count; i++) { pi = &c->parts[i]; /* Check that particles have been drifted to the current time */ if (pi->ti_drift != e->ti_current) error("Particle pi not drifted to current time"); - } } +} #endif - /* Create secondary cache to store particle interactions. */ - struct c2_cache int_cache; - int icount = 0, icount_align = 0; +/* Create secondary cache to store particle interactions. */ +struct c2_cache int_cache; +int icount = 0, icount_align = 0; - /* Loop over the particles in the cell. */ - for (int pid = 0; pid < count; pid++) { +/* Loop over the particles in the cell. */ +for (int pid = 0; pid < count; pid++) { - /* Get a pointer to the ith particle. */ - pi = &parts[pid]; + /* Get a pointer to the ith particle. */ + pi = &parts[pid]; - /* Is the ith particle active? */ - if (!part_is_active(pi, e)) continue; + /* Is the ith particle active? */ + if (!part_is_active(pi, e)) continue; - vector pix, piy, piz; + vector pix, piy, piz; - const float hi = cell_cache->h[pid]; + const float hi = cell_cache->h[pid]; - /* Fill particle pi vectors. */ - pix.v = vec_set1(cell_cache->x[pid]); - piy.v = vec_set1(cell_cache->y[pid]); - piz.v = vec_set1(cell_cache->z[pid]); - v_hi.v = vec_set1(hi); - v_vix.v = vec_set1(cell_cache->vx[pid]); - v_viy.v = vec_set1(cell_cache->vy[pid]); - v_viz.v = vec_set1(cell_cache->vz[pid]); - - v_rhoi.v = vec_set1(cell_cache->rho[pid]); - v_grad_hi.v = vec_set1(cell_cache->grad_h[pid]); - v_pOrhoi2.v = vec_set1(cell_cache->pOrho2[pid]); - v_balsara_i.v = vec_set1(cell_cache->balsara[pid]); - v_ci.v = vec_set1(cell_cache->soundspeed[pid]); + /* Fill particle pi vectors. */ + pix.v = vec_set1(cell_cache->x[pid]); + piy.v = vec_set1(cell_cache->y[pid]); + piz.v = vec_set1(cell_cache->z[pid]); + v_hi.v = vec_set1(hi); + v_vix.v = vec_set1(cell_cache->vx[pid]); + v_viy.v = vec_set1(cell_cache->vy[pid]); + v_viz.v = vec_set1(cell_cache->vz[pid]); - const float hig2 = hi * hi * kernel_gamma2; - v_hig2.v = vec_set1(hig2); + v_rhoi.v = vec_set1(cell_cache->rho[pid]); + v_grad_hi.v = vec_set1(cell_cache->grad_h[pid]); + v_pOrhoi2.v = vec_set1(cell_cache->pOrho2[pid]); + v_balsara_i.v = vec_set1(cell_cache->balsara[pid]); + v_ci.v = vec_set1(cell_cache->soundspeed[pid]); - /* Reset cumulative sums of update vectors. */ - vector a_hydro_xSum, a_hydro_ySum, a_hydro_zSum, h_dtSum, v_sigSum, entropy_dtSum; + const float hig2 = hi * hi * kernel_gamma2; + v_hig2.v = vec_set1(hig2); - /* Get the inverse of hi. */ - vector v_hi_inv; + /* Reset cumulative sums of update vectors. */ + vector a_hydro_xSum, a_hydro_ySum, a_hydro_zSum, h_dtSum, v_sigSum, + entropy_dtSum; - v_hi_inv = vec_reciprocal(v_hi); + /* Get the inverse of hi. */ + vector v_hi_inv; - a_hydro_xSum.v = vec_setzero(); - a_hydro_ySum.v = vec_setzero(); - a_hydro_zSum.v = vec_setzero(); - h_dtSum.v = vec_setzero(); - v_sigSum.v = vec_set1(pi->force.v_sig); - entropy_dtSum.v = vec_setzero(); + v_hi_inv = vec_reciprocal(v_hi); - /* Pad cache if there is a serial remainder. */ - count_align = count; - int rem = count % (num_vec_proc * VEC_SIZE); - if (rem != 0) { - int pad = (num_vec_proc * VEC_SIZE) - rem; + a_hydro_xSum.v = vec_setzero(); + a_hydro_ySum.v = vec_setzero(); + a_hydro_zSum.v = vec_setzero(); + h_dtSum.v = vec_setzero(); + v_sigSum.v = vec_set1(pi->force.v_sig); + entropy_dtSum.v = vec_setzero(); - count_align += pad; + /* Pad cache if there is a serial remainder. */ + count_align = count; + int rem = count % (num_vec_proc * VEC_SIZE); + if (rem != 0) { + int pad = (num_vec_proc * VEC_SIZE) - rem; - /* Set positions to the same as particle pi so when the r2 > 0 mask is - * applied these extra contributions are masked out.*/ - for (int i = count; i < count_align; i++) { - cell_cache->x[i] = pix.f[0]; - cell_cache->y[i] = piy.f[0]; - cell_cache->z[i] = piz.f[0]; - cell_cache->h[i] = 1.f; - } + count_align += pad; + + /* Set positions to the same as particle pi so when the r2 > 0 mask is + * applied these extra contributions are masked out.*/ + for (int i = count; i < count_align; i++) { + cell_cache->x[i] = pix.f[0]; + cell_cache->y[i] = piy.f[0]; + cell_cache->z[i] = piz.f[0]; + cell_cache->h[i] = 1.f; } + } - vector pjx, pjy, pjz, hj, hjg2; + vector pjx, pjy, pjz, hj, hjg2; - /* Find all of particle pi's interacions and store needed values in the - * secondary cache.*/ - for (int pjd = 0; pjd < count_align; pjd += (num_vec_proc * VEC_SIZE)) { + /* Find all of particle pi's interacions and store needed values in the + * secondary cache.*/ + for (int pjd = 0; pjd < count_align; pjd += (num_vec_proc * VEC_SIZE)) { - /* Load 2 sets of vectors from the particle cache. */ - pjx.v = vec_load(&cell_cache->x[pjd]); - pjy.v = vec_load(&cell_cache->y[pjd]); - pjz.v = vec_load(&cell_cache->z[pjd]); - hj.v = vec_load(&cell_cache->h[pjd]); - hjg2.v = vec_mul(vec_mul(hj.v,hj.v), kernel_gamma2_vec.v); + /* Load 2 sets of vectors from the particle cache. */ + pjx.v = vec_load(&cell_cache->x[pjd]); + pjy.v = vec_load(&cell_cache->y[pjd]); + pjz.v = vec_load(&cell_cache->z[pjd]); + hj.v = vec_load(&cell_cache->h[pjd]); + hjg2.v = vec_mul(vec_mul(hj.v, hj.v), kernel_gamma2_vec.v); - /* Compute the pairwise distance. */ - vector v_dx_tmp, v_dy_tmp, v_dz_tmp; + /* Compute the pairwise distance. */ + vector v_dx_tmp, v_dy_tmp, v_dz_tmp; - v_dx_tmp.v = vec_sub(pix.v, pjx.v); - v_dy_tmp.v = vec_sub(piy.v, pjy.v); - v_dz_tmp.v = vec_sub(piz.v, pjz.v); + v_dx_tmp.v = vec_sub(pix.v, pjx.v); + v_dy_tmp.v = vec_sub(piy.v, pjy.v); + v_dz_tmp.v = vec_sub(piz.v, pjz.v); - v_r2.v = vec_mul(v_dx_tmp.v, v_dx_tmp.v); - v_r2.v = vec_fma(v_dy_tmp.v, v_dy_tmp.v, v_r2.v); - v_r2.v = vec_fma(v_dz_tmp.v, v_dz_tmp.v, v_r2.v); + v_r2.v = vec_mul(v_dx_tmp.v, v_dx_tmp.v); + v_r2.v = vec_fma(v_dy_tmp.v, v_dy_tmp.v, v_r2.v); + v_r2.v = vec_fma(v_dz_tmp.v, v_dz_tmp.v, v_r2.v); - /* Form r2 > 0 mask, r2 < hig2 mask and r2 < hjg2 mask. */ - mask_t v_doi_mask, v_doi_mask_self_check; - int doi_mask, doi_mask_self_check; + /* Form r2 > 0 mask, r2 < hig2 mask and r2 < hjg2 mask. */ + mask_t v_doi_mask, v_doi_mask_self_check; + int doi_mask, doi_mask_self_check; - /* Form r2 > 0 mask.*/ - vec_create_mask(v_doi_mask_self_check, vec_cmp_gt(v_r2.v, vec_setzero())); + /* Form r2 > 0 mask.*/ + vec_create_mask(v_doi_mask_self_check, vec_cmp_gt(v_r2.v, vec_setzero())); - /* Form a mask from r2 < hig2 mask and r2 < hjg2 mask. */ - vector v_h2; - v_h2.v = vec_fmax(v_hig2.v, hjg2.v); - vec_create_mask(v_doi_mask, vec_cmp_lt(v_r2.v, v_h2.v)); + /* Form a mask from r2 < hig2 mask and r2 < hjg2 mask. */ + vector v_h2; + v_h2.v = vec_fmax(v_hig2.v, hjg2.v); + vec_create_mask(v_doi_mask, vec_cmp_lt(v_r2.v, v_h2.v)); - /* Form integer masks. */ - doi_mask_self_check = vec_form_int_mask(v_doi_mask_self_check); - doi_mask = vec_form_int_mask(v_doi_mask); - - /* Combine all 3 masks. */ - doi_mask = doi_mask & doi_mask_self_check; + /* Form integer masks. */ + doi_mask_self_check = vec_form_int_mask(v_doi_mask_self_check); + doi_mask = vec_form_int_mask(v_doi_mask); - /* If there are any interactions left pack interaction values into c2 - * cache. */ - if (doi_mask) { - - storeForceInteractions(doi_mask, pjd, &v_r2, &v_dx_tmp, &v_dy_tmp, &v_dz_tmp, - cell_cache, &int_cache, - &icount, &a_hydro_xSum, &a_hydro_ySum, &a_hydro_zSum, - &h_dtSum, &v_sigSum, &entropy_dtSum, - &v_hi_inv, &v_vix, &v_viy, &v_viz, &v_rhoi, &v_grad_hi, &v_pOrhoi2, &v_balsara_i, &v_ci); - } + /* Combine all 3 masks. */ + doi_mask = doi_mask & doi_mask_self_check; - } /* Loop over all other particles. */ + /* If there are any interactions left pack interaction values into c2 + * cache. */ + if (doi_mask) { - /* Perform padded vector remainder interactions if any are present. */ - calcRemForceInteractions(&int_cache, icount, &a_hydro_xSum, &a_hydro_ySum, &a_hydro_zSum, - &h_dtSum, &v_sigSum, &entropy_dtSum, &v_hi_inv, - &v_vix, &v_viy, &v_viz, &v_rhoi, &v_grad_hi, &v_pOrhoi2, &v_balsara_i, &v_ci, - &icount_align, 2); + storeForceInteractions( + doi_mask, pjd, &v_r2, &v_dx_tmp, &v_dy_tmp, &v_dz_tmp, cell_cache, + &int_cache, &icount, &a_hydro_xSum, &a_hydro_ySum, &a_hydro_zSum, + &h_dtSum, &v_sigSum, &entropy_dtSum, &v_hi_inv, &v_vix, &v_viy, + &v_viz, &v_rhoi, &v_grad_hi, &v_pOrhoi2, &v_balsara_i, &v_ci); + } - /* Initialise masks to true in case remainder interactions have been - * performed. */ - mask_t int_mask, int_mask2; - vec_init_mask(int_mask); - vec_init_mask(int_mask2); + } /* Loop over all other particles. */ - /* Perform interaction with 2 vectors. */ - for (int pjd = 0; pjd < icount_align; pjd += (2 * VEC_SIZE)) { - runner_iact_nonsym_2_vec_force( - &int_cache.r2q[pjd], &int_cache.dxq[pjd], &int_cache.dyq[pjd], &int_cache.dzq[pjd], &v_vix, &v_viy, &v_viz, &v_rhoi, &v_grad_hi, &v_pOrhoi2, &v_balsara_i, &v_ci, - &int_cache.vxq[pjd], &int_cache.vyq[pjd], &int_cache.vzq[pjd], &int_cache.rhoq[pjd], &int_cache.grad_hq[pjd], &int_cache.pOrho2q[pjd], &int_cache.balsaraq[pjd], &int_cache.soundspeedq[pjd], &int_cache.mq[pjd], &v_hi_inv, &int_cache.h_invq[pjd], - &a_hydro_xSum, &a_hydro_ySum, &a_hydro_zSum, &h_dtSum, &v_sigSum, &entropy_dtSum,int_mask, int_mask2, 0); + /* Perform padded vector remainder interactions if any are present. */ + calcRemForceInteractions( + &int_cache, icount, &a_hydro_xSum, &a_hydro_ySum, &a_hydro_zSum, &h_dtSum, + &v_sigSum, &entropy_dtSum, &v_hi_inv, &v_vix, &v_viy, &v_viz, &v_rhoi, + &v_grad_hi, &v_pOrhoi2, &v_balsara_i, &v_ci, &icount_align, 2); - } - - VEC_HADD(a_hydro_xSum, pi->a_hydro[0]); - VEC_HADD(a_hydro_ySum, pi->a_hydro[1]); - VEC_HADD(a_hydro_zSum, pi->a_hydro[2]); - VEC_HADD(h_dtSum, pi->force.h_dt); - VEC_HMAX(v_sigSum, pi->force.v_sig); - VEC_HADD(entropy_dtSum, pi->entropy_dt); + /* Initialise masks to true in case remainder interactions have been + * performed. */ + mask_t int_mask, int_mask2; + vec_init_mask(int_mask); + vec_init_mask(int_mask2); - /* Reset interaction count. */ - icount = 0; - } /* loop over all particles. */ + /* Perform interaction with 2 vectors. */ + for (int pjd = 0; pjd < icount_align; pjd += (2 * VEC_SIZE)) { + runner_iact_nonsym_2_vec_force( + &int_cache.r2q[pjd], &int_cache.dxq[pjd], &int_cache.dyq[pjd], + &int_cache.dzq[pjd], &v_vix, &v_viy, &v_viz, &v_rhoi, &v_grad_hi, + &v_pOrhoi2, &v_balsara_i, &v_ci, &int_cache.vxq[pjd], + &int_cache.vyq[pjd], &int_cache.vzq[pjd], &int_cache.rhoq[pjd], + &int_cache.grad_hq[pjd], &int_cache.pOrho2q[pjd], + &int_cache.balsaraq[pjd], &int_cache.soundspeedq[pjd], + &int_cache.mq[pjd], &v_hi_inv, &int_cache.h_invq[pjd], &a_hydro_xSum, + &a_hydro_ySum, &a_hydro_zSum, &h_dtSum, &v_sigSum, &entropy_dtSum, + int_mask, int_mask2, 0); + } + + VEC_HADD(a_hydro_xSum, pi->a_hydro[0]); + VEC_HADD(a_hydro_ySum, pi->a_hydro[1]); + VEC_HADD(a_hydro_zSum, pi->a_hydro[2]); + VEC_HADD(h_dtSum, pi->force.h_dt); + VEC_HMAX(v_sigSum, pi->force.v_sig); + VEC_HADD(entropy_dtSum, pi->entropy_dt); + + /* Reset interaction count. */ + icount = 0; +} /* loop over all particles. */ - //TIMER_TOC(timer_doself_force); +// TIMER_TOC(timer_doself_force); #endif /* WITH_VECTORIZATION */ } diff --git a/src/vector.h b/src/vector.h index a7fd01ac59519ec98a2f37a0a9bde2eac2d58516..872010d77bb5f6f117f75f191b3cb3e399bed685 100644 --- a/src/vector.h +++ b/src/vector.h @@ -60,9 +60,9 @@ #define vec_dbl_set(a, b, c, d, e, f, g, h) \ _mm512_set_pd(h, g, f, e, d, c, b, a) #define vec_add(a, b) _mm512_add_ps(a, b) -#define vec_mask_add(a, b, mask) _mm512_mask_add_ps(a, mask, b, a) +#define vec_mask_add(a, b, mask) _mm512_mask_add_ps(a, mask, b, a) #define vec_sub(a, b) _mm512_sub_ps(a, b) -#define vec_mask_sub(a, b, mask) _mm512_mask_sub_ps(a, mask, a, b) +#define vec_mask_sub(a, b, mask) _mm512_mask_sub_ps(a, mask, a, b) #define vec_mul(a, b) _mm512_mul_ps(a, b) #define vec_fma(a, b, c) _mm512_fmadd_ps(a, b, c) #define vec_sqrt(a) _mm512_sqrt_ps(a) @@ -80,12 +80,12 @@ #define vec_cmp_result(a) a #define vec_form_int_mask(a) a #define vec_and(a, b) _mm512_and_ps(a, b) -#define vec_mask_and(a, b) a & b +#define vec_mask_and(a, b) a &b #define vec_and_mask(a, mask) _mm512_maskz_expand_ps(mask, a) #define vec_init_mask(mask) mask = 0xFFFF #define vec_zero_mask(mask) mask = 0 #define vec_create_mask(mask, cond) mask = cond -#define vec_pad_mask(mask,pad) mask = mask >> (pad) +#define vec_pad_mask(mask, pad) mask = mask >> (pad) #define vec_todbl_lo(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 0)) #define vec_todbl_hi(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 1)) #define vec_dbl_tofloat(a, b) _mm512_insertf128(_mm512_castps128_ps512(a), b, 1) @@ -127,7 +127,7 @@ } #endif -/* Do nothing in the case of AVX-512 as there are already +/* Do nothing in the case of AVX-512 as there are already * instructions for left-packing.*/ #define VEC_FORM_PACKED_MASK(mask, packed_mask) @@ -155,9 +155,9 @@ #define vec_set(a, b, c, d, e, f, g, h) _mm256_set_ps(h, g, f, e, d, c, b, a) #define vec_dbl_set(a, b, c, d) _mm256_set_pd(d, c, b, a) #define vec_add(a, b) _mm256_add_ps(a, b) -#define vec_mask_add(a, b, mask) vec_add(a, vec_and(b,mask.v)) +#define vec_mask_add(a, b, mask) vec_add(a, vec_and(b, mask.v)) #define vec_sub(a, b) _mm256_sub_ps(a, b) -#define vec_mask_sub(a, b, mask) vec_sub(a, vec_and(b,mask.v)) +#define vec_mask_sub(a, b, mask) vec_sub(a, vec_and(b, mask.v)) #define vec_mul(a, b) _mm256_mul_ps(a, b) #define vec_sqrt(a) _mm256_sqrt_ps(a) #define vec_rcp(a) _mm256_rcp_ps(a) @@ -172,14 +172,15 @@ #define vec_cmp_lte(a, b) _mm256_cmp_ps(a, b, _CMP_LE_OQ) #define vec_cmp_gte(a, b) _mm256_cmp_ps(a, b, _CMP_GE_OQ) #define vec_cmp_result(a) _mm256_movemask_ps(a) -#define vec_form_int_mask(a) _mm256_movemask_ps(a.v) +#define vec_form_int_mask(a) _mm256_movemask_ps(a.v) #define vec_and(a, b) _mm256_and_ps(a, b) #define vec_mask_and(a, b) _mm256_and_ps(a.v, b.v) #define vec_and_mask(a, mask) vec_mask_and(a, mask) #define vec_init_mask(mask) mask.m = vec_setint1(0xFFFFFFFF) #define vec_create_mask(mask, cond) mask.v = cond #define vec_zero_mask(mask) mask.v = vec_setzero() -#define vec_pad_mask(mask,pad) for (int i = VEC_SIZE - (pad); i < VEC_SIZE; i++) mask.i[i] = 0 +#define vec_pad_mask(mask, pad) \ + for (int i = VEC_SIZE - (pad); i < VEC_SIZE; i++) mask.i[i] = 0 #define vec_todbl_lo(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 0)) #define vec_todbl_hi(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 1)) #define vec_dbl_tofloat(a, b) _mm256_insertf128(_mm256_castps128_ps256(a), b, 1) @@ -205,12 +206,12 @@ a.v = _mm256_hadd_ps(a.v, a.v); \ b += a.f[0] + a.f[4]; -/* Performs a horizontal maximum on the vector and takes the maximum of the result with a float, b. */ -#define VEC_HMAX(a, b) \ -{ \ - for(int k=0; k<VEC_SIZE; k++) \ - b = max(b, a.f[k]); \ -} +/* Performs a horizontal maximum on the vector and takes the maximum of the + * result with a float, b. */ +#define VEC_HMAX(a, b) \ + { \ + for (int k = 0; k < VEC_SIZE; k++) b = max(b, a.f[k]); \ + } /* Returns the lower 128-bits of the 256-bit vector. */ #define VEC_GET_LOW(a) _mm256_castps256_ps128(a) diff --git a/tests/test125cells.c b/tests/test125cells.c index 9ca8ef377aa4528e036fe657be9f5f3f127c1f6b..f6bd966d56802bf5472d695673357f4e7cd178dc 100644 --- a/tests/test125cells.c +++ b/tests/test125cells.c @@ -690,7 +690,7 @@ int main(int argc, char *argv[]) { struct cell *cj = cells[i * 25 + j * 5 + k]; if (main_cell != cj) { - + const ticks sub_tic = getticks(); runner_dopair2_force(&runner, main_cell, cj); @@ -703,7 +703,7 @@ int main(int argc, char *argv[]) { } #ifdef WITH_VECTORIZATION -/* Initialise the cache. */ + /* Initialise the cache. */ runner.ci_cache.count = 0; cache_init(&runner.ci_cache, 512); #endif @@ -712,7 +712,7 @@ int main(int argc, char *argv[]) { /* And now the self-interaction for the main cell */ DOSELF2(&runner, main_cell); - + timings[26] += getticks() - self_tic; #endif @@ -729,13 +729,12 @@ int main(int argc, char *argv[]) { } } - for (size_t n = 0; n < 100*runs; ++n) { + for (size_t n = 0; n < 100 * runs; ++n) { ticks self_tic = getticks(); DOSELF2(&runner, main_cell); self_force_time += getticks() - self_tic; - } /* Output timing */ @@ -752,7 +751,8 @@ int main(int argc, char *argv[]) { message("Corner calculations took : %15lli ticks.", corner_time / runs); message("Edge calculations took : %15lli ticks.", edge_time / runs); message("Face calculations took : %15lli ticks.", face_time / runs); - message("Self calculations took : %15lli ticks.", self_force_time / 100*runs); + message("Self calculations took : %15lli ticks.", + self_force_time / 100 * runs); message("SWIFT calculation took : %15lli ticks.", time / runs); for (int j = 0; j < 125; ++j) diff --git a/tests/testKernel.c b/tests/testKernel.c index 0f627675cbfbcbc6fe926ee6617f83d3aeacb5de..a2744119a527cc842cdd4711056eee7a7d7b4270 100644 --- a/tests/testKernel.c +++ b/tests/testKernel.c @@ -91,7 +91,7 @@ int main() { printf("\nVector Output for kernel_deval_2_vec\n"); printf("-------------\n"); - + /* Test vectorised kernel that uses two vectors. */ for (int i = 0; i < numPoints; i += VEC_SIZE) { @@ -100,7 +100,7 @@ int main() { vector vx_2, vx_h_2; vector W_vec_2, dW_vec_2; - + for (int j = 0; j < VEC_SIZE; j++) { vx.f[j] = (i + j) * 2.25f / numPoints; vx_2.f[j] = (i + j) * 2.25f / numPoints; @@ -127,7 +127,7 @@ int main() { return 1; } } - + /* Check second vector results. */ for (int j = 0; j < VEC_SIZE; j++) { printf("%2d: h= %f H= %f x=%f W(x,h)=%f dW(x,h)=%f\n", i + j, h,