Commit 028fb65b authored by James Willis's avatar James Willis
Browse files

Formatting.

parent ae2120aa
......@@ -62,19 +62,19 @@ struct cache {
/* Particle z velocity. */
float *restrict vz __attribute__((aligned(CACHE_ALIGN)));
/* Particle density. */
float *restrict rho __attribute__((aligned(CACHE_ALIGN)));
/* Particle smoothing length gradient. */
float *restrict grad_h __attribute__((aligned(CACHE_ALIGN)));
/* Pressure over density squared. */
float *restrict pOrho2 __attribute__((aligned(CACHE_ALIGN)));
/* Balsara switch. */
float *restrict balsara __attribute__((aligned(CACHE_ALIGN)));
/* Particle sound speed. */
float *restrict soundspeed __attribute__((aligned(CACHE_ALIGN)));
......@@ -112,19 +112,19 @@ struct c2_cache {
/* z velocity of particle pj. */
float vzq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
/* Density of particle pj. */
float rhoq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
/* Smoothing length gradient of particle pj. */
float grad_hq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
/* Pressure over density squared of particle pj. */
float pOrho2q[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
/* Balsara switch of particle pj. */
float balsaraq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
/* Sound speed of particle pj. */
float soundspeedq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
......@@ -213,7 +213,7 @@ __attribute__((always_inline)) INLINE void cache_read_particles(
ci_cache->vx[i] = ci->parts[i].v[0];
ci_cache->vy[i] = ci->parts[i].v[1];
ci_cache->vz[i] = ci->parts[i].v[2];
ci_cache->rho[i] = ci->parts[i].rho;
ci_cache->grad_h[i] = ci->parts[i].force.f;
ci_cache->pOrho2[i] = ci->parts[i].force.P_over_rho2;
......
......@@ -434,14 +434,20 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz,
/* Mask updates to intermediate vector sums for particle pi. */
rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj.v, wi.v), mask);
rho_dhSum->v = vec_mask_sub(rho_dhSum->v, vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
vec_mul(xi.v, wi_dx.v))), mask);
rho_dhSum->v = vec_mask_sub(
rho_dhSum->v, vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
vec_mul(xi.v, wi_dx.v))),
mask);
wcountSum->v = vec_mask_add(wcountSum->v, wi.v, mask);
wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, vec_mul(xi.v, wi_dx.v), mask);
div_vSum->v = vec_mask_sub(div_vSum->v, vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask);
curlvxSum->v = vec_mask_add(curlvxSum->v,vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask);
curlvySum->v = vec_mask_add(curlvySum->v,vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask);
curlvzSum->v = vec_mask_add(curlvzSum->v,vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask);
div_vSum->v =
vec_mask_sub(div_vSum->v, vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask);
curlvxSum->v = vec_mask_add(curlvxSum->v,
vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask);
curlvySum->v = vec_mask_add(curlvySum->v,
vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask);
curlvzSum->v = vec_mask_add(curlvzSum->v,
vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask);
}
/**
......@@ -449,12 +455,14 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz,
* (non-symmetric vectorized version).
*/
__attribute__((always_inline)) INLINE static void
runner_iact_nonsym_2_vec_density(
float *R2, float *Dx, float *Dy, float *Dz, vector hi_inv, vector vix,
vector viy, vector viz, float *Vjx, float *Vjy, float *Vjz, float *Mj,
vector *rhoSum, vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum,
vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum,
mask_t mask, mask_t mask2, short mask_cond) {
runner_iact_nonsym_2_vec_density(float *R2, float *Dx, float *Dy, float *Dz,
vector hi_inv, vector vix, vector viy,
vector viz, float *Vjx, float *Vjy, float *Vjz,
float *Mj, vector *rhoSum, vector *rho_dhSum,
vector *wcountSum, vector *wcount_dhSum,
vector *div_vSum, vector *curlvxSum,
vector *curlvySum, vector *curlvzSum,
mask_t mask, mask_t mask2, short mask_cond) {
vector r, ri, r2, xi, wi, wi_dx;
vector mj;
......@@ -534,35 +542,48 @@ runner_iact_nonsym_2_vec_density(
curlvrz.v = vec_mul(curlvrz.v, ri.v);
curlvrz2.v = vec_mul(curlvrz2.v, ri2.v);
/* Mask updates to intermediate vector sums for particle pi. */
/* Mask updates to intermediate vector sums for particle pi. */
/* Mask only when needed. */
if(mask_cond) {
if (mask_cond) {
rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj.v, wi.v), mask);
rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj2.v, wi2.v), mask2);
rho_dhSum->v = vec_mask_sub(rho_dhSum->v, vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
vec_mul(xi.v, wi_dx.v))), mask);
rho_dhSum->v = vec_mask_sub(rho_dhSum->v, vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v,
vec_mul(xi2.v, wi_dx2.v))), mask2);
rho_dhSum->v = vec_mask_sub(
rho_dhSum->v, vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
vec_mul(xi.v, wi_dx.v))),
mask);
rho_dhSum->v = vec_mask_sub(
rho_dhSum->v, vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v,
vec_mul(xi2.v, wi_dx2.v))),
mask2);
wcountSum->v = vec_mask_add(wcountSum->v, wi.v, mask);
wcountSum->v = vec_mask_add(wcountSum->v, wi2.v, mask2);
wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, vec_mul(xi.v, wi_dx.v), mask);
wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, vec_mul(xi2.v, wi_dx2.v), mask2);
div_vSum->v = vec_mask_sub(div_vSum->v, vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask);
div_vSum->v = vec_mask_sub(div_vSum->v, vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)), mask2);
curlvxSum->v = vec_mask_add(curlvxSum->v,vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask);
curlvxSum->v = vec_mask_add(curlvxSum->v,vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)), mask2);
curlvySum->v = vec_mask_add(curlvySum->v,vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask);
curlvySum->v = vec_mask_add(curlvySum->v,vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)), mask2);
curlvzSum->v = vec_mask_add(curlvzSum->v,vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask);
curlvzSum->v = vec_mask_add(curlvzSum->v,vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)), mask2);
}
else {
wcount_dhSum->v =
vec_mask_sub(wcount_dhSum->v, vec_mul(xi.v, wi_dx.v), mask);
wcount_dhSum->v =
vec_mask_sub(wcount_dhSum->v, vec_mul(xi2.v, wi_dx2.v), mask2);
div_vSum->v = vec_mask_sub(div_vSum->v,
vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask);
div_vSum->v = vec_mask_sub(
div_vSum->v, vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)), mask2);
curlvxSum->v = vec_mask_add(
curlvxSum->v, vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask);
curlvxSum->v = vec_mask_add(
curlvxSum->v, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)), mask2);
curlvySum->v = vec_mask_add(
curlvySum->v, vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask);
curlvySum->v = vec_mask_add(
curlvySum->v, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)), mask2);
curlvzSum->v = vec_mask_add(
curlvzSum->v, vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask);
curlvzSum->v = vec_mask_add(
curlvzSum->v, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)), mask2);
} else {
rhoSum->v += vec_mul(mj.v, wi.v);
rhoSum->v += vec_mul(mj2.v, wi2.v);
rho_dhSum->v -= vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
vec_mul(xi.v, wi_dx.v)));
rho_dhSum->v -= vec_mul(
mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(xi.v, wi_dx.v)));
rho_dhSum->v -= vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v,
vec_mul(xi2.v, wi_dx2.v)));
vec_mul(xi2.v, wi_dx2.v)));
wcountSum->v += wi.v;
wcountSum->v += wi2.v;
wcount_dhSum->v -= vec_mul(xi.v, wi_dx.v);
......@@ -1140,9 +1161,15 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force(
}
#ifdef WITH_VECTORIZATION
__attribute__((always_inline)) INLINE static void runner_iact_nonsym_1_vec_force(
float *R2, float *Dx, float *Dy, float *Dz, vector *vix, vector *viy, vector *viz, vector *pirho, vector *grad_hi, vector *piPOrho2, vector *balsara_i, vector *ci, float *Vjx, float *Vjy, float *Vjz, float *Pjrho, float *Grad_hj, float *PjPOrho2, float *Balsara_j, float *Cj, float *Mj, vector *hi_inv, float *Hj_inv,
vector *a_hydro_xSum, vector *a_hydro_ySum, vector *a_hydro_zSum, vector *h_dtSum, vector *v_sigSum, vector *entropy_dtSum, mask_t mask) {
__attribute__((always_inline)) INLINE static void
runner_iact_nonsym_1_vec_force(
float *R2, float *Dx, float *Dy, float *Dz, vector *vix, vector *viy,
vector *viz, vector *pirho, vector *grad_hi, vector *piPOrho2,
vector *balsara_i, vector *ci, float *Vjx, float *Vjy, float *Vjz,
float *Pjrho, float *Grad_hj, float *PjPOrho2, float *Balsara_j, float *Cj,
float *Mj, vector *hi_inv, float *Hj_inv, vector *a_hydro_xSum,
vector *a_hydro_ySum, vector *a_hydro_zSum, vector *h_dtSum,
vector *v_sigSum, vector *entropy_dtSum, mask_t mask) {
#ifdef WITH_VECTORIZATION
......@@ -1164,7 +1191,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_1_vec_force
dx.v = vec_load(Dx);
dy.v = vec_load(Dy);
dz.v = vec_load(Dz);
vjx.v = vec_load(Vjx);
vjy.v = vec_load(Vjy);
vjz.v = vec_load(Vjz);
......@@ -1179,7 +1206,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_1_vec_force
fac_mu.v = vec_set1(1.f); /* Will change with cosmological integration */
/* Load stuff. */
/* Load stuff. */
balsara.v = balsara_i->v + balsara_j.v;
/* Get the radius and inverse radius. */
......@@ -1195,10 +1222,10 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_1_vec_force
/* Get the kernel for hj. */
hjd_inv = pow_dimension_plus_one_vec(hj_inv);
xj.v = r.v * hj_inv.v;
/* Calculate the kernel for two particles. */
kernel_eval_dWdx_force_vec(&xj, &wj_dx);
wj_dr.v = hjd_inv.v * wj_dx.v;
/* Compute dv dot r. */
......@@ -1223,7 +1250,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_1_vec_force
sph_term.v =
(grad_hi->v * piPOrho2->v * wi_dr.v + grad_hj.v * pjPOrho2.v * wj_dr.v) *
ri.v;
/* Eventually get the acceleration */
acc.v = visc_term.v + sph_term.v;
......@@ -1255,9 +1282,16 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_1_vec_force
#endif
}
__attribute__((always_inline)) INLINE static void runner_iact_nonsym_2_vec_force(
float *R2, float *Dx, float *Dy, float *Dz, vector *vix, vector *viy, vector *viz, vector *pirho, vector *grad_hi, vector *piPOrho2, vector *balsara_i, vector *ci, float *Vjx, float *Vjy, float *Vjz, float *Pjrho, float *Grad_hj, float *PjPOrho2, float *Balsara_j, float *Cj, float *Mj, vector *hi_inv, float *Hj_inv,
vector *a_hydro_xSum, vector *a_hydro_ySum, vector *a_hydro_zSum, vector *h_dtSum, vector *v_sigSum, vector *entropy_dtSum, mask_t mask, mask_t mask_2, short mask_cond) {
__attribute__((always_inline)) INLINE static void
runner_iact_nonsym_2_vec_force(
float *R2, float *Dx, float *Dy, float *Dz, vector *vix, vector *viy,
vector *viz, vector *pirho, vector *grad_hi, vector *piPOrho2,
vector *balsara_i, vector *ci, float *Vjx, float *Vjy, float *Vjz,
float *Pjrho, float *Grad_hj, float *PjPOrho2, float *Balsara_j, float *Cj,
float *Mj, vector *hi_inv, float *Hj_inv, vector *a_hydro_xSum,
vector *a_hydro_ySum, vector *a_hydro_zSum, vector *h_dtSum,
vector *v_sigSum, vector *entropy_dtSum, mask_t mask, mask_t mask_2,
short mask_cond) {
#ifdef WITH_VECTORIZATION
......@@ -1292,7 +1326,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_2_vec_force
dx.v = vec_load(Dx);
dy.v = vec_load(Dy);
dz.v = vec_load(Dz);
vjx.v = vec_load(Vjx);
vjy.v = vec_load(Vjy);
vjz.v = vec_load(Vjz);
......@@ -1311,7 +1345,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_2_vec_force
dx_2.v = vec_load(&Dx[VEC_SIZE]);
dy_2.v = vec_load(&Dy[VEC_SIZE]);
dz_2.v = vec_load(&Dz[VEC_SIZE]);
vjx_2.v = vec_load(&Vjx[VEC_SIZE]);
vjy_2.v = vec_load(&Vjy[VEC_SIZE]);
vjz_2.v = vec_load(&Vjz[VEC_SIZE]);
......@@ -1324,7 +1358,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_2_vec_force
cj_2.v = vec_load(&Cj[VEC_SIZE]);
hj_inv_2.v = vec_load(&Hj_inv[VEC_SIZE]);
/* Load stuff. */
/* Load stuff. */
balsara.v = balsara_i->v + balsara_j.v;
balsara_2.v = balsara_i->v + balsara_j_2.v;
......@@ -1348,11 +1382,11 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_2_vec_force
hjd_inv_2 = pow_dimension_plus_one_vec(hj_inv_2);
xj.v = r.v * hj_inv.v;
xj_2.v = r_2.v * hj_inv_2.v;
/* Calculate the kernel for two particles. */
kernel_eval_dWdx_force_vec(&xj, &wj_dx);
kernel_eval_dWdx_force_vec(&xj_2, &wj_dx_2);
wj_dr.v = hjd_inv.v * wj_dx.v;
wj_dr_2.v = hjd_inv_2.v * wj_dx_2.v;
......@@ -1360,13 +1394,13 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_2_vec_force
dvdr.v = ((vix->v - vjx.v) * dx.v) + ((viy->v - vjy.v) * dy.v) +
((viz->v - vjz.v) * dz.v);
dvdr_2.v = ((vix->v - vjx_2.v) * dx_2.v) + ((viy->v - vjy_2.v) * dy_2.v) +
((viz->v - vjz_2.v) * dz_2.v);
((viz->v - vjz_2.v) * dz_2.v);
/* Compute the relative velocity. (This is 0 if the particles move away from
* each other and negative otherwise) */
omega_ij.v = vec_fmin(dvdr.v, vec_setzero());
omega_ij_2.v = vec_fmin(dvdr_2.v, vec_setzero());
mu_ij.v = fac_mu.v * ri.v * omega_ij.v; /* This is 0 or negative */
mu_ij.v = fac_mu.v * ri.v * omega_ij.v; /* This is 0 or negative */
mu_ij_2.v = fac_mu.v * ri_2.v * omega_ij_2.v; /* This is 0 or negative */
/* Compute signal velocity */
......@@ -1379,7 +1413,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_2_vec_force
visc.v = vec_set1(-0.25f) * vec_set1(const_viscosity_alpha) * v_sig.v *
mu_ij.v * balsara.v / rho_ij.v;
visc_2.v = vec_set1(-0.25f) * vec_set1(const_viscosity_alpha) * v_sig_2.v *
mu_ij_2.v * balsara_2.v / rho_ij_2.v;
mu_ij_2.v * balsara_2.v / rho_ij_2.v;
/* Now, convolve with the kernel */
visc_term.v = vec_set1(0.5f) * visc.v * (wi_dr.v + wj_dr.v) * ri.v;
......@@ -1387,9 +1421,9 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_2_vec_force
sph_term.v =
(grad_hi->v * piPOrho2->v * wi_dr.v + grad_hj.v * pjPOrho2.v * wj_dr.v) *
ri.v;
sph_term_2.v =
(grad_hi->v * piPOrho2->v * wi_dr_2.v + grad_hj_2.v * pjPOrho2_2.v * wj_dr_2.v) *
ri_2.v;
sph_term_2.v = (grad_hi->v * piPOrho2->v * wi_dr_2.v +
grad_hj_2.v * pjPOrho2_2.v * wj_dr_2.v) *
ri_2.v;
/* Eventually get the acceleration */
acc.v = visc_term.v + sph_term.v;
......@@ -1412,7 +1446,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_2_vec_force
entropy_dt_2.v = mj_2.v * visc_term_2.v * dvdr_2.v;
/* Store the forces back on the particles. */
if(mask_cond) {
if (mask_cond) {
a_hydro_xSum->v = vec_mask_sub(a_hydro_xSum->v, piax.v, mask);
a_hydro_xSum->v = vec_mask_sub(a_hydro_xSum->v, piax_2.v, mask_2);
a_hydro_ySum->v = vec_mask_sub(a_hydro_ySum->v, piay.v, mask);
......@@ -1425,8 +1459,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_2_vec_force
v_sigSum->v = vec_fmax(v_sigSum->v, vec_and_mask(v_sig_2, mask_2));
entropy_dtSum->v = vec_mask_add(entropy_dtSum->v, entropy_dt.v, mask);
entropy_dtSum->v = vec_mask_add(entropy_dtSum->v, entropy_dt_2.v, mask_2);
}
else {
} else {
a_hydro_xSum->v = vec_sub(a_hydro_xSum->v, piax.v);
a_hydro_xSum->v = vec_sub(a_hydro_xSum->v, piax_2.v);
a_hydro_ySum->v = vec_sub(a_hydro_ySum->v, piay.v);
......
......@@ -436,7 +436,7 @@ static const vector cond = FILL_VEC(0.5f);
/**
* @brief Computes the kernel function and its derivative for two particles
* using vectors. Does not return zero if $u > \\gamma = H/h$, should only
* using vectors. Does not return zero if $u > \\gamma = H/h$, should only
* be called if particles are known to interact.
*
* @param u The ratio of the distance to the smoothing length $u = x/h$.
......@@ -517,7 +517,8 @@ __attribute__((always_inline)) INLINE static void kernel_deval_1_vec(
/**
* @brief Computes the kernel function and its derivative for two particles
* using interleaved vectors. Does not return zero if $u > \\gamma = H/h$, should only
* using interleaved vectors. Does not return zero if $u > \\gamma = H/h$,
* should only
* be called if particles are known to interact.
*
* @param u The ratio of the distance to the smoothing length $u = x/h$.
......@@ -582,9 +583,9 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
vector mask_reg1, mask_reg2, mask_reg1_v2, mask_reg2_v2;
/* Form a mask for each part of the kernel. */
mask_reg1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */
mask_reg1_v2.v = vec_cmp_lt(x2.v, cond.v); /* 0 < x < 0.5 */
mask_reg2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */
mask_reg1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */
mask_reg1_v2.v = vec_cmp_lt(x2.v, cond.v); /* 0 < x < 0.5 */
mask_reg2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */
mask_reg2_v2.v = vec_cmp_gte(x2.v, cond.v); /* 0.5 < x < 1 */
/* Work out w for both regions of the kernel and combine the results together
......@@ -648,7 +649,7 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
/**
* @brief Computes the kernel function for two particles
* using vectors. Does not return zero if $u > \\gamma = H/h$, should only
* using vectors. Does not return zero if $u > \\gamma = H/h$, should only
* be called if particles are known to interact.
*
* @param u The ratio of the distance to the smoothing length $u = x/h$.
......@@ -709,7 +710,7 @@ __attribute__((always_inline)) INLINE static void kernel_eval_W_vec(vector *u,
/**
* @brief Computes the kernel function derivative for two particles
* using vectors. Does not return zero if $u > \\gamma = H/h$, should only
* using vectors. Does not return zero if $u > \\gamma = H/h$, should only
* be called if particles are known to interact.
*
* @param u The ratio of the distance to the smoothing length $u = x/h$.
......@@ -825,9 +826,9 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_vec(
/* Mask out result for particles that lie outside of the kernel function. */
vector mask;
mask.v = vec_cmp_lt(x.v,vec_set1(1.f));
mask.v = vec_cmp_lt(x.v, vec_set1(1.f));
dw_dx->v = vec_and(dw_dx->v,mask.v);
dw_dx->v = vec_and(dw_dx->v, mask.v);
/* Return everything */
dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
......@@ -854,7 +855,8 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_2_vec(
#ifdef WENDLAND_C2_KERNEL
/* Init the iteration for Horner's scheme. */
dw_dx->v = vec_fma(wendland_dwdx_const_c0.v, x.v, wendland_dwdx_const_c1.v);
dw_dx_2->v = vec_fma(wendland_dwdx_const_c0.v, x_2.v, wendland_dwdx_const_c1.v);
dw_dx_2->v =
vec_fma(wendland_dwdx_const_c0.v, x_2.v, wendland_dwdx_const_c1.v);
/* Calculate the polynomial interleaving vector operations */
dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c2.v);
......@@ -872,9 +874,9 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_2_vec(
vector mask_reg1_2, mask_reg2_2;
/* Form a mask for each part of the kernel. */
mask_reg1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */
mask_reg1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */
mask_reg1_2.v = vec_cmp_lt(x_2.v, cond.v); /* 0 < x < 0.5 */
mask_reg2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */
mask_reg2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */
mask_reg2_2.v = vec_cmp_gte(x_2.v, cond.v); /* 0.5 < x < 1 */
/* Work out w for both regions of the kernel and combine the results together
......@@ -887,7 +889,7 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_2_vec(
dw_dx2_2.v = vec_fma(cubic_2_dwdx_const_c0.v, x_2.v, cubic_2_dwdx_const_c1.v);
/* Calculate the polynomial interleaving vector operations. */
dw_dx->v = vec_mul(dw_dx->v, x.v); /* cubic_1_dwdx_const_c2 is zero. */
dw_dx->v = vec_mul(dw_dx->v, x.v); /* cubic_1_dwdx_const_c2 is zero. */
dw_dx_2->v = vec_mul(dw_dx_2->v, x_2.v); /* cubic_1_dwdx_const_c2 is zero. */
dw_dx2.v = vec_fma(dw_dx2.v, x.v, cubic_2_dwdx_const_c2.v);
dw_dx2_2.v = vec_fma(dw_dx2_2.v, x_2.v, cubic_2_dwdx_const_c2.v);
......@@ -907,17 +909,18 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_2_vec(
/* Mask out result for particles that lie outside of the kernel function. */
vector mask, mask_2;
mask.v = vec_cmp_lt(x.v,vec_set1(1.f));
mask_2.v = vec_cmp_lt(x_2.v,vec_set1(1.f));
mask.v = vec_cmp_lt(x.v, vec_set1(1.f));
mask_2.v = vec_cmp_lt(x_2.v, vec_set1(1.f));
dw_dx->v = vec_and(dw_dx->v,mask.v);
dw_dx_2->v = vec_and(dw_dx_2->v,mask_2.v);
dw_dx->v = vec_and(dw_dx->v, mask.v);
dw_dx_2->v = vec_and(dw_dx_2->v, mask_2.v);
/* Return everything */
dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
kernel_gamma_inv_dim_plus_one_vec.v));
dw_dx_2->v = vec_mul(dw_dx_2->v, vec_mul(kernel_constant_vec.v,
kernel_gamma_inv_dim_plus_one_vec.v));
dw_dx_2->v = vec_mul(
dw_dx_2->v,
vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_plus_one_vec.v));
}
#endif /* WITH_VECTORIZATION */
......
......@@ -1793,8 +1793,7 @@ void *runner_main(void *data) {
#else
runner_doself2_force(r, ci);
#endif
}
else if (t->subtype == task_subtype_grav)
} else if (t->subtype == task_subtype_grav)
runner_doself_grav(r, ci, 1);
else if (t->subtype == task_subtype_external_grav)
runner_do_grav_external(r, ci, 1);
......
This diff is collapsed.
......@@ -60,9 +60,9 @@
#define vec_dbl_set(a, b, c, d, e, f, g, h) \
_mm512_set_pd(h, g, f, e, d, c, b, a)
#define vec_add(a, b) _mm512_add_ps(a, b)
#define vec_mask_add(a, b, mask) _mm512_mask_add_ps(a, mask, b, a)
#define vec_mask_add(a, b, mask) _mm512_mask_add_ps(a, mask, b, a)
#define vec_sub(a, b) _mm512_sub_ps(a, b)
#define vec_mask_sub(a, b, mask) _mm512_mask_sub_ps(a, mask, a, b)
#define vec_mask_sub(a, b, mask) _mm512_mask_sub_ps(a, mask, a, b)
#define vec_mul(a, b) _mm512_mul_ps(a, b)
#define vec_fma(a, b, c) _mm512_fmadd_ps(a, b, c)
#define vec_sqrt(a) _mm512_sqrt_ps(a)
......@@ -80,12 +80,12 @@
#define vec_cmp_result(a) a
#define vec_form_int_mask(a) a
#define vec_and(a, b) _mm512_and_ps(a, b)
#define vec_mask_and(a, b) a & b
#define vec_mask_and(a, b) a &b
#define vec_and_mask(a, mask) _mm512_maskz_expand_ps(mask, a)
#define vec_init_mask(mask) mask = 0xFFFF
#define vec_zero_mask(mask) mask = 0
#define vec_create_mask(mask, cond) mask = cond
#define vec_pad_mask(mask,pad) mask = mask >> (pad)
#define vec_pad_mask(mask, pad) mask = mask >> (pad)
#define vec_todbl_lo(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 0))
#define vec_todbl_hi(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 1))
#define vec_dbl_tofloat(a, b) _mm512_insertf128(_mm512_castps128_ps512(a), b, 1)
......@@ -127,7 +127,7 @@
}
#endif
/* Do nothing in the case of AVX-512 as there are already
/* Do nothing in the case of AVX-512 as there are already
* instructions for left-packing.*/
#define VEC_FORM_PACKED_MASK(mask, packed_mask)
......@@ -155,9 +155,9 @@
#define vec_set(a, b, c, d, e, f, g, h) _mm256_set_ps(h, g, f, e, d, c, b, a)
#define vec_dbl_set(a, b, c, d) _mm256_set_pd(d, c, b, a)
#define vec_add(a, b) _mm256_add_ps(a, b)
#define vec_mask_add(a, b, mask) vec_add(a, vec_and(b,mask.v))
#define vec_mask_add(a, b, mask) vec_add(a, vec_and(b, mask.v))
#define vec_sub(a, b) _mm256_sub_ps(a, b)
#define vec_mask_sub(a, b, mask) vec_sub(a, vec_and(b,mask.v))
#define vec_mask_sub(a, b, mask) vec_sub(a, vec_and(b, mask.v))
#define vec_mul(a, b) _mm256_mul_ps(a, b)
#define vec_sqrt(a) _mm256_sqrt_ps(a)
#define vec_rcp(a) _mm256_rcp_ps(a)
......@@ -172,14 +172,15 @@
#define vec_cmp_lte(a, b) _mm256_cmp_ps(a, b, _CMP_LE_OQ)
#define vec_cmp_gte(a, b) _mm256_cmp_ps(a, b, _CMP_GE_OQ)
#define vec_cmp_result(a) _mm256_movemask_ps(a)
#define vec_form_int_mask(a) _mm256_movemask_ps(a.v)
#define vec_form_int_mask(a) _mm256_movemask_ps(a.v)
#define vec_and(a, b) _mm256_and_ps(a, b)
#define vec_mask_and(a, b) _mm256_and_ps(a.v, b.v)
#define vec_and_mask(a, mask) vec_mask_and(a, mask)
#define vec_init_mask(mask) mask.m = vec_setint1(0xFFFFFFFF)
#define vec_create_mask(mask, cond) mask.v = cond
#define vec_zero_mask(mask) mask.v = vec_setzero()
#define vec_pad_mask(mask,pad) for (int i = VEC_SIZE - (pad); i < VEC_SIZE; i++) mask.i[i] = 0
#define vec_pad_mask(mask, pad) \
for (int i = VEC_SIZE - (pad); i < VEC_SIZE; i++) mask.i[i] = 0
#define vec_todbl_lo(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 0))
#define vec_todbl_hi(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 1))
#define vec_dbl_tofloat(a, b) _mm256_insertf128(_mm256_castps128_ps256(a), b, 1)
......@@ -205,12 +206,12 @@
a.v = _mm256_hadd_ps(a.v, a.v); \
b += a.f[0] + a.f[4];
/* Performs a horizontal maximum on the vector and takes the maximum of the result with a float, b. */
#define VEC_HMAX(a, b) \
{ \
for(int k=0; k<VEC_SIZE; k++) \
b = max(b, a.f[k]); \
}
/* Performs a horizontal maximum on the vector and takes the maximum of the
* result with a float, b. */
#define VEC_HMAX(a, b) \
{ \
for (int k = 0; k < VEC_SIZE; k++) b = max(b, a.f[k]); \
}
/* Returns the lower 128-bits of the 256-bit vector. */
#define VEC_GET_LOW(a) _mm256_castps256_ps128(a)
......
......@@ -690,7 +690,7 @@ int main(int argc, char *argv[]) {
struct cell *cj = cells[i * 25 + j * 5 + k];
if (main_cell != cj) {
const ticks sub_tic = getticks();
runner_dopair2_force(&runner, main_cell, cj);
......@@ -703,7 +703,7 @@ int main(int argc, char *argv[]) {
}
#ifdef WITH_VECTORIZATION
/* Initialise the cache. */
/* Initialise the cache. */
runner.ci_cache.count = 0;
cache_init(&runner.ci_cache, 512);
#endif
......@@ -712,7 +712,7 @@ int main(int argc, char *argv[]) {
/* And now the self-interaction for the main cell */
DOSELF2(&runner, main_cell);
timings[26] += getticks() - self_tic;
#endif
......@@ -729,13 +729,12 @@ int main(int argc, char *argv[]) {
}
}
for (size_t n = 0; n < 100*runs; ++n) {
for (size_t n = 0; n < 100 * runs; ++n) {
ticks self_tic = getticks();
DOSELF2(&runner, main_cell);
self_force_time += getticks() - self_tic;
}
/* Output timing */
......@@ -752,7 +751,8 @@ int main(int argc, char *argv[]) {
message("Corner calculations took : %15lli ticks.", corner_time / runs);