Commit 1da52a3a authored by Matthieu Schaller's avatar Matthieu Schaller
Browse files

Merge branch 'doself2-vectorisation' into 'master'

Doself2 vectorisation

Implements:
* `runner_doself2_force_vec` a vectorised version of the `DOSELF2` for force interactions.
* Updates particle cache with properties needed for force interactions.
* Vectorised interaction functions for the force using 1 and 2 vectors, which are tested in `testInteractions.c`

See merge request !406
parents be949fab 3d0eb1c0
......@@ -59,6 +59,11 @@ tests/brute_force_periodic_BC_perturbed.dat
tests/swift_dopair_active.dat
tests/test_nonsym_density_serial.dat
tests/test_nonsym_density_vec.dat
tests/test_nonsym_force_serial.dat
tests/test_nonsym_density_1_vec.dat
tests/test_nonsym_density_2_vec.dat
tests/test_nonsym_force_1_vec.dat
tests/test_nonsym_force_2_vec.dat
tests/testGreetings
tests/testReading
tests/input.hdf5
......
......@@ -144,6 +144,14 @@ __attribute__((always_inline)) INLINE static int part_is_active(
return (part_bin <= max_active_bin);
}
__attribute__((always_inline)) INLINE static int part_is_active_no_debug(
const struct part *p, const timebin_t max_active_bin) {
const timebin_t part_bin = p->time_bin;
return (part_bin <= max_active_bin);
}
/**
* @brief Is this g-particle finishing its time-step now ?
*
......
......@@ -65,6 +65,21 @@ struct cache {
/* Maximum index into neighbouring cell for particles that are in range. */
int *restrict max_index SWIFT_CACHE_ALIGN;
/* Particle density. */
float *restrict rho SWIFT_CACHE_ALIGN;
/* Particle smoothing length gradient. */
float *restrict grad_h SWIFT_CACHE_ALIGN;
/* Pressure over density squared. */
float *restrict pOrho2 SWIFT_CACHE_ALIGN;
/* Balsara switch. */
float *restrict balsara SWIFT_CACHE_ALIGN;
/* Particle sound speed. */
float *restrict soundspeed SWIFT_CACHE_ALIGN;
/* Cache size. */
int count;
};
......@@ -127,6 +142,11 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c,
free(c->vz);
free(c->h);
free(c->max_index);
free(c->rho);
free(c->grad_h);
free(c->pOrho2);
free(c->balsara);
free(c->soundspeed);
}
error += posix_memalign((void **)&c->x, SWIFT_CACHE_ALIGNMENT, sizeBytes);
......@@ -139,6 +159,15 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c,
error += posix_memalign((void **)&c->h, SWIFT_CACHE_ALIGNMENT, sizeBytes);
error += posix_memalign((void **)&c->max_index, SWIFT_CACHE_ALIGNMENT,
sizeIntBytes);
error += posix_memalign((void **)&c->rho, SWIFT_CACHE_ALIGNMENT, sizeBytes);
error +=
posix_memalign((void **)&c->grad_h, SWIFT_CACHE_ALIGNMENT, sizeBytes);
error +=
posix_memalign((void **)&c->pOrho2, SWIFT_CACHE_ALIGNMENT, sizeBytes);
error +=
posix_memalign((void **)&c->balsara, SWIFT_CACHE_ALIGNMENT, sizeBytes);
error +=
posix_memalign((void **)&c->soundspeed, SWIFT_CACHE_ALIGNMENT, sizeBytes);
if (error != 0)
error("Couldn't allocate cache, no. of particles: %d", (int)count);
......@@ -191,6 +220,68 @@ __attribute__((always_inline)) INLINE void cache_read_particles(
#endif
}
/**
* @brief Populate cache for force interactions by reading in the particles in
* unsorted order.
*
* @param ci The #cell.
* @param ci_cache The cache.
*/
__attribute__((always_inline)) INLINE void cache_read_force_particles(
const struct cell *restrict const ci,
struct cache *restrict const ci_cache) {
#if defined(GADGET2_SPH)
/* Let the compiler know that the data is aligned and create pointers to the
* arrays inside the cache. */
swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, y, ci_cache->y, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, z, ci_cache->z, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, h, ci_cache->h, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, m, ci_cache->m, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, vx, ci_cache->vx, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, rho, ci_cache->rho, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, grad_h, ci_cache->grad_h,
SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, pOrho2, ci_cache->pOrho2,
SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, balsara, ci_cache->balsara,
SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, soundspeed, ci_cache->soundspeed,
SWIFT_CACHE_ALIGNMENT);
const struct part *restrict parts = ci->parts;
double loc[3];
loc[0] = ci->loc[0];
loc[1] = ci->loc[1];
loc[2] = ci->loc[2];
/* Shift the particles positions to a local frame so single precision can be
* used instead of double precision. */
for (int i = 0; i < ci->count; i++) {
x[i] = (float)(parts[i].x[0] - loc[0]);
y[i] = (float)(parts[i].x[1] - loc[1]);
z[i] = (float)(parts[i].x[2] - loc[2]);
h[i] = parts[i].h;
m[i] = parts[i].mass;
vx[i] = parts[i].v[0];
vy[i] = parts[i].v[1];
vz[i] = parts[i].v[2];
rho[i] = parts[i].rho;
grad_h[i] = parts[i].force.f;
pOrho2[i] = parts[i].force.P_over_rho2;
balsara[i] = parts[i].force.balsara;
soundspeed[i] = parts[i].force.soundspeed;
}
#endif
}
/**
* @brief Populate caches by only reading particles that are within range of
* each other within the adjoining cell.Also read the particles into the cache
......
......@@ -1167,4 +1167,365 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force(
#endif
}
#ifdef WITH_VECTORIZATION
static const vector const_viscosity_alpha_fac =
FILL_VEC(-0.25f * const_viscosity_alpha);
/**
* @brief Force interaction computed using 1 vector
* (non-symmetric vectorized version).
*/
__attribute__((always_inline)) INLINE static void
runner_iact_nonsym_1_vec_force(
vector *r2, vector *dx, vector *dy, vector *dz, vector vix, vector viy,
vector viz, vector pirho, vector grad_hi, vector piPOrho2, vector balsara_i,
vector ci, float *Vjx, float *Vjy, float *Vjz, float *Pjrho, float *Grad_hj,
float *PjPOrho2, float *Balsara_j, float *Cj, float *Mj, vector hi_inv,
vector hj_inv, vector *a_hydro_xSum, vector *a_hydro_ySum,
vector *a_hydro_zSum, vector *h_dtSum, vector *v_sigSum,
vector *entropy_dtSum, mask_t mask) {
#ifdef WITH_VECTORIZATION
vector r, ri;
vector vjx, vjy, vjz, dvx, dvy, dvz;
vector pjrho, grad_hj, pjPOrho2, balsara_j, cj, mj;
vector xi, xj;
vector hid_inv, hjd_inv;
vector wi_dx, wj_dx, wi_dr, wj_dr, dvdr;
vector piax, piay, piaz;
vector pih_dt;
vector v_sig;
vector omega_ij, mu_ij, fac_mu, balsara;
vector rho_ij, visc, visc_term, sph_term, acc, entropy_dt;
/* Fill vectors. */
vjx.v = vec_load(Vjx);
vjy.v = vec_load(Vjy);
vjz.v = vec_load(Vjz);
mj.v = vec_load(Mj);
pjrho.v = vec_load(Pjrho);
grad_hj.v = vec_load(Grad_hj);
pjPOrho2.v = vec_load(PjPOrho2);
balsara_j.v = vec_load(Balsara_j);
cj.v = vec_load(Cj);
fac_mu.v = vec_set1(1.f); /* Will change with cosmological integration */
/* Load stuff. */
balsara.v = vec_add(balsara_i.v, balsara_j.v);
/* Get the radius and inverse radius. */
ri = vec_reciprocal_sqrt(*r2);
r.v = vec_mul(r2->v, ri.v);
/* Get the kernel for hi. */
hid_inv = pow_dimension_plus_one_vec(hi_inv);
xi.v = vec_mul(r.v, hi_inv.v);
kernel_eval_dWdx_force_vec(&xi, &wi_dx);
wi_dr.v = vec_mul(hid_inv.v, wi_dx.v);
/* Get the kernel for hj. */
hjd_inv = pow_dimension_plus_one_vec(hj_inv);
xj.v = vec_mul(r.v, hj_inv.v);
/* Calculate the kernel for two particles. */
kernel_eval_dWdx_force_vec(&xj, &wj_dx);
wj_dr.v = vec_mul(hjd_inv.v, wj_dx.v);
/* Compute dv. */
dvx.v = vec_sub(vix.v, vjx.v);
dvy.v = vec_sub(viy.v, vjy.v);
dvz.v = vec_sub(viz.v, vjz.v);
/* Compute dv dot r. */
dvdr.v = vec_fma(dvx.v, dx->v, vec_fma(dvy.v, dy->v, vec_mul(dvz.v, dz->v)));
/* Compute the relative velocity. (This is 0 if the particles move away from
* each other and negative otherwise) */
omega_ij.v = vec_fmin(dvdr.v, vec_setzero());
mu_ij.v =
vec_mul(fac_mu.v, vec_mul(ri.v, omega_ij.v)); /* This is 0 or negative */
/* Compute signal velocity */
v_sig.v = vec_fnma(vec_set1(3.f), mu_ij.v, vec_add(ci.v, cj.v));
/* Now construct the full viscosity term */
rho_ij.v = vec_mul(vec_set1(0.5f), vec_add(pirho.v, pjrho.v));
visc.v = vec_div(vec_mul(const_viscosity_alpha_fac.v,
vec_mul(v_sig.v, vec_mul(mu_ij.v, balsara.v))),
rho_ij.v);
/* Now, convolve with the kernel */
visc_term.v =
vec_mul(vec_set1(0.5f),
vec_mul(visc.v, vec_mul(vec_add(wi_dr.v, wj_dr.v), ri.v)));
sph_term.v =
vec_mul(vec_fma(vec_mul(grad_hi.v, piPOrho2.v), wi_dr.v,
vec_mul(grad_hj.v, vec_mul(pjPOrho2.v, wj_dr.v))),
ri.v);
/* Eventually get the acceleration */
acc.v = vec_add(visc_term.v, sph_term.v);
/* Use the force, Luke! */
piax.v = vec_mul(mj.v, vec_mul(dx->v, acc.v));
piay.v = vec_mul(mj.v, vec_mul(dy->v, acc.v));
piaz.v = vec_mul(mj.v, vec_mul(dz->v, acc.v));
/* Get the time derivative for h. */
pih_dt.v =
vec_div(vec_mul(mj.v, vec_mul(dvdr.v, vec_mul(ri.v, wi_dr.v))), pjrho.v);
/* Change in entropy */
entropy_dt.v = vec_mul(mj.v, vec_mul(visc_term.v, dvdr.v));
/* Store the forces back on the particles. */
a_hydro_xSum->v = vec_mask_sub(a_hydro_xSum->v, piax.v, mask);
a_hydro_ySum->v = vec_mask_sub(a_hydro_ySum->v, piay.v, mask);
a_hydro_zSum->v = vec_mask_sub(a_hydro_zSum->v, piaz.v, mask);
h_dtSum->v = vec_mask_sub(h_dtSum->v, pih_dt.v, mask);
v_sigSum->v = vec_fmax(v_sigSum->v, vec_and_mask(v_sig.v, mask));
entropy_dtSum->v = vec_mask_add(entropy_dtSum->v, entropy_dt.v, mask);
#else
error(
"The Gadget2 serial version of runner_iact_nonsym_force was called when "
"the vectorised version should have been used.");
#endif
}
/**
* @brief Force interaction computed using 2 interleaved vectors
* (non-symmetric vectorized version).
*/
__attribute__((always_inline)) INLINE static void
runner_iact_nonsym_2_vec_force(
float *R2, float *Dx, float *Dy, float *Dz, vector vix, vector viy,
vector viz, vector pirho, vector grad_hi, vector piPOrho2, vector balsara_i,
vector ci, float *Vjx, float *Vjy, float *Vjz, float *Pjrho, float *Grad_hj,
float *PjPOrho2, float *Balsara_j, float *Cj, float *Mj, vector hi_inv,
float *Hj_inv, vector *a_hydro_xSum, vector *a_hydro_ySum,
vector *a_hydro_zSum, vector *h_dtSum, vector *v_sigSum,
vector *entropy_dtSum, mask_t mask, mask_t mask_2, short mask_cond) {
#ifdef WITH_VECTORIZATION
vector r, r2, ri;
vector dx, dy, dz, dvx, dvy, dvz;
vector vjx, vjy, vjz;
vector pjrho, grad_hj, pjPOrho2, balsara_j, cj, mj, hj_inv;
vector ui, uj;
vector hid_inv, hjd_inv;
vector wi_dx, wj_dx, wi_dr, wj_dr, dvdr;
vector piax, piay, piaz;
vector pih_dt;
vector v_sig;
vector omega_ij, mu_ij, fac_mu, balsara;
vector rho_ij, visc, visc_term, sph_term, acc, entropy_dt;
vector r_2, r2_2, ri_2;
vector dx_2, dy_2, dz_2, dvx_2, dvy_2, dvz_2;
vector vjx_2, vjy_2, vjz_2;
vector pjrho_2, grad_hj_2, pjPOrho2_2, balsara_j_2, cj_2, mj_2, hj_inv_2;
vector ui_2, uj_2;
vector hjd_inv_2;
vector wi_dx_2, wj_dx_2, wi_dr_2, wj_dr_2, dvdr_2;
vector piax_2, piay_2, piaz_2;
vector pih_dt_2;
vector v_sig_2;
vector omega_ij_2, mu_ij_2, balsara_2;
vector rho_ij_2, visc_2, visc_term_2, sph_term_2, acc_2, entropy_dt_2;
/* Fill vectors. */
mj.v = vec_load(Mj);
mj_2.v = vec_load(&Mj[VEC_SIZE]);
vjx.v = vec_load(Vjx);
vjx_2.v = vec_load(&Vjx[VEC_SIZE]);
vjy.v = vec_load(Vjy);
vjy_2.v = vec_load(&Vjy[VEC_SIZE]);
vjz.v = vec_load(Vjz);
vjz_2.v = vec_load(&Vjz[VEC_SIZE]);
dx.v = vec_load(Dx);
dx_2.v = vec_load(&Dx[VEC_SIZE]);
dy.v = vec_load(Dy);
dy_2.v = vec_load(&Dy[VEC_SIZE]);
dz.v = vec_load(Dz);
dz_2.v = vec_load(&Dz[VEC_SIZE]);
/* Get the radius and inverse radius. */
r2.v = vec_load(R2);
r2_2.v = vec_load(&R2[VEC_SIZE]);
ri = vec_reciprocal_sqrt(r2);
ri_2 = vec_reciprocal_sqrt(r2_2);
r.v = vec_mul(r2.v, ri.v);
r_2.v = vec_mul(r2_2.v, ri_2.v);
/* Get remaining properties. */
pjrho.v = vec_load(Pjrho);
pjrho_2.v = vec_load(&Pjrho[VEC_SIZE]);
grad_hj.v = vec_load(Grad_hj);
grad_hj_2.v = vec_load(&Grad_hj[VEC_SIZE]);
pjPOrho2.v = vec_load(PjPOrho2);
pjPOrho2_2.v = vec_load(&PjPOrho2[VEC_SIZE]);
balsara_j.v = vec_load(Balsara_j);
balsara_j_2.v = vec_load(&Balsara_j[VEC_SIZE]);
cj.v = vec_load(Cj);
cj_2.v = vec_load(&Cj[VEC_SIZE]);
hj_inv.v = vec_load(Hj_inv);
hj_inv_2.v = vec_load(&Hj_inv[VEC_SIZE]);
fac_mu.v = vec_set1(1.f); /* Will change with cosmological integration */
/* Find the balsara switch. */
balsara.v = vec_add(balsara_i.v, balsara_j.v);
balsara_2.v = vec_add(balsara_i.v, balsara_j_2.v);
/* Get the kernel for hi. */
hid_inv = pow_dimension_plus_one_vec(hi_inv);
ui.v = vec_mul(r.v, hi_inv.v);
ui_2.v = vec_mul(r_2.v, hi_inv.v);
kernel_eval_dWdx_force_vec(&ui, &wi_dx);
kernel_eval_dWdx_force_vec(&ui_2, &wi_dx_2);
wi_dr.v = vec_mul(hid_inv.v, wi_dx.v);
wi_dr_2.v = vec_mul(hid_inv.v, wi_dx_2.v);
/* Get the kernel for hj. */
hjd_inv = pow_dimension_plus_one_vec(hj_inv);
hjd_inv_2 = pow_dimension_plus_one_vec(hj_inv_2);
uj.v = vec_mul(r.v, hj_inv.v);
uj_2.v = vec_mul(r_2.v, hj_inv_2.v);
/* Calculate the kernel for two particles. */
kernel_eval_dWdx_force_vec(&uj, &wj_dx);
kernel_eval_dWdx_force_vec(&uj_2, &wj_dx_2);
wj_dr.v = vec_mul(hjd_inv.v, wj_dx.v);
wj_dr_2.v = vec_mul(hjd_inv_2.v, wj_dx_2.v);
/* Compute dv. */
dvx.v = vec_sub(vix.v, vjx.v);
dvx_2.v = vec_sub(vix.v, vjx_2.v);
dvy.v = vec_sub(viy.v, vjy.v);
dvy_2.v = vec_sub(viy.v, vjy_2.v);
dvz.v = vec_sub(viz.v, vjz.v);
dvz_2.v = vec_sub(viz.v, vjz_2.v);
/* Compute dv dot r. */
dvdr.v = vec_fma(dvx.v, dx.v, vec_fma(dvy.v, dy.v, vec_mul(dvz.v, dz.v)));
dvdr_2.v = vec_fma(dvx_2.v, dx_2.v,
vec_fma(dvy_2.v, dy_2.v, vec_mul(dvz_2.v, dz_2.v)));
/* Compute the relative velocity. (This is 0 if the particles move away from
* each other and negative otherwise) */
omega_ij.v = vec_fmin(dvdr.v, vec_setzero());
omega_ij_2.v = vec_fmin(dvdr_2.v, vec_setzero());
mu_ij.v =
vec_mul(fac_mu.v, vec_mul(ri.v, omega_ij.v)); /* This is 0 or negative */
mu_ij_2.v = vec_mul(
fac_mu.v, vec_mul(ri_2.v, omega_ij_2.v)); /* This is 0 or negative */
/* Compute signal velocity */
v_sig.v = vec_fnma(vec_set1(3.f), mu_ij.v, vec_add(ci.v, cj.v));
v_sig_2.v = vec_fnma(vec_set1(3.f), mu_ij_2.v, vec_add(ci.v, cj_2.v));
/* Now construct the full viscosity term */
rho_ij.v = vec_mul(vec_set1(0.5f), vec_add(pirho.v, pjrho.v));
rho_ij_2.v = vec_mul(vec_set1(0.5f), vec_add(pirho.v, pjrho_2.v));
visc.v = vec_div(vec_mul(const_viscosity_alpha_fac.v,
vec_mul(v_sig.v, vec_mul(mu_ij.v, balsara.v))),
rho_ij.v);
visc_2.v =
vec_div(vec_mul(const_viscosity_alpha_fac.v,
vec_mul(v_sig_2.v, vec_mul(mu_ij_2.v, balsara_2.v))),
rho_ij_2.v);
/* Now, convolve with the kernel */
visc_term.v =
vec_mul(vec_set1(0.5f),
vec_mul(visc.v, vec_mul(vec_add(wi_dr.v, wj_dr.v), ri.v)));
visc_term_2.v = vec_mul(
vec_set1(0.5f),
vec_mul(visc_2.v, vec_mul(vec_add(wi_dr_2.v, wj_dr_2.v), ri_2.v)));
vector grad_hi_mul_piPOrho2;
grad_hi_mul_piPOrho2.v = vec_mul(grad_hi.v, piPOrho2.v);
sph_term.v =
vec_mul(vec_fma(grad_hi_mul_piPOrho2.v, wi_dr.v,
vec_mul(grad_hj.v, vec_mul(pjPOrho2.v, wj_dr.v))),
ri.v);
sph_term_2.v =
vec_mul(vec_fma(grad_hi_mul_piPOrho2.v, wi_dr_2.v,
vec_mul(grad_hj_2.v, vec_mul(pjPOrho2_2.v, wj_dr_2.v))),
ri_2.v);
/* Eventually get the acceleration */
acc.v = vec_add(visc_term.v, sph_term.v);
acc_2.v = vec_add(visc_term_2.v, sph_term_2.v);
/* Use the force, Luke! */
piax.v = vec_mul(mj.v, vec_mul(dx.v, acc.v));
piax_2.v = vec_mul(mj_2.v, vec_mul(dx_2.v, acc_2.v));
piay.v = vec_mul(mj.v, vec_mul(dy.v, acc.v));
piay_2.v = vec_mul(mj_2.v, vec_mul(dy_2.v, acc_2.v));
piaz.v = vec_mul(mj.v, vec_mul(dz.v, acc.v));
piaz_2.v = vec_mul(mj_2.v, vec_mul(dz_2.v, acc_2.v));
/* Get the time derivative for h. */
pih_dt.v =
vec_div(vec_mul(mj.v, vec_mul(dvdr.v, vec_mul(ri.v, wi_dr.v))), pjrho.v);
pih_dt_2.v =
vec_div(vec_mul(mj_2.v, vec_mul(dvdr_2.v, vec_mul(ri_2.v, wi_dr_2.v))),
pjrho_2.v);
/* Change in entropy */
entropy_dt.v = vec_mul(mj.v, vec_mul(visc_term.v, dvdr.v));
entropy_dt_2.v = vec_mul(mj_2.v, vec_mul(visc_term_2.v, dvdr_2.v));
/* Store the forces back on the particles. */
if (mask_cond) {
a_hydro_xSum->v = vec_mask_sub(a_hydro_xSum->v, piax.v, mask);
a_hydro_xSum->v = vec_mask_sub(a_hydro_xSum->v, piax_2.v, mask_2);
a_hydro_ySum->v = vec_mask_sub(a_hydro_ySum->v, piay.v, mask);
a_hydro_ySum->v = vec_mask_sub(a_hydro_ySum->v, piay_2.v, mask_2);
a_hydro_zSum->v = vec_mask_sub(a_hydro_zSum->v, piaz.v, mask);
a_hydro_zSum->v = vec_mask_sub(a_hydro_zSum->v, piaz_2.v, mask_2);
h_dtSum->v = vec_mask_sub(h_dtSum->v, pih_dt.v, mask);
h_dtSum->v = vec_mask_sub(h_dtSum->v, pih_dt_2.v, mask_2);
v_sigSum->v = vec_fmax(v_sigSum->v, vec_and_mask(v_sig.v, mask));
v_sigSum->v = vec_fmax(v_sigSum->v, vec_and_mask(v_sig_2.v, mask_2));
entropy_dtSum->v = vec_mask_add(entropy_dtSum->v, entropy_dt.v, mask);
entropy_dtSum->v = vec_mask_add(entropy_dtSum->v, entropy_dt_2.v, mask_2);
} else {
a_hydro_xSum->v = vec_sub(a_hydro_xSum->v, piax.v);
a_hydro_xSum->v = vec_sub(a_hydro_xSum->v, piax_2.v);
a_hydro_ySum->v = vec_sub(a_hydro_ySum->v, piay.v);
a_hydro_ySum->v = vec_sub(a_hydro_ySum->v, piay_2.v);
a_hydro_zSum->v = vec_sub(a_hydro_zSum->v, piaz.v);
a_hydro_zSum->v = vec_sub(a_hydro_zSum->v, piaz_2.v);
h_dtSum->v = vec_sub(h_dtSum->v, pih_dt.v);
h_dtSum->v = vec_sub(h_dtSum->v, pih_dt_2.v);
v_sigSum->v = vec_fmax(v_sigSum->v, v_sig.v);
v_sigSum->v = vec_fmax(v_sigSum->v, v_sig_2.v);
entropy_dtSum->v = vec_add(entropy_dtSum->v, entropy_dt.v);
entropy_dtSum->v = vec_add(entropy_dtSum->v, entropy_dt_2.v);
}
#else
error(
"The Gadget2 serial version of runner_iact_nonsym_force was called when "
"the vectorised version should have been used.");
#endif
}
#endif
#endif /* SWIFT_GADGET2_HYDRO_IACT_H */
......@@ -438,9 +438,7 @@ static const vector cond = FILL_VEC(0.5f);
/**
* @brief Computes the kernel function and its derivative for two particles
* using vectors.
*
* Return 0 if $u > \\gamma = H/h$
* using vectors. The return value is undefined if $u > \\gamma = H/h$.
*
* @param u The ratio of the distance to the smoothing length $u = x/h$.
* @param w (return) The value of the kernel function $W(x,h)$.
......@@ -518,9 +516,8 @@ __attribute__((always_inline)) INLINE static void kernel_deval_1_vec(
/**
* @brief Computes the kernel function and its derivative for two particles
* using interleaved vectors.
*
* Return 0 if $u > \\gamma = H/h$
* using interleaved vectors. The return value is undefined if $u > \\gamma =
* H/h$.
*
* @param u The ratio of the distance to the smoothing length $u = x/h$.
* @param w (return) The value of the kernel function $W(x,h)$.
......@@ -644,9 +641,7 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
/**
* @brief Computes the kernel function for two particles
* using vectors.
*
* Return 0 if $u > \\gamma = H/h$
* using vectors. The return value is undefined if $u > \\gamma = H/h$.
*
* @param u The ratio of the distance to the smoothing length $u = x/h$.
* @param w (return) The value of the kernel function $W(x,h)$.
......@@ -704,6 +699,65 @@ __attribute__((always_inline)) INLINE static void kernel_eval_W_vec(vector *u,
vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
}
/**
* @brief Computes the kernel function derivative for two particles
* using vectors. The return value is undefined if $u > \\gamma = H/h$.
*
* @param u The ratio of the distance to the smoothing length $u = x/h$.
* @param dw_dx (return) The norm of the gradient of $|\\nabla W(x,h)|$.
*/
__attribute__((always_inline)) INLINE static void kernel_eval_dWdx_vec(
vector *u, vector *dw_dx) {
/* Go to the range [0,1[ from [0,H[ */
vector x;
x.v = vec_mul(u->v, kernel_gamma_inv_vec.v);
#ifdef WENDLAND_C2_KERNEL
/* Init the iteration for Horner's scheme. */
dw_dx->v = vec_fma(wendland_dwdx_const_c0.v, x.v, wendland_dwdx_const_c1.v);
/* Calculate the polynomial interleaving vector operations */
dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c2.v);
dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c3.v);
dw_dx->v = vec_mul(dw_dx->v, x.v);
#elif defined(CUBIC_SPLINE_KERNEL)
vector dw_dx2;
mask_t mask_reg1, mask_reg2;
/* Form a mask for each part of the kernel. */
vec_create_mask(mask_reg1, vec_cmp_lt(x.v, cond.v)); /* 0 < x < 0.5 */
vec_create_mask(mask_reg2, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */
/* Work out w for both regions of the kernel and combine the results together
* using masks. */
/* Init the iteration for Horner's scheme. */
dw_dx->v = vec_fma(cubic_1_dwdx_const_c0.v, x.v, cubic_1_dwdx_const_c1.v);
dw_dx2.v = vec_fma(cubic_2_dwdx_const_c0.v, x.v, cubic_2_dwdx_const_c1.v);