Commit d3dd5d55 authored by Matthieu Schaller's avatar Matthieu Schaller
Browse files

Merge branch 'master' into test_125

parents 133171b2 62e0011a
......@@ -269,12 +269,6 @@ int main(int argc, char *argv[]) {
/* Temporary abort to handle absence of vectorized functions */
#ifdef WITH_VECTORIZATION
#ifdef GADGET2_SPH
error(
"Vectorized version of Gadget SPH routines not implemented yet. "
"Reconfigure with --disable-vec and recompile or use DEFAULT_SPH.");
#endif
#ifdef MINIMAL_SPH
error(
"Vectorized version of Minimal SPH routines not implemented yet. "
......
......@@ -147,12 +147,11 @@ __attribute__((always_inline)) INLINE static void hydro_prepare_force(
/* Compute this particle's sound speed. */
const float u = p->u;
const float fc = p->force.soundspeed =
sqrtf(hydro_gamma * hydro_gamma_minus_one * u);
const float fc = p->force.soundspeed = sqrtf(hydro_gamma * hydro_gamma_minus_one * u);
/* Compute the P/Omega/rho2. */
xp->omega = 1.0f + 0.3333333333f * h * p->rho_dh / p->rho;
p->force.POrho2 = u * hydro_gamma_minus_one / (p->rho * xp->omega);
p->force.P_over_rho2 = u * hydro_gamma_minus_one / (p->rho * xp->omega);
/* Balsara switch */
p->force.balsara = normDiv_v / (normDiv_v + normRot_v + 0.0001f * fc * ih);
......@@ -217,7 +216,7 @@ __attribute__((always_inline)) INLINE static void hydro_predict_extra(
u = p->u *= expf(w);
/* Predict gradient term */
p->force.POrho2 = u * hydro_gamma_minus_one / (p->rho * xp->omega);
p->force.P_over_rho2 = u * hydro_gamma_minus_one / (p->rho * xp->omega);
}
/**
......
......@@ -371,8 +371,8 @@ __attribute__((always_inline)) INLINE static void runner_iact_force(
mj = pj->mass;
rhoi = pi->rho;
rhoj = pj->rho;
POrho2i = pi->force.POrho2;
POrho2j = pj->force.POrho2;
POrho2i = pi->force.P_over_rho2;
POrho2j = pj->force.P_over_rho2;
/* Get the kernel for hi. */
hi_inv = 1.0f / hi;
......@@ -480,13 +480,13 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_force(
mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass,
pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass);
piPOrho2.v =
vec_set(pi[0]->force.POrho2, pi[1]->force.POrho2, pi[2]->force.POrho2,
pi[3]->force.POrho2, pi[4]->force.POrho2, pi[5]->force.POrho2,
pi[6]->force.POrho2, pi[7]->force.POrho2);
vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2, pi[2]->force.P_over_rho2,
pi[3]->force.P_over_rho2, pi[4]->force.P_over_rho2, pi[5]->force.P_over_rho2,
pi[6]->force.P_over_rho2, pi[7]->force.P_over_rho2);
pjPOrho2.v =
vec_set(pj[0]->force.POrho2, pj[1]->force.POrho2, pj[2]->force.POrho2,
pj[3]->force.POrho2, pj[4]->force.POrho2, pj[5]->force.POrho2,
pj[6]->force.POrho2, pj[7]->force.POrho2);
vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2, pj[2]->force.P_over_rho2,
pj[3]->force.P_over_rho2, pj[4]->force.P_over_rho2, pj[5]->force.P_over_rho2,
pj[6]->force.P_over_rho2, pj[7]->force.P_over_rho2);
pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho, pi[4]->rho,
pi[5]->rho, pi[6]->rho, pi[7]->rho);
pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho, pj[4]->rho,
......@@ -495,14 +495,12 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_force(
pi[6]->u, pi[7]->u);
pju.v = vec_set(pj[0]->u, pj[1]->u, pj[2]->u, pj[3]->u, pj[4]->u, pj[5]->u,
pj[6]->u, pj[7]->u);
ci.v = vec_set(pi[0]->force.soundspeed, pi[1]->force.soundspeed,
pi[2]->force.soundspeed, pi[3]->force.soundspeed,
pi[4]->force.soundspeed, pi[5]->force.soundspeed,
pi[6]->force.soundspeed, pi[7]->force.soundspeed);
cj.v = vec_set(pj[0]->force.soundspeed, pj[1]->force.soundspeed,
pj[2]->force.soundspeed, pj[3]->force.soundspeed,
pj[4]->force.soundspeed, pj[5]->force.soundspeed,
pj[6]->force.soundspeed, pj[7]->force.soundspeed);
ci.v =
vec_set(pi[0]->force.soundspeed, pi[1]->force.soundspeed, pi[2]->force.soundspeed, pi[3]->force.soundspeed,
pi[4]->force.soundspeed, pi[5]->force.soundspeed, pi[6]->force.soundspeed, pi[7]->force.soundspeed);
cj.v =
vec_set(pj[0]->force.soundspeed, pj[1]->force.soundspeed, pj[2]->force.soundspeed, pj[3]->force.soundspeed,
pj[4]->force.soundspeed, pj[5]->force.soundspeed, pj[6]->force.soundspeed, pj[7]->force.soundspeed);
vi_sig.v = vec_set(pi[0]->force.v_sig, pi[1]->force.v_sig, pi[2]->force.v_sig,
pi[3]->force.v_sig, pi[4]->force.v_sig, pi[5]->force.v_sig,
pi[6]->force.v_sig, pi[7]->force.v_sig);
......@@ -532,18 +530,18 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_force(
#elif VEC_SIZE == 4
mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass);
mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass);
piPOrho2.v = vec_set(pi[0]->force.POrho2, pi[1]->force.POrho2,
pi[2]->force.POrho2, pi[3]->force.POrho2);
pjPOrho2.v = vec_set(pj[0]->force.POrho2, pj[1]->force.POrho2,
pj[2]->force.POrho2, pj[3]->force.POrho2);
piPOrho2.v = vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2,
pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2);
pjPOrho2.v = vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2,
pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2);
pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho);
pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho);
piu.v = vec_set(pi[0]->u, pi[1]->u, pi[2]->u, pi[3]->u);
pju.v = vec_set(pj[0]->u, pj[1]->u, pj[2]->u, pj[3]->u);
ci.v = vec_set(pi[0]->force.soundspeed, pi[1]->force.soundspeed,
pi[2]->force.soundspeed, pi[3]->force.soundspeed);
cj.v = vec_set(pj[0]->force.soundspeed, pj[1]->force.soundspeed,
pj[2]->force.soundspeed, pj[3]->force.soundspeed);
ci.v =
vec_set(pi[0]->force.soundspeed, pi[1]->force.soundspeed, pi[2]->force.soundspeed, pi[3]->force.soundspeed);
cj.v =
vec_set(pj[0]->force.soundspeed, pj[1]->force.soundspeed, pj[2]->force.soundspeed, pj[3]->force.soundspeed);
vi_sig.v = vec_set(pi[0]->force.v_sig, pi[1]->force.v_sig, pi[2]->force.v_sig,
pi[3]->force.v_sig);
vj_sig.v = vec_set(pj[0]->force.v_sig, pj[1]->force.v_sig, pj[2]->force.v_sig,
......@@ -639,18 +637,14 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_force(
piu_dt.v += mj.v * tc.v * (piu.v - pju.v);
pju_dt.v += mi.v * tc.v * (pju.v - piu.v);
/* compute the signal velocity (this is always symmetrical). */
vi_sig.v = vec_fmax(vi_sig.v, v_sig.v);
vj_sig.v = vec_fmax(vj_sig.v, v_sig.v);
/* Store the forces back on the particles. */
for (k = 0; k < VEC_SIZE; k++) {
pi[k]->force.u_dt += piu_dt.f[k];
pj[k]->force.u_dt += pju_dt.f[k];
pi[k]->force.h_dt -= pih_dt.f[k];
pj[k]->force.h_dt -= pjh_dt.f[k];
pi[k]->force.v_sig = vi_sig.f[k];
pj[k]->force.v_sig = vj_sig.f[k];
pi[k]->force.v_sig = fmaxf(pi[k]->force.v_sig, v_sig.f[k]);
pj[k]->force.v_sig = fmaxf(pj[k]->force.v_sig, v_sig.f[k]);
for (j = 0; j < 3; j++) {
pi[k]->a_hydro[j] -= pia[j].f[k];
pj[k]->a_hydro[j] += pja[j].f[k];
......@@ -686,8 +680,8 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_force(
mj = pj->mass;
rhoi = pi->rho;
rhoj = pj->rho;
POrho2i = pi->force.POrho2;
POrho2j = pj->force.POrho2;
POrho2i = pi->force.P_over_rho2;
POrho2j = pj->force.P_over_rho2;
/* Get the kernel for hi. */
hi_inv = 1.0f / hi;
......@@ -787,13 +781,13 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force(
mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass,
pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass);
piPOrho2.v =
vec_set(pi[0]->force.POrho2, pi[1]->force.POrho2, pi[2]->force.POrho2,
pi[3]->force.POrho2, pi[4]->force.POrho2, pi[5]->force.POrho2,
pi[6]->force.POrho2, pi[7]->force.POrho2);
vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2, pi[2]->force.P_over_rho2,
pi[3]->force.P_over_rho2, pi[4]->force.P_over_rho2, pi[5]->force.P_over_rho2,
pi[6]->force.P_over_rho2, pi[7]->force.P_over_rho2);
pjPOrho2.v =
vec_set(pj[0]->force.POrho2, pj[1]->force.POrho2, pj[2]->force.POrho2,
pj[3]->force.POrho2, pj[4]->force.POrho2, pj[5]->force.POrho2,
pj[6]->force.POrho2, pj[7]->force.POrho2);
vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2, pj[2]->force.P_over_rho2,
pj[3]->force.P_over_rho2, pj[4]->force.P_over_rho2, pj[5]->force.P_over_rho2,
pj[6]->force.P_over_rho2, pj[7]->force.P_over_rho2);
pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho, pi[4]->rho,
pi[5]->rho, pi[6]->rho, pi[7]->rho);
pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho, pj[4]->rho,
......@@ -802,14 +796,12 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force(
pi[6]->u, pi[7]->u);
pju.v = vec_set(pj[0]->u, pj[1]->u, pj[2]->u, pj[3]->u, pj[4]->u, pj[5]->u,
pj[6]->u, pj[7]->u);
ci.v = vec_set(pi[0]->force.soundspeed, pi[1]->force.soundspeed,
pi[2]->force.soundspeed, pi[3]->force.soundspeed,
pi[4]->force.soundspeed, pi[5]->force.soundspeed,
pi[6]->force.soundspeed, pi[7]->force.soundspeed);
cj.v = vec_set(pj[0]->force.soundspeed, pj[1]->force.soundspeed,
pj[2]->force.soundspeed, pj[3]->force.soundspeed,
pj[4]->force.soundspeed, pj[5]->force.soundspeed,
pj[6]->force.soundspeed, pj[7]->force.soundspeed);
ci.v =
vec_set(pi[0]->force.soundspeed, pi[1]->force.soundspeed, pi[2]->force.soundspeed, pi[3]->force.soundspeed,
pi[4]->force.soundspeed, pi[5]->force.soundspeed, pi[6]->force.soundspeed, pi[7]->force.soundspeed);
cj.v =
vec_set(pj[0]->force.soundspeed, pj[1]->force.soundspeed, pj[2]->force.soundspeed, pj[3]->force.soundspeed,
pj[4]->force.soundspeed, pj[5]->force.soundspeed, pj[6]->force.soundspeed, pj[7]->force.soundspeed);
vi_sig.v = vec_set(pi[0]->force.v_sig, pi[1]->force.v_sig, pi[2]->force.v_sig,
pi[3]->force.v_sig, pi[4]->force.v_sig, pi[5]->force.v_sig,
pi[6]->force.v_sig, pi[7]->force.v_sig);
......@@ -838,18 +830,18 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force(
pj[4]->alpha, pj[5]->alpha, pj[6]->alpha, pj[7]->alpha);
#elif VEC_SIZE == 4
mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass);
piPOrho2.v = vec_set(pi[0]->force.POrho2, pi[1]->force.POrho2,
pi[2]->force.POrho2, pi[3]->force.POrho2);
pjPOrho2.v = vec_set(pj[0]->force.POrho2, pj[1]->force.POrho2,
pj[2]->force.POrho2, pj[3]->force.POrho2);
piPOrho2.v = vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2,
pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2);
pjPOrho2.v = vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2,
pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2);
pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho);
pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho);
piu.v = vec_set(pi[0]->u, pi[1]->u, pi[2]->u, pi[3]->u);
pju.v = vec_set(pj[0]->u, pj[1]->u, pj[2]->u, pj[3]->u);
ci.v = vec_set(pi[0]->force.soundspeed, pi[1]->force.soundspeed,
pi[2]->force.soundspeed, pi[3]->force.soundspeed);
cj.v = vec_set(pj[0]->force.soundspeed, pj[1]->force.soundspeed,
pj[2]->force.soundspeed, pj[3]->force.soundspeed);
ci.v =
vec_set(pi[0]->force.soundspeed, pi[1]->force.soundspeed, pi[2]->force.soundspeed, pi[3]->force.soundspeed);
cj.v =
vec_set(pj[0]->force.soundspeed, pj[1]->force.soundspeed, pj[2]->force.soundspeed, pj[3]->force.soundspeed);
vi_sig.v = vec_set(pi[0]->force.v_sig, pi[1]->force.v_sig, pi[2]->force.v_sig,
pi[3]->force.v_sig);
vj_sig.v = vec_set(pj[0]->force.v_sig, pj[1]->force.v_sig, pj[2]->force.v_sig,
......@@ -940,15 +932,11 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force(
/* Add the thermal conductivity */
piu_dt.v += mj.v * tc.v * (piu.v - pju.v);
/* compute the signal velocity (this is always symmetrical). */
vi_sig.v = vec_fmax(vi_sig.v, v_sig.v);
vj_sig.v = vec_fmax(vj_sig.v, v_sig.v);
/* Store the forces back on the particles. */
for (k = 0; k < VEC_SIZE; k++) {
pi[k]->force.u_dt += piu_dt.f[k];
pi[k]->force.h_dt -= pih_dt.f[k];
pi[k]->force.v_sig = vi_sig.f[k];
pi[k]->force.v_sig = fmaxf(pi[k]->force.v_sig,v_sig.f[k]);
for (j = 0; j < 3; j++) pi[k]->a_hydro[j] -= pia[j].f[k];
}
......
......@@ -90,7 +90,7 @@ struct part {
float balsara;
/* Aggregate quantities. */
float POrho2;
float P_over_rho2;
/* Change in particle energy over time. */
float u_dt;
......
......@@ -251,8 +251,8 @@ __attribute__((always_inline)) INLINE static void hydro_kick_extra(
p->entropy *= 0.5f;
/* Do not 'overcool' when timestep increases */
if (p->entropy + 0.5f * p->entropy_dt * dt < 0.5f * p->entropy)
p->entropy_dt = -0.5f * p->entropy / dt;
if (p->entropy + p->entropy_dt * half_dt < 0.5f * p->entropy)
p->entropy_dt = -0.5f * p->entropy / half_dt;
}
/**
......
......@@ -479,9 +479,177 @@ __attribute__((always_inline)) INLINE static void runner_iact_force(
__attribute__((always_inline)) INLINE static void runner_iact_vec_force(
float *R2, float *Dx, float *Hi, float *Hj, struct part **pi,
struct part **pj) {
error(
"A vectorised version of the Gadget2 force interaction function does not "
"exist yet!");
#ifdef WITH_VECTORIZATION
vector r, r2, ri;
vector xi, xj;
vector hi, hj, hi_inv, hj_inv;
vector hi2_inv, hj2_inv;
vector wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, dvdr;
vector piPOrho, pjPOrho, pirho, pjrho;
vector mi, mj;
vector f;
vector dx[3];
vector vi[3], vj[3];
vector pia[3], pja[3];
vector pih_dt, pjh_dt;
vector ci, cj, v_sig;
vector omega_ij, mu_ij, fac_mu, balsara;
vector rho_ij, visc, visc_term, sph_term, acc, entropy_dt;
int j, k;
fac_mu.v = vec_set1(1.f); /* Will change with cosmological integration */
/* Load stuff. */
#if VEC_SIZE == 8
mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass,
pi[4]->mass, pi[5]->mass, pi[6]->mass, pi[7]->mass);
mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass,
pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass);
piPOrho.v =
vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2, pi[2]->force.P_over_rho2,
pi[3]->force.P_over_rho2, pi[4]->force.P_over_rho2, pi[5]->force.P_over_rho2,
pi[6]->force.P_over_rho2, pi[7]->force.P_over_rho2);
pjPOrho.v =
vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2, pj[2]->force.P_over_rho2,
pj[3]->force.P_over_rho2, pj[4]->force.P_over_rho2, pj[5]->force.P_over_rho2,
pj[6]->force.P_over_rho2, pj[7]->force.P_over_rho2);
pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho, pi[4]->rho,
pi[5]->rho, pi[6]->rho, pi[7]->rho);
pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho, pj[4]->rho,
pj[5]->rho, pj[6]->rho, pj[7]->rho);
ci.v =
vec_set(pi[0]->force.soundspeed, pi[1]->force.soundspeed, pi[2]->force.soundspeed, pi[3]->force.soundspeed,
pi[4]->force.soundspeed, pi[5]->force.soundspeed, pi[6]->force.soundspeed, pi[7]->force.soundspeed);
cj.v =
vec_set(pj[0]->force.soundspeed, pj[1]->force.soundspeed, pj[2]->force.soundspeed, pj[3]->force.soundspeed,
pj[4]->force.soundspeed, pj[5]->force.soundspeed, pj[6]->force.soundspeed, pj[7]->force.soundspeed);
for (k = 0; k < 3; k++) {
vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k],
pi[4]->v[k], pi[5]->v[k], pi[6]->v[k], pi[7]->v[k]);
vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k],
pj[4]->v[k], pj[5]->v[k], pj[6]->v[k], pj[7]->v[k]);
}
for (k = 0; k < 3; k++)
dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k], Dx[12 + k],
Dx[15 + k], Dx[18 + k], Dx[21 + k]);
balsara.v =
vec_set(pi[0]->force.balsara, pi[1]->force.balsara, pi[2]->force.balsara,
pi[3]->force.balsara, pi[4]->force.balsara, pi[5]->force.balsara,
pi[6]->force.balsara, pi[7]->force.balsara) +
vec_set(pj[0]->force.balsara, pj[1]->force.balsara, pj[2]->force.balsara,
pj[3]->force.balsara, pj[4]->force.balsara, pj[5]->force.balsara,
pj[6]->force.balsara, pj[7]->force.balsara);
#elif VEC_SIZE == 4
mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass);
mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass);
piPOrho.v = vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2,
pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2);
pjPOrho.v = vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2,
pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2);
pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho);
pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho);
ci.v =
vec_set(pi[0]->force.soundspeed, pi[1]->force.soundspeed, pi[2]->force.soundspeed, pi[3]->force.soundspeed);
cj.v =
vec_set(pj[0]->force.soundspeed, pj[1]->force.soundspeed, pj[2]->force.soundspeed, pj[3]->force.soundspeed);
for (k = 0; k < 3; k++) {
vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k]);
vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k]);
}
for (k = 0; k < 3; k++)
dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]);
balsara.v = vec_set(pi[0]->force.balsara, pi[1]->force.balsara,
pi[2]->force.balsara, pi[3]->force.balsara) +
vec_set(pj[0]->force.balsara, pj[1]->force.balsara,
pj[2]->force.balsara, pj[3]->force.balsara);
#else
error("Unknown vector size.")
#endif
/* Get the radius and inverse radius. */
r2.v = vec_load(R2);
ri.v = vec_rsqrt(r2.v);
ri.v = ri.v - vec_set1(0.5f) * ri.v * (r2.v * ri.v * ri.v - vec_set1(1.0f));
r.v = r2.v * ri.v;
/* Get the kernel for hi. */
hi.v = vec_load(Hi);
hi_inv.v = vec_rcp(hi.v);
hi_inv.v = hi_inv.v - hi_inv.v * (hi.v * hi_inv.v - vec_set1(1.0f));
hi2_inv.v = hi_inv.v * hi_inv.v;
xi.v = r.v * hi_inv.v;
kernel_deval_vec(&xi, &wi, &wi_dx);
wi_dr.v = hi2_inv.v * hi2_inv.v * wi_dx.v;
/* Get the kernel for hj. */
hj.v = vec_load(Hj);
hj_inv.v = vec_rcp(hj.v);
hj_inv.v = hj_inv.v - hj_inv.v * (hj.v * hj_inv.v - vec_set1(1.0f));
hj2_inv.v = hj_inv.v * hj_inv.v;
xj.v = r.v * hj_inv.v;
kernel_deval_vec(&xj, &wj, &wj_dx);
wj_dr.v = hj2_inv.v * hj2_inv.v * wj_dx.v;
/* Compute dv dot r. */
dvdr.v = ((vi[0].v - vj[0].v) * dx[0].v) + ((vi[1].v - vj[1].v) * dx[1].v) +
((vi[2].v - vj[2].v) * dx[2].v);
//dvdr.v = dvdr.v * ri.v;
/* Compute the relative velocity. (This is 0 if the particles move away from
* each other and negative otherwise) */
omega_ij.v = vec_fmin(dvdr.v, vec_set1(0.0f));
mu_ij.v = fac_mu.v * ri.v * omega_ij.v; /* This is 0 or negative */
/* Compute signal velocity */
v_sig.v = ci.v + cj.v - vec_set1(3.0f) * mu_ij.v;
/* Now construct the full viscosity term */
rho_ij.v = vec_set1(0.5f) * (pirho.v + pjrho.v);
visc.v = vec_set1(-0.25f) * vec_set1(const_viscosity_alpha) * v_sig.v * mu_ij.v *
balsara.v / rho_ij.v;
/* Now, convolve with the kernel */
visc_term.v = vec_set1(0.5f) * visc.v * (wi_dr.v + wj_dr.v) * ri.v;
sph_term.v = (piPOrho.v * wi_dr.v + pjPOrho.v * wj_dr.v) * ri.v;
/* Eventually get the acceleration */
acc.v = visc_term.v + sph_term.v;
/* Use the force, Luke! */
for (k = 0; k < 3; k++) {
f.v = dx[k].v * acc.v;
pia[k].v = mj.v * f.v;
pja[k].v = mi.v * f.v;
}
/* Get the time derivative for h. */
pih_dt.v = mj.v * dvdr.v * ri.v / pjrho.v * wi_dr.v;
pjh_dt.v = mi.v * dvdr.v * ri.v / pirho.v * wj_dr.v;
/* Change in entropy */
entropy_dt.v = vec_set1(0.5f) * visc_term.v * dvdr.v;
/* Store the forces back on the particles. */
for (k = 0; k < VEC_SIZE; k++) {
for (j = 0; j < 3; j++) {
pi[k]->a_hydro[j] -= pia[j].f[k];
pj[k]->a_hydro[j] += pja[j].f[k];
}
pi[k]->force.h_dt -= pih_dt.f[k];
pj[k]->force.h_dt -= pjh_dt.f[k];
pi[k]->force.v_sig = fmaxf(pi[k]->force.v_sig, v_sig.f[k]);
pj[k]->force.v_sig = fmaxf(pj[k]->force.v_sig, v_sig.f[k]);
pi[k]->entropy_dt += entropy_dt.f[k] * mj.f[k];
pj[k]->entropy_dt -= entropy_dt.f[k] * mi.f[k];
}
#else
error("The Gadget2 serial version of runner_iact_nonsym_force was called when the vectorised version should have been used.")
#endif
}
/**
......@@ -575,9 +743,166 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_force(
__attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force(
float *R2, float *Dx, float *Hi, float *Hj, struct part **pi,
struct part **pj) {
error(
"A vectorised version of the Gadget2 non-symmetric force interaction "
"function does not exist yet!");
#ifdef WITH_VECTORIZATION
vector r, r2, ri;
vector xi, xj;
vector hi, hj, hi_inv, hj_inv;
vector hi2_inv, hj2_inv;
vector wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, dvdr;
vector piPOrho, pjPOrho, pirho, pjrho;
vector mj;
vector f;
vector dx[3];
vector vi[3], vj[3];
vector pia[3];
vector pih_dt;
vector ci, cj, v_sig;
vector omega_ij, mu_ij, fac_mu, balsara;
vector rho_ij, visc, visc_term, sph_term, acc, entropy_dt;
int j, k;
fac_mu.v = vec_set1(1.f); /* Will change with cosmological integration */
/* Load stuff. */
#if VEC_SIZE == 8
mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass,
pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass);
piPOrho.v =
vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2, pi[2]->force.P_over_rho2,
pi[3]->force.P_over_rho2, pi[4]->force.P_over_rho2, pi[5]->force.P_over_rho2,
pi[6]->force.P_over_rho2, pi[7]->force.P_over_rho2);
pjPOrho.v =
vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2, pj[2]->force.P_over_rho2,
pj[3]->force.P_over_rho2, pj[4]->force.P_over_rho2, pj[5]->force.P_over_rho2,
pj[6]->force.P_over_rho2, pj[7]->force.P_over_rho2);
pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho, pi[4]->rho,
pi[5]->rho, pi[6]->rho, pi[7]->rho);
pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho, pj[4]->rho,
pj[5]->rho, pj[6]->rho, pj[7]->rho);
ci.v =
vec_set(pi[0]->force.soundspeed, pi[1]->force.soundspeed, pi[2]->force.soundspeed, pi[3]->force.soundspeed,
pi[4]->force.soundspeed, pi[5]->force.soundspeed, pi[6]->force.soundspeed, pi[7]->force.soundspeed);
cj.v =
vec_set(pj[0]->force.soundspeed, pj[1]->force.soundspeed, pj[2]->force.soundspeed, pj[3]->force.soundspeed,
pj[4]->force.soundspeed, pj[5]->force.soundspeed, pj[6]->force.soundspeed, pj[7]->force.soundspeed);
for (k = 0; k < 3; k++) {
vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k],
pi[4]->v[k], pi[5]->v[k], pi[6]->v[k], pi[7]->v[k]);
vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k],
pj[4]->v[k], pj[5]->v[k], pj[6]->v[k], pj[7]->v[k]);
}
for (k = 0; k < 3; k++)
dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k], Dx[12 + k],
Dx[15 + k], Dx[18 + k], Dx[21 + k]);
balsara.v =
vec_set(pi[0]->force.balsara, pi[1]->force.balsara, pi[2]->force.balsara,
pi[3]->force.balsara, pi[4]->force.balsara, pi[5]->force.balsara,
pi[6]->force.balsara, pi[7]->force.balsara) +
vec_set(pj[0]->force.balsara, pj[1]->force.balsara, pj[2]->force.balsara,
pj[3]->force.balsara, pj[4]->force.balsara, pj[5]->force.balsara,
pj[6]->force.balsara, pj[7]->force.balsara);
#elif VEC_SIZE == 4
mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass);
piPOrho.v = vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2,
pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2);
pjPOrho.v = vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2,
pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2);
pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho);
pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho);
ci.v =
vec_set(pi[0]->force.soundspeed, pi[1]->force.soundspeed, pi[2]->force.soundspeed, pi[3]->force.soundspeed);
cj.v =
vec_set(pj[0]->force.soundspeed, pj[1]->force.soundspeed, pj[2]->force.soundspeed, pj[3]->force.soundspeed);
for (k = 0; k < 3; k++) {
vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k]);
vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k]);
}