diff --git a/src/hydro/Gadget2/hydro_iact.h b/src/hydro/Gadget2/hydro_iact.h index 1c041bf088ed171c427bf057cadfb42519fc2f67..5c977fc80198d197b742789c184a52de270b61bb 100644 --- a/src/hydro/Gadget2/hydro_iact.h +++ b/src/hydro/Gadget2/hydro_iact.h @@ -1206,269 +1206,8 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force( #ifdef WITH_VECTORIZATION __attribute__((always_inline)) INLINE static void runner_iact_nonsym_1_vec_force( - vector *r2, vector *dx, vector *dy, vector *dz, vector hi_inv, vector hj_inv, struct part **pi, - struct part **pj, vector *a_hydro_xSum, vector *a_hydro_ySum, vector *a_hydro_zSum, vector *h_dtSum, vector *v_sigSum, vector *entropy_dtSum, vector mask) { - -#ifdef WITH_VECTORIZATION - - vector r, ri; - vector xi, xj; - vector hid_inv, hjd_inv; - vector wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, dvdr; - vector piPOrho2, pjPOrho2, pirho, pjrho; - vector mj; - vector grad_hi, grad_hj; - vector vi[3], vj[3]; - vector pia[3]; - vector pih_dt; - vector ci, cj, v_sig; - vector omega_ij, mu_ij, fac_mu, balsara; - vector rho_ij, visc, visc_term, sph_term, acc, entropy_dt; - int k; - - fac_mu.v = vec_set1(1.f); /* Will change with cosmological integration */ - -/* Load stuff. */ - mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass, - pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass); - piPOrho2.v = vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2, - pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2, - pi[4]->force.P_over_rho2, pi[5]->force.P_over_rho2, - pi[6]->force.P_over_rho2, pi[7]->force.P_over_rho2); - pjPOrho2.v = vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2, - pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2, - pj[4]->force.P_over_rho2, pj[5]->force.P_over_rho2, - pj[6]->force.P_over_rho2, pj[7]->force.P_over_rho2); - grad_hi.v = - vec_set(pi[0]->force.f, pi[1]->force.f, pi[2]->force.f, pi[3]->force.f, - pi[4]->force.f, pi[5]->force.f, pi[6]->force.f, pi[7]->force.f); - grad_hj.v = - vec_set(pj[0]->force.f, pj[1]->force.f, pj[2]->force.f, pj[3]->force.f, - pj[4]->force.f, pj[5]->force.f, pj[6]->force.f, pj[7]->force.f); - pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho, pi[4]->rho, - pi[5]->rho, pi[6]->rho, pi[7]->rho); - pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho, pj[4]->rho, - pj[5]->rho, pj[6]->rho, pj[7]->rho); - ci.v = vec_set(pi[0]->force.soundspeed, pi[1]->force.soundspeed, - pi[2]->force.soundspeed, pi[3]->force.soundspeed, - pi[4]->force.soundspeed, pi[5]->force.soundspeed, - pi[6]->force.soundspeed, pi[7]->force.soundspeed); - cj.v = vec_set(pj[0]->force.soundspeed, pj[1]->force.soundspeed, - pj[2]->force.soundspeed, pj[3]->force.soundspeed, - pj[4]->force.soundspeed, pj[5]->force.soundspeed, - pj[6]->force.soundspeed, pj[7]->force.soundspeed); - for (k = 0; k < 3; k++) { - vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k], - pi[4]->v[k], pi[5]->v[k], pi[6]->v[k], pi[7]->v[k]); - vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k], - pj[4]->v[k], pj[5]->v[k], pj[6]->v[k], pj[7]->v[k]); - } - balsara.v = - vec_set(pi[0]->force.balsara, pi[1]->force.balsara, pi[2]->force.balsara, - pi[3]->force.balsara, pi[4]->force.balsara, pi[5]->force.balsara, - pi[6]->force.balsara, pi[7]->force.balsara) + - vec_set(pj[0]->force.balsara, pj[1]->force.balsara, pj[2]->force.balsara, - pj[3]->force.balsara, pj[4]->force.balsara, pj[5]->force.balsara, - pj[6]->force.balsara, pj[7]->force.balsara); - - /* Get the radius and inverse radius. */ - ri = vec_reciprocal_sqrt(*r2); - r.v = r2->v * ri.v; - - /* Get the kernel for hi. */ - hid_inv = pow_dimension_plus_one_vec(hi_inv); - xi.v = r.v * hi_inv.v; - kernel_deval_vec(&xi, &wi, &wi_dx); - wi_dr.v = hid_inv.v * wi_dx.v; - - /* Get the kernel for hj. */ - hjd_inv = pow_dimension_plus_one_vec(hj_inv); - xj.v = r.v * hj_inv.v; - kernel_deval_vec(&xj, &wj, &wj_dx); - wj_dr.v = hjd_inv.v * wj_dx.v; - - /* Compute dv dot r. */ - //dvdr.v = ((vi[0].v - vj[0].v) * dx[0].v) + ((vi[1].v - vj[1].v) * dx[1].v) + - // ((vi[2].v - vj[2].v) * dx[2].v); - - dvdr.v = ((vi[0].v - vj[0].v) * dx->v) + ((vi[1].v - vj[1].v) * dy->v) + - ((vi[2].v - vj[2].v) * dz->v); - - // dvdr.v = dvdr.v * ri.v; - - /* Compute the relative velocity. (This is 0 if the particles move away from - * each other and negative otherwise) */ - omega_ij.v = vec_fmin(dvdr.v, vec_set1(0.0f)); - mu_ij.v = fac_mu.v * ri.v * omega_ij.v; /* This is 0 or negative */ - - /* Compute signal velocity */ - v_sig.v = ci.v + cj.v - vec_set1(3.0f) * mu_ij.v; - - /* Now construct the full viscosity term */ - rho_ij.v = vec_set1(0.5f) * (pirho.v + pjrho.v); - visc.v = vec_set1(-0.25f) * vec_set1(const_viscosity_alpha) * v_sig.v * - mu_ij.v * balsara.v / rho_ij.v; - - /* Now, convolve with the kernel */ - visc_term.v = vec_set1(0.5f) * visc.v * (wi_dr.v + wj_dr.v) * ri.v; - sph_term.v = - (grad_hi.v * piPOrho2.v * wi_dr.v + grad_hj.v * pjPOrho2.v * wj_dr.v) * - ri.v; - - /* Eventually get the acceleration */ - acc.v = visc_term.v + sph_term.v; - - /* Use the force, Luke! */ - //for (k = 0; k < 3; k++) { - // f.v = dx[k].v * acc.v; - // pia[k].v = mj.v * f.v; - //} - - pia[0].v = mj.v * dx->v * acc.v; - pia[1].v = mj.v * dy->v * acc.v; - pia[2].v = mj.v * dz->v * acc.v; - - /* Get the time derivative for h. */ - pih_dt.v = mj.v * dvdr.v * ri.v / pjrho.v * wi_dr.v; - - /* Change in entropy */ - entropy_dt.v = mj.v * visc_term.v * dvdr.v; - - /* Store the forces back on the particles. */ - //for (k = 0; k < VEC_SIZE; k++) { - // for (j = 0; j < 3; j++) pi[k]->a_hydro[j] -= pia[j].f[k]; - // pi[k]->force.h_dt -= pih_dt.f[k]; - // pi[k]->force.v_sig = max(pi[k]->force.v_sig, v_sig.f[k]); - // pi[k]->entropy_dt += entropy_dt.f[k]; - //} - - a_hydro_xSum->v -= vec_and(pia[0].v, mask.v); - a_hydro_ySum->v -= vec_and(pia[1].v, mask.v); - a_hydro_zSum->v -= vec_and(pia[2].v, mask.v); - h_dtSum->v -= vec_and(pih_dt.v, mask.v); - v_sigSum->v = vec_fmax(v_sigSum->v, vec_and(v_sig.v, mask.v)); - entropy_dtSum->v += vec_and(entropy_dt.v,mask.v); - -#else - - error( - "The Gadget2 serial version of runner_iact_nonsym_force was called when " - "the vectorised version should have been used."); - -#endif -} - -__attribute__((always_inline)) INLINE static void runner_iact_nonsym_1_vec_force_2( - vector *r2, vector *dx, vector *dy, vector *dz, vector *vix, vector *viy, vector *viz, vector *pirho, vector *grad_hi, vector *piPOrho2, vector *balsara_i, vector *ci, vector *vjx, vector *vjy, vector *vjz, vector *pjrho, vector *grad_hj, vector *pjPOrho2, vector *balsara_j, vector *cj, vector *mj, vector hi_inv, vector hj_inv, - vector *a_hydro_xSum, vector *a_hydro_ySum, vector *a_hydro_zSum, vector *h_dtSum, vector *v_sigSum, vector *entropy_dtSum, vector mask) { - -#ifdef WITH_VECTORIZATION - - vector r, ri; - vector xi, xj; - vector hid_inv, hjd_inv; - vector wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, dvdr; - vector pia[3]; - vector pih_dt; - vector v_sig; - vector omega_ij, mu_ij, fac_mu, balsara; - vector rho_ij, visc, visc_term, sph_term, acc, entropy_dt; - - fac_mu.v = vec_set1(1.f); /* Will change with cosmological integration */ - -/* Load stuff. */ - balsara.v = balsara_i->v + balsara_j->v; - - /* Get the radius and inverse radius. */ - ri = vec_reciprocal_sqrt(*r2); - r.v = r2->v * ri.v; - - /* Get the kernel for hi. */ - hid_inv = pow_dimension_plus_one_vec(hi_inv); - xi.v = r.v * hi_inv.v; - kernel_deval_1_vec(&xi, &wi, &wi_dx); - wi_dr.v = hid_inv.v * wi_dx.v; - - /* Get the kernel for hj. */ - hjd_inv = pow_dimension_plus_one_vec(hj_inv); - xj.v = r.v * hj_inv.v; - kernel_deval_1_vec(&xj, &wj, &wj_dx); - wj_dr.v = hjd_inv.v * wj_dx.v; - - /* Compute dv dot r. */ - //dvdr.v = ((vi[0].v - vj[0].v) * dx[0].v) + ((vi[1].v - vj[1].v) * dx[1].v) + - // ((vi[2].v - vj[2].v) * dx[2].v); - - dvdr.v = ((vix->v - vjx->v) * dx->v) + ((viy->v - vjy->v) * dy->v) + - ((viz->v - vjz->v) * dz->v); - - // dvdr.v = dvdr.v * ri.v; - - /* Compute the relative velocity. (This is 0 if the particles move away from - * each other and negative otherwise) */ - omega_ij.v = vec_fmin(dvdr.v, vec_set1(0.0f)); - mu_ij.v = fac_mu.v * ri.v * omega_ij.v; /* This is 0 or negative */ - - /* Compute signal velocity */ - v_sig.v = ci->v + cj->v - vec_set1(3.0f) * mu_ij.v; - - /* Now construct the full viscosity term */ - rho_ij.v = vec_set1(0.5f) * (pirho->v + pjrho->v); - visc.v = vec_set1(-0.25f) * vec_set1(const_viscosity_alpha) * v_sig.v * - mu_ij.v * balsara.v / rho_ij.v; - - /* Now, convolve with the kernel */ - visc_term.v = vec_set1(0.5f) * visc.v * (wi_dr.v + wj_dr.v) * ri.v; - sph_term.v = - (grad_hi->v * piPOrho2->v * wi_dr.v + grad_hj->v * pjPOrho2->v * wj_dr.v) * - ri.v; - - /* Eventually get the acceleration */ - acc.v = visc_term.v + sph_term.v; - - /* Use the force, Luke! */ - //for (k = 0; k < 3; k++) { - // f.v = dx[k].v * acc.v; - // pia[k].v = mj.v * f.v; - //} - - pia[0].v = mj->v * dx->v * acc.v; - pia[1].v = mj->v * dy->v * acc.v; - pia[2].v = mj->v * dz->v * acc.v; - - /* Get the time derivative for h. */ - pih_dt.v = mj->v * dvdr.v * ri.v / pjrho->v * wi_dr.v; - - /* Change in entropy */ - entropy_dt.v = mj->v * visc_term.v * dvdr.v; - - /* Store the forces back on the particles. */ - //for (k = 0; k < VEC_SIZE; k++) { - // for (j = 0; j < 3; j++) pi[k]->a_hydro[j] -= pia[j].f[k]; - // pi[k]->force.h_dt -= pih_dt.f[k]; - // pi[k]->force.v_sig = max(pi[k]->force.v_sig, v_sig.f[k]); - // pi[k]->entropy_dt += entropy_dt.f[k]; - //} - - a_hydro_xSum->v -= vec_and(pia[0].v, mask.v); - a_hydro_ySum->v -= vec_and(pia[1].v, mask.v); - a_hydro_zSum->v -= vec_and(pia[2].v, mask.v); - h_dtSum->v -= vec_and(pih_dt.v, mask.v); - v_sigSum->v = vec_fmax(v_sigSum->v, vec_and(v_sig.v, mask.v)); - entropy_dtSum->v += vec_and(entropy_dt.v,mask.v); - -#else - - error( - "The Gadget2 serial version of runner_iact_nonsym_force was called when " - "the vectorised version should have been used."); - -#endif -} - -__attribute__((always_inline)) INLINE static void runner_iact_nonsym_2_vec_force( float *R2, float *Dx, float *Dy, float *Dz, vector *vix, vector *viy, vector *viz, vector *pirho, vector *grad_hi, vector *piPOrho2, vector *balsara_i, vector *ci, float *Vjx, float *Vjy, float *Vjz, float *Pjrho, float *Grad_hj, float *PjPOrho2, float *Balsara_j, float *Cj, float *Mj, vector *hi_inv, float *Hj_inv, - vector *a_hydro_xSum, vector *a_hydro_ySum, vector *a_hydro_zSum, vector *h_dtSum, vector *v_sigSum, vector *entropy_dtSum, vector mask, vector mask_2) { + vector *a_hydro_xSum, vector *a_hydro_ySum, vector *a_hydro_zSum, vector *h_dtSum, vector *v_sigSum, vector *entropy_dtSum, vector mask) { #ifdef WITH_VECTORIZATION @@ -1485,19 +1224,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_2_vec_force vector omega_ij, mu_ij, fac_mu, balsara; vector rho_ij, visc, visc_term, sph_term, acc, entropy_dt; - vector r_2, r2_2, ri_2; - vector dx_2, dy_2, dz_2; - vector vjx_2, vjy_2, vjz_2; - vector pjrho_2, grad_hj_2, pjPOrho2_2, balsara_j_2, cj_2, mj_2, hj_inv_2; - vector xi_2, xj_2; - vector hjd_inv_2; - vector wi_dx_2, wj_dx_2, wi_dr_2, wj_dr_2, dvdr_2; - vector piax_2, piay_2, piaz_2; - vector pih_dt_2; - vector v_sig_2; - vector omega_ij_2, mu_ij_2, balsara_2; - vector rho_ij_2, visc_2, visc_term_2, sph_term_2, acc_2, entropy_dt_2; - /* Fill vectors. */ r2.v = vec_load(R2); dx.v = vec_load(Dx); @@ -1518,123 +1244,72 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_2_vec_force fac_mu.v = vec_set1(1.f); /* Will change with cosmological integration */ - r2_2.v = vec_load(&R2[VEC_SIZE]); - dx_2.v = vec_load(&Dx[VEC_SIZE]); - dy_2.v = vec_load(&Dy[VEC_SIZE]); - dz_2.v = vec_load(&Dz[VEC_SIZE]); - - vjx_2.v = vec_load(&Vjx[VEC_SIZE]); - vjy_2.v = vec_load(&Vjy[VEC_SIZE]); - vjz_2.v = vec_load(&Vjz[VEC_SIZE]); - mj_2.v = vec_load(&Mj[VEC_SIZE]); - - pjrho_2.v = vec_load(&Pjrho[VEC_SIZE]); - grad_hj_2.v = vec_load(&Grad_hj[VEC_SIZE]); - pjPOrho2_2.v = vec_load(&PjPOrho2[VEC_SIZE]); - balsara_j_2.v = vec_load(&Balsara_j[VEC_SIZE]); - cj_2.v = vec_load(&Cj[VEC_SIZE]); - hj_inv_2.v = vec_load(&Hj_inv[VEC_SIZE]); - /* Load stuff. */ balsara.v = balsara_i->v + balsara_j.v; - balsara_2.v = balsara_i->v + balsara_j_2.v; /* Get the radius and inverse radius. */ ri = vec_reciprocal_sqrt(r2); - ri_2 = vec_reciprocal_sqrt(r2_2); r.v = r2.v * ri.v; - r_2.v = r2_2.v * ri_2.v; /* Get the kernel for hi. */ hid_inv = pow_dimension_plus_one_vec(*hi_inv); xi.v = r.v * hi_inv->v; - xi_2.v = r_2.v * hi_inv->v; kernel_eval_dWdx_force_vec(&xi, &wi_dx); - kernel_eval_dWdx_force_vec(&xi_2, &wi_dx_2); wi_dr.v = hid_inv.v * wi_dx.v; - wi_dr_2.v = hid_inv.v * wi_dx_2.v; /* Get the kernel for hj. */ hjd_inv = pow_dimension_plus_one_vec(hj_inv); - hjd_inv_2 = pow_dimension_plus_one_vec(hj_inv_2); xj.v = r.v * hj_inv.v; - xj_2.v = r_2.v * hj_inv_2.v; /* Calculate the kernel for two particles. */ - kernel_eval_dWdx_force_vec(&xj, &wj_dx); - kernel_eval_dWdx_force_vec(&xj_2, &wj_dx_2); + kernel_deval_1_vec(&xj, &wj_dx); wj_dr.v = hjd_inv.v * wj_dx.v; - wj_dr_2.v = hjd_inv_2.v * wj_dx_2.v; /* Compute dv dot r. */ dvdr.v = ((vix->v - vjx.v) * dx.v) + ((viy->v - vjy.v) * dy.v) + ((viz->v - vjz.v) * dz.v); - dvdr_2.v = ((vix->v - vjx_2.v) * dx_2.v) + ((viy->v - vjy_2.v) * dy_2.v) + - ((viz->v - vjz_2.v) * dz_2.v); /* Compute the relative velocity. (This is 0 if the particles move away from * each other and negative otherwise) */ omega_ij.v = vec_fmin(dvdr.v, vec_setzero()); - omega_ij_2.v = vec_fmin(dvdr_2.v, vec_setzero()); mu_ij.v = fac_mu.v * ri.v * omega_ij.v; /* This is 0 or negative */ - mu_ij_2.v = fac_mu.v * ri_2.v * omega_ij_2.v; /* This is 0 or negative */ /* Compute signal velocity */ v_sig.v = ci->v + cj.v - vec_set1(3.0f) * mu_ij.v; - v_sig_2.v = ci->v + cj_2.v - vec_set1(3.0f) * mu_ij_2.v; /* Now construct the full viscosity term */ rho_ij.v = vec_set1(0.5f) * (pirho->v + pjrho.v); - rho_ij_2.v = vec_set1(0.5f) * (pirho->v + pjrho_2.v); visc.v = vec_set1(-0.25f) * vec_set1(const_viscosity_alpha) * v_sig.v * mu_ij.v * balsara.v / rho_ij.v; - visc_2.v = vec_set1(-0.25f) * vec_set1(const_viscosity_alpha) * v_sig_2.v * - mu_ij_2.v * balsara_2.v / rho_ij_2.v; /* Now, convolve with the kernel */ visc_term.v = vec_set1(0.5f) * visc.v * (wi_dr.v + wj_dr.v) * ri.v; - visc_term_2.v = vec_set1(0.5f) * visc_2.v * (wi_dr_2.v + wj_dr_2.v) * ri_2.v; sph_term.v = (grad_hi->v * piPOrho2->v * wi_dr.v + grad_hj.v * pjPOrho2.v * wj_dr.v) * ri.v; - sph_term_2.v = - (grad_hi->v * piPOrho2->v * wi_dr_2.v + grad_hj_2.v * pjPOrho2_2.v * wj_dr_2.v) * - ri_2.v; - + /* Eventually get the acceleration */ acc.v = visc_term.v + sph_term.v; - acc_2.v = visc_term_2.v + sph_term_2.v; /* Use the force, Luke! */ piax.v = mj.v * dx.v * acc.v; - piax_2.v = mj_2.v * dx_2.v * acc_2.v; piay.v = mj.v * dy.v * acc.v; - piay_2.v = mj_2.v * dy_2.v * acc_2.v; piaz.v = mj.v * dz.v * acc.v; - piaz_2.v = mj_2.v * dz_2.v * acc_2.v; /* Get the time derivative for h. */ pih_dt.v = mj.v * dvdr.v * ri.v / pjrho.v * wi_dr.v; - pih_dt_2.v = mj_2.v * dvdr_2.v * ri_2.v / pjrho_2.v * wi_dr_2.v; /* Change in entropy */ entropy_dt.v = mj.v * visc_term.v * dvdr.v; - entropy_dt_2.v = mj_2.v * visc_term_2.v * dvdr_2.v; /* Store the forces back on the particles. */ a_hydro_xSum->v -= vec_and(piax.v, mask.v); - a_hydro_xSum->v -= vec_and(piax_2.v, mask_2.v); a_hydro_ySum->v -= vec_and(piay.v, mask.v); - a_hydro_ySum->v -= vec_and(piay_2.v, mask_2.v); a_hydro_zSum->v -= vec_and(piaz.v, mask.v); - a_hydro_zSum->v -= vec_and(piaz_2.v, mask_2.v); h_dtSum->v -= vec_and(pih_dt.v, mask.v); - h_dtSum->v -= vec_and(pih_dt_2.v, mask_2.v); v_sigSum->v = vec_fmax(v_sigSum->v, vec_and(v_sig.v, mask.v)); - v_sigSum->v = vec_fmax(v_sigSum->v, vec_and(v_sig_2.v, mask_2.v)); entropy_dtSum->v += vec_and(entropy_dt.v,mask.v); - entropy_dtSum->v += vec_and(entropy_dt_2.v,mask_2.v); #else @@ -1645,9 +1320,9 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_2_vec_force #endif } -__attribute__((always_inline)) INLINE static void runner_iact_nonsym_1_vec_force_3( +__attribute__((always_inline)) INLINE static void runner_iact_nonsym_1_vec_force_nomask( float *R2, float *Dx, float *Dy, float *Dz, vector *vix, vector *viy, vector *viz, vector *pirho, vector *grad_hi, vector *piPOrho2, vector *balsara_i, vector *ci, float *Vjx, float *Vjy, float *Vjz, float *Pjrho, float *Grad_hj, float *PjPOrho2, float *Balsara_j, float *Cj, float *Mj, vector *hi_inv, float *Hj_inv, - vector *a_hydro_xSum, vector *a_hydro_ySum, vector *a_hydro_zSum, vector *h_dtSum, vector *v_sigSum, vector *entropy_dtSum, vector mask) { + vector *a_hydro_xSum, vector *a_hydro_ySum, vector *a_hydro_zSum, vector *h_dtSum, vector *v_sigSum, vector *entropy_dtSum) { #ifdef WITH_VECTORIZATION @@ -1657,7 +1332,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_1_vec_force vector pjrho, grad_hj, pjPOrho2, balsara_j, cj, mj, hj_inv; vector xi, xj; vector hid_inv, hjd_inv; - vector wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, dvdr; + vector wi_dx, wj_dx, wi_dr, wj_dr, dvdr; vector piax, piay, piaz; vector pih_dt; vector v_sig; @@ -1694,7 +1369,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_1_vec_force /* Get the kernel for hi. */ hid_inv = pow_dimension_plus_one_vec(*hi_inv); xi.v = r.v * hi_inv->v; - kernel_deval_1_vec(&xi, &wi, &wi_dx); + kernel_eval_dWdx_force_vec(&xi, &wi_dx); wi_dr.v = hid_inv.v * wi_dx.v; /* Get the kernel for hj. */ @@ -1702,7 +1377,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_1_vec_force xj.v = r.v * hj_inv.v; /* Calculate the kernel for two particles. */ - kernel_deval_1_vec(&xj, &wj, &wj_dx); + kernel_deval_1_vec(&xj, &wj_dx); wj_dr.v = hjd_inv.v * wj_dx.v; @@ -1744,12 +1419,12 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_1_vec_force entropy_dt.v = mj.v * visc_term.v * dvdr.v; /* Store the forces back on the particles. */ - a_hydro_xSum->v -= vec_and(piax.v, mask.v); - a_hydro_ySum->v -= vec_and(piay.v, mask.v); - a_hydro_zSum->v -= vec_and(piaz.v, mask.v); - h_dtSum->v -= vec_and(pih_dt.v, mask.v); - v_sigSum->v = vec_fmax(v_sigSum->v, vec_and(v_sig.v, mask.v)); - entropy_dtSum->v += vec_and(entropy_dt.v,mask.v); + a_hydro_xSum->v -= piax.v; + a_hydro_ySum->v -= piay.v; + a_hydro_zSum->v -= piaz.v; + h_dtSum->v -= pih_dt.v; + v_sigSum->v = vec_fmax(v_sigSum->v, v_sig.v); + entropy_dtSum->v += entropy_dt.v; #else @@ -1760,9 +1435,9 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_1_vec_force #endif } -__attribute__((always_inline)) INLINE static void runner_iact_nonsym_1_vec_force_nomask( +__attribute__((always_inline)) INLINE static void runner_iact_nonsym_2_vec_force( float *R2, float *Dx, float *Dy, float *Dz, vector *vix, vector *viy, vector *viz, vector *pirho, vector *grad_hi, vector *piPOrho2, vector *balsara_i, vector *ci, float *Vjx, float *Vjy, float *Vjz, float *Pjrho, float *Grad_hj, float *PjPOrho2, float *Balsara_j, float *Cj, float *Mj, vector *hi_inv, float *Hj_inv, - vector *a_hydro_xSum, vector *a_hydro_ySum, vector *a_hydro_zSum, vector *h_dtSum, vector *v_sigSum, vector *entropy_dtSum) { + vector *a_hydro_xSum, vector *a_hydro_ySum, vector *a_hydro_zSum, vector *h_dtSum, vector *v_sigSum, vector *entropy_dtSum, vector mask, vector mask_2) { #ifdef WITH_VECTORIZATION @@ -1772,13 +1447,26 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_1_vec_force vector pjrho, grad_hj, pjPOrho2, balsara_j, cj, mj, hj_inv; vector xi, xj; vector hid_inv, hjd_inv; - vector wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, dvdr; + vector wi_dx, wj_dx, wi_dr, wj_dr, dvdr; vector piax, piay, piaz; vector pih_dt; vector v_sig; vector omega_ij, mu_ij, fac_mu, balsara; vector rho_ij, visc, visc_term, sph_term, acc, entropy_dt; + vector r_2, r2_2, ri_2; + vector dx_2, dy_2, dz_2; + vector vjx_2, vjy_2, vjz_2; + vector pjrho_2, grad_hj_2, pjPOrho2_2, balsara_j_2, cj_2, mj_2, hj_inv_2; + vector xi_2, xj_2; + vector hjd_inv_2; + vector wi_dx_2, wj_dx_2, wi_dr_2, wj_dr_2, dvdr_2; + vector piax_2, piay_2, piaz_2; + vector pih_dt_2; + vector v_sig_2; + vector omega_ij_2, mu_ij_2, balsara_2; + vector rho_ij_2, visc_2, visc_term_2, sph_term_2, acc_2, entropy_dt_2; + /* Fill vectors. */ r2.v = vec_load(R2); dx.v = vec_load(Dx); @@ -1799,72 +1487,123 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_1_vec_force fac_mu.v = vec_set1(1.f); /* Will change with cosmological integration */ + r2_2.v = vec_load(&R2[VEC_SIZE]); + dx_2.v = vec_load(&Dx[VEC_SIZE]); + dy_2.v = vec_load(&Dy[VEC_SIZE]); + dz_2.v = vec_load(&Dz[VEC_SIZE]); + + vjx_2.v = vec_load(&Vjx[VEC_SIZE]); + vjy_2.v = vec_load(&Vjy[VEC_SIZE]); + vjz_2.v = vec_load(&Vjz[VEC_SIZE]); + mj_2.v = vec_load(&Mj[VEC_SIZE]); + + pjrho_2.v = vec_load(&Pjrho[VEC_SIZE]); + grad_hj_2.v = vec_load(&Grad_hj[VEC_SIZE]); + pjPOrho2_2.v = vec_load(&PjPOrho2[VEC_SIZE]); + balsara_j_2.v = vec_load(&Balsara_j[VEC_SIZE]); + cj_2.v = vec_load(&Cj[VEC_SIZE]); + hj_inv_2.v = vec_load(&Hj_inv[VEC_SIZE]); + /* Load stuff. */ balsara.v = balsara_i->v + balsara_j.v; + balsara_2.v = balsara_i->v + balsara_j_2.v; /* Get the radius and inverse radius. */ ri = vec_reciprocal_sqrt(r2); + ri_2 = vec_reciprocal_sqrt(r2_2); r.v = r2.v * ri.v; + r_2.v = r2_2.v * ri_2.v; /* Get the kernel for hi. */ hid_inv = pow_dimension_plus_one_vec(*hi_inv); xi.v = r.v * hi_inv->v; - kernel_deval_1_vec(&xi, &wi, &wi_dx); + xi_2.v = r_2.v * hi_inv->v; + kernel_eval_dWdx_force_vec(&xi, &wi_dx); + kernel_eval_dWdx_force_vec(&xi_2, &wi_dx_2); wi_dr.v = hid_inv.v * wi_dx.v; + wi_dr_2.v = hid_inv.v * wi_dx_2.v; /* Get the kernel for hj. */ hjd_inv = pow_dimension_plus_one_vec(hj_inv); + hjd_inv_2 = pow_dimension_plus_one_vec(hj_inv_2); xj.v = r.v * hj_inv.v; + xj_2.v = r_2.v * hj_inv_2.v; /* Calculate the kernel for two particles. */ - kernel_deval_1_vec(&xj, &wj, &wj_dx); + kernel_eval_dWdx_force_vec(&xj, &wj_dx); + kernel_eval_dWdx_force_vec(&xj_2, &wj_dx_2); wj_dr.v = hjd_inv.v * wj_dx.v; + wj_dr_2.v = hjd_inv_2.v * wj_dx_2.v; /* Compute dv dot r. */ dvdr.v = ((vix->v - vjx.v) * dx.v) + ((viy->v - vjy.v) * dy.v) + ((viz->v - vjz.v) * dz.v); + dvdr_2.v = ((vix->v - vjx_2.v) * dx_2.v) + ((viy->v - vjy_2.v) * dy_2.v) + + ((viz->v - vjz_2.v) * dz_2.v); /* Compute the relative velocity. (This is 0 if the particles move away from * each other and negative otherwise) */ omega_ij.v = vec_fmin(dvdr.v, vec_setzero()); + omega_ij_2.v = vec_fmin(dvdr_2.v, vec_setzero()); mu_ij.v = fac_mu.v * ri.v * omega_ij.v; /* This is 0 or negative */ + mu_ij_2.v = fac_mu.v * ri_2.v * omega_ij_2.v; /* This is 0 or negative */ /* Compute signal velocity */ v_sig.v = ci->v + cj.v - vec_set1(3.0f) * mu_ij.v; + v_sig_2.v = ci->v + cj_2.v - vec_set1(3.0f) * mu_ij_2.v; /* Now construct the full viscosity term */ rho_ij.v = vec_set1(0.5f) * (pirho->v + pjrho.v); + rho_ij_2.v = vec_set1(0.5f) * (pirho->v + pjrho_2.v); visc.v = vec_set1(-0.25f) * vec_set1(const_viscosity_alpha) * v_sig.v * mu_ij.v * balsara.v / rho_ij.v; + visc_2.v = vec_set1(-0.25f) * vec_set1(const_viscosity_alpha) * v_sig_2.v * + mu_ij_2.v * balsara_2.v / rho_ij_2.v; /* Now, convolve with the kernel */ visc_term.v = vec_set1(0.5f) * visc.v * (wi_dr.v + wj_dr.v) * ri.v; + visc_term_2.v = vec_set1(0.5f) * visc_2.v * (wi_dr_2.v + wj_dr_2.v) * ri_2.v; sph_term.v = (grad_hi->v * piPOrho2->v * wi_dr.v + grad_hj.v * pjPOrho2.v * wj_dr.v) * ri.v; - + sph_term_2.v = + (grad_hi->v * piPOrho2->v * wi_dr_2.v + grad_hj_2.v * pjPOrho2_2.v * wj_dr_2.v) * + ri_2.v; + /* Eventually get the acceleration */ acc.v = visc_term.v + sph_term.v; + acc_2.v = visc_term_2.v + sph_term_2.v; /* Use the force, Luke! */ piax.v = mj.v * dx.v * acc.v; + piax_2.v = mj_2.v * dx_2.v * acc_2.v; piay.v = mj.v * dy.v * acc.v; + piay_2.v = mj_2.v * dy_2.v * acc_2.v; piaz.v = mj.v * dz.v * acc.v; + piaz_2.v = mj_2.v * dz_2.v * acc_2.v; /* Get the time derivative for h. */ pih_dt.v = mj.v * dvdr.v * ri.v / pjrho.v * wi_dr.v; + pih_dt_2.v = mj_2.v * dvdr_2.v * ri_2.v / pjrho_2.v * wi_dr_2.v; /* Change in entropy */ entropy_dt.v = mj.v * visc_term.v * dvdr.v; + entropy_dt_2.v = mj_2.v * visc_term_2.v * dvdr_2.v; /* Store the forces back on the particles. */ - a_hydro_xSum->v -= piax.v; - a_hydro_ySum->v -= piay.v; - a_hydro_zSum->v -= piaz.v; - h_dtSum->v -= pih_dt.v; - v_sigSum->v = vec_fmax(v_sigSum->v, v_sig.v); - entropy_dtSum->v += entropy_dt.v; + a_hydro_xSum->v -= vec_and(piax.v, mask.v); + a_hydro_xSum->v -= vec_and(piax_2.v, mask_2.v); + a_hydro_ySum->v -= vec_and(piay.v, mask.v); + a_hydro_ySum->v -= vec_and(piay_2.v, mask_2.v); + a_hydro_zSum->v -= vec_and(piaz.v, mask.v); + a_hydro_zSum->v -= vec_and(piaz_2.v, mask_2.v); + h_dtSum->v -= vec_and(pih_dt.v, mask.v); + h_dtSum->v -= vec_and(pih_dt_2.v, mask_2.v); + v_sigSum->v = vec_fmax(v_sigSum->v, vec_and(v_sig.v, mask.v)); + v_sigSum->v = vec_fmax(v_sigSum->v, vec_and(v_sig_2.v, mask_2.v)); + entropy_dtSum->v += vec_and(entropy_dt.v,mask.v); + entropy_dtSum->v += vec_and(entropy_dt_2.v,mask_2.v); #else