diff --git a/src/hydro/Gadget2/hydro_iact.h b/src/hydro/Gadget2/hydro_iact.h index daa6c37681131e197b7e1834d0792238a600eaaa..a740a77625e801a8025c7a243b891bbbedac35d1 100644 --- a/src/hydro/Gadget2/hydro_iact.h +++ b/src/hydro/Gadget2/hydro_iact.h @@ -109,9 +109,121 @@ __attribute__((always_inline)) INLINE static void runner_iact_density( __attribute__((always_inline)) INLINE static void runner_iact_vec_density( float *R2, float *Dx, float *Hi, float *Hj, struct part **pi, struct part **pj) { - error( - "A vectorised version of the Gadget2 density interaction function does " - "not exist yet!"); + +#ifdef WITH_VECTORIZATION + + vector r, ri, r2, xi, xj, hi, hj, hi_inv, hj_inv, wi, wj, wi_dx, wj_dx; + vector rhoi, rhoj, rhoi_dh, rhoj_dh, wcounti, wcountj, wcounti_dh, wcountj_dh; + vector mi, mj; + vector dx[3], dv[3]; + vector vi[3], vj[3]; + vector dvdr, div_vi, div_vj; + vector curlvr[3], curl_vi[3], curl_vj[3]; + int k, j; + +#if VEC_SIZE == 8 + /* Get the masses. */ + mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass, + pi[4]->mass, pi[5]->mass, pi[6]->mass, pi[7]->mass); + mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass, + pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass); + /* Get each velocity component. */ + for (k = 0; k < 3; k++) { + vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k], + pi[4]->v[k], pi[5]->v[k], pi[6]->v[k], pi[7]->v[k]); + vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k], + pj[4]->v[k], pj[5]->v[k], pj[6]->v[k], pj[7]->v[k]); + } + /* Get each component of particle separation. (Dx={dx1,dy1,dz1,dx2,dy2,dz2,...,dxn,dyn,dzn})*/ + for (k = 0; k < 3; k++) + dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k], Dx[12 + k], + Dx[15 + k], Dx[18 + k], Dx[21 + k]); +#elif VEC_SIZE == 4 + mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass); + mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass); + for (k = 0; k < 3; k++) { + vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k]); + vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k]); + } + for (k = 0; k < 3; k++) + dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]); +#endif + + /* Get the radius and inverse radius. */ + r2.v = vec_load(R2); + ri.v = vec_rsqrt(r2.v); + /*vec_rsqrt does not have the level of accuracy we need, so an extra term is added below.*/ + ri.v = ri.v - vec_set1(0.5f) * ri.v * (r2.v * ri.v * ri.v - vec_set1(1.0f)); + r.v = r2.v * ri.v; + + hi.v = vec_load(Hi); + hi_inv.v = vec_rcp(hi.v); + hi_inv.v = hi_inv.v - hi_inv.v * (hi_inv.v * hi.v - vec_set1(1.0f)); + xi.v = r.v * hi_inv.v; + + hj.v = vec_load(Hj); + hj_inv.v = vec_rcp(hj.v); + hj_inv.v = hj_inv.v - hj_inv.v * (hj_inv.v * hj.v - vec_set1(1.0f)); + xj.v = r.v * hj_inv.v; + + /* Compute the kernel function. */ + kernel_deval_vec(&xi, &wi, &wi_dx); + kernel_deval_vec(&xj, &wj, &wj_dx); + + /* Compute dv. */ + dv[0].v = vi[0].v - vj[0].v; + dv[1].v = vi[1].v - vj[1].v; + dv[2].v = vi[2].v - vj[2].v; + + /* Compute dv dot r */ + dvdr.v = (dv[0].v * dx[0].v) + (dv[1].v * dx[1].v) + (dv[2].v * dx[2].v); + dvdr.v = dvdr.v * ri.v; + + /* Compute dv cross r */ + curlvr[0].v = dv[1].v * dx[2].v - dv[2].v * dx[1].v; + curlvr[1].v = dv[2].v * dx[0].v - dv[0].v * dx[2].v; + curlvr[2].v = dv[0].v * dx[1].v - dv[1].v * dx[0].v; + for (k = 0; k < 3; k++) curlvr[k].v *= ri.v; + + /* Compute density of pi. */ + rhoi.v = mj.v * wi.v; + rhoi_dh.v = mj.v * (vec_set1(3.0f) * wi.v + xi.v * wi_dx.v); + wcounti.v = wi.v; + wcounti_dh.v = xi.v * wi_dx.v; + div_vi.v = mj.v * dvdr.v * wi_dx.v; + for (k = 0; k < 3; k++) curl_vi[k].v = mj.v * curlvr[k].v * wi_dx.v; + + /* Compute density of pj. */ + rhoj.v = mi.v * wj.v; + rhoj_dh.v = mi.v * (vec_set1(3.0f) * wj.v + xj.v * wj_dx.v); + wcountj.v = wj.v; + wcountj_dh.v = xj.v * wj_dx.v; + div_vj.v = mi.v * dvdr.v * wj_dx.v; + for (k = 0; k < 3; k++) curl_vj[k].v = mi.v * curlvr[k].v * wj_dx.v; + + /* Update particles. */ + for (k = 0; k < VEC_SIZE; k++) { + pi[k]->rho += rhoi.f[k]; + pi[k]->rho_dh -= rhoi_dh.f[k]; + pi[k]->density.wcount += wcounti.f[k]; + pi[k]->density.wcount_dh -= wcounti_dh.f[k]; + pi[k]->div_v += div_vi.f[k]; + for (j = 0; j < 3; j++) pi[k]->density.rot_v[j] += curl_vi[j].f[k]; + pj[k]->rho += rhoj.f[k]; + pj[k]->rho_dh -= rhoj_dh.f[k]; + pj[k]->density.wcount += wcountj.f[k]; + pj[k]->density.wcount_dh -= wcountj_dh.f[k]; + pj[k]->div_v += div_vj.f[k]; + for (j = 0; j < 3; j++) pj[k]->density.rot_v[j] += curl_vj[j].f[k]; + } + +#else + + for (int k = 0; k < VEC_SIZE; k++) + error("The Gadget2 serial version of runner_iact_density was called when the vectorised version should have been used.") + +#endif + } /** @@ -168,9 +280,99 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density( __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_density(float *R2, float *Dx, float *Hi, float *Hj, struct part **pi, struct part **pj) { - error( - "A vectorised version of the Gadget2 non-symmetric density interaction " - "function does not exist yet!"); + +#ifdef WITH_VECTORIZATION + + vector r, ri, r2, xi, hi, hi_inv, wi, wi_dx; + vector rhoi, rhoi_dh, wcounti, wcounti_dh, div_vi; + vector mj; + vector dx[3], dv[3]; + vector vi[3], vj[3]; + vector dvdr; + vector curlvr[3], curl_vi[3]; + int k, j; + +#if VEC_SIZE == 8 + /* Get the masses. */ + mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass, + pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass); + /* Get each velocity component. */ + for (k = 0; k < 3; k++) { + vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k], + pi[4]->v[k], pi[5]->v[k], pi[6]->v[k], pi[7]->v[k]); + vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k], + pj[4]->v[k], pj[5]->v[k], pj[6]->v[k], pj[7]->v[k]); + } + /* Get each component of particle separation. (Dx={dx1,dy1,dz1,dx2,dy2,dz2,...,dxn,dyn,dzn})*/ + for (k = 0; k < 3; k++) + dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k], Dx[12 + k], + Dx[15 + k], Dx[18 + k], Dx[21 + k]); +#elif VEC_SIZE == 4 + mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass); + for (k = 0; k < 3; k++) { + vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k]); + vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k]); + } + for (k = 0; k < 3; k++) + dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]); +#endif + + /* Get the radius and inverse radius. */ + r2.v = vec_load(R2); + ri.v = vec_rsqrt(r2.v); + /*vec_rsqrt does not have the level of accuracy we need, so an extra term is added below.*/ + ri.v = ri.v - vec_set1(0.5f) * ri.v * (r2.v * ri.v * ri.v - vec_set1(1.0f)); + r.v = r2.v * ri.v; + + hi.v = vec_load(Hi); + hi_inv.v = vec_rcp(hi.v); + hi_inv.v = hi_inv.v - hi_inv.v * (hi_inv.v * hi.v - vec_set1(1.0f)); + xi.v = r.v * hi_inv.v; + + kernel_deval_vec(&xi, &wi, &wi_dx); + + /* Compute dv. */ + dv[0].v = vi[0].v - vj[0].v; + dv[1].v = vi[1].v - vj[1].v; + dv[2].v = vi[2].v - vj[2].v; + + /* Compute dv dot r */ + dvdr.v = (dv[0].v * dx[0].v) + (dv[1].v * dx[1].v) + (dv[2].v * dx[2].v); + dvdr.v = dvdr.v * ri.v; + + /* Compute dv cross r */ + curlvr[0].v = dv[1].v * dx[2].v - dv[2].v * dx[1].v; + curlvr[1].v = dv[2].v * dx[0].v - dv[0].v * dx[2].v; + curlvr[2].v = dv[0].v * dx[1].v - dv[1].v * dx[0].v; + for (k = 0; k < 3; k++) curlvr[k].v *= ri.v; + + /* Compute density of pi. */ + rhoi.v = mj.v * wi.v; + rhoi_dh.v = mj.v * (vec_set1(3.0f) * wi.v + xi.v * wi_dx.v); + wcounti.v = wi.v; + wcounti_dh.v = xi.v * wi_dx.v; + div_vi.v = mj.v * dvdr.v * wi_dx.v; + for (k = 0; k < 3; k++) curl_vi[k].v = mj.v * curlvr[k].v * wi_dx.v; + + /* Update particles. */ + for (k = 0; k < VEC_SIZE; k++) { + pi[k]->rho += rhoi.f[k]; + pi[k]->rho_dh -= rhoi_dh.f[k]; + pi[k]->density.wcount += wcounti.f[k]; + pi[k]->density.wcount_dh -= wcounti_dh.f[k]; + pi[k]->div_v += div_vi.f[k]; + for (j = 0; j < 3; j++) pi[k]->density.rot_v[j] += curl_vi[j].f[k]; + } + +#else + + for (int k = 0; k < VEC_SIZE; k++) + error("The Gadget2 serial version of runner_iact_nonsym_density was called when the vectorised version should have been used.") + +#endif + + + } /**