/******************************************************************************* * This file is part of SWIFT. * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk) * Matthieu Schaller (matthieu.schaller@durham.ac.uk) * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . * ******************************************************************************/ #ifndef SWIFT_GADGET2_HYDRO_IACT_H #define SWIFT_GADGET2_HYDRO_IACT_H /** * @file Gadget2/hydro_iact.h * @brief SPH interaction functions following the Gadget-2 version of SPH. * * The interactions computed here are the ones presented in the Gadget-2 paper * Springel, V., MNRAS, Volume 364, Issue 4, pp. 1105-1134. * We use the same numerical coefficients as the Gadget-2 code. When used with * the Spline-3 kernel, the results should be equivalent to the ones obtained * with Gadget-2 up to the rounding errors and interactions missed by the * Gadget-2 tree-code neighbours search. */ #include "cache.h" #include "minmax.h" /** * @brief Density loop */ __attribute__((always_inline)) INLINE static void runner_iact_density( float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) { float wi, wi_dx; float wj, wj_dx; float dv[3], curlvr[3]; /* Get the masses. */ const float mi = pi->mass; const float mj = pj->mass; /* Get r and r inverse. */ const float r = sqrtf(r2); const float r_inv = 1.0f / r; /* Compute the kernel function for pi */ const float hi_inv = 1.f / hi; const float ui = r * hi_inv; kernel_deval(ui, &wi, &wi_dx); /* Compute contribution to the density */ pi->rho += mj * wi; pi->density.rho_dh -= mj * (hydro_dimension * wi + ui * wi_dx); /* Compute contribution to the number of neighbours */ pi->density.wcount += wi; pi->density.wcount_dh -= (hydro_dimension * wi + ui * wi_dx); /* Compute the kernel function for pj */ const float hj_inv = 1.f / hj; const float uj = r * hj_inv; kernel_deval(uj, &wj, &wj_dx); /* Compute contribution to the density */ pj->rho += mi * wj; pj->density.rho_dh -= mi * (hydro_dimension * wj + uj * wj_dx); /* Compute contribution to the number of neighbours */ pj->density.wcount += wj; pj->density.wcount_dh -= (hydro_dimension * wj + uj * wj_dx); const float faci = mj * wi_dx * r_inv; const float facj = mi * wj_dx * r_inv; /* Compute dv dot r */ dv[0] = pi->v[0] - pj->v[0]; dv[1] = pi->v[1] - pj->v[1]; dv[2] = pi->v[2] - pj->v[2]; const float dvdr = dv[0] * dx[0] + dv[1] * dx[1] + dv[2] * dx[2]; pi->density.div_v -= faci * dvdr; pj->density.div_v -= facj * dvdr; /* Compute dv cross r */ curlvr[0] = dv[1] * dx[2] - dv[2] * dx[1]; curlvr[1] = dv[2] * dx[0] - dv[0] * dx[2]; curlvr[2] = dv[0] * dx[1] - dv[1] * dx[0]; pi->density.rot_v[0] += faci * curlvr[0]; pi->density.rot_v[1] += faci * curlvr[1]; pi->density.rot_v[2] += faci * curlvr[2]; pj->density.rot_v[0] += facj * curlvr[0]; pj->density.rot_v[1] += facj * curlvr[1]; pj->density.rot_v[2] += facj * curlvr[2]; } /** * @brief Density loop (Vectorized version) */ __attribute__((always_inline)) INLINE static void runner_iact_vec_density( float *R2, float *Dx, float *Hi, float *Hj, struct part **pi, struct part **pj) { #ifdef WITH_OLD_VECTORIZATION vector r, ri, r2, ui, uj, hi, hj, hi_inv, hj_inv, wi, wj, wi_dx, wj_dx; vector rhoi, rhoj, rhoi_dh, rhoj_dh, wcounti, wcountj, wcounti_dh, wcountj_dh; vector mi, mj; vector dx[3], dv[3]; vector vi[3], vj[3]; vector dvdr, div_vi, div_vj; vector curlvr[3], curl_vi[3], curl_vj[3]; int k, j; #if VEC_SIZE == 8 /* Get the masses. */ mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass, pi[4]->mass, pi[5]->mass, pi[6]->mass, pi[7]->mass); mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass, pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass); /* Get each velocity component. */ for (k = 0; k < 3; k++) { vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k], pi[4]->v[k], pi[5]->v[k], pi[6]->v[k], pi[7]->v[k]); vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k], pj[4]->v[k], pj[5]->v[k], pj[6]->v[k], pj[7]->v[k]); } /* Get each component of particle separation. * (Dx={dx1,dy1,dz1,dx2,dy2,dz2,...,dxn,dyn,dzn})*/ for (k = 0; k < 3; k++) dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k], Dx[12 + k], Dx[15 + k], Dx[18 + k], Dx[21 + k]); #elif VEC_SIZE == 4 mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass); mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass); for (k = 0; k < 3; k++) { vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k]); vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k]); } for (k = 0; k < 3; k++) dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]); #else error("Unknown vector size."); #endif /* Get the radius and inverse radius. */ r2.v = vec_load(R2); ri = vec_reciprocal_sqrt(r2); r.v = r2.v * ri.v; hi.v = vec_load(Hi); hi_inv = vec_reciprocal(hi); ui.v = r.v * hi_inv.v; hj.v = vec_load(Hj); hj_inv = vec_reciprocal(hj); uj.v = r.v * hj_inv.v; /* Compute the kernel function. */ kernel_deval_vec(&ui, &wi, &wi_dx); kernel_deval_vec(&uj, &wj, &wj_dx); /* Compute dv. */ dv[0].v = vi[0].v - vj[0].v; dv[1].v = vi[1].v - vj[1].v; dv[2].v = vi[2].v - vj[2].v; /* Compute dv dot r */ dvdr.v = (dv[0].v * dx[0].v) + (dv[1].v * dx[1].v) + (dv[2].v * dx[2].v); dvdr.v = dvdr.v * ri.v; /* Compute dv cross r */ curlvr[0].v = dv[1].v * dx[2].v - dv[2].v * dx[1].v; curlvr[1].v = dv[2].v * dx[0].v - dv[0].v * dx[2].v; curlvr[2].v = dv[0].v * dx[1].v - dv[1].v * dx[0].v; for (k = 0; k < 3; k++) curlvr[k].v *= ri.v; /* Compute density of pi. */ rhoi.v = mj.v * wi.v; rhoi_dh.v = mj.v * (vec_set1(hydro_dimension) * wi.v + ui.v * wi_dx.v); wcounti.v = wi.v; wcounti_dh.v = (vec_set1(hydro_dimension) * wi.v + ui.v * wi_dx.v); div_vi.v = mj.v * dvdr.v * wi_dx.v; for (k = 0; k < 3; k++) curl_vi[k].v = mj.v * curlvr[k].v * wi_dx.v; /* Compute density of pj. */ rhoj.v = mi.v * wj.v; rhoj_dh.v = mi.v * (vec_set1(hydro_dimension) * wj.v + uj.v * wj_dx.v); wcountj.v = wj.v; wcountj_dh.v = (vec_set1(hydro_dimension) * wj.v + uj.v * wj_dx.v); div_vj.v = mi.v * dvdr.v * wj_dx.v; for (k = 0; k < 3; k++) curl_vj[k].v = mi.v * curlvr[k].v * wj_dx.v; /* Update particles. */ for (k = 0; k < VEC_SIZE; k++) { pi[k]->rho += rhoi.f[k]; pi[k]->density.rho_dh -= rhoi_dh.f[k]; pi[k]->density.wcount += wcounti.f[k]; pi[k]->density.wcount_dh -= wcounti_dh.f[k]; pi[k]->density.div_v -= div_vi.f[k]; for (j = 0; j < 3; j++) pi[k]->density.rot_v[j] += curl_vi[j].f[k]; pj[k]->rho += rhoj.f[k]; pj[k]->density.rho_dh -= rhoj_dh.f[k]; pj[k]->density.wcount += wcountj.f[k]; pj[k]->density.wcount_dh -= wcountj_dh.f[k]; pj[k]->density.div_v -= div_vj.f[k]; for (j = 0; j < 3; j++) pj[k]->density.rot_v[j] += curl_vj[j].f[k]; } #else error( "The Gadget2 serial version of runner_iact_density was called when the " "vectorised version should have been used."); #endif } /** * @brief Density loop (non-symmetric version) */ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density( float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) { float wi, wi_dx; float dv[3], curlvr[3]; /* Get the masses. */ const float mj = pj->mass; /* Get r and r inverse. */ const float r = sqrtf(r2); const float r_inv = 1.0f / r; /* Compute the kernel function */ const float hi_inv = 1.0f / hi; const float ui = r * hi_inv; kernel_deval(ui, &wi, &wi_dx); /* Compute contribution to the density */ pi->rho += mj * wi; pi->density.rho_dh -= mj * (hydro_dimension * wi + ui * wi_dx); /* Compute contribution to the number of neighbours */ pi->density.wcount += wi; pi->density.wcount_dh -= (hydro_dimension * wi + ui * wi_dx); const float fac = mj * wi_dx * r_inv; /* Compute dv dot r */ dv[0] = pi->v[0] - pj->v[0]; dv[1] = pi->v[1] - pj->v[1]; dv[2] = pi->v[2] - pj->v[2]; const float dvdr = dv[0] * dx[0] + dv[1] * dx[1] + dv[2] * dx[2]; pi->density.div_v -= fac * dvdr; /* Compute dv cross r */ curlvr[0] = dv[1] * dx[2] - dv[2] * dx[1]; curlvr[1] = dv[2] * dx[0] - dv[0] * dx[2]; curlvr[2] = dv[0] * dx[1] - dv[1] * dx[0]; pi->density.rot_v[0] += fac * curlvr[0]; pi->density.rot_v[1] += fac * curlvr[1]; pi->density.rot_v[2] += fac * curlvr[2]; } /** * @brief Density loop (non-symmetric vectorized version) */ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_density(float *R2, float *Dx, float *Hi, float *Hj, struct part **pi, struct part **pj) { #ifdef WITH_OLD_VECTORIZATION vector r, ri, r2, ui, hi, hi_inv, wi, wi_dx; vector rhoi, rhoi_dh, wcounti, wcounti_dh, div_vi; vector mj; vector dx[3], dv[3]; vector vi[3], vj[3]; vector dvdr; vector curlvr[3], curl_vi[3]; int k, j; #if VEC_SIZE == 8 /* Get the masses. */ mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass, pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass); /* Get each velocity component. */ for (k = 0; k < 3; k++) { vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k], pi[4]->v[k], pi[5]->v[k], pi[6]->v[k], pi[7]->v[k]); vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k], pj[4]->v[k], pj[5]->v[k], pj[6]->v[k], pj[7]->v[k]); } /* Get each component of particle separation. * (Dx={dx1,dy1,dz1,dx2,dy2,dz2,...,dxn,dyn,dzn})*/ for (k = 0; k < 3; k++) dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k], Dx[12 + k], Dx[15 + k], Dx[18 + k], Dx[21 + k]); #elif VEC_SIZE == 4 mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass); for (k = 0; k < 3; k++) { vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k]); vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k]); } for (k = 0; k < 3; k++) dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]); #else error("Unknown vector size."); #endif /* Get the radius and inverse radius. */ r2.v = vec_load(R2); ri = vec_reciprocal_sqrt(r2); r.v = r2.v * ri.v; hi.v = vec_load(Hi); hi_inv = vec_reciprocal(hi); ui.v = r.v * hi_inv.v; kernel_deval_vec(&ui, &wi, &wi_dx); /* Compute dv. */ dv[0].v = vi[0].v - vj[0].v; dv[1].v = vi[1].v - vj[1].v; dv[2].v = vi[2].v - vj[2].v; /* Compute dv dot r */ dvdr.v = (dv[0].v * dx[0].v) + (dv[1].v * dx[1].v) + (dv[2].v * dx[2].v); dvdr.v = dvdr.v * ri.v; /* Compute dv cross r */ curlvr[0].v = dv[1].v * dx[2].v - dv[2].v * dx[1].v; curlvr[1].v = dv[2].v * dx[0].v - dv[0].v * dx[2].v; curlvr[2].v = dv[0].v * dx[1].v - dv[1].v * dx[0].v; for (k = 0; k < 3; k++) curlvr[k].v *= ri.v; /* Compute density of pi. */ rhoi.v = mj.v * wi.v; rhoi_dh.v = mj.v * (vec_set1(hydro_dimension) * wi.v + ui.v * wi_dx.v); wcounti.v = wi.v; wcounti_dh.v = (vec_set1(hydro_dimension) * wi.v + ui.v * wi_dx.v); div_vi.v = mj.v * dvdr.v * wi_dx.v; for (k = 0; k < 3; k++) curl_vi[k].v = mj.v * curlvr[k].v * wi_dx.v; /* Update particles. */ for (k = 0; k < VEC_SIZE; k++) { pi[k]->rho += rhoi.f[k]; pi[k]->density.rho_dh -= rhoi_dh.f[k]; pi[k]->density.wcount += wcounti.f[k]; pi[k]->density.wcount_dh -= wcounti_dh.f[k]; pi[k]->density.div_v -= div_vi.f[k]; for (j = 0; j < 3; j++) pi[k]->density.rot_v[j] += curl_vi[j].f[k]; } #else error( "The Gadget2 serial version of runner_iact_nonsym_density was called " "when the vectorised version should have been used."); #endif } #ifdef WITH_VECTORIZATION /** * @brief Density interaction computed using 1 vector * (non-symmetric vectorized version). */ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz, vector hi_inv, vector vix, vector viy, vector viz, float *Vjx, float *Vjy, float *Vjz, float *Mj, vector *rhoSum, vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum, vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum, mask_t mask) { vector r, ri, ui, wi, wi_dx; vector mj; vector dvx, dvy, dvz; vector vjx, vjy, vjz; vector dvdr; vector curlvrx, curlvry, curlvrz; /* Fill the vectors. */ mj.v = vec_load(Mj); vjx.v = vec_load(Vjx); vjy.v = vec_load(Vjy); vjz.v = vec_load(Vjz); /* Get the radius and inverse radius. */ ri = vec_reciprocal_sqrt(*r2); r.v = vec_mul(r2->v, ri.v); ui.v = vec_mul(r.v, hi_inv.v); /* Calculate the kernel for two particles. */ kernel_deval_1_vec(&ui, &wi, &wi_dx); /* Compute dv. */ dvx.v = vec_sub(vix.v, vjx.v); dvy.v = vec_sub(viy.v, vjy.v); dvz.v = vec_sub(viz.v, vjz.v); /* Compute dv dot r */ dvdr.v = vec_fma(dvx.v, dx->v, vec_fma(dvy.v, dy->v, vec_mul(dvz.v, dz->v))); dvdr.v = vec_mul(dvdr.v, ri.v); /* Compute dv cross r */ curlvrx.v = vec_fma(dvy.v, dz->v, vec_mul(vec_set1(-1.0f), vec_mul(dvz.v, dy->v))); curlvry.v = vec_fma(dvz.v, dx->v, vec_mul(vec_set1(-1.0f), vec_mul(dvx.v, dz->v))); curlvrz.v = vec_fma(dvx.v, dy->v, vec_mul(vec_set1(-1.0f), vec_mul(dvy.v, dx->v))); curlvrx.v = vec_mul(curlvrx.v, ri.v); curlvry.v = vec_mul(curlvry.v, ri.v); curlvrz.v = vec_mul(curlvrz.v, ri.v); vector wcount_dh_update; wcount_dh_update.v = vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v)); /* Mask updates to intermediate vector sums for particle pi. */ rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj.v, wi.v), mask); rho_dhSum->v = vec_mask_sub(rho_dhSum->v, vec_mul(mj.v, wcount_dh_update.v), mask); wcountSum->v = vec_mask_add(wcountSum->v, wi.v, mask); wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, wcount_dh_update.v, mask); div_vSum->v = vec_mask_sub(div_vSum->v, vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask); curlvxSum->v = vec_mask_add(curlvxSum->v, vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask); curlvySum->v = vec_mask_add(curlvySum->v, vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask); curlvzSum->v = vec_mask_add(curlvzSum->v, vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask); } /** * @brief Density interaction computed using 2 interleaved vectors * (non-symmetric vectorized version). */ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_2_vec_density(float *R2, float *Dx, float *Dy, float *Dz, vector hi_inv, vector vix, vector viy, vector viz, float *Vjx, float *Vjy, float *Vjz, float *Mj, vector *rhoSum, vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum, vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum, mask_t mask, mask_t mask2, short mask_cond) { vector r, ri, r2, ui, wi, wi_dx; vector mj; vector dx, dy, dz, dvx, dvy, dvz; vector vjx, vjy, vjz; vector dvdr; vector curlvrx, curlvry, curlvrz; vector r_2, ri2, r2_2, ui2, wi2, wi_dx2; vector mj2; vector dx2, dy2, dz2, dvx2, dvy2, dvz2; vector vjx2, vjy2, vjz2; vector dvdr2; vector curlvrx2, curlvry2, curlvrz2; /* Fill the vectors. */ mj.v = vec_load(Mj); mj2.v = vec_load(&Mj[VEC_SIZE]); vjx.v = vec_load(Vjx); vjx2.v = vec_load(&Vjx[VEC_SIZE]); vjy.v = vec_load(Vjy); vjy2.v = vec_load(&Vjy[VEC_SIZE]); vjz.v = vec_load(Vjz); vjz2.v = vec_load(&Vjz[VEC_SIZE]); dx.v = vec_load(Dx); dx2.v = vec_load(&Dx[VEC_SIZE]); dy.v = vec_load(Dy); dy2.v = vec_load(&Dy[VEC_SIZE]); dz.v = vec_load(Dz); dz2.v = vec_load(&Dz[VEC_SIZE]); /* Get the radius and inverse radius. */ r2.v = vec_load(R2); r2_2.v = vec_load(&R2[VEC_SIZE]); ri = vec_reciprocal_sqrt(r2); ri2 = vec_reciprocal_sqrt(r2_2); r.v = vec_mul(r2.v, ri.v); r_2.v = vec_mul(r2_2.v, ri2.v); ui.v = vec_mul(r.v, hi_inv.v); ui2.v = vec_mul(r_2.v, hi_inv.v); /* Calculate the kernel for two particles. */ kernel_deval_2_vec(&ui, &wi, &wi_dx, &ui2, &wi2, &wi_dx2); /* Compute dv. */ dvx.v = vec_sub(vix.v, vjx.v); dvx2.v = vec_sub(vix.v, vjx2.v); dvy.v = vec_sub(viy.v, vjy.v); dvy2.v = vec_sub(viy.v, vjy2.v); dvz.v = vec_sub(viz.v, vjz.v); dvz2.v = vec_sub(viz.v, vjz2.v); /* Compute dv dot r */ dvdr.v = vec_fma(dvx.v, dx.v, vec_fma(dvy.v, dy.v, vec_mul(dvz.v, dz.v))); dvdr2.v = vec_fma(dvx2.v, dx2.v, vec_fma(dvy2.v, dy2.v, vec_mul(dvz2.v, dz2.v))); dvdr.v = vec_mul(dvdr.v, ri.v); dvdr2.v = vec_mul(dvdr2.v, ri2.v); /* Compute dv cross r */ curlvrx.v = vec_fma(dvy.v, dz.v, vec_mul(vec_set1(-1.0f), vec_mul(dvz.v, dy.v))); curlvrx2.v = vec_fma(dvy2.v, dz2.v, vec_mul(vec_set1(-1.0f), vec_mul(dvz2.v, dy2.v))); curlvry.v = vec_fma(dvz.v, dx.v, vec_mul(vec_set1(-1.0f), vec_mul(dvx.v, dz.v))); curlvry2.v = vec_fma(dvz2.v, dx2.v, vec_mul(vec_set1(-1.0f), vec_mul(dvx2.v, dz2.v))); curlvrz.v = vec_fma(dvx.v, dy.v, vec_mul(vec_set1(-1.0f), vec_mul(dvy.v, dx.v))); curlvrz2.v = vec_fma(dvx2.v, dy2.v, vec_mul(vec_set1(-1.0f), vec_mul(dvy2.v, dx2.v))); curlvrx.v = vec_mul(curlvrx.v, ri.v); curlvrx2.v = vec_mul(curlvrx2.v, ri2.v); curlvry.v = vec_mul(curlvry.v, ri.v); curlvry2.v = vec_mul(curlvry2.v, ri2.v); curlvrz.v = vec_mul(curlvrz.v, ri.v); curlvrz2.v = vec_mul(curlvrz2.v, ri2.v); vector wcount_dh_update, wcount_dh_update2; wcount_dh_update.v = vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v)); wcount_dh_update2.v = vec_fma(vec_set1(hydro_dimension), wi2.v, vec_mul(ui2.v, wi_dx2.v)); /* Mask updates to intermediate vector sums for particle pi. */ /* Mask only when needed. */ if (mask_cond) { rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj.v, wi.v), mask); rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj2.v, wi2.v), mask2); rho_dhSum->v = vec_mask_sub(rho_dhSum->v, vec_mul(mj.v, wcount_dh_update.v), mask); rho_dhSum->v = vec_mask_sub(rho_dhSum->v, vec_mul(mj2.v, wcount_dh_update2.v), mask2); wcountSum->v = vec_mask_add(wcountSum->v, wi.v, mask); wcountSum->v = vec_mask_add(wcountSum->v, wi2.v, mask2); wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, wcount_dh_update.v, mask); wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, wcount_dh_update2.v, mask2); div_vSum->v = vec_mask_sub(div_vSum->v, vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask); div_vSum->v = vec_mask_sub( div_vSum->v, vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)), mask2); curlvxSum->v = vec_mask_add( curlvxSum->v, vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask); curlvxSum->v = vec_mask_add( curlvxSum->v, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)), mask2); curlvySum->v = vec_mask_add( curlvySum->v, vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask); curlvySum->v = vec_mask_add( curlvySum->v, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)), mask2); curlvzSum->v = vec_mask_add( curlvzSum->v, vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask); curlvzSum->v = vec_mask_add( curlvzSum->v, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)), mask2); } else { rhoSum->v = vec_add(rhoSum->v, vec_mul(mj.v, wi.v)); rhoSum->v = vec_add(rhoSum->v, vec_mul(mj2.v, wi2.v)); rho_dhSum->v = vec_sub(rho_dhSum->v, vec_mul(mj.v, wcount_dh_update.v)); rho_dhSum->v = vec_sub(rho_dhSum->v, vec_mul(mj2.v, wcount_dh_update2.v)); wcountSum->v = vec_add(wcountSum->v, wi.v); wcountSum->v = vec_add(wcountSum->v, wi2.v); wcount_dhSum->v = vec_sub(wcount_dhSum->v, wcount_dh_update.v); wcount_dhSum->v = vec_sub(wcount_dhSum->v, wcount_dh_update2.v); div_vSum->v = vec_sub(div_vSum->v, vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v))); div_vSum->v = vec_sub(div_vSum->v, vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v))); curlvxSum->v = vec_add(curlvxSum->v, vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v))); curlvxSum->v = vec_add(curlvxSum->v, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v))); curlvySum->v = vec_add(curlvySum->v, vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v))); curlvySum->v = vec_add(curlvySum->v, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v))); curlvzSum->v = vec_add(curlvzSum->v, vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v))); curlvzSum->v = vec_add(curlvzSum->v, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v))); } } #endif /** * @brief Force loop */ __attribute__((always_inline)) INLINE static void runner_iact_force( float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) { float wi, wj, wi_dx, wj_dx; const float fac_mu = 1.f; /* Will change with cosmological integration */ const float r = sqrtf(r2); const float r_inv = 1.0f / r; /* Get some values in local variables. */ const float mi = pi->mass; const float mj = pj->mass; const float rhoi = pi->rho; const float rhoj = pj->rho; /* Get the kernel for hi. */ const float hi_inv = 1.0f / hi; const float hid_inv = pow_dimension_plus_one(hi_inv); /* 1/h^(d+1) */ const float ui = r * hi_inv; kernel_deval(ui, &wi, &wi_dx); const float wi_dr = hid_inv * wi_dx; /* Get the kernel for hj. */ const float hj_inv = 1.0f / hj; const float hjd_inv = pow_dimension_plus_one(hj_inv); /* 1/h^(d+1) */ const float xj = r * hj_inv; kernel_deval(xj, &wj, &wj_dx); const float wj_dr = hjd_inv * wj_dx; /* Compute h-gradient terms */ const float f_i = pi->force.f; const float f_j = pj->force.f; /* Compute pressure terms */ const float P_over_rho2_i = pi->force.P_over_rho2; const float P_over_rho2_j = pj->force.P_over_rho2; /* Compute sound speeds */ const float ci = pi->force.soundspeed; const float cj = pj->force.soundspeed; /* Compute dv dot r. */ const float dvdr = (pi->v[0] - pj->v[0]) * dx[0] + (pi->v[1] - pj->v[1]) * dx[1] + (pi->v[2] - pj->v[2]) * dx[2]; /* Balsara term */ const float balsara_i = pi->force.balsara; const float balsara_j = pj->force.balsara; /* Are the particles moving towards each others ? */ const float omega_ij = (dvdr < 0.f) ? dvdr : 0.f; const float mu_ij = fac_mu * r_inv * omega_ij; /* This is 0 or negative */ /* Signal velocity */ const float v_sig = ci + cj - 3.f * mu_ij; /* Now construct the full viscosity term */ const float rho_ij = 0.5f * (rhoi + rhoj); const float visc = -0.25f * const_viscosity_alpha * v_sig * mu_ij * (balsara_i + balsara_j) / rho_ij; /* Now, convolve with the kernel */ const float visc_term = 0.5f * visc * (wi_dr + wj_dr) * r_inv; const float sph_term = (f_i * P_over_rho2_i * wi_dr + f_j * P_over_rho2_j * wj_dr) * r_inv; /* Eventually got the acceleration */ const float acc = visc_term + sph_term; /* Use the force Luke ! */ pi->a_hydro[0] -= mj * acc * dx[0]; pi->a_hydro[1] -= mj * acc * dx[1]; pi->a_hydro[2] -= mj * acc * dx[2]; pj->a_hydro[0] += mi * acc * dx[0]; pj->a_hydro[1] += mi * acc * dx[1]; pj->a_hydro[2] += mi * acc * dx[2]; /* Get the time derivative for h. */ pi->force.h_dt -= mj * dvdr * r_inv / rhoj * wi_dr; pj->force.h_dt -= mi * dvdr * r_inv / rhoi * wj_dr; /* Update the signal velocity. */ pi->force.v_sig = (pi->force.v_sig > v_sig) ? pi->force.v_sig : v_sig; pj->force.v_sig = (pj->force.v_sig > v_sig) ? pj->force.v_sig : v_sig; /* Change in entropy */ pi->entropy_dt += mj * visc_term * dvdr; pj->entropy_dt += mi * visc_term * dvdr; } /** * @brief Force loop (Vectorized version) */ __attribute__((always_inline)) INLINE static void runner_iact_vec_force( float *R2, float *Dx, float *Hi, float *Hj, struct part **pi, struct part **pj) { #ifdef WITH_OLD_VECTORIZATION vector r, r2, ri; vector xi, xj; vector hi, hj, hi_inv, hj_inv; vector hid_inv, hjd_inv; vector wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, dvdr; vector piPOrho2, pjPOrho2, pirho, pjrho; vector mi, mj; vector f; vector grad_hi, grad_hj; vector dx[3]; vector vi[3], vj[3]; vector pia[3], pja[3]; vector pih_dt, pjh_dt; vector ci, cj, v_sig; vector omega_ij, mu_ij, fac_mu, balsara; vector rho_ij, visc, visc_term, sph_term, acc, entropy_dt; int j, k; fac_mu.v = vec_set1(1.f); /* Will change with cosmological integration */ /* Load stuff. */ #if VEC_SIZE == 8 mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass, pi[4]->mass, pi[5]->mass, pi[6]->mass, pi[7]->mass); mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass, pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass); piPOrho2.v = vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2, pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2, pi[4]->force.P_over_rho2, pi[5]->force.P_over_rho2, pi[6]->force.P_over_rho2, pi[7]->force.P_over_rho2); pjPOrho2.v = vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2, pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2, pj[4]->force.P_over_rho2, pj[5]->force.P_over_rho2, pj[6]->force.P_over_rho2, pj[7]->force.P_over_rho2); grad_hi.v = vec_set(pi[0]->force.f, pi[1]->force.f, pi[2]->force.f, pi[3]->force.f, pi[4]->force.f, pi[5]->force.f, pi[6]->force.f, pi[7]->force.f); grad_hj.v = vec_set(pj[0]->force.f, pj[1]->force.f, pj[2]->force.f, pj[3]->force.f, pj[4]->force.f, pj[5]->force.f, pj[6]->force.f, pj[7]->force.f); pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho, pi[4]->rho, pi[5]->rho, pi[6]->rho, pi[7]->rho); pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho, pj[4]->rho, pj[5]->rho, pj[6]->rho, pj[7]->rho); ci.v = vec_set(pi[0]->force.soundspeed, pi[1]->force.soundspeed, pi[2]->force.soundspeed, pi[3]->force.soundspeed, pi[4]->force.soundspeed, pi[5]->force.soundspeed, pi[6]->force.soundspeed, pi[7]->force.soundspeed); cj.v = vec_set(pj[0]->force.soundspeed, pj[1]->force.soundspeed, pj[2]->force.soundspeed, pj[3]->force.soundspeed, pj[4]->force.soundspeed, pj[5]->force.soundspeed, pj[6]->force.soundspeed, pj[7]->force.soundspeed); for (k = 0; k < 3; k++) { vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k], pi[4]->v[k], pi[5]->v[k], pi[6]->v[k], pi[7]->v[k]); vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k], pj[4]->v[k], pj[5]->v[k], pj[6]->v[k], pj[7]->v[k]); } for (k = 0; k < 3; k++) dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k], Dx[12 + k], Dx[15 + k], Dx[18 + k], Dx[21 + k]); balsara.v = vec_set(pi[0]->force.balsara, pi[1]->force.balsara, pi[2]->force.balsara, pi[3]->force.balsara, pi[4]->force.balsara, pi[5]->force.balsara, pi[6]->force.balsara, pi[7]->force.balsara) + vec_set(pj[0]->force.balsara, pj[1]->force.balsara, pj[2]->force.balsara, pj[3]->force.balsara, pj[4]->force.balsara, pj[5]->force.balsara, pj[6]->force.balsara, pj[7]->force.balsara); #elif VEC_SIZE == 4 mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass); mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass); piPOrho2.v = vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2, pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2); pjPOrho2.v = vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2, pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2); grad_hi.v = vec_set(pi[0]->force.f, pi[1]->force.f, pi[2]->force.f, pi[3]->force.f); grad_hj.v = vec_set(pj[0]->force.f, pj[1]->force.f, pj[2]->force.f, pj[3]->force.f); pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho); pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho); ci.v = vec_set(pi[0]->force.soundspeed, pi[1]->force.soundspeed, pi[2]->force.soundspeed, pi[3]->force.soundspeed); cj.v = vec_set(pj[0]->force.soundspeed, pj[1]->force.soundspeed, pj[2]->force.soundspeed, pj[3]->force.soundspeed); for (k = 0; k < 3; k++) { vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k]); vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k]); } for (k = 0; k < 3; k++) dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]); balsara.v = vec_set(pi[0]->force.balsara, pi[1]->force.balsara, pi[2]->force.balsara, pi[3]->force.balsara) + vec_set(pj[0]->force.balsara, pj[1]->force.balsara, pj[2]->force.balsara, pj[3]->force.balsara); #else error("Unknown vector size."); #endif /* Get the radius and inverse radius. */ r2.v = vec_load(R2); ri = vec_reciprocal_sqrt(r2); r.v = r2.v * ri.v; /* Get the kernel for hi. */ hi.v = vec_load(Hi); hi_inv = vec_reciprocal(hi); hid_inv = pow_dimension_plus_one_vec(hi_inv); /* 1/h^(d+1) */ xi.v = r.v * hi_inv.v; kernel_deval_vec(&xi, &wi, &wi_dx); wi_dr.v = hid_inv.v * wi_dx.v; /* Get the kernel for hj. */ hj.v = vec_load(Hj); hj_inv = vec_reciprocal(hj); hjd_inv = pow_dimension_plus_one_vec(hj_inv); /* 1/h^(d+1) */ xj.v = r.v * hj_inv.v; kernel_deval_vec(&xj, &wj, &wj_dx); wj_dr.v = hjd_inv.v * wj_dx.v; /* Compute dv dot r. */ dvdr.v = ((vi[0].v - vj[0].v) * dx[0].v) + ((vi[1].v - vj[1].v) * dx[1].v) + ((vi[2].v - vj[2].v) * dx[2].v); // dvdr.v = dvdr.v * ri.v; /* Compute the relative velocity. (This is 0 if the particles move away from * each other and negative otherwise) */ omega_ij.v = vec_fmin(dvdr.v, vec_set1(0.0f)); mu_ij.v = fac_mu.v * ri.v * omega_ij.v; /* This is 0 or negative */ /* Compute signal velocity */ v_sig.v = ci.v + cj.v - vec_set1(3.0f) * mu_ij.v; /* Now construct the full viscosity term */ rho_ij.v = vec_set1(0.5f) * (pirho.v + pjrho.v); visc.v = vec_set1(-0.25f) * vec_set1(const_viscosity_alpha) * v_sig.v * mu_ij.v * balsara.v / rho_ij.v; /* Now, convolve with the kernel */ visc_term.v = vec_set1(0.5f) * visc.v * (wi_dr.v + wj_dr.v) * ri.v; sph_term.v = (grad_hi.v * piPOrho2.v * wi_dr.v + grad_hj.v * pjPOrho2.v * wj_dr.v) * ri.v; /* Eventually get the acceleration */ acc.v = visc_term.v + sph_term.v; /* Use the force, Luke! */ for (k = 0; k < 3; k++) { f.v = dx[k].v * acc.v; pia[k].v = mj.v * f.v; pja[k].v = mi.v * f.v; } /* Get the time derivative for h. */ pih_dt.v = mj.v * dvdr.v * ri.v / pjrho.v * wi_dr.v; pjh_dt.v = mi.v * dvdr.v * ri.v / pirho.v * wj_dr.v; /* Change in entropy */ entropy_dt.v = visc_term.v * dvdr.v; /* Store the forces back on the particles. */ for (k = 0; k < VEC_SIZE; k++) { for (j = 0; j < 3; j++) { pi[k]->a_hydro[j] -= pia[j].f[k]; pj[k]->a_hydro[j] += pja[j].f[k]; } pi[k]->force.h_dt -= pih_dt.f[k]; pj[k]->force.h_dt -= pjh_dt.f[k]; pi[k]->force.v_sig = max(pi[k]->force.v_sig, v_sig.f[k]); pj[k]->force.v_sig = max(pj[k]->force.v_sig, v_sig.f[k]); pi[k]->entropy_dt += entropy_dt.f[k] * mj.f[k]; pj[k]->entropy_dt += entropy_dt.f[k] * mi.f[k]; } #else error( "The Gadget2 serial version of runner_iact_nonsym_force was called when " "the vectorised version should have been used."); #endif } /** * @brief Force loop (non-symmetric version) */ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_force( float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) { float wi, wj, wi_dx, wj_dx; const float fac_mu = 1.f; /* Will change with cosmological integration */ const float r = sqrtf(r2); const float r_inv = 1.0f / r; /* Get some values in local variables. */ // const float mi = pi->mass; const float mj = pj->mass; const float rhoi = pi->rho; const float rhoj = pj->rho; /* Get the kernel for hi. */ const float hi_inv = 1.0f / hi; const float hid_inv = pow_dimension_plus_one(hi_inv); /* 1/h^(d+1) */ const float ui = r * hi_inv; kernel_deval(ui, &wi, &wi_dx); const float wi_dr = hid_inv * wi_dx; /* Get the kernel for hj. */ const float hj_inv = 1.0f / hj; const float hjd_inv = pow_dimension_plus_one(hj_inv); /* 1/h^(d+1) */ const float xj = r * hj_inv; kernel_deval(xj, &wj, &wj_dx); const float wj_dr = hjd_inv * wj_dx; /* Compute h-gradient terms */ const float f_i = pi->force.f; const float f_j = pj->force.f; /* Compute pressure terms */ const float P_over_rho2_i = pi->force.P_over_rho2; const float P_over_rho2_j = pj->force.P_over_rho2; /* Compute sound speeds */ const float ci = pi->force.soundspeed; const float cj = pj->force.soundspeed; /* Compute dv dot r. */ const float dvdr = (pi->v[0] - pj->v[0]) * dx[0] + (pi->v[1] - pj->v[1]) * dx[1] + (pi->v[2] - pj->v[2]) * dx[2]; /* Balsara term */ const float balsara_i = pi->force.balsara; const float balsara_j = pj->force.balsara; /* Are the particles moving towards each others ? */ const float omega_ij = (dvdr < 0.f) ? dvdr : 0.f; const float mu_ij = fac_mu * r_inv * omega_ij; /* This is 0 or negative */ /* Signal velocity */ const float v_sig = ci + cj - 3.f * mu_ij; /* Now construct the full viscosity term */ const float rho_ij = 0.5f * (rhoi + rhoj); const float visc = -0.25f * const_viscosity_alpha * v_sig * mu_ij * (balsara_i + balsara_j) / rho_ij; /* Now, convolve with the kernel */ const float visc_term = 0.5f * visc * (wi_dr + wj_dr) * r_inv; const float sph_term = (f_i * P_over_rho2_i * wi_dr + f_j * P_over_rho2_j * wj_dr) * r_inv; /* Eventually got the acceleration */ const float acc = visc_term + sph_term; /* Use the force Luke ! */ pi->a_hydro[0] -= mj * acc * dx[0]; pi->a_hydro[1] -= mj * acc * dx[1]; pi->a_hydro[2] -= mj * acc * dx[2]; /* Get the time derivative for h. */ pi->force.h_dt -= mj * dvdr * r_inv / rhoj * wi_dr; /* Update the signal velocity. */ pi->force.v_sig = (pi->force.v_sig > v_sig) ? pi->force.v_sig : v_sig; /* Change in entropy */ pi->entropy_dt += mj * visc_term * dvdr; } /** * @brief Force loop (Vectorized non-symmetric version) */ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force( float *R2, float *Dx, float *Hi, float *Hj, struct part **pi, struct part **pj) { #ifdef WITH_OLD_VECTORIZATION vector r, r2, ri; vector xi, xj; vector hi, hj, hi_inv, hj_inv; vector hid_inv, hjd_inv; vector wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, dvdr; vector piPOrho2, pjPOrho2, pirho, pjrho; vector mj; vector f; vector grad_hi, grad_hj; vector dx[3]; vector vi[3], vj[3]; vector pia[3]; vector pih_dt; vector ci, cj, v_sig; vector omega_ij, mu_ij, fac_mu, balsara; vector rho_ij, visc, visc_term, sph_term, acc, entropy_dt; int j, k; fac_mu.v = vec_set1(1.f); /* Will change with cosmological integration */ /* Load stuff. */ #if VEC_SIZE == 8 mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass, pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass); piPOrho2.v = vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2, pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2, pi[4]->force.P_over_rho2, pi[5]->force.P_over_rho2, pi[6]->force.P_over_rho2, pi[7]->force.P_over_rho2); pjPOrho2.v = vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2, pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2, pj[4]->force.P_over_rho2, pj[5]->force.P_over_rho2, pj[6]->force.P_over_rho2, pj[7]->force.P_over_rho2); grad_hi.v = vec_set(pi[0]->force.f, pi[1]->force.f, pi[2]->force.f, pi[3]->force.f, pi[4]->force.f, pi[5]->force.f, pi[6]->force.f, pi[7]->force.f); grad_hj.v = vec_set(pj[0]->force.f, pj[1]->force.f, pj[2]->force.f, pj[3]->force.f, pj[4]->force.f, pj[5]->force.f, pj[6]->force.f, pj[7]->force.f); pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho, pi[4]->rho, pi[5]->rho, pi[6]->rho, pi[7]->rho); pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho, pj[4]->rho, pj[5]->rho, pj[6]->rho, pj[7]->rho); ci.v = vec_set(pi[0]->force.soundspeed, pi[1]->force.soundspeed, pi[2]->force.soundspeed, pi[3]->force.soundspeed, pi[4]->force.soundspeed, pi[5]->force.soundspeed, pi[6]->force.soundspeed, pi[7]->force.soundspeed); cj.v = vec_set(pj[0]->force.soundspeed, pj[1]->force.soundspeed, pj[2]->force.soundspeed, pj[3]->force.soundspeed, pj[4]->force.soundspeed, pj[5]->force.soundspeed, pj[6]->force.soundspeed, pj[7]->force.soundspeed); for (k = 0; k < 3; k++) { vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k], pi[4]->v[k], pi[5]->v[k], pi[6]->v[k], pi[7]->v[k]); vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k], pj[4]->v[k], pj[5]->v[k], pj[6]->v[k], pj[7]->v[k]); } for (k = 0; k < 3; k++) dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k], Dx[12 + k], Dx[15 + k], Dx[18 + k], Dx[21 + k]); balsara.v = vec_set(pi[0]->force.balsara, pi[1]->force.balsara, pi[2]->force.balsara, pi[3]->force.balsara, pi[4]->force.balsara, pi[5]->force.balsara, pi[6]->force.balsara, pi[7]->force.balsara) + vec_set(pj[0]->force.balsara, pj[1]->force.balsara, pj[2]->force.balsara, pj[3]->force.balsara, pj[4]->force.balsara, pj[5]->force.balsara, pj[6]->force.balsara, pj[7]->force.balsara); #elif VEC_SIZE == 4 mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass); piPOrho2.v = vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2, pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2); pjPOrho2.v = vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2, pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2); grad_hi.v = vec_set(pi[0]->force.f, pi[1]->force.f, pi[2]->force.f, pi[3]->force.f); grad_hj.v = vec_set(pj[0]->force.f, pj[1]->force.f, pj[2]->force.f, pj[3]->force.f); pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho); pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho); ci.v = vec_set(pi[0]->force.soundspeed, pi[1]->force.soundspeed, pi[2]->force.soundspeed, pi[3]->force.soundspeed); cj.v = vec_set(pj[0]->force.soundspeed, pj[1]->force.soundspeed, pj[2]->force.soundspeed, pj[3]->force.soundspeed); for (k = 0; k < 3; k++) { vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k]); vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k]); } for (k = 0; k < 3; k++) dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]); balsara.v = vec_set(pi[0]->force.balsara, pi[1]->force.balsara, pi[2]->force.balsara, pi[3]->force.balsara) + vec_set(pj[0]->force.balsara, pj[1]->force.balsara, pj[2]->force.balsara, pj[3]->force.balsara); #else error("Unknown vector size."); #endif /* Get the radius and inverse radius. */ r2.v = vec_load(R2); ri = vec_reciprocal_sqrt(r2); r.v = r2.v * ri.v; /* Get the kernel for hi. */ hi.v = vec_load(Hi); hi_inv = vec_reciprocal(hi); hid_inv = pow_dimension_plus_one_vec(hi_inv); xi.v = r.v * hi_inv.v; kernel_deval_vec(&xi, &wi, &wi_dx); wi_dr.v = hid_inv.v * wi_dx.v; /* Get the kernel for hj. */ hj.v = vec_load(Hj); hj_inv = vec_reciprocal(hj); hjd_inv = pow_dimension_plus_one_vec(hj_inv); xj.v = r.v * hj_inv.v; kernel_deval_vec(&xj, &wj, &wj_dx); wj_dr.v = hjd_inv.v * wj_dx.v; /* Compute dv dot r. */ dvdr.v = ((vi[0].v - vj[0].v) * dx[0].v) + ((vi[1].v - vj[1].v) * dx[1].v) + ((vi[2].v - vj[2].v) * dx[2].v); // dvdr.v = dvdr.v * ri.v; /* Compute the relative velocity. (This is 0 if the particles move away from * each other and negative otherwise) */ omega_ij.v = vec_fmin(dvdr.v, vec_set1(0.0f)); mu_ij.v = fac_mu.v * ri.v * omega_ij.v; /* This is 0 or negative */ /* Compute signal velocity */ v_sig.v = ci.v + cj.v - vec_set1(3.0f) * mu_ij.v; /* Now construct the full viscosity term */ rho_ij.v = vec_set1(0.5f) * (pirho.v + pjrho.v); visc.v = vec_set1(-0.25f) * vec_set1(const_viscosity_alpha) * v_sig.v * mu_ij.v * balsara.v / rho_ij.v; /* Now, convolve with the kernel */ visc_term.v = vec_set1(0.5f) * visc.v * (wi_dr.v + wj_dr.v) * ri.v; sph_term.v = (grad_hi.v * piPOrho2.v * wi_dr.v + grad_hj.v * pjPOrho2.v * wj_dr.v) * ri.v; /* Eventually get the acceleration */ acc.v = visc_term.v + sph_term.v; /* Use the force, Luke! */ for (k = 0; k < 3; k++) { f.v = dx[k].v * acc.v; pia[k].v = mj.v * f.v; } /* Get the time derivative for h. */ pih_dt.v = mj.v * dvdr.v * ri.v / pjrho.v * wi_dr.v; /* Change in entropy */ entropy_dt.v = mj.v * visc_term.v * dvdr.v; /* Store the forces back on the particles. */ for (k = 0; k < VEC_SIZE; k++) { for (j = 0; j < 3; j++) pi[k]->a_hydro[j] -= pia[j].f[k]; pi[k]->force.h_dt -= pih_dt.f[k]; pi[k]->force.v_sig = max(pi[k]->force.v_sig, v_sig.f[k]); pi[k]->entropy_dt += entropy_dt.f[k]; } #else error( "The Gadget2 serial version of runner_iact_nonsym_force was called when " "the vectorised version should have been used."); #endif } #ifdef WITH_VECTORIZATION __attribute__((always_inline)) INLINE static void runner_iact_nonsym_1_vec_force( vector *r2, vector *dx, vector *dy, vector *dz, vector vix, vector viy, vector viz, vector pirho, vector grad_hi, vector piPOrho2, vector balsara_i, vector ci, float *Vjx, float *Vjy, float *Vjz, float *Pjrho, float *Grad_hj, float *PjPOrho2, float *Balsara_j, float *Cj, float *Mj, vector hi_inv, float *Hj_inv, vector *a_hydro_xSum, vector *a_hydro_ySum, vector *a_hydro_zSum, vector *h_dtSum, vector *v_sigSum, vector *entropy_dtSum, mask_t mask) { #ifdef WITH_VECTORIZATION vector r, ri; vector vjx, vjy, vjz; vector pjrho, grad_hj, pjPOrho2, balsara_j, cj, mj, hj_inv; vector xi, xj; vector hid_inv, hjd_inv; vector wi_dx, wj_dx, wi_dr, wj_dr, dvdr; vector piax, piay, piaz; vector pih_dt; vector v_sig; vector omega_ij, mu_ij, fac_mu, balsara; vector rho_ij, visc, visc_term, sph_term, acc, entropy_dt; /* Fill vectors. */ vjx.v = vec_load(Vjx); vjy.v = vec_load(Vjy); vjz.v = vec_load(Vjz); mj.v = vec_load(Mj); pjrho.v = vec_load(Pjrho); grad_hj.v = vec_load(Grad_hj); pjPOrho2.v = vec_load(PjPOrho2); balsara_j.v = vec_load(Balsara_j); cj.v = vec_load(Cj); hj_inv.v = vec_load(Hj_inv); fac_mu.v = vec_set1(1.f); /* Will change with cosmological integration */ /* Load stuff. */ balsara.v = balsara_i.v + balsara_j.v; /* Get the radius and inverse radius. */ ri = vec_reciprocal_sqrt(*r2); r.v = r2->v * ri.v; /* Get the kernel for hi. */ hid_inv = pow_dimension_plus_one_vec(hi_inv); xi.v = r.v * hi_inv.v; kernel_eval_dWdx_force_vec(&xi, &wi_dx); wi_dr.v = hid_inv.v * wi_dx.v; /* Get the kernel for hj. */ hjd_inv = pow_dimension_plus_one_vec(hj_inv); xj.v = r.v * hj_inv.v; /* Calculate the kernel for two particles. */ kernel_eval_dWdx_force_vec(&xj, &wj_dx); wj_dr.v = hjd_inv.v * wj_dx.v; /* Compute dv dot r. */ dvdr.v = ((vix.v - vjx.v) * dx->v) + ((viy.v - vjy.v) * dy->v) + ((viz.v - vjz.v) * dz->v); /* Compute the relative velocity. (This is 0 if the particles move away from * each other and negative otherwise) */ omega_ij.v = vec_fmin(dvdr.v, vec_setzero()); mu_ij.v = fac_mu.v * ri.v * omega_ij.v; /* This is 0 or negative */ /* Compute signal velocity */ v_sig.v = ci.v + cj.v - vec_set1(3.0f) * mu_ij.v; /* Now construct the full viscosity term */ rho_ij.v = vec_set1(0.5f) * (pirho.v + pjrho.v); visc.v = vec_set1(-0.25f) * vec_set1(const_viscosity_alpha) * v_sig.v * mu_ij.v * balsara.v / rho_ij.v; /* Now, convolve with the kernel */ visc_term.v = vec_set1(0.5f) * visc.v * (wi_dr.v + wj_dr.v) * ri.v; sph_term.v = (grad_hi.v * piPOrho2.v * wi_dr.v + grad_hj.v * pjPOrho2.v * wj_dr.v) * ri.v; /* Eventually get the acceleration */ acc.v = visc_term.v + sph_term.v; /* Use the force, Luke! */ piax.v = mj.v * dx->v * acc.v; piay.v = mj.v * dy->v * acc.v; piaz.v = mj.v * dz->v * acc.v; /* Get the time derivative for h. */ pih_dt.v = mj.v * dvdr.v * ri.v / pjrho.v * wi_dr.v; /* Change in entropy */ entropy_dt.v = mj.v * visc_term.v * dvdr.v; /* Store the forces back on the particles. */ a_hydro_xSum->v = vec_mask_sub(a_hydro_xSum->v, piax.v, mask); a_hydro_ySum->v = vec_mask_sub(a_hydro_ySum->v, piay.v, mask); a_hydro_zSum->v = vec_mask_sub(a_hydro_zSum->v, piaz.v, mask); h_dtSum->v = vec_mask_sub(h_dtSum->v, pih_dt.v, mask); v_sigSum->v = vec_fmax(v_sigSum->v, vec_and_mask(v_sig.v, mask)); entropy_dtSum->v = vec_mask_add(entropy_dtSum->v, entropy_dt.v, mask); #else error( "The Gadget2 serial version of runner_iact_nonsym_force was called when " "the vectorised version should have been used."); #endif } __attribute__((always_inline)) INLINE static void runner_iact_nonsym_2_vec_force( float *R2, float *Dx, float *Dy, float *Dz, vector vix, vector viy, vector viz, vector pirho, vector grad_hi, vector piPOrho2, vector balsara_i, vector ci, float *Vjx, float *Vjy, float *Vjz, float *Pjrho, float *Grad_hj, float *PjPOrho2, float *Balsara_j, float *Cj, float *Mj, vector hi_inv, float *Hj_inv, vector *a_hydro_xSum, vector *a_hydro_ySum, vector *a_hydro_zSum, vector *h_dtSum, vector *v_sigSum, vector *entropy_dtSum, mask_t mask, mask_t mask_2, short mask_cond) { #ifdef WITH_VECTORIZATION vector r, r2, ri; vector dx, dy, dz, dvx, dvy, dvz; vector vjx, vjy, vjz; vector pjrho, grad_hj, pjPOrho2, balsara_j, cj, mj, hj_inv; vector ui, uj; vector hid_inv, hjd_inv; vector wi_dx, wj_dx, wi_dr, wj_dr, dvdr; vector piax, piay, piaz; vector pih_dt; vector v_sig; vector omega_ij, mu_ij, fac_mu, balsara; vector rho_ij, visc, visc_term, sph_term, acc, entropy_dt; vector r_2, r2_2, ri_2; vector dx_2, dy_2, dz_2, dvx_2, dvy_2, dvz_2; vector vjx_2, vjy_2, vjz_2; vector pjrho_2, grad_hj_2, pjPOrho2_2, balsara_j_2, cj_2, mj_2, hj_inv_2; vector ui_2, uj_2; vector hjd_inv_2; vector wi_dx_2, wj_dx_2, wi_dr_2, wj_dr_2, dvdr_2; vector piax_2, piay_2, piaz_2; vector pih_dt_2; vector v_sig_2; vector omega_ij_2, mu_ij_2, balsara_2; vector rho_ij_2, visc_2, visc_term_2, sph_term_2, acc_2, entropy_dt_2; /* Fill vectors. */ mj.v = vec_load(Mj); mj_2.v = vec_load(&Mj[VEC_SIZE]); vjx.v = vec_load(Vjx); vjx_2.v = vec_load(&Vjx[VEC_SIZE]); vjy.v = vec_load(Vjy); vjy_2.v = vec_load(&Vjy[VEC_SIZE]); vjz.v = vec_load(Vjz); vjz_2.v = vec_load(&Vjz[VEC_SIZE]); dx.v = vec_load(Dx); dx_2.v = vec_load(&Dx[VEC_SIZE]); dy.v = vec_load(Dy); dy_2.v = vec_load(&Dy[VEC_SIZE]); dz.v = vec_load(Dz); dz_2.v = vec_load(&Dz[VEC_SIZE]); /* Get the radius and inverse radius. */ r2.v = vec_load(R2); r2_2.v = vec_load(&R2[VEC_SIZE]); ri = vec_reciprocal_sqrt(r2); ri_2 = vec_reciprocal_sqrt(r2_2); r.v = vec_mul(r2.v, ri.v); r_2.v = vec_mul(r2_2.v, ri_2.v); /* Get remaining properties. */ pjrho.v = vec_load(Pjrho); pjrho_2.v = vec_load(&Pjrho[VEC_SIZE]); grad_hj.v = vec_load(Grad_hj); grad_hj_2.v = vec_load(&Grad_hj[VEC_SIZE]); pjPOrho2.v = vec_load(PjPOrho2); pjPOrho2_2.v = vec_load(&PjPOrho2[VEC_SIZE]); balsara_j.v = vec_load(Balsara_j); balsara_j_2.v = vec_load(&Balsara_j[VEC_SIZE]); cj.v = vec_load(Cj); cj_2.v = vec_load(&Cj[VEC_SIZE]); hj_inv.v = vec_load(Hj_inv); hj_inv_2.v = vec_load(&Hj_inv[VEC_SIZE]); fac_mu.v = vec_set1(1.f); /* Will change with cosmological integration */ /* Find the balsara switch. */ balsara.v = vec_add(balsara_i.v, balsara_j.v); balsara_2.v = vec_add(balsara_i.v, balsara_j_2.v); /* Get the kernel for hi. */ hid_inv = pow_dimension_plus_one_vec(hi_inv); ui.v = vec_mul(r.v, hi_inv.v); ui_2.v = vec_mul(r_2.v, hi_inv.v); kernel_eval_dWdx_force_vec(&ui, &wi_dx); kernel_eval_dWdx_force_vec(&ui_2, &wi_dx_2); wi_dr.v = vec_mul(hid_inv.v, wi_dx.v); wi_dr_2.v = vec_mul(hid_inv.v, wi_dx_2.v); /* Get the kernel for hj. */ hjd_inv = pow_dimension_plus_one_vec(hj_inv); hjd_inv_2 = pow_dimension_plus_one_vec(hj_inv_2); uj.v = vec_mul(r.v, hj_inv.v); uj_2.v = vec_mul(r_2.v, hj_inv_2.v); /* Calculate the kernel for two particles. */ kernel_eval_dWdx_force_vec(&uj, &wj_dx); kernel_eval_dWdx_force_vec(&uj_2, &wj_dx_2); wj_dr.v = vec_mul(hjd_inv.v, wj_dx.v); wj_dr_2.v = vec_mul(hjd_inv_2.v, wj_dx_2.v); /* Compute dv. */ dvx.v = vec_sub(vix.v, vjx.v); dvx_2.v = vec_sub(vix.v, vjx_2.v); dvy.v = vec_sub(viy.v, vjy.v); dvy_2.v = vec_sub(viy.v, vjy_2.v); dvz.v = vec_sub(viz.v, vjz.v); dvz_2.v = vec_sub(viz.v, vjz_2.v); /* Compute dv dot r. */ dvdr.v = vec_fma(dvx.v, dx.v, vec_fma(dvy.v, dy.v, vec_mul(dvz.v, dz.v))); dvdr_2.v = vec_fma(dvx_2.v, dx_2.v, vec_fma(dvy_2.v, dy_2.v, vec_mul(dvz_2.v, dz_2.v))); /* Compute the relative velocity. (This is 0 if the particles move away from * each other and negative otherwise) */ omega_ij.v = vec_fmin(dvdr.v, vec_setzero()); omega_ij_2.v = vec_fmin(dvdr_2.v, vec_setzero()); mu_ij.v = vec_mul(fac_mu.v, vec_mul(ri.v, omega_ij.v)); /* This is 0 or negative */ mu_ij_2.v = vec_mul(fac_mu.v, vec_mul(ri_2.v, omega_ij_2.v)); /* This is 0 or negative */ /* Compute signal velocity */ v_sig.v = vec_fnma(vec_set1(3.f), mu_ij.v, vec_add(ci.v, cj.v)); v_sig_2.v = vec_fnma(vec_set1(3.f), mu_ij_2.v, vec_add(ci.v, cj_2.v)); /* Now construct the full viscosity term */ rho_ij.v = vec_mul(vec_set1(0.5f), vec_add(pirho.v, pjrho.v)); rho_ij_2.v = vec_mul(vec_set1(0.5f), vec_add(pirho.v, pjrho_2.v)); vector const_viscosity_alpha_fac; const_viscosity_alpha_fac.v = vec_set1(-0.25f * const_viscosity_alpha); visc.v = vec_div(vec_mul(const_viscosity_alpha_fac.v, vec_mul(v_sig.v, vec_mul(mu_ij.v, balsara.v))), rho_ij.v); visc_2.v = vec_div(vec_mul(const_viscosity_alpha_fac.v, vec_mul(v_sig_2.v, vec_mul(mu_ij_2.v, balsara_2.v))), rho_ij_2.v); /* Now, convolve with the kernel */ visc_term.v = vec_mul(vec_set1(0.5f), vec_mul(visc.v, vec_mul(vec_add(wi_dr.v, wj_dr.v), ri.v))); visc_term_2.v = vec_mul(vec_set1(0.5f), vec_mul(visc_2.v, vec_mul(vec_add(wi_dr_2.v, wj_dr_2.v), ri_2.v))); vector grad_hi_mul_piPOrho2; grad_hi_mul_piPOrho2.v = vec_mul(grad_hi.v, piPOrho2.v); sph_term.v = vec_mul(vec_fma(grad_hi_mul_piPOrho2.v, wi_dr.v, vec_mul(grad_hj.v, vec_mul(pjPOrho2.v, wj_dr.v))), ri.v); sph_term_2.v = vec_mul(vec_fma(grad_hi_mul_piPOrho2.v, wi_dr_2.v, vec_mul(grad_hj_2.v, vec_mul(pjPOrho2_2.v, wj_dr_2.v))), ri_2.v); /* Eventually get the acceleration */ acc.v = vec_add(visc_term.v, sph_term.v); acc_2.v = vec_add(visc_term_2.v, sph_term_2.v); /* Use the force, Luke! */ piax.v = vec_mul(mj.v, vec_mul(dx.v, acc.v)); piax_2.v = vec_mul(mj_2.v, vec_mul(dx_2.v, acc_2.v)); piay.v = vec_mul(mj.v, vec_mul(dy.v, acc.v)); piay_2.v = vec_mul(mj_2.v, vec_mul(dy_2.v, acc_2.v)); piaz.v = vec_mul(mj.v, vec_mul(dz.v, acc.v)); piaz_2.v = vec_mul(mj_2.v, vec_mul(dz_2.v, acc_2.v)); // for(int i=0; iv = vec_mask_sub(a_hydro_xSum->v, piax.v, mask); a_hydro_xSum->v = vec_mask_sub(a_hydro_xSum->v, piax_2.v, mask_2); a_hydro_ySum->v = vec_mask_sub(a_hydro_ySum->v, piay.v, mask); a_hydro_ySum->v = vec_mask_sub(a_hydro_ySum->v, piay_2.v, mask_2); a_hydro_zSum->v = vec_mask_sub(a_hydro_zSum->v, piaz.v, mask); a_hydro_zSum->v = vec_mask_sub(a_hydro_zSum->v, piaz_2.v, mask_2); h_dtSum->v = vec_mask_sub(h_dtSum->v, pih_dt.v, mask); h_dtSum->v = vec_mask_sub(h_dtSum->v, pih_dt_2.v, mask_2); v_sigSum->v = vec_fmax(v_sigSum->v, vec_and_mask(v_sig.v, mask)); v_sigSum->v = vec_fmax(v_sigSum->v, vec_and_mask(v_sig_2.v, mask_2)); entropy_dtSum->v = vec_mask_add(entropy_dtSum->v, entropy_dt.v, mask); entropy_dtSum->v = vec_mask_add(entropy_dtSum->v, entropy_dt_2.v, mask_2); } else { a_hydro_xSum->v = vec_sub(a_hydro_xSum->v, piax.v); a_hydro_xSum->v = vec_sub(a_hydro_xSum->v, piax_2.v); a_hydro_ySum->v = vec_sub(a_hydro_ySum->v, piay.v); a_hydro_ySum->v = vec_sub(a_hydro_ySum->v, piay_2.v); a_hydro_zSum->v = vec_sub(a_hydro_zSum->v, piaz.v); a_hydro_zSum->v = vec_sub(a_hydro_zSum->v, piaz_2.v); h_dtSum->v = vec_sub(h_dtSum->v, pih_dt.v); h_dtSum->v = vec_sub(h_dtSum->v, pih_dt_2.v); v_sigSum->v = vec_fmax(v_sigSum->v, v_sig.v); v_sigSum->v = vec_fmax(v_sigSum->v, v_sig_2.v); entropy_dtSum->v = vec_add(entropy_dtSum->v, entropy_dt.v); entropy_dtSum->v = vec_add(entropy_dtSum->v, entropy_dt_2.v); } #else error( "The Gadget2 serial version of runner_iact_nonsym_force was called when " "the vectorised version should have been used."); #endif } #endif #endif /* SWIFT_GADGET2_HYDRO_IACT_H */