Skip to content
Snippets Groups Projects
Commit bebc572a authored by James Willis's avatar James Willis
Browse files

Created an inline function to perform vector loads.

parent ea3ac791
No related branches found
No related tags found
1 merge request!440Dopair2 vectorisation
...@@ -170,17 +170,15 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz, ...@@ -170,17 +170,15 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz,
mask_t mask) { mask_t mask) {
vector r, ri, ui, wi, wi_dx; vector r, ri, ui, wi, wi_dx;
vector mj;
vector dvx, dvy, dvz; vector dvx, dvy, dvz;
vector vjx, vjy, vjz;
vector dvdr; vector dvdr;
vector curlvrx, curlvry, curlvrz; vector curlvrx, curlvry, curlvrz;
/* Fill the vectors. */ /* Fill the vectors. */
mj.v = vec_load(Mj); const vector mj = vector_load(Mj);
vjx.v = vec_load(Vjx); const vector vjx = vector_load(Vjx);
vjy.v = vec_load(Vjy); const vector vjy = vector_load(Vjy);
vjz.v = vec_load(Vjz); const vector vjz = vector_load(Vjz);
/* Get the radius and inverse radius. */ /* Get the radius and inverse radius. */
ri = vec_reciprocal_sqrt(*r2); ri = vec_reciprocal_sqrt(*r2);
...@@ -245,38 +243,34 @@ runner_iact_nonsym_2_vec_density(float *R2, float *Dx, float *Dy, float *Dz, ...@@ -245,38 +243,34 @@ runner_iact_nonsym_2_vec_density(float *R2, float *Dx, float *Dy, float *Dz,
vector *curlvySum, vector *curlvzSum, vector *curlvySum, vector *curlvzSum,
mask_t mask, mask_t mask2, short mask_cond) { mask_t mask, mask_t mask2, short mask_cond) {
vector r, ri, r2, ui, wi, wi_dx; vector r, ri, ui, wi, wi_dx;
vector mj; vector dvx, dvy, dvz;
vector dx, dy, dz, dvx, dvy, dvz;
vector vjx, vjy, vjz;
vector dvdr; vector dvdr;
vector curlvrx, curlvry, curlvrz; vector curlvrx, curlvry, curlvrz;
vector r_2, ri2, r2_2, ui2, wi2, wi_dx2; vector r_2, ri2, ui2, wi2, wi_dx2;
vector mj2; vector dvx2, dvy2, dvz2;
vector dx2, dy2, dz2, dvx2, dvy2, dvz2;
vector vjx2, vjy2, vjz2;
vector dvdr2; vector dvdr2;
vector curlvrx2, curlvry2, curlvrz2; vector curlvrx2, curlvry2, curlvrz2;
/* Fill the vectors. */ /* Fill the vectors. */
mj.v = vec_load(Mj); const vector mj = vector_load(Mj);
mj2.v = vec_load(&Mj[VEC_SIZE]); const vector mj2 = vector_load(&Mj[VEC_SIZE]);
vjx.v = vec_load(Vjx); const vector vjx = vector_load(Vjx);
vjx2.v = vec_load(&Vjx[VEC_SIZE]); const vector vjx2 = vector_load(&Vjx[VEC_SIZE]);
vjy.v = vec_load(Vjy); const vector vjy = vector_load(Vjy);
vjy2.v = vec_load(&Vjy[VEC_SIZE]); const vector vjy2 = vector_load(&Vjy[VEC_SIZE]);
vjz.v = vec_load(Vjz); const vector vjz = vector_load(Vjz);
vjz2.v = vec_load(&Vjz[VEC_SIZE]); const vector vjz2 = vector_load(&Vjz[VEC_SIZE]);
dx.v = vec_load(Dx); const vector dx = vector_load(Dx);
dx2.v = vec_load(&Dx[VEC_SIZE]); const vector dx2 = vector_load(&Dx[VEC_SIZE]);
dy.v = vec_load(Dy); const vector dy = vector_load(Dy);
dy2.v = vec_load(&Dy[VEC_SIZE]); const vector dy2 = vector_load(&Dy[VEC_SIZE]);
dz.v = vec_load(Dz); const vector dz = vector_load(Dz);
dz2.v = vec_load(&Dz[VEC_SIZE]); const vector dz2 = vector_load(&Dz[VEC_SIZE]);
/* Get the radius and inverse radius. */ /* Get the radius and inverse radius. */
r2.v = vec_load(R2); const vector r2 = vector_load(R2);
r2_2.v = vec_load(&R2[VEC_SIZE]); const vector r2_2 = vector_load(&R2[VEC_SIZE]);
ri = vec_reciprocal_sqrt(r2); ri = vec_reciprocal_sqrt(r2);
ri2 = vec_reciprocal_sqrt(r2_2); ri2 = vec_reciprocal_sqrt(r2_2);
r.v = vec_mul(r2.v, ri.v); r.v = vec_mul(r2.v, ri.v);
...@@ -592,8 +586,7 @@ runner_iact_nonsym_1_vec_force( ...@@ -592,8 +586,7 @@ runner_iact_nonsym_1_vec_force(
#ifdef WITH_VECTORIZATION #ifdef WITH_VECTORIZATION
vector r, ri; vector r, ri;
vector vjx, vjy, vjz, dvx, dvy, dvz; vector dvx, dvy, dvz;
vector pjrho, grad_hj, pjPOrho2, balsara_j, cj, mj;
vector xi, xj; vector xi, xj;
vector hid_inv, hjd_inv; vector hid_inv, hjd_inv;
vector wi_dx, wj_dx, wi_dr, wj_dr, dvdr; vector wi_dx, wj_dx, wi_dr, wj_dr, dvdr;
...@@ -604,16 +597,15 @@ runner_iact_nonsym_1_vec_force( ...@@ -604,16 +597,15 @@ runner_iact_nonsym_1_vec_force(
vector rho_ij, visc, visc_term, sph_term, acc, entropy_dt; vector rho_ij, visc, visc_term, sph_term, acc, entropy_dt;
/* Fill vectors. */ /* Fill vectors. */
vjx.v = vec_load(Vjx); const vector vjx = vector_load(Vjx);
vjy.v = vec_load(Vjy); const vector vjy = vector_load(Vjy);
vjz.v = vec_load(Vjz); const vector vjz = vector_load(Vjz);
mj.v = vec_load(Mj); const vector mj = vector_load(Mj);
const vector pjrho = vector_load(Pjrho);
pjrho.v = vec_load(Pjrho); const vector grad_hj = vector_load(Grad_hj);
grad_hj.v = vec_load(Grad_hj); const vector pjPOrho2 = vector_load(PjPOrho2);
pjPOrho2.v = vec_load(PjPOrho2); const vector balsara_j = vector_load(Balsara_j);
balsara_j.v = vec_load(Balsara_j); const vector cj = vector_load(Cj);
cj.v = vec_load(Cj);
fac_mu.v = vec_set1(1.f); /* Will change with cosmological integration */ fac_mu.v = vec_set1(1.f); /* Will change with cosmological integration */
...@@ -720,10 +712,8 @@ runner_iact_nonsym_2_vec_force( ...@@ -720,10 +712,8 @@ runner_iact_nonsym_2_vec_force(
#ifdef WITH_VECTORIZATION #ifdef WITH_VECTORIZATION
vector r, r2, ri; vector r, ri;
vector dx, dy, dz, dvx, dvy, dvz; vector dvx, dvy, dvz;
vector vjx, vjy, vjz;
vector pjrho, grad_hj, pjPOrho2, balsara_j, cj, mj, hj_inv;
vector ui, uj; vector ui, uj;
vector hid_inv, hjd_inv; vector hid_inv, hjd_inv;
vector wi_dx, wj_dx, wi_dr, wj_dr, dvdr; vector wi_dx, wj_dx, wi_dr, wj_dr, dvdr;
...@@ -733,10 +723,8 @@ runner_iact_nonsym_2_vec_force( ...@@ -733,10 +723,8 @@ runner_iact_nonsym_2_vec_force(
vector omega_ij, mu_ij, fac_mu, balsara; vector omega_ij, mu_ij, fac_mu, balsara;
vector rho_ij, visc, visc_term, sph_term, acc, entropy_dt; vector rho_ij, visc, visc_term, sph_term, acc, entropy_dt;
vector r_2, r2_2, ri_2; vector r_2, ri_2;
vector dx_2, dy_2, dz_2, dvx_2, dvy_2, dvz_2; vector dvx_2, dvy_2, dvz_2;
vector vjx_2, vjy_2, vjz_2;
vector pjrho_2, grad_hj_2, pjPOrho2_2, balsara_j_2, cj_2, mj_2, hj_inv_2;
vector ui_2, uj_2; vector ui_2, uj_2;
vector hjd_inv_2; vector hjd_inv_2;
vector wi_dx_2, wj_dx_2, wi_dr_2, wj_dr_2, dvdr_2; vector wi_dx_2, wj_dx_2, wi_dr_2, wj_dr_2, dvdr_2;
...@@ -747,42 +735,42 @@ runner_iact_nonsym_2_vec_force( ...@@ -747,42 +735,42 @@ runner_iact_nonsym_2_vec_force(
vector rho_ij_2, visc_2, visc_term_2, sph_term_2, acc_2, entropy_dt_2; vector rho_ij_2, visc_2, visc_term_2, sph_term_2, acc_2, entropy_dt_2;
/* Fill vectors. */ /* Fill vectors. */
mj.v = vec_load(Mj); const vector mj = vector_load(Mj);
mj_2.v = vec_load(&Mj[VEC_SIZE]); const vector mj_2 = vector_load(&Mj[VEC_SIZE]);
vjx.v = vec_load(Vjx); const vector vjx = vector_load(Vjx);
vjx_2.v = vec_load(&Vjx[VEC_SIZE]); const vector vjx_2 = vector_load(&Vjx[VEC_SIZE]);
vjy.v = vec_load(Vjy); const vector vjy = vector_load(Vjy);
vjy_2.v = vec_load(&Vjy[VEC_SIZE]); const vector vjy_2 = vector_load(&Vjy[VEC_SIZE]);
vjz.v = vec_load(Vjz); const vector vjz = vector_load(Vjz);
vjz_2.v = vec_load(&Vjz[VEC_SIZE]); const vector vjz_2 = vector_load(&Vjz[VEC_SIZE]);
dx.v = vec_load(Dx); const vector dx = vector_load(Dx);
dx_2.v = vec_load(&Dx[VEC_SIZE]); const vector dx_2 = vector_load(&Dx[VEC_SIZE]);
dy.v = vec_load(Dy); const vector dy = vector_load(Dy);
dy_2.v = vec_load(&Dy[VEC_SIZE]); const vector dy_2 = vector_load(&Dy[VEC_SIZE]);
dz.v = vec_load(Dz); const vector dz = vector_load(Dz);
dz_2.v = vec_load(&Dz[VEC_SIZE]); const vector dz_2 = vector_load(&Dz[VEC_SIZE]);
/* Get the radius and inverse radius. */ /* Get the radius and inverse radius. */
r2.v = vec_load(R2); const vector r2 = vector_load(R2);
r2_2.v = vec_load(&R2[VEC_SIZE]); const vector r2_2 = vector_load(&R2[VEC_SIZE]);
ri = vec_reciprocal_sqrt(r2); ri = vec_reciprocal_sqrt(r2);
ri_2 = vec_reciprocal_sqrt(r2_2); ri_2 = vec_reciprocal_sqrt(r2_2);
r.v = vec_mul(r2.v, ri.v); r.v = vec_mul(r2.v, ri.v);
r_2.v = vec_mul(r2_2.v, ri_2.v); r_2.v = vec_mul(r2_2.v, ri_2.v);
/* Get remaining properties. */ /* Get remaining properties. */
pjrho.v = vec_load(Pjrho); const vector pjrho = vector_load(Pjrho);
pjrho_2.v = vec_load(&Pjrho[VEC_SIZE]); const vector pjrho_2 = vector_load(&Pjrho[VEC_SIZE]);
grad_hj.v = vec_load(Grad_hj); const vector grad_hj = vector_load(Grad_hj);
grad_hj_2.v = vec_load(&Grad_hj[VEC_SIZE]); const vector grad_hj_2 = vector_load(&Grad_hj[VEC_SIZE]);
pjPOrho2.v = vec_load(PjPOrho2); const vector pjPOrho2 = vector_load(PjPOrho2);
pjPOrho2_2.v = vec_load(&PjPOrho2[VEC_SIZE]); const vector pjPOrho2_2 = vector_load(&PjPOrho2[VEC_SIZE]);
balsara_j.v = vec_load(Balsara_j); const vector balsara_j = vector_load(Balsara_j);
balsara_j_2.v = vec_load(&Balsara_j[VEC_SIZE]); const vector balsara_j_2 = vector_load(&Balsara_j[VEC_SIZE]);
cj.v = vec_load(Cj); const vector cj = vector_load(Cj);
cj_2.v = vec_load(&Cj[VEC_SIZE]); const vector cj_2 = vector_load(&Cj[VEC_SIZE]);
hj_inv.v = vec_load(Hj_inv); const vector hj_inv = vector_load(Hj_inv);
hj_inv_2.v = vec_load(&Hj_inv[VEC_SIZE]); const vector hj_inv_2 = vector_load(&Hj_inv[VEC_SIZE]);
fac_mu.v = vec_set1(1.f); /* Will change with cosmological integration */ fac_mu.v = vec_set1(1.f); /* Will change with cosmological integration */
......
...@@ -615,21 +615,18 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( ...@@ -615,21 +615,18 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
} }
} }
vector v_pjx, v_pjy, v_pjz;
vector v_pjx2, v_pjy2, v_pjz2;
/* Find all of particle pi's interacions and store needed values in the /* Find all of particle pi's interacions and store needed values in the
* secondary cache.*/ * secondary cache.*/
for (int pjd = 0; pjd < count_align; pjd += (num_vec_proc * VEC_SIZE)) { for (int pjd = 0; pjd < count_align; pjd += (num_vec_proc * VEC_SIZE)) {
/* Load 2 sets of vectors from the particle cache. */ /* Load 2 sets of vectors from the particle cache. */
v_pjx.v = vec_load(&cell_cache->x[pjd]); const vector v_pjx = vector_load(&cell_cache->x[pjd]);
v_pjy.v = vec_load(&cell_cache->y[pjd]); const vector v_pjy = vector_load(&cell_cache->y[pjd]);
v_pjz.v = vec_load(&cell_cache->z[pjd]); const vector v_pjz = vector_load(&cell_cache->z[pjd]);
v_pjx2.v = vec_load(&cell_cache->x[pjd + VEC_SIZE]); const vector v_pjx2 = vector_load(&cell_cache->x[pjd + VEC_SIZE]);
v_pjy2.v = vec_load(&cell_cache->y[pjd + VEC_SIZE]); const vector v_pjy2 = vector_load(&cell_cache->y[pjd + VEC_SIZE]);
v_pjz2.v = vec_load(&cell_cache->z[pjd + VEC_SIZE]); const vector v_pjz2 = vector_load(&cell_cache->z[pjd + VEC_SIZE]);
/* Compute the pairwise distance. */ /* Compute the pairwise distance. */
vector v_dx, v_dy, v_dz; vector v_dx, v_dy, v_dz;
...@@ -848,11 +845,11 @@ __attribute__((always_inline)) INLINE void runner_doself2_force_vec( ...@@ -848,11 +845,11 @@ __attribute__((always_inline)) INLINE void runner_doself2_force_vec(
for (int pjd = 0; pjd < count_align; pjd += (num_vec_proc * VEC_SIZE)) { for (int pjd = 0; pjd < count_align; pjd += (num_vec_proc * VEC_SIZE)) {
/* Load 1 set of vectors from the particle cache. */ /* Load 1 set of vectors from the particle cache. */
vector v_pjx, v_pjy, v_pjz, hj, hjg2; vector hjg2;
v_pjx.v = vec_load(&cell_cache->x[pjd]); const vector v_pjx = vector_load(&cell_cache->x[pjd]);
v_pjy.v = vec_load(&cell_cache->y[pjd]); const vector v_pjy = vector_load(&cell_cache->y[pjd]);
v_pjz.v = vec_load(&cell_cache->z[pjd]); const vector v_pjz = vector_load(&cell_cache->z[pjd]);
hj.v = vec_load(&cell_cache->h[pjd]); const vector hj = vector_load(&cell_cache->h[pjd]);
hjg2.v = vec_mul(vec_mul(hj.v, hj.v), kernel_gamma2_vec.v); hjg2.v = vec_mul(vec_mul(hj.v, hj.v), kernel_gamma2_vec.v);
/* Compute the pairwise distance. */ /* Compute the pairwise distance. */
...@@ -1094,7 +1091,6 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, ...@@ -1094,7 +1091,6 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
/* Get the cache index to the jth particle. */ /* Get the cache index to the jth particle. */
const int cj_cache_idx = pjd; const int cj_cache_idx = pjd;
vector v_pjx, v_pjy, v_pjz;
vector v_dx, v_dy, v_dz, v_r2; vector v_dx, v_dy, v_dz, v_r2;
#ifdef SWIFT_DEBUG_CHECKS #ifdef SWIFT_DEBUG_CHECKS
...@@ -1106,9 +1102,9 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, ...@@ -1106,9 +1102,9 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
#endif #endif
/* Load 2 sets of vectors from the particle cache. */ /* Load 2 sets of vectors from the particle cache. */
v_pjx.v = vec_load(&cj_cache->x[cj_cache_idx]); const vector v_pjx = vector_load(&cj_cache->x[cj_cache_idx]);
v_pjy.v = vec_load(&cj_cache->y[cj_cache_idx]); const vector v_pjy = vector_load(&cj_cache->y[cj_cache_idx]);
v_pjz.v = vec_load(&cj_cache->z[cj_cache_idx]); const vector v_pjz = vector_load(&cj_cache->z[cj_cache_idx]);
/* Compute the pairwise distance. */ /* Compute the pairwise distance. */
v_dx.v = vec_sub(v_pix.v, v_pjx.v); v_dx.v = vec_sub(v_pix.v, v_pjx.v);
...@@ -1224,13 +1220,12 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, ...@@ -1224,13 +1220,12 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
} }
#endif #endif
vector v_pix, v_piy, v_piz;
vector v_dx, v_dy, v_dz, v_r2; vector v_dx, v_dy, v_dz, v_r2;
/* Load 2 sets of vectors from the particle cache. */ /* Load 2 sets of vectors from the particle cache. */
v_pix.v = vec_load(&ci_cache->x[ci_cache_idx]); const vector v_pix = vector_load(&ci_cache->x[ci_cache_idx]);
v_piy.v = vec_load(&ci_cache->y[ci_cache_idx]); const vector v_piy = vector_load(&ci_cache->y[ci_cache_idx]);
v_piz.v = vec_load(&ci_cache->z[ci_cache_idx]); const vector v_piz = vector_load(&ci_cache->z[ci_cache_idx]);
/* Compute the pairwise distance. */ /* Compute the pairwise distance. */
v_dx.v = vec_sub(v_pjx.v, v_pix.v); v_dx.v = vec_sub(v_pjx.v, v_pix.v);
...@@ -1469,8 +1464,8 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, ...@@ -1469,8 +1464,8 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci,
/* Get the cache index to the jth particle. */ /* Get the cache index to the jth particle. */
const int cj_cache_idx = pjd; const int cj_cache_idx = pjd;
vector v_dx, v_dy, v_dz; vector v_dx, v_dy, v_dz, v_r2;
vector v_pjx, v_pjy, v_pjz, v_hj, v_hjg2, v_r2; vector v_hjg2;
#ifdef SWIFT_DEBUG_CHECKS #ifdef SWIFT_DEBUG_CHECKS
if (cj_cache_idx % VEC_SIZE != 0 || cj_cache_idx < 0 || if (cj_cache_idx % VEC_SIZE != 0 || cj_cache_idx < 0 ||
...@@ -1481,10 +1476,10 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, ...@@ -1481,10 +1476,10 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci,
#endif #endif
/* Load 2 sets of vectors from the particle cache. */ /* Load 2 sets of vectors from the particle cache. */
v_pjx.v = vec_load(&cj_cache->x[cj_cache_idx]); const vector v_pjx = vector_load(&cj_cache->x[cj_cache_idx]);
v_pjy.v = vec_load(&cj_cache->y[cj_cache_idx]); const vector v_pjy = vector_load(&cj_cache->y[cj_cache_idx]);
v_pjz.v = vec_load(&cj_cache->z[cj_cache_idx]); const vector v_pjz = vector_load(&cj_cache->z[cj_cache_idx]);
v_hj.v = vec_load(&cj_cache->h[cj_cache_idx]); const vector v_hj = vector_load(&cj_cache->h[cj_cache_idx]);
v_hjg2.v = vec_mul(vec_mul(v_hj.v, v_hj.v), kernel_gamma2_vec.v); v_hjg2.v = vec_mul(vec_mul(v_hj.v, v_hj.v), kernel_gamma2_vec.v);
/* Compute the pairwise distance. */ /* Compute the pairwise distance. */
...@@ -1610,14 +1605,14 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, ...@@ -1610,14 +1605,14 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci,
} }
#endif #endif
vector v_pix, v_piy, v_piz, v_hi, v_hig2; vector v_hig2;
vector v_dx, v_dy, v_dz, v_r2; vector v_dx, v_dy, v_dz, v_r2;
/* Load 2 sets of vectors from the particle cache. */ /* Load 2 sets of vectors from the particle cache. */
v_pix.v = vec_load(&ci_cache->x[ci_cache_idx]); const vector v_pix = vector_load(&ci_cache->x[ci_cache_idx]);
v_piy.v = vec_load(&ci_cache->y[ci_cache_idx]); const vector v_piy = vector_load(&ci_cache->y[ci_cache_idx]);
v_piz.v = vec_load(&ci_cache->z[ci_cache_idx]); const vector v_piz = vector_load(&ci_cache->z[ci_cache_idx]);
v_hi.v = vec_load(&ci_cache->h[ci_cache_idx]); const vector v_hi = vector_load(&ci_cache->h[ci_cache_idx]);
v_hig2.v = vec_mul(vec_mul(v_hi.v, v_hi.v), kernel_gamma2_vec.v); v_hig2.v = vec_mul(vec_mul(v_hi.v, v_hi.v), kernel_gamma2_vec.v);
/* Compute the pairwise distance. */ /* Compute the pairwise distance. */
......
...@@ -431,6 +431,21 @@ __attribute__((always_inline)) INLINE vector vec_reciprocal_sqrt(vector x) { ...@@ -431,6 +431,21 @@ __attribute__((always_inline)) INLINE vector vec_reciprocal_sqrt(vector x) {
return x_inv; return x_inv;
} }
/**
* @brief Loads a vector from memory.
*
* @param *x memory to load from.
* @return temp loaded #vector.
*/
__attribute__((always_inline)) INLINE vector vector_load(float *const x) {
vector temp;
temp.v = vec_load(x);
return temp;
}
#else #else
/* Needed for cache alignment. */ /* Needed for cache alignment. */
#define VEC_SIZE 8 #define VEC_SIZE 8
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment