From bebc572ac181bbb17eab76ee9404a4be4106e7bf Mon Sep 17 00:00:00 2001 From: James Willis <james.s.willis@durham.ac.uk> Date: Tue, 24 Oct 2017 11:02:18 +0100 Subject: [PATCH] Created an inline function to perform vector loads. --- src/hydro/Gadget2/hydro_iact.h | 144 +++++++++++++++------------------ src/runner_doiact_vec.c | 61 +++++++------- src/vector.h | 15 ++++ 3 files changed, 109 insertions(+), 111 deletions(-) diff --git a/src/hydro/Gadget2/hydro_iact.h b/src/hydro/Gadget2/hydro_iact.h index 45ee89ceb1..aaeb23a55d 100644 --- a/src/hydro/Gadget2/hydro_iact.h +++ b/src/hydro/Gadget2/hydro_iact.h @@ -170,17 +170,15 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz, mask_t mask) { vector r, ri, ui, wi, wi_dx; - vector mj; vector dvx, dvy, dvz; - vector vjx, vjy, vjz; vector dvdr; vector curlvrx, curlvry, curlvrz; /* Fill the vectors. */ - mj.v = vec_load(Mj); - vjx.v = vec_load(Vjx); - vjy.v = vec_load(Vjy); - vjz.v = vec_load(Vjz); + const vector mj = vector_load(Mj); + const vector vjx = vector_load(Vjx); + const vector vjy = vector_load(Vjy); + const vector vjz = vector_load(Vjz); /* Get the radius and inverse radius. */ ri = vec_reciprocal_sqrt(*r2); @@ -245,38 +243,34 @@ runner_iact_nonsym_2_vec_density(float *R2, float *Dx, float *Dy, float *Dz, vector *curlvySum, vector *curlvzSum, mask_t mask, mask_t mask2, short mask_cond) { - vector r, ri, r2, ui, wi, wi_dx; - vector mj; - vector dx, dy, dz, dvx, dvy, dvz; - vector vjx, vjy, vjz; + vector r, ri, ui, wi, wi_dx; + vector dvx, dvy, dvz; vector dvdr; vector curlvrx, curlvry, curlvrz; - vector r_2, ri2, r2_2, ui2, wi2, wi_dx2; - vector mj2; - vector dx2, dy2, dz2, dvx2, dvy2, dvz2; - vector vjx2, vjy2, vjz2; + vector r_2, ri2, ui2, wi2, wi_dx2; + vector dvx2, dvy2, dvz2; vector dvdr2; vector curlvrx2, curlvry2, curlvrz2; /* Fill the vectors. */ - mj.v = vec_load(Mj); - mj2.v = vec_load(&Mj[VEC_SIZE]); - vjx.v = vec_load(Vjx); - vjx2.v = vec_load(&Vjx[VEC_SIZE]); - vjy.v = vec_load(Vjy); - vjy2.v = vec_load(&Vjy[VEC_SIZE]); - vjz.v = vec_load(Vjz); - vjz2.v = vec_load(&Vjz[VEC_SIZE]); - dx.v = vec_load(Dx); - dx2.v = vec_load(&Dx[VEC_SIZE]); - dy.v = vec_load(Dy); - dy2.v = vec_load(&Dy[VEC_SIZE]); - dz.v = vec_load(Dz); - dz2.v = vec_load(&Dz[VEC_SIZE]); + const vector mj = vector_load(Mj); + const vector mj2 = vector_load(&Mj[VEC_SIZE]); + const vector vjx = vector_load(Vjx); + const vector vjx2 = vector_load(&Vjx[VEC_SIZE]); + const vector vjy = vector_load(Vjy); + const vector vjy2 = vector_load(&Vjy[VEC_SIZE]); + const vector vjz = vector_load(Vjz); + const vector vjz2 = vector_load(&Vjz[VEC_SIZE]); + const vector dx = vector_load(Dx); + const vector dx2 = vector_load(&Dx[VEC_SIZE]); + const vector dy = vector_load(Dy); + const vector dy2 = vector_load(&Dy[VEC_SIZE]); + const vector dz = vector_load(Dz); + const vector dz2 = vector_load(&Dz[VEC_SIZE]); /* Get the radius and inverse radius. */ - r2.v = vec_load(R2); - r2_2.v = vec_load(&R2[VEC_SIZE]); + const vector r2 = vector_load(R2); + const vector r2_2 = vector_load(&R2[VEC_SIZE]); ri = vec_reciprocal_sqrt(r2); ri2 = vec_reciprocal_sqrt(r2_2); r.v = vec_mul(r2.v, ri.v); @@ -592,8 +586,7 @@ runner_iact_nonsym_1_vec_force( #ifdef WITH_VECTORIZATION vector r, ri; - vector vjx, vjy, vjz, dvx, dvy, dvz; - vector pjrho, grad_hj, pjPOrho2, balsara_j, cj, mj; + vector dvx, dvy, dvz; vector xi, xj; vector hid_inv, hjd_inv; vector wi_dx, wj_dx, wi_dr, wj_dr, dvdr; @@ -604,16 +597,15 @@ runner_iact_nonsym_1_vec_force( vector rho_ij, visc, visc_term, sph_term, acc, entropy_dt; /* Fill vectors. */ - vjx.v = vec_load(Vjx); - vjy.v = vec_load(Vjy); - vjz.v = vec_load(Vjz); - mj.v = vec_load(Mj); - - pjrho.v = vec_load(Pjrho); - grad_hj.v = vec_load(Grad_hj); - pjPOrho2.v = vec_load(PjPOrho2); - balsara_j.v = vec_load(Balsara_j); - cj.v = vec_load(Cj); + const vector vjx = vector_load(Vjx); + const vector vjy = vector_load(Vjy); + const vector vjz = vector_load(Vjz); + const vector mj = vector_load(Mj); + const vector pjrho = vector_load(Pjrho); + const vector grad_hj = vector_load(Grad_hj); + const vector pjPOrho2 = vector_load(PjPOrho2); + const vector balsara_j = vector_load(Balsara_j); + const vector cj = vector_load(Cj); fac_mu.v = vec_set1(1.f); /* Will change with cosmological integration */ @@ -720,10 +712,8 @@ runner_iact_nonsym_2_vec_force( #ifdef WITH_VECTORIZATION - vector r, r2, ri; - vector dx, dy, dz, dvx, dvy, dvz; - vector vjx, vjy, vjz; - vector pjrho, grad_hj, pjPOrho2, balsara_j, cj, mj, hj_inv; + vector r, ri; + vector dvx, dvy, dvz; vector ui, uj; vector hid_inv, hjd_inv; vector wi_dx, wj_dx, wi_dr, wj_dr, dvdr; @@ -733,10 +723,8 @@ runner_iact_nonsym_2_vec_force( vector omega_ij, mu_ij, fac_mu, balsara; vector rho_ij, visc, visc_term, sph_term, acc, entropy_dt; - vector r_2, r2_2, ri_2; - vector dx_2, dy_2, dz_2, dvx_2, dvy_2, dvz_2; - vector vjx_2, vjy_2, vjz_2; - vector pjrho_2, grad_hj_2, pjPOrho2_2, balsara_j_2, cj_2, mj_2, hj_inv_2; + vector r_2, ri_2; + vector dvx_2, dvy_2, dvz_2; vector ui_2, uj_2; vector hjd_inv_2; vector wi_dx_2, wj_dx_2, wi_dr_2, wj_dr_2, dvdr_2; @@ -747,42 +735,42 @@ runner_iact_nonsym_2_vec_force( vector rho_ij_2, visc_2, visc_term_2, sph_term_2, acc_2, entropy_dt_2; /* Fill vectors. */ - mj.v = vec_load(Mj); - mj_2.v = vec_load(&Mj[VEC_SIZE]); - vjx.v = vec_load(Vjx); - vjx_2.v = vec_load(&Vjx[VEC_SIZE]); - vjy.v = vec_load(Vjy); - vjy_2.v = vec_load(&Vjy[VEC_SIZE]); - vjz.v = vec_load(Vjz); - vjz_2.v = vec_load(&Vjz[VEC_SIZE]); - dx.v = vec_load(Dx); - dx_2.v = vec_load(&Dx[VEC_SIZE]); - dy.v = vec_load(Dy); - dy_2.v = vec_load(&Dy[VEC_SIZE]); - dz.v = vec_load(Dz); - dz_2.v = vec_load(&Dz[VEC_SIZE]); + const vector mj = vector_load(Mj); + const vector mj_2 = vector_load(&Mj[VEC_SIZE]); + const vector vjx = vector_load(Vjx); + const vector vjx_2 = vector_load(&Vjx[VEC_SIZE]); + const vector vjy = vector_load(Vjy); + const vector vjy_2 = vector_load(&Vjy[VEC_SIZE]); + const vector vjz = vector_load(Vjz); + const vector vjz_2 = vector_load(&Vjz[VEC_SIZE]); + const vector dx = vector_load(Dx); + const vector dx_2 = vector_load(&Dx[VEC_SIZE]); + const vector dy = vector_load(Dy); + const vector dy_2 = vector_load(&Dy[VEC_SIZE]); + const vector dz = vector_load(Dz); + const vector dz_2 = vector_load(&Dz[VEC_SIZE]); /* Get the radius and inverse radius. */ - r2.v = vec_load(R2); - r2_2.v = vec_load(&R2[VEC_SIZE]); + const vector r2 = vector_load(R2); + const vector r2_2 = vector_load(&R2[VEC_SIZE]); ri = vec_reciprocal_sqrt(r2); ri_2 = vec_reciprocal_sqrt(r2_2); r.v = vec_mul(r2.v, ri.v); r_2.v = vec_mul(r2_2.v, ri_2.v); /* Get remaining properties. */ - pjrho.v = vec_load(Pjrho); - pjrho_2.v = vec_load(&Pjrho[VEC_SIZE]); - grad_hj.v = vec_load(Grad_hj); - grad_hj_2.v = vec_load(&Grad_hj[VEC_SIZE]); - pjPOrho2.v = vec_load(PjPOrho2); - pjPOrho2_2.v = vec_load(&PjPOrho2[VEC_SIZE]); - balsara_j.v = vec_load(Balsara_j); - balsara_j_2.v = vec_load(&Balsara_j[VEC_SIZE]); - cj.v = vec_load(Cj); - cj_2.v = vec_load(&Cj[VEC_SIZE]); - hj_inv.v = vec_load(Hj_inv); - hj_inv_2.v = vec_load(&Hj_inv[VEC_SIZE]); + const vector pjrho = vector_load(Pjrho); + const vector pjrho_2 = vector_load(&Pjrho[VEC_SIZE]); + const vector grad_hj = vector_load(Grad_hj); + const vector grad_hj_2 = vector_load(&Grad_hj[VEC_SIZE]); + const vector pjPOrho2 = vector_load(PjPOrho2); + const vector pjPOrho2_2 = vector_load(&PjPOrho2[VEC_SIZE]); + const vector balsara_j = vector_load(Balsara_j); + const vector balsara_j_2 = vector_load(&Balsara_j[VEC_SIZE]); + const vector cj = vector_load(Cj); + const vector cj_2 = vector_load(&Cj[VEC_SIZE]); + const vector hj_inv = vector_load(Hj_inv); + const vector hj_inv_2 = vector_load(&Hj_inv[VEC_SIZE]); fac_mu.v = vec_set1(1.f); /* Will change with cosmological integration */ diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c index c09c8bc319..3ffc59d55b 100644 --- a/src/runner_doiact_vec.c +++ b/src/runner_doiact_vec.c @@ -615,21 +615,18 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( } } - vector v_pjx, v_pjy, v_pjz; - vector v_pjx2, v_pjy2, v_pjz2; - /* Find all of particle pi's interacions and store needed values in the * secondary cache.*/ for (int pjd = 0; pjd < count_align; pjd += (num_vec_proc * VEC_SIZE)) { /* Load 2 sets of vectors from the particle cache. */ - v_pjx.v = vec_load(&cell_cache->x[pjd]); - v_pjy.v = vec_load(&cell_cache->y[pjd]); - v_pjz.v = vec_load(&cell_cache->z[pjd]); + const vector v_pjx = vector_load(&cell_cache->x[pjd]); + const vector v_pjy = vector_load(&cell_cache->y[pjd]); + const vector v_pjz = vector_load(&cell_cache->z[pjd]); - v_pjx2.v = vec_load(&cell_cache->x[pjd + VEC_SIZE]); - v_pjy2.v = vec_load(&cell_cache->y[pjd + VEC_SIZE]); - v_pjz2.v = vec_load(&cell_cache->z[pjd + VEC_SIZE]); + const vector v_pjx2 = vector_load(&cell_cache->x[pjd + VEC_SIZE]); + const vector v_pjy2 = vector_load(&cell_cache->y[pjd + VEC_SIZE]); + const vector v_pjz2 = vector_load(&cell_cache->z[pjd + VEC_SIZE]); /* Compute the pairwise distance. */ vector v_dx, v_dy, v_dz; @@ -848,11 +845,11 @@ __attribute__((always_inline)) INLINE void runner_doself2_force_vec( for (int pjd = 0; pjd < count_align; pjd += (num_vec_proc * VEC_SIZE)) { /* Load 1 set of vectors from the particle cache. */ - vector v_pjx, v_pjy, v_pjz, hj, hjg2; - v_pjx.v = vec_load(&cell_cache->x[pjd]); - v_pjy.v = vec_load(&cell_cache->y[pjd]); - v_pjz.v = vec_load(&cell_cache->z[pjd]); - hj.v = vec_load(&cell_cache->h[pjd]); + vector hjg2; + const vector v_pjx = vector_load(&cell_cache->x[pjd]); + const vector v_pjy = vector_load(&cell_cache->y[pjd]); + const vector v_pjz = vector_load(&cell_cache->z[pjd]); + const vector hj = vector_load(&cell_cache->h[pjd]); hjg2.v = vec_mul(vec_mul(hj.v, hj.v), kernel_gamma2_vec.v); /* Compute the pairwise distance. */ @@ -1094,7 +1091,6 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, /* Get the cache index to the jth particle. */ const int cj_cache_idx = pjd; - vector v_pjx, v_pjy, v_pjz; vector v_dx, v_dy, v_dz, v_r2; #ifdef SWIFT_DEBUG_CHECKS @@ -1106,9 +1102,9 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, #endif /* Load 2 sets of vectors from the particle cache. */ - v_pjx.v = vec_load(&cj_cache->x[cj_cache_idx]); - v_pjy.v = vec_load(&cj_cache->y[cj_cache_idx]); - v_pjz.v = vec_load(&cj_cache->z[cj_cache_idx]); + const vector v_pjx = vector_load(&cj_cache->x[cj_cache_idx]); + const vector v_pjy = vector_load(&cj_cache->y[cj_cache_idx]); + const vector v_pjz = vector_load(&cj_cache->z[cj_cache_idx]); /* Compute the pairwise distance. */ v_dx.v = vec_sub(v_pix.v, v_pjx.v); @@ -1224,13 +1220,12 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, } #endif - vector v_pix, v_piy, v_piz; vector v_dx, v_dy, v_dz, v_r2; /* Load 2 sets of vectors from the particle cache. */ - v_pix.v = vec_load(&ci_cache->x[ci_cache_idx]); - v_piy.v = vec_load(&ci_cache->y[ci_cache_idx]); - v_piz.v = vec_load(&ci_cache->z[ci_cache_idx]); + const vector v_pix = vector_load(&ci_cache->x[ci_cache_idx]); + const vector v_piy = vector_load(&ci_cache->y[ci_cache_idx]); + const vector v_piz = vector_load(&ci_cache->z[ci_cache_idx]); /* Compute the pairwise distance. */ v_dx.v = vec_sub(v_pjx.v, v_pix.v); @@ -1469,8 +1464,8 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, /* Get the cache index to the jth particle. */ const int cj_cache_idx = pjd; - vector v_dx, v_dy, v_dz; - vector v_pjx, v_pjy, v_pjz, v_hj, v_hjg2, v_r2; + vector v_dx, v_dy, v_dz, v_r2; + vector v_hjg2; #ifdef SWIFT_DEBUG_CHECKS if (cj_cache_idx % VEC_SIZE != 0 || cj_cache_idx < 0 || @@ -1481,10 +1476,10 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, #endif /* Load 2 sets of vectors from the particle cache. */ - v_pjx.v = vec_load(&cj_cache->x[cj_cache_idx]); - v_pjy.v = vec_load(&cj_cache->y[cj_cache_idx]); - v_pjz.v = vec_load(&cj_cache->z[cj_cache_idx]); - v_hj.v = vec_load(&cj_cache->h[cj_cache_idx]); + const vector v_pjx = vector_load(&cj_cache->x[cj_cache_idx]); + const vector v_pjy = vector_load(&cj_cache->y[cj_cache_idx]); + const vector v_pjz = vector_load(&cj_cache->z[cj_cache_idx]); + const vector v_hj = vector_load(&cj_cache->h[cj_cache_idx]); v_hjg2.v = vec_mul(vec_mul(v_hj.v, v_hj.v), kernel_gamma2_vec.v); /* Compute the pairwise distance. */ @@ -1610,14 +1605,14 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, } #endif - vector v_pix, v_piy, v_piz, v_hi, v_hig2; + vector v_hig2; vector v_dx, v_dy, v_dz, v_r2; /* Load 2 sets of vectors from the particle cache. */ - v_pix.v = vec_load(&ci_cache->x[ci_cache_idx]); - v_piy.v = vec_load(&ci_cache->y[ci_cache_idx]); - v_piz.v = vec_load(&ci_cache->z[ci_cache_idx]); - v_hi.v = vec_load(&ci_cache->h[ci_cache_idx]); + const vector v_pix = vector_load(&ci_cache->x[ci_cache_idx]); + const vector v_piy = vector_load(&ci_cache->y[ci_cache_idx]); + const vector v_piz = vector_load(&ci_cache->z[ci_cache_idx]); + const vector v_hi = vector_load(&ci_cache->h[ci_cache_idx]); v_hig2.v = vec_mul(vec_mul(v_hi.v, v_hi.v), kernel_gamma2_vec.v); /* Compute the pairwise distance. */ diff --git a/src/vector.h b/src/vector.h index 91ce508754..6e9ab5a417 100644 --- a/src/vector.h +++ b/src/vector.h @@ -431,6 +431,21 @@ __attribute__((always_inline)) INLINE vector vec_reciprocal_sqrt(vector x) { return x_inv; } +/** + * @brief Loads a vector from memory. + * + * @param *x memory to load from. + * @return temp loaded #vector. + */ +__attribute__((always_inline)) INLINE vector vector_load(float *const x) { + + vector temp; + + temp.v = vec_load(x); + + return temp; +} + #else /* Needed for cache alignment. */ #define VEC_SIZE 8 -- GitLab