From f7f6b528da73cdf3c58d951ad612f7ced793a0e4 Mon Sep 17 00:00:00 2001 From: James Willis <james.s.willis@durham.ac.uk> Date: Fri, 2 Dec 2016 16:31:30 +0000 Subject: [PATCH] Comments. --- src/runner_doiact_vec.c | 108 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 97 insertions(+), 11 deletions(-) diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c index 720cba2c83..a39947266f 100644 --- a/src/runner_doiact_vec.c +++ b/src/runner_doiact_vec.c @@ -28,13 +28,40 @@ #define C2_CACHE_SIZE (NUM_VEC_PROC * VEC_SIZE * 6) + (NUM_VEC_PROC * VEC_SIZE) #ifdef WITH_VECTORIZATION -__attribute__((always_inline)) INLINE static void calcRemInteractions(const struct cache *const cell_cache, float *r2q, float *dxq, float *dyq, float *dzq, float *mq, float *vxq, float *vyq, float *vzq, const int icount, vector *rhoSum, vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum, vector *div_vSum, vector *curlvxSum,vector *curlvySum, vector *curlvzSum, vector v_hi_inv, vector v_vix, vector v_viy, vector v_viz, int *icount_align) { +/** + * @brief Compute the vector remainder interactions from the secondary cache. + * + * @param (return) r2q C2 cache of the separation between two particles squared. + * @param (return) dxq C2 cache of the x separation between two particles. + * @param (return) dyq C2 cache of the y separation between two particles. + * @param (return) dzq C2 cache of the z separation between two particles. + * @param (return) mq C2 cache of the mass of particle pj. + * @param (return) vxq C2 cache of the x velocity of particle pj. + * @param (return) vyq C2 cache of the y velocity of particle pj. + * @param (return) vzq C2 cache of the z velocity of particle pj. + * @param icount Interaction count. + * @param (return) rhoSum #vector holding the cumulative sum of the density update on pi. + * @param (return) rho_dhSum #vector holding the cumulative sum of the density gradient update on pi. + * @param (return) wcountSum #vector holding the cumulative sum of the wcount update on pi. + * @param (return) wcount_dhSum #vector holding the cumulative sum of the wcount gradient update on pi. + * @param (return) div_vSum #vector holding the cumulative sum of the divergence update on pi. + * @param (return) curlvxSum #vector holding the cumulative sum of the curl of vx update on pi. + * @param (return) curlvySum #vector holding the cumulative sum of the curl of vy update on pi. + * @param (return) curlvzSum #vector holding the cumulative sum of the curl of vz update on pi. + * @param v_hi_inv #vector of 1/h for pi. + * @param v_vix #vector of x velocity of pi. + * @param v_viy #vector of y velocity of pi. + * @param v_viz #vector of z velocity of pi. + * @param (return) icount_align Interaction count after the remainder interactions have been performed, should be a multiple of the vector length. + */ +/__attribute__((always_inline)) INLINE static void calcRemInteractions(const struct cache *const cell_cache, float *r2q, float *dxq, float *dyq, float *dzq, float *mq, float *vxq, float *vyq, float *vzq, const int icount, vector *rhoSum, vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum, vector *div_vSum, vector *curlvxSum,vector *curlvySum, vector *curlvzSum, vector v_hi_inv, vector v_vix, vector v_viy, vector v_viz, int *icount_align) { #ifdef HAVE_AVX512_F KNL_MASK_16 knl_mask, knl_mask2; #endif vector int_mask, int_mask2; - + + /* Work out the number of remainder interactions and pad secondary cache. */ *icount_align = icount; int rem = icount % (NUM_VEC_PROC * VEC_SIZE); if (rem != 0) { @@ -51,7 +78,7 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions(const stru int_mask.m = vec_setint1(0xFFFFFFFF); int_mask2.m = vec_setint1(0xFFFFFFFF); #endif - /* Pad secondary cache */ + /* Pad secondary cache so that there are no contributions in the interaction function. */ for(int i=icount; i<*icount_align; i++) { mq[i] = 0.f; r2q[i] = 1.f; @@ -92,8 +119,44 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions(const stru } } +/** + * @brief Left-packs the values needed by an interaction into the secondary cache (Supports AVX, AVX2 and AVX512 instruction sets). + * + * @param mask Contains which particles need to interact. + * @param v_r2 #vector of the separation between two particles squared. + * @param v_dx #vector of the x separation between two particles. + * @param v_dy #vector of the y separation between two particles. + * @param v_dz #vector of the z separation between two particles. + * @param v_mj #vector of the mass of particle pj. + * @param v_vjx #vector of x velocity of pj. + * @param v_vjy #vector of y velocity of pj. + * @param v_vjz #vector of z velocity of pj. + * @param cell_cache #cache of all particles in the cell. + * @param r2q C2 cache of the separation between two particles squared. + * @param dxq C2 cache of the x separation between two particles. + * @param dyq C2 cache of the y separation between two particles. + * @param dzq C2 cache of the z separation between two particles. + * @param mq C2 cache of the mass of particle pj. + * @param vxq C2 cache of the x velocity of particle pj. + * @param vyq C2 cache of the y velocity of particle pj. + * @param vzq C2 cache of the z velocity of particle pj. + * @param icount Interaction count. + * @param rhoSum #vector holding the cumulative sum of the density update on pi. + * @param rho_dhSum #vector holding the cumulative sum of the density gradient update on pi. + * @param wcountSum #vector holding the cumulative sum of the wcount update on pi. + * @param wcount_dhSum #vector holding the cumulative sum of the wcount gradient update on pi. + * @param div_vSum #vector holding the cumulative sum of the divergence update on pi. + * @param curlvxSum #vector holding the cumulative sum of the curl of vx update on pi. + * @param curlvySum #vector holding the cumulative sum of the curl of vy update on pi. + * @param curlvzSum #vector holding the cumulative sum of the curl of vz update on pi. + * @param v_hi_inv #vector of 1/h for pi. + * @param v_vix #vector of x velocity of pi. + * @param v_viy #vector of y velocity of pi. + * @param v_viz #vector of z velocity of pi. + */ __attribute__((always_inline)) INLINE static void storeInteractions(const int mask, const int pjd, vector *v_r2, vector *v_dx, vector *v_dy, vector *v_dz, vector *v_mj, vector *v_vjx, vector *v_vjy, vector *v_vjz, const struct cache *const cell_cache, float *r2q, float *dxq, float *dyq, float *dzq, float *mq, float *vxq, float *vyq, float *vzq, int *icount, vector *rhoSum, vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum, vector *div_vSum, vector *curlvxSum,vector *curlvySum, vector *curlvzSum, vector v_hi_inv, vector v_vix, vector v_viy, vector v_viz) { +/* Left-pack values needed into the secondary cache using the interaction mask. */ #if defined(HAVE_AVX2) || defined(HAVE_AVX512_F) int pack = 0; @@ -123,6 +186,7 @@ __attribute__((always_inline)) INLINE static void storeInteractions(const int ma (*icount) += pack; #else + /* Quicker to do it serially in AVX rather than use intrinsics. */ for(int bit_index = 0; bit_index<VEC_SIZE; bit_index++) { if (mask & (1 << bit_index)) { /* Add this interaction to the queue. */ @@ -138,17 +202,24 @@ __attribute__((always_inline)) INLINE static void storeInteractions(const int ma (*icount)++; } } + /* Flush the c2 cache if it has reached capacity. */ if(*icount >= (C2_CACHE_SIZE - (NUM_VEC_PROC * VEC_SIZE))) { int icount_align = *icount; + + /* Peform remainder interactions. */ calcRemInteractions(cell_cache, r2q, dxq, dyq, dzq, mq, vxq, vyq, vzq, *icount, rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, curlvySum, curlvzSum, v_hi_inv, v_vix, v_viy, v_viz, &icount_align); vector int_mask, int_mask2; int_mask.m = vec_setint1(0xFFFFFFFF); int_mask2.m = vec_setint1(0xFFFFFFFF); + + /* Perform interactions. */ for (int pjd = 0; pjd < icount_align; pjd+=(NUM_VEC_PROC * VEC_SIZE)) { runner_iact_nonsym_2_vec_density(&r2q[pjd], &dxq[pjd], &dyq[pjd], &dzq[pjd], v_hi_inv, v_vix, v_viy, v_viz, &vxq[pjd], &vyq[pjd], &vzq[pjd], &mq[pjd], rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, curlvySum, curlvzSum, int_mask, int_mask2, 0, 0); } + + /* Reset interaction count. */ *icount = 0; } @@ -157,7 +228,7 @@ __attribute__((always_inline)) INLINE static void storeInteractions(const int ma #endif /** - * @brief Compute the cell self-interaction (non-symmetric) vec. + * @brief Compute the cell self-interaction (non-symmetric) using vector intrinsics with one particle pi at a time. * * @param r The #runner. * @param c The #cell. @@ -173,6 +244,9 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(struct run struct part *restrict parts = c->parts; const int count = c->count; + + /* Get the particle cache from the runner and re-allocate + * the cache if it is not big enough for the cell. */ struct cache *restrict cell_cache = &r->par_cache; int icount = 0, icount_align = 0; @@ -196,6 +270,7 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(struct run cache_init(cell_cache,count); } + /* Read the particles from the cell and store them locally in the cache. */ cache_read_particles(c,cell_cache); /* Loop over the particles in the cell. */ @@ -211,7 +286,7 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(struct run const float hi = cell_cache->h[pid]; - /* Fill pi position vector. */ + /* Fill particle pi vectors. */ pix.v = vec_set1(cell_cache->x[pid]); piy.v = vec_set1(cell_cache->y[pid]); piz.v = vec_set1(cell_cache->z[pid]); @@ -223,8 +298,10 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(struct run const float hig2 = hi * hi * kernel_gamma2; v_hig2.v = vec_set1(hig2); + /* Reset cumulative sums of update vectors. */ vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, curlvySum, curlvzSum; + /* Get the inverse of hi. */ vector v_hi_inv; VEC_RECIPROCAL(v_hi.v, v_hi_inv.v); @@ -258,7 +335,7 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(struct run vector pjx2, pjy2, pjz2; vector pjvx2, pjvy2, pjvz2, mj2; - /* Find all of particle pi's interacions and store needed values in secondary cache.*/ + /* Find all of particle pi's interacions and store needed values in the secondary cache.*/ for (int pjd = 0; pjd < count_align; pjd += (num_vec_proc * VEC_SIZE)) { /* Load 2 sets of vectors from the particle cache. */ @@ -314,21 +391,23 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(struct run vector v_doi_mask, v_doi_mask_check, v_doi_mask2, v_doi_mask2_check; int doi_mask2; + /* Form r2 > 0 mask and r2 < hig2 mask. */ v_doi_mask_check.v = vec_cmp_gt(v_r2.v,vec_setzero()); v_doi_mask.v = vec_cmp_lt(v_r2.v, v_hig2.v); + /* Form r2 > 0 mask and r2 < hig2 mask. */ v_doi_mask2_check.v = vec_cmp_gt(v_r2_2.v,vec_setzero()); v_doi_mask2.v = vec_cmp_lt(v_r2_2.v, v_hig2.v); + /* Combine two masks and form integer mask. */ doi_mask = vec_cmp_result(vec_and(v_doi_mask.v, v_doi_mask_check.v)); doi_mask2 = vec_cmp_result(vec_and(v_doi_mask2.v, v_doi_mask2_check.v)); #endif - /* Hit or miss? */ + /* If there are any interactions left pack interaction values into c2 cache. */ if (doi_mask) { storeInteractions(doi_mask,pjd, &v_r2, &v_dx_tmp,&v_dy_tmp, &v_dz_tmp, &mj, &pjvx, &pjvy, &pjvz, cell_cache, &r2q[0], &dxq[0], &dyq[0], &dzq[0], &mq[0], &vxq[0], &vyq[0], &vzq[0], &icount, &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz); } - /* Hit or miss? */ if (doi_mask2) { storeInteractions(doi_mask2,pjd + VEC_SIZE, &v_r2_2, &v_dx_tmp2,&v_dy_tmp2, &v_dz_tmp2, &mj2, &pjvx2, &pjvy2, &pjvz2, cell_cache, &r2q[0], &dxq[0], &dyq[0], &dzq[0], &mq[0], &vxq[0], &vyq[0], &vzq[0], &icount, &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz); } @@ -337,7 +416,7 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(struct run /* Perform padded vector remainder interactions if any are present. */ calcRemInteractions(cell_cache, &r2q[0], &dxq[0], &dyq[0], &dzq[0], &mq[0], &vxq[0], &vyq[0], &vzq[0], icount, &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz, &icount_align); - /* Initialise masks to true incase remainder interactions have been performed. */ + /* Initialise masks to true in case remainder interactions have been performed. */ vector int_mask, int_mask2; #ifdef HAVE_AVX512_F KNL_MASK_16 knl_mask = 0xFFFF; @@ -378,7 +457,8 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(struct run } /** - * @brief Compute the cell self-interaction (non-symmetric) vec. + * @brief Compute the cell self-interaction (non-symmetric) using vector intrinsics with two particle pis at a time. + * CURRENTLY BROKEN DO NOT USE. * * @param r The #runner. * @param c The #cell. @@ -393,6 +473,7 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec_2(struct r struct part *restrict pi2; int count_align; + /* Create two secondary caches. */ int icount = 0, icount_align = 0; float r2q[C2_CACHE_SIZE] __attribute__((aligned(array_align))); float dxq[C2_CACHE_SIZE] __attribute__((aligned(array_align))); @@ -418,23 +499,28 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec_2(struct r //TIMER_TIC + /* TODO: Need to find two active particles, not just one. */ if (c->ti_end_min > ti_current) return; if (c->ti_end_max < ti_current) error("Cell in an impossible time-zone"); struct part *restrict parts = c->parts; const int count = c->count; + + /* Get the particle cache from the runner and re-allocate + * the cache if it is not big enough for the cell. */ struct cache *restrict cell_cache = &r->par_cache; if(cell_cache->count < count) { cache_init(cell_cache,count); } + /* Read the particles from the cell and store them locally in the cache. */ cache_read_particles(c,&r->par_cache); /* Loop over the particles in the cell. */ for (int pid = 0; pid < count; pid+=2) { - /* Get a pointer to the ith particle. */ + /* Get a pointer to the ith particle and next i particle. */ pi = &parts[pid]; pi2 = &parts[pid + 1]; -- GitLab