diff --git a/src/cache.h b/src/cache.h index 49b654990cfa494dd2256d3489efd5495c9660ac..e7c7703ed36242b4be0baeeb30a195893eead7f0 100644 --- a/src/cache.h +++ b/src/cache.h @@ -154,8 +154,8 @@ __attribute__((always_inline)) INLINE void cache_read_particles( #if defined(GADGET2_SPH) - /* Shift the particles positions to a local frame so single precision can be - * used instead of double precision. */ +/* Shift the particles positions to a local frame so single precision can be + * used instead of double precision. */ #if defined(WITH_VECTORIZATION) && defined(__ICC) #pragma vector aligned #endif @@ -367,7 +367,8 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( ci_cache->vz[ci_cache_idx] = ci->parts[idx].v[2]; } - /* Pad cache with fake particles that exist outside the cell so will not interact.*/ + /* Pad cache with fake particles that exist outside the cell so will not + * interact.*/ float fake_pix = 2.0f * ci_cache->x[ci->count - 1]; for (int i = ci->count - first_pi_align; i < ci->count - first_pi_align + VEC_SIZE; i++) @@ -389,7 +390,8 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( cj_cache->vz[i] = cj->parts[idx].v[2]; } - /* Pad cache with fake particles that exist outside the cell so will not interact.*/ + /* Pad cache with fake particles that exist outside the cell so will not + * interact.*/ float fake_pjx = 2.0f * cj_cache->x[last_pj_align]; for (int i = last_pj_align + 1; i < last_pj_align + 1 + VEC_SIZE; i++) cj_cache->x[i] = fake_pjx; diff --git a/src/hydro/Gadget2/hydro_iact.h b/src/hydro/Gadget2/hydro_iact.h index fb19fb2c2acfafd34f187ba5a2c28e61b9da8514..4ce9902c138c3ac8ff6edb1e66e36c91aeb668ab 100644 --- a/src/hydro/Gadget2/hydro_iact.h +++ b/src/hydro/Gadget2/hydro_iact.h @@ -381,12 +381,14 @@ runner_iact_nonsym_vec_density(float *R2, float *Dx, float *Hi, float *Hj, * (non-symmetric vectorized version). */ __attribute__((always_inline)) INLINE static void -runner_iact_nonsym_1_vec_density( - vector *r2, vector *dx, vector *dy, vector *dz, vector hi_inv, vector vix, - vector viy, vector viz, float *Vjx, float *Vjy, float *Vjz, float *Mj, - vector *rhoSum, vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum, - vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum, - vector mask, int knlMask) { +runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz, + vector hi_inv, vector vix, vector viy, + vector viz, float *Vjx, float *Vjy, float *Vjz, + float *Mj, vector *rhoSum, vector *rho_dhSum, + vector *wcountSum, vector *wcount_dhSum, + vector *div_vSum, vector *curlvxSum, + vector *curlvySum, vector *curlvzSum, + vector mask, int knlMask) { vector r, ri, xi, wi, wi_dx; vector mj; diff --git a/src/kernel_hydro.h b/src/kernel_hydro.h index 585655940cf6f0587fd7dd1efae1bf546476b1da..f634a59d7ee769951e6560d46a92053c144cc766 100644 --- a/src/kernel_hydro.h +++ b/src/kernel_hydro.h @@ -327,7 +327,8 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx( const float *const coeffs = &kernel_coeffs[ind * (kernel_degree + 1)]; /* First two terms of the polynomial ... */ - float dw_dx = ((float)kernel_degree * coeffs[0] * x) + (float)(kernel_degree - 1) * coeffs[1]; + float dw_dx = ((float)kernel_degree * coeffs[0] * x) + + (float)(kernel_degree - 1) * coeffs[1]; /* ... and the rest of them */ for (int k = 2; k < kernel_degree; k++) { @@ -396,7 +397,8 @@ __attribute__((always_inline)) INLINE static void kernel_deval_vec( dw_dx->v * kernel_constant_vec.v * kernel_gamma_inv_dim_plus_one_vec.v; } -/* Define constant vectors for the Wendland C2 and Cubic Spline kernel coefficients. */ +/* Define constant vectors for the Wendland C2 and Cubic Spline kernel + * coefficients. */ #ifdef WENDLAND_C2_KERNEL static const vector wendland_const_c0 = FILL_VEC(4.f); static const vector wendland_const_c1 = FILL_VEC(-15.f); @@ -469,37 +471,39 @@ __attribute__((always_inline)) INLINE static void kernel_deval_1_vec( vector mask_reg1, mask_reg2; /* Form a mask for each part of the kernel. */ - mask_reg1.v = vec_cmp_lt(x.v,cond.v); /* 0 < x < 0.5 */ - mask_reg2.v = vec_cmp_gte(x.v,cond.v); /* 0.5 < x < 1 */; + mask_reg1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */ + mask_reg2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */ + ; - /* Work out w for both regions of the kernel and combine the results together using masks. */ + /* Work out w for both regions of the kernel and combine the results together + * using masks. */ /* Init the iteration for Horner's scheme. */ w->v = vec_fma(cubic_1_const_c0.v, x.v, cubic_1_const_c1.v); w2.v = vec_fma(cubic_2_const_c0.v, x.v, cubic_2_const_c1.v); dw_dx->v = cubic_1_const_c0.v; dw_dx2.v = cubic_2_const_c0.v; - + /* Calculate the polynomial interleaving vector operations. */ dw_dx->v = vec_fma(dw_dx->v, x.v, w->v); dw_dx2.v = vec_fma(dw_dx2.v, x.v, w2.v); w->v = vec_mul(x.v, w->v); /* cubic_1_const_c2 is zero. */ w2.v = vec_fma(x.v, w2.v, cubic_2_const_c2.v); - + dw_dx->v = vec_fma(dw_dx->v, x.v, w->v); dw_dx2.v = vec_fma(dw_dx2.v, x.v, w2.v); w->v = vec_fma(x.v, w->v, cubic_1_const_c3.v); w2.v = vec_fma(x.v, w2.v, cubic_2_const_c3.v); - + /* Mask out unneeded values. */ - w->v = vec_and(w->v,mask_reg1.v); - w2.v = vec_and(w2.v,mask_reg2.v); - dw_dx->v = vec_and(dw_dx->v,mask_reg1.v); - dw_dx2.v = vec_and(dw_dx2.v,mask_reg2.v); + w->v = vec_and(w->v, mask_reg1.v); + w2.v = vec_and(w2.v, mask_reg2.v); + dw_dx->v = vec_and(dw_dx->v, mask_reg1.v); + dw_dx2.v = vec_and(dw_dx2.v, mask_reg2.v); /* Added both w and w2 together to form complete result. */ - w->v = vec_add(w->v,w2.v); - dw_dx->v = vec_add(dw_dx->v,dw_dx2.v); + w->v = vec_add(w->v, w2.v); + dw_dx->v = vec_add(dw_dx->v, dw_dx2.v); #else #error #endif @@ -556,7 +560,7 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec( dw_dx->v = vec_fma(dw_dx->v, x.v, w->v); dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v); - w->v = vec_mul(x.v, w->v); /* wendland_const_c4 is zero. */ + w->v = vec_mul(x.v, w->v); /* wendland_const_c4 is zero. */ w2->v = vec_mul(x2.v, w2->v); /* wendland_const_c4 is zero. */ dw_dx->v = vec_fma(dw_dx->v, x.v, w->v); @@ -579,12 +583,15 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec( vector mask_reg1, mask_reg2, mask_reg1_v2, mask_reg2_v2; /* Form a mask for each part of the kernel. */ - mask_reg1.v = vec_cmp_lt(x.v,cond.v); /* 0 < x < 0.5 */ - mask_reg1_v2.v = vec_cmp_lt(x2.v,cond.v); /* 0 < x < 0.5 */ - mask_reg2.v = vec_cmp_gte(x.v,cond.v); /* 0.5 < x < 1 */; - mask_reg2_v2.v = vec_cmp_gte(x2.v,cond.v); /* 0.5 < x < 1 */; + mask_reg1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */ + mask_reg1_v2.v = vec_cmp_lt(x2.v, cond.v); /* 0 < x < 0.5 */ + mask_reg2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */ + ; + mask_reg2_v2.v = vec_cmp_gte(x2.v, cond.v); /* 0.5 < x < 1 */ + ; - /* Work out w for both regions of the kernel and combine the results together using masks. */ + /* Work out w for both regions of the kernel and combine the results together + * using masks. */ /* Init the iteration for Horner's scheme. */ w->v = vec_fma(cubic_1_const_c0.v, x.v, cubic_1_const_c1.v); @@ -595,13 +602,13 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec( dw_dx2->v = cubic_1_const_c0.v; dw_dx_2.v = cubic_2_const_c0.v; dw_dx2_2.v = cubic_2_const_c0.v; - + /* Calculate the polynomial interleaving vector operations. */ dw_dx->v = vec_fma(dw_dx->v, x.v, w->v); dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v); dw_dx_2.v = vec_fma(dw_dx_2.v, x.v, w_2.v); dw_dx2_2.v = vec_fma(dw_dx2_2.v, x2.v, w2_2.v); - w->v = vec_mul(x.v, w->v); /* cubic_1_const_c2 is zero. */ + w->v = vec_mul(x.v, w->v); /* cubic_1_const_c2 is zero. */ w2->v = vec_mul(x2.v, w2->v); /* cubic_1_const_c2 is zero. */ w_2.v = vec_fma(x.v, w_2.v, cubic_2_const_c2.v); w2_2.v = vec_fma(x2.v, w2_2.v, cubic_2_const_c2.v); @@ -616,20 +623,20 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec( w2_2.v = vec_fma(x2.v, w2_2.v, cubic_2_const_c3.v); /* Mask out unneeded values. */ - w->v = vec_and(w->v,mask_reg1.v); - w2->v = vec_and(w2->v,mask_reg1_v2.v); - w_2.v = vec_and(w_2.v,mask_reg2.v); - w2_2.v = vec_and(w2_2.v,mask_reg2_v2.v); - dw_dx->v = vec_and(dw_dx->v,mask_reg1.v); - dw_dx2->v = vec_and(dw_dx2->v,mask_reg1_v2.v); - dw_dx_2.v = vec_and(dw_dx_2.v,mask_reg2.v); - dw_dx2_2.v = vec_and(dw_dx2_2.v,mask_reg2_v2.v); + w->v = vec_and(w->v, mask_reg1.v); + w2->v = vec_and(w2->v, mask_reg1_v2.v); + w_2.v = vec_and(w_2.v, mask_reg2.v); + w2_2.v = vec_and(w2_2.v, mask_reg2_v2.v); + dw_dx->v = vec_and(dw_dx->v, mask_reg1.v); + dw_dx2->v = vec_and(dw_dx2->v, mask_reg1_v2.v); + dw_dx_2.v = vec_and(dw_dx_2.v, mask_reg2.v); + dw_dx2_2.v = vec_and(dw_dx2_2.v, mask_reg2_v2.v); /* Added both w and w2 together to form complete result. */ - w->v = vec_add(w->v,w_2.v); - w2->v = vec_add(w2->v,w2_2.v); - dw_dx->v = vec_add(dw_dx->v,dw_dx_2.v); - dw_dx2->v = vec_add(dw_dx2->v,dw_dx2_2.v); + w->v = vec_add(w->v, w_2.v); + w2->v = vec_add(w2->v, w2_2.v); + dw_dx->v = vec_add(dw_dx->v, dw_dx_2.v); + dw_dx2->v = vec_add(dw_dx2->v, dw_dx2_2.v); /* Return everything */ w->v = w->v * kernel_constant_vec.v * kernel_gamma_inv_dim_vec.v; @@ -651,8 +658,8 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec( * @param u The ratio of the distance to the smoothing length $u = x/h$. * @param w (return) The value of the kernel function $W(x,h)$. */ -__attribute__((always_inline)) INLINE static void kernel_eval_W_vec( - vector *u, vector *w) { +__attribute__((always_inline)) INLINE static void kernel_eval_W_vec(vector *u, + vector *w) { /* Go to the range [0,1[ from [0,H[ */ vector x; @@ -672,28 +679,30 @@ __attribute__((always_inline)) INLINE static void kernel_eval_W_vec( vector mask1, mask2; /* Form a mask for each part of the kernel. */ - mask1.v = vec_cmp_lt(x.v,cond.v); /* 0 < x < 0.5 */ - mask2.v = vec_cmp_gte(x.v,cond.v); /* 0.5 < x < 1 */; + mask1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */ + mask2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */ + ; - /* Work out w for both regions of the kernel and combine the results together using masks. */ + /* Work out w for both regions of the kernel and combine the results together + * using masks. */ /* Init the iteration for Horner's scheme. */ w->v = vec_fma(cubic_1_const_c0.v, x.v, cubic_1_const_c1.v); w2.v = vec_fma(cubic_2_const_c0.v, x.v, cubic_2_const_c1.v); - + /* Calculate the polynomial interleaving vector operations. */ w->v = vec_mul(x.v, w->v); /* cubic_1_const_c2 is zero */ w2.v = vec_fma(x.v, w2.v, cubic_2_const_c2.v); - + w->v = vec_fma(x.v, w->v, cubic_1_const_c3.v); w2.v = vec_fma(x.v, w2.v, cubic_2_const_c3.v); - + /* Mask out unneeded values. */ - w->v = vec_and(w->v,mask1.v); - w2.v = vec_and(w2.v,mask2.v); + w->v = vec_and(w->v, mask1.v); + w2.v = vec_and(w2.v, mask2.v); /* Added both w and w2 together to form complete result. */ - w->v = vec_add(w->v,w2.v); + w->v = vec_add(w->v, w2.v); #else #error #endif @@ -721,39 +730,41 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_vec( #ifdef WENDLAND_C2_KERNEL /* Init the iteration for Horner's scheme. */ - dw_dx->v = vec_fma(wendland_dwdx_const_c0.v,x.v,wendland_dwdx_const_c1.v); + dw_dx->v = vec_fma(wendland_dwdx_const_c0.v, x.v, wendland_dwdx_const_c1.v); /* Calculate the polynomial interleaving vector operations */ - dw_dx->v = vec_fma(dw_dx->v,x.v,wendland_dwdx_const_c2.v); + dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c2.v); - dw_dx->v = vec_fma(dw_dx->v,x.v,wendland_dwdx_const_c3.v); + dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c3.v); - dw_dx->v = vec_mul(dw_dx->v,x.v); + dw_dx->v = vec_mul(dw_dx->v, x.v); #elif defined(CUBIC_SPLINE_KERNEL) vector dw_dx2; vector mask1, mask2; /* Form a mask for each part of the kernel. */ - mask1.v = vec_cmp_lt(x.v,cond.v); /* 0 < x < 0.5 */ - mask2.v = vec_cmp_gte(x.v,cond.v); /* 0.5 < x < 1 */; + mask1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */ + mask2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */ + ; - /* Work out w for both regions of the kernel and combine the results together using masks. */ + /* Work out w for both regions of the kernel and combine the results together + * using masks. */ /* Init the iteration for Horner's scheme. */ - dw_dx->v = vec_fma(cubic_1_dwdx_const_c0.v,x.v,cubic_1_dwdx_const_c1.v); - dw_dx2.v = vec_fma(cubic_2_dwdx_const_c0.v,x.v,cubic_2_dwdx_const_c1.v); - + dw_dx->v = vec_fma(cubic_1_dwdx_const_c0.v, x.v, cubic_1_dwdx_const_c1.v); + dw_dx2.v = vec_fma(cubic_2_dwdx_const_c0.v, x.v, cubic_2_dwdx_const_c1.v); + /* Calculate the polynomial interleaving vector operations. */ - dw_dx->v = vec_mul(dw_dx->v,x.v); /* cubic_1_dwdx_const_c2 is zero. */ - dw_dx2.v = vec_fma(dw_dx2.v,x.v,cubic_2_dwdx_const_c2.v); - + dw_dx->v = vec_mul(dw_dx->v, x.v); /* cubic_1_dwdx_const_c2 is zero. */ + dw_dx2.v = vec_fma(dw_dx2.v, x.v, cubic_2_dwdx_const_c2.v); + /* Mask out unneeded values. */ - dw_dx->v = vec_and(dw_dx->v,mask1.v); - dw_dx2.v = vec_and(dw_dx2.v,mask2.v); + dw_dx->v = vec_and(dw_dx->v, mask1.v); + dw_dx2.v = vec_and(dw_dx2.v, mask2.v); /* Added both dwdx and dwdx2 together to form complete result. */ - dw_dx->v = vec_add(dw_dx->v,dw_dx2.v); + dw_dx->v = vec_add(dw_dx->v, dw_dx2.v); #else #error #endif diff --git a/src/runner_doiact.h b/src/runner_doiact.h index b91a949e0e0910f61bc0b8d5fd8a283dcb24835d..ecd360b9d9898803c479b996f73709bad069e41b 100644 --- a/src/runner_doiact.h +++ b/src/runner_doiact.h @@ -3191,7 +3191,9 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts, } /** - * @brief Determine which version of DOPAIR1 needs to be called depending on MPI, vectorisation and orientation of the cells or whether DOPAIR1 needs to be called at all. + * @brief Determine which version of DOPAIR1 needs to be called depending on + * MPI, vectorisation and orientation of the cells or whether DOPAIR1 needs to + * be called at all. * * @param r #runner * @param ci #cell ci @@ -3224,12 +3226,13 @@ void DOPAIR1_BRANCH(struct runner *r, struct cell *ci, struct cell *cj) { if (!(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid))) error("Trying to interact unsorted cells."); -#if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH) && (DOPAIR1_BRANCH == runner_dopair1_density_branch) - if(!sort_is_corner(sid)) - runner_dopair1_density_vec(r, ci, cj); - else +#if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH) && \ + (DOPAIR1_BRANCH == runner_dopair1_density_branch) + if (!sort_is_corner(sid)) + runner_dopair1_density_vec(r, ci, cj); + else + DOPAIR1(r, ci, cj); +#else DOPAIR1(r, ci, cj); -#else - DOPAIR1(r, ci, cj); #endif } diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c index 19ed3247f7075e90460d63cbc7a72d828e12d533..4148b2693b3f457711fcc8f5732850e126bb5e8d 100644 --- a/src/runner_doiact_vec.c +++ b/src/runner_doiact_vec.c @@ -299,7 +299,8 @@ __attribute__((always_inline)) INLINE static void populate_max_d_no_cache( int first_pi = 0, last_pj = cj->count - 1; - /* Find the first active particle in ci to interact with any particle in cj. */ + /* Find the first active particle in ci to interact with any particle in cj. + */ /* Populate max_di with distances. */ int active_id = ci->count - 1; for (int k = ci->count - 1; k >= 0; k--) { @@ -309,20 +310,19 @@ __attribute__((always_inline)) INLINE static void populate_max_d_no_cache( max_di[k] = d; - /* If the particle is out of range set the index to + /* If the particle is out of range set the index to * the last active particle within range. */ if (d < dj_min) { first_pi = active_id; break; - } - else { - if(part_is_active(p,e)) active_id = k; + } else { + if (part_is_active(p, e)) active_id = k; } } /* Find the maximum distance of pi particles into cj.*/ for (int k = first_pi; k < ci->count; k++) { - max_di[k] = fmaxf(max_di[k - 1],max_di[k]); + max_di[k] = fmaxf(max_di[k - 1], max_di[k]); } /* Find the last particle in cj to interact with any particle in ci. */ @@ -332,17 +332,16 @@ __attribute__((always_inline)) INLINE static void populate_max_d_no_cache( p = &parts_j[sort_j[k].i]; h = p->h; d = sort_j[k].d - h * kernel_gamma - dx_max - rshift; - + max_dj[k] = d; - - /* If the particle is out of range set the index to + + /* If the particle is out of range set the index to * the last active particle within range. */ if (d > di_max) { last_pj = active_id; break; - } - else { - if(part_is_active(p,e)) active_id = k; + } else { + if (part_is_active(p, e)) active_id = k; } } @@ -1021,28 +1020,28 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, const float dx_max = (ci->dx_max + cj->dx_max); /* Check if any particles are active and return if there are not. */ - int numActive = 0; - for (int pid = count_i - 1; - pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min; pid--) { + int numActive = 0; + for (int pid = count_i - 1; + pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min; pid--) { struct part *restrict pi = &parts_i[sort_i[pid].i]; if (part_is_active(pi, e)) { numActive++; break; - } + } } - if(!numActive) { + if (!numActive) { for (int pjd = 0; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max; - pjd++) { + pjd++) { struct part *restrict pj = &parts_j[sort_j[pjd].i]; if (part_is_active(pj, e)) { numActive++; break; - } - } + } + } } - if(numActive == 0) return; + if (numActive == 0) return; /* Get both particle caches from the runner and re-allocate * them if they are not big enough for the cells. */ @@ -1067,7 +1066,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, /* Also find the first pi that interacts with any particle in cj and the last * pj that interacts with any particle in ci. */ populate_max_d_no_cache(ci, cj, sort_i, sort_j, dx_max, rshift, max_di, - max_dj, &first_pi, &last_pj, e); + max_dj, &first_pi, &last_pj, e); /* Find the maximum index into cj that is required by a particle in ci. */ /* Find the maximum index into ci that is required by a particle in cj. */ @@ -1102,8 +1101,8 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, int first_pi_align = first_pi; int last_pj_align = last_pj; cache_read_two_partial_cells_sorted(ci, cj, ci_cache, cj_cache, sort_i, - sort_j, shift, &first_pi_align, - &last_pj_align, 1); + sort_j, shift, &first_pi_align, + &last_pj_align, 1); /* Get the number of particles read into the ci cache. */ int ci_cache_count = count_i - first_pi_align; @@ -1147,7 +1146,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, /* Reset cumulative sums of update vectors. */ vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, - curlvySum, curlvzSum; + curlvySum, curlvzSum; /* Get the inverse of hi. */ vector v_hi_inv; @@ -1184,8 +1183,8 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, vector v_dx, v_dy, v_dz, v_r2; #ifdef SWIFT_DEBUG_CHECKS - if (cj_cache_idx%VEC_SIZE != 0 || cj_cache_idx < 0) { - error("Unaligned read!!! cj_cache_idx=%d",cj_cache_idx); + if (cj_cache_idx % VEC_SIZE != 0 || cj_cache_idx < 0) { + error("Unaligned read!!! cj_cache_idx=%d", cj_cache_idx); } #endif @@ -1223,7 +1222,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, #ifdef HAVE_AVX512_F knl_mask); #else - 0); + 0); #endif } /* loop over the parts in cj. */ @@ -1281,7 +1280,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, /* Reset cumulative sums of update vectors. */ vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, - curlvySum, curlvzSum; + curlvySum, curlvzSum; /* Get the inverse of hj. */ vector v_hj_inv; @@ -1304,17 +1303,18 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, /* Pad the exit iteration align so cache reads are aligned. */ int rem = exit_iteration_align % VEC_SIZE; - if ( exit_iteration_align < VEC_SIZE) { + if (exit_iteration_align < VEC_SIZE) { exit_iteration_align = 0; - } - else exit_iteration_align -= rem; + } else + exit_iteration_align -= rem; /* Loop over the parts in ci. */ - for (int ci_cache_idx = exit_iteration_align; ci_cache_idx < ci_cache_count; ci_cache_idx += VEC_SIZE) { + for (int ci_cache_idx = exit_iteration_align; + ci_cache_idx < ci_cache_count; ci_cache_idx += VEC_SIZE) { #ifdef SWIFT_DEBUG_CHECKS - if (ci_cache_idx%VEC_SIZE != 0 || ci_cache_idx < 0) { - error("Unaligned read!!! ci_cache_idx=%d",ci_cache_idx); + if (ci_cache_idx % VEC_SIZE != 0 || ci_cache_idx < 0) { + error("Unaligned read!!! ci_cache_idx=%d", ci_cache_idx); } #endif @@ -1354,12 +1354,13 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, #ifdef HAVE_AVX512_F knl_mask); #else - 0); + 0); #endif } /* loop over the parts in ci. */ - /* Perform horizontal adds on vector sums and store result in particle pj. */ + /* Perform horizontal adds on vector sums and store result in particle pj. + */ VEC_HADD(rhoSum, pj->rho); VEC_HADD(rho_dhSum, pj->density.rho_dh); VEC_HADD(wcountSum, pj->density.wcount); @@ -1370,7 +1371,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, VEC_HADD(curlvzSum, pj->density.rot_v[2]); } /* loop over the parts in cj. */ - + TIMER_TOC(timer_dopair_density); } diff --git a/tests/test27cells.c b/tests/test27cells.c index 30767fd5038e5df3f1f8b1a717a3e54c728c351c..bd827b68e90ea5f4e9d5577612e6cecda2edf83a 100644 --- a/tests/test27cells.c +++ b/tests/test27cells.c @@ -553,8 +553,7 @@ int main(int argc, char *argv[]) { dump_particle_fields(outputFileName, main_cell, cells); /* Check serial results against the vectorised results. */ - if (check_results(main_cell->parts, vec_parts, main_cell->count, - threshold)) + if (check_results(main_cell->parts, vec_parts, main_cell->count, threshold)) message("Differences found..."); /* Output timing */