gitlab is upgraded to version 13, please report any issues and enjoy

Commit ecdaae35 authored by Matthieu Schaller's avatar Matthieu Schaller
Browse files

Code formatting

parent f01a4e0c
......@@ -154,8 +154,8 @@ __attribute__((always_inline)) INLINE void cache_read_particles(
#if defined(GADGET2_SPH)
/* Shift the particles positions to a local frame so single precision can be
* used instead of double precision. */
/* Shift the particles positions to a local frame so single precision can be
* used instead of double precision. */
#if defined(WITH_VECTORIZATION) && defined(__ICC)
#pragma vector aligned
#endif
......@@ -367,7 +367,8 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
ci_cache->vz[ci_cache_idx] = ci->parts[idx].v[2];
}
/* Pad cache with fake particles that exist outside the cell so will not interact.*/
/* Pad cache with fake particles that exist outside the cell so will not
* interact.*/
float fake_pix = 2.0f * ci_cache->x[ci->count - 1];
for (int i = ci->count - first_pi_align;
i < ci->count - first_pi_align + VEC_SIZE; i++)
......@@ -389,7 +390,8 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
cj_cache->vz[i] = cj->parts[idx].v[2];
}
/* Pad cache with fake particles that exist outside the cell so will not interact.*/
/* Pad cache with fake particles that exist outside the cell so will not
* interact.*/
float fake_pjx = 2.0f * cj_cache->x[last_pj_align];
for (int i = last_pj_align + 1; i < last_pj_align + 1 + VEC_SIZE; i++)
cj_cache->x[i] = fake_pjx;
......
......@@ -381,12 +381,14 @@ runner_iact_nonsym_vec_density(float *R2, float *Dx, float *Hi, float *Hj,
* (non-symmetric vectorized version).
*/
__attribute__((always_inline)) INLINE static void
runner_iact_nonsym_1_vec_density(
vector *r2, vector *dx, vector *dy, vector *dz, vector hi_inv, vector vix,
vector viy, vector viz, float *Vjx, float *Vjy, float *Vjz, float *Mj,
vector *rhoSum, vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum,
vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum,
vector mask, int knlMask) {
runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz,
vector hi_inv, vector vix, vector viy,
vector viz, float *Vjx, float *Vjy, float *Vjz,
float *Mj, vector *rhoSum, vector *rho_dhSum,
vector *wcountSum, vector *wcount_dhSum,
vector *div_vSum, vector *curlvxSum,
vector *curlvySum, vector *curlvzSum,
vector mask, int knlMask) {
vector r, ri, xi, wi, wi_dx;
vector mj;
......
......@@ -327,7 +327,8 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx(
const float *const coeffs = &kernel_coeffs[ind * (kernel_degree + 1)];
/* First two terms of the polynomial ... */
float dw_dx = ((float)kernel_degree * coeffs[0] * x) + (float)(kernel_degree - 1) * coeffs[1];
float dw_dx = ((float)kernel_degree * coeffs[0] * x) +
(float)(kernel_degree - 1) * coeffs[1];
/* ... and the rest of them */
for (int k = 2; k < kernel_degree; k++) {
......@@ -396,7 +397,8 @@ __attribute__((always_inline)) INLINE static void kernel_deval_vec(
dw_dx->v * kernel_constant_vec.v * kernel_gamma_inv_dim_plus_one_vec.v;
}
/* Define constant vectors for the Wendland C2 and Cubic Spline kernel coefficients. */
/* Define constant vectors for the Wendland C2 and Cubic Spline kernel
* coefficients. */
#ifdef WENDLAND_C2_KERNEL
static const vector wendland_const_c0 = FILL_VEC(4.f);
static const vector wendland_const_c1 = FILL_VEC(-15.f);
......@@ -469,37 +471,39 @@ __attribute__((always_inline)) INLINE static void kernel_deval_1_vec(
vector mask_reg1, mask_reg2;
/* Form a mask for each part of the kernel. */
mask_reg1.v = vec_cmp_lt(x.v,cond.v); /* 0 < x < 0.5 */
mask_reg2.v = vec_cmp_gte(x.v,cond.v); /* 0.5 < x < 1 */;
mask_reg1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */
mask_reg2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */
;
/* Work out w for both regions of the kernel and combine the results together using masks. */
/* Work out w for both regions of the kernel and combine the results together
* using masks. */
/* Init the iteration for Horner's scheme. */
w->v = vec_fma(cubic_1_const_c0.v, x.v, cubic_1_const_c1.v);
w2.v = vec_fma(cubic_2_const_c0.v, x.v, cubic_2_const_c1.v);
dw_dx->v = cubic_1_const_c0.v;
dw_dx2.v = cubic_2_const_c0.v;
/* Calculate the polynomial interleaving vector operations. */
dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
dw_dx2.v = vec_fma(dw_dx2.v, x.v, w2.v);
w->v = vec_mul(x.v, w->v); /* cubic_1_const_c2 is zero. */
w2.v = vec_fma(x.v, w2.v, cubic_2_const_c2.v);
dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
dw_dx2.v = vec_fma(dw_dx2.v, x.v, w2.v);
w->v = vec_fma(x.v, w->v, cubic_1_const_c3.v);
w2.v = vec_fma(x.v, w2.v, cubic_2_const_c3.v);
/* Mask out unneeded values. */
w->v = vec_and(w->v,mask_reg1.v);
w2.v = vec_and(w2.v,mask_reg2.v);
dw_dx->v = vec_and(dw_dx->v,mask_reg1.v);
dw_dx2.v = vec_and(dw_dx2.v,mask_reg2.v);
w->v = vec_and(w->v, mask_reg1.v);
w2.v = vec_and(w2.v, mask_reg2.v);
dw_dx->v = vec_and(dw_dx->v, mask_reg1.v);
dw_dx2.v = vec_and(dw_dx2.v, mask_reg2.v);
/* Added both w and w2 together to form complete result. */
w->v = vec_add(w->v,w2.v);
dw_dx->v = vec_add(dw_dx->v,dw_dx2.v);
w->v = vec_add(w->v, w2.v);
dw_dx->v = vec_add(dw_dx->v, dw_dx2.v);
#else
#error
#endif
......@@ -556,7 +560,7 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v);
w->v = vec_mul(x.v, w->v); /* wendland_const_c4 is zero. */
w->v = vec_mul(x.v, w->v); /* wendland_const_c4 is zero. */
w2->v = vec_mul(x2.v, w2->v); /* wendland_const_c4 is zero. */
dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
......@@ -579,12 +583,15 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
vector mask_reg1, mask_reg2, mask_reg1_v2, mask_reg2_v2;
/* Form a mask for each part of the kernel. */
mask_reg1.v = vec_cmp_lt(x.v,cond.v); /* 0 < x < 0.5 */
mask_reg1_v2.v = vec_cmp_lt(x2.v,cond.v); /* 0 < x < 0.5 */
mask_reg2.v = vec_cmp_gte(x.v,cond.v); /* 0.5 < x < 1 */;
mask_reg2_v2.v = vec_cmp_gte(x2.v,cond.v); /* 0.5 < x < 1 */;
mask_reg1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */
mask_reg1_v2.v = vec_cmp_lt(x2.v, cond.v); /* 0 < x < 0.5 */
mask_reg2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */
;
mask_reg2_v2.v = vec_cmp_gte(x2.v, cond.v); /* 0.5 < x < 1 */
;
/* Work out w for both regions of the kernel and combine the results together using masks. */
/* Work out w for both regions of the kernel and combine the results together
* using masks. */
/* Init the iteration for Horner's scheme. */
w->v = vec_fma(cubic_1_const_c0.v, x.v, cubic_1_const_c1.v);
......@@ -595,13 +602,13 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
dw_dx2->v = cubic_1_const_c0.v;
dw_dx_2.v = cubic_2_const_c0.v;
dw_dx2_2.v = cubic_2_const_c0.v;
/* Calculate the polynomial interleaving vector operations. */
dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v);
dw_dx_2.v = vec_fma(dw_dx_2.v, x.v, w_2.v);
dw_dx2_2.v = vec_fma(dw_dx2_2.v, x2.v, w2_2.v);
w->v = vec_mul(x.v, w->v); /* cubic_1_const_c2 is zero. */
w->v = vec_mul(x.v, w->v); /* cubic_1_const_c2 is zero. */
w2->v = vec_mul(x2.v, w2->v); /* cubic_1_const_c2 is zero. */
w_2.v = vec_fma(x.v, w_2.v, cubic_2_const_c2.v);
w2_2.v = vec_fma(x2.v, w2_2.v, cubic_2_const_c2.v);
......@@ -616,20 +623,20 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
w2_2.v = vec_fma(x2.v, w2_2.v, cubic_2_const_c3.v);
/* Mask out unneeded values. */
w->v = vec_and(w->v,mask_reg1.v);
w2->v = vec_and(w2->v,mask_reg1_v2.v);
w_2.v = vec_and(w_2.v,mask_reg2.v);
w2_2.v = vec_and(w2_2.v,mask_reg2_v2.v);
dw_dx->v = vec_and(dw_dx->v,mask_reg1.v);
dw_dx2->v = vec_and(dw_dx2->v,mask_reg1_v2.v);
dw_dx_2.v = vec_and(dw_dx_2.v,mask_reg2.v);
dw_dx2_2.v = vec_and(dw_dx2_2.v,mask_reg2_v2.v);
w->v = vec_and(w->v, mask_reg1.v);
w2->v = vec_and(w2->v, mask_reg1_v2.v);
w_2.v = vec_and(w_2.v, mask_reg2.v);
w2_2.v = vec_and(w2_2.v, mask_reg2_v2.v);
dw_dx->v = vec_and(dw_dx->v, mask_reg1.v);
dw_dx2->v = vec_and(dw_dx2->v, mask_reg1_v2.v);
dw_dx_2.v = vec_and(dw_dx_2.v, mask_reg2.v);
dw_dx2_2.v = vec_and(dw_dx2_2.v, mask_reg2_v2.v);
/* Added both w and w2 together to form complete result. */
w->v = vec_add(w->v,w_2.v);
w2->v = vec_add(w2->v,w2_2.v);
dw_dx->v = vec_add(dw_dx->v,dw_dx_2.v);
dw_dx2->v = vec_add(dw_dx2->v,dw_dx2_2.v);
w->v = vec_add(w->v, w_2.v);
w2->v = vec_add(w2->v, w2_2.v);
dw_dx->v = vec_add(dw_dx->v, dw_dx_2.v);
dw_dx2->v = vec_add(dw_dx2->v, dw_dx2_2.v);
/* Return everything */
w->v = w->v * kernel_constant_vec.v * kernel_gamma_inv_dim_vec.v;
......@@ -651,8 +658,8 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
* @param u The ratio of the distance to the smoothing length $u = x/h$.
* @param w (return) The value of the kernel function $W(x,h)$.
*/
__attribute__((always_inline)) INLINE static void kernel_eval_W_vec(
vector *u, vector *w) {
__attribute__((always_inline)) INLINE static void kernel_eval_W_vec(vector *u,
vector *w) {
/* Go to the range [0,1[ from [0,H[ */
vector x;
......@@ -672,28 +679,30 @@ __attribute__((always_inline)) INLINE static void kernel_eval_W_vec(
vector mask1, mask2;
/* Form a mask for each part of the kernel. */
mask1.v = vec_cmp_lt(x.v,cond.v); /* 0 < x < 0.5 */
mask2.v = vec_cmp_gte(x.v,cond.v); /* 0.5 < x < 1 */;
mask1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */
mask2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */
;
/* Work out w for both regions of the kernel and combine the results together using masks. */
/* Work out w for both regions of the kernel and combine the results together
* using masks. */
/* Init the iteration for Horner's scheme. */
w->v = vec_fma(cubic_1_const_c0.v, x.v, cubic_1_const_c1.v);
w2.v = vec_fma(cubic_2_const_c0.v, x.v, cubic_2_const_c1.v);
/* Calculate the polynomial interleaving vector operations. */
w->v = vec_mul(x.v, w->v); /* cubic_1_const_c2 is zero */
w2.v = vec_fma(x.v, w2.v, cubic_2_const_c2.v);
w->v = vec_fma(x.v, w->v, cubic_1_const_c3.v);
w2.v = vec_fma(x.v, w2.v, cubic_2_const_c3.v);
/* Mask out unneeded values. */
w->v = vec_and(w->v,mask1.v);
w2.v = vec_and(w2.v,mask2.v);
w->v = vec_and(w->v, mask1.v);
w2.v = vec_and(w2.v, mask2.v);
/* Added both w and w2 together to form complete result. */
w->v = vec_add(w->v,w2.v);
w->v = vec_add(w->v, w2.v);
#else
#error
#endif
......@@ -721,39 +730,41 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_vec(
#ifdef WENDLAND_C2_KERNEL
/* Init the iteration for Horner's scheme. */
dw_dx->v = vec_fma(wendland_dwdx_const_c0.v,x.v,wendland_dwdx_const_c1.v);
dw_dx->v = vec_fma(wendland_dwdx_const_c0.v, x.v, wendland_dwdx_const_c1.v);
/* Calculate the polynomial interleaving vector operations */
dw_dx->v = vec_fma(dw_dx->v,x.v,wendland_dwdx_const_c2.v);
dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c2.v);
dw_dx->v = vec_fma(dw_dx->v,x.v,wendland_dwdx_const_c3.v);
dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c3.v);
dw_dx->v = vec_mul(dw_dx->v,x.v);
dw_dx->v = vec_mul(dw_dx->v, x.v);
#elif defined(CUBIC_SPLINE_KERNEL)
vector dw_dx2;
vector mask1, mask2;
/* Form a mask for each part of the kernel. */
mask1.v = vec_cmp_lt(x.v,cond.v); /* 0 < x < 0.5 */
mask2.v = vec_cmp_gte(x.v,cond.v); /* 0.5 < x < 1 */;
mask1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */
mask2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */
;
/* Work out w for both regions of the kernel and combine the results together using masks. */
/* Work out w for both regions of the kernel and combine the results together
* using masks. */
/* Init the iteration for Horner's scheme. */
dw_dx->v = vec_fma(cubic_1_dwdx_const_c0.v,x.v,cubic_1_dwdx_const_c1.v);
dw_dx2.v = vec_fma(cubic_2_dwdx_const_c0.v,x.v,cubic_2_dwdx_const_c1.v);
dw_dx->v = vec_fma(cubic_1_dwdx_const_c0.v, x.v, cubic_1_dwdx_const_c1.v);
dw_dx2.v = vec_fma(cubic_2_dwdx_const_c0.v, x.v, cubic_2_dwdx_const_c1.v);
/* Calculate the polynomial interleaving vector operations. */
dw_dx->v = vec_mul(dw_dx->v,x.v); /* cubic_1_dwdx_const_c2 is zero. */
dw_dx2.v = vec_fma(dw_dx2.v,x.v,cubic_2_dwdx_const_c2.v);
dw_dx->v = vec_mul(dw_dx->v, x.v); /* cubic_1_dwdx_const_c2 is zero. */
dw_dx2.v = vec_fma(dw_dx2.v, x.v, cubic_2_dwdx_const_c2.v);
/* Mask out unneeded values. */
dw_dx->v = vec_and(dw_dx->v,mask1.v);
dw_dx2.v = vec_and(dw_dx2.v,mask2.v);
dw_dx->v = vec_and(dw_dx->v, mask1.v);
dw_dx2.v = vec_and(dw_dx2.v, mask2.v);
/* Added both dwdx and dwdx2 together to form complete result. */
dw_dx->v = vec_add(dw_dx->v,dw_dx2.v);
dw_dx->v = vec_add(dw_dx->v, dw_dx2.v);
#else
#error
#endif
......
......@@ -3191,7 +3191,9 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts,
}
/**
* @brief Determine which version of DOPAIR1 needs to be called depending on MPI, vectorisation and orientation of the cells or whether DOPAIR1 needs to be called at all.
* @brief Determine which version of DOPAIR1 needs to be called depending on
* MPI, vectorisation and orientation of the cells or whether DOPAIR1 needs to
* be called at all.
*
* @param r #runner
* @param ci #cell ci
......@@ -3224,12 +3226,13 @@ void DOPAIR1_BRANCH(struct runner *r, struct cell *ci, struct cell *cj) {
if (!(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid)))
error("Trying to interact unsorted cells.");
#if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH) && (DOPAIR1_BRANCH == runner_dopair1_density_branch)
if(!sort_is_corner(sid))
runner_dopair1_density_vec(r, ci, cj);
else
#if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH) && \
(DOPAIR1_BRANCH == runner_dopair1_density_branch)
if (!sort_is_corner(sid))
runner_dopair1_density_vec(r, ci, cj);
else
DOPAIR1(r, ci, cj);
#else
DOPAIR1(r, ci, cj);
#else
DOPAIR1(r, ci, cj);
#endif
}
......@@ -299,7 +299,8 @@ __attribute__((always_inline)) INLINE static void populate_max_d_no_cache(
int first_pi = 0, last_pj = cj->count - 1;
/* Find the first active particle in ci to interact with any particle in cj. */
/* Find the first active particle in ci to interact with any particle in cj.
*/
/* Populate max_di with distances. */
int active_id = ci->count - 1;
for (int k = ci->count - 1; k >= 0; k--) {
......@@ -309,20 +310,19 @@ __attribute__((always_inline)) INLINE static void populate_max_d_no_cache(
max_di[k] = d;
/* If the particle is out of range set the index to
/* If the particle is out of range set the index to
* the last active particle within range. */
if (d < dj_min) {
first_pi = active_id;
break;
}
else {
if(part_is_active(p,e)) active_id = k;
} else {
if (part_is_active(p, e)) active_id = k;
}
}
/* Find the maximum distance of pi particles into cj.*/
for (int k = first_pi; k < ci->count; k++) {
max_di[k] = fmaxf(max_di[k - 1],max_di[k]);
max_di[k] = fmaxf(max_di[k - 1], max_di[k]);
}
/* Find the last particle in cj to interact with any particle in ci. */
......@@ -332,17 +332,16 @@ __attribute__((always_inline)) INLINE static void populate_max_d_no_cache(
p = &parts_j[sort_j[k].i];
h = p->h;
d = sort_j[k].d - h * kernel_gamma - dx_max - rshift;
max_dj[k] = d;
/* If the particle is out of range set the index to
/* If the particle is out of range set the index to
* the last active particle within range. */
if (d > di_max) {
last_pj = active_id;
break;
}
else {
if(part_is_active(p,e)) active_id = k;
} else {
if (part_is_active(p, e)) active_id = k;
}
}
......@@ -1021,28 +1020,28 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
const float dx_max = (ci->dx_max + cj->dx_max);
/* Check if any particles are active and return if there are not. */
int numActive = 0;
for (int pid = count_i - 1;
pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min; pid--) {
int numActive = 0;
for (int pid = count_i - 1;
pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min; pid--) {
struct part *restrict pi = &parts_i[sort_i[pid].i];
if (part_is_active(pi, e)) {
numActive++;
break;
}
}
}
if(!numActive) {
if (!numActive) {
for (int pjd = 0; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max;
pjd++) {
pjd++) {
struct part *restrict pj = &parts_j[sort_j[pjd].i];
if (part_is_active(pj, e)) {
numActive++;
break;
}
}
}
}
}
if(numActive == 0) return;
if (numActive == 0) return;
/* Get both particle caches from the runner and re-allocate
* them if they are not big enough for the cells. */
......@@ -1067,7 +1066,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
/* Also find the first pi that interacts with any particle in cj and the last
* pj that interacts with any particle in ci. */
populate_max_d_no_cache(ci, cj, sort_i, sort_j, dx_max, rshift, max_di,
max_dj, &first_pi, &last_pj, e);
max_dj, &first_pi, &last_pj, e);
/* Find the maximum index into cj that is required by a particle in ci. */
/* Find the maximum index into ci that is required by a particle in cj. */
......@@ -1102,8 +1101,8 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
int first_pi_align = first_pi;
int last_pj_align = last_pj;
cache_read_two_partial_cells_sorted(ci, cj, ci_cache, cj_cache, sort_i,
sort_j, shift, &first_pi_align,
&last_pj_align, 1);
sort_j, shift, &first_pi_align,
&last_pj_align, 1);
/* Get the number of particles read into the ci cache. */
int ci_cache_count = count_i - first_pi_align;
......@@ -1147,7 +1146,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
/* Reset cumulative sums of update vectors. */
vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum,
curlvySum, curlvzSum;
curlvySum, curlvzSum;
/* Get the inverse of hi. */
vector v_hi_inv;
......@@ -1184,8 +1183,8 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
vector v_dx, v_dy, v_dz, v_r2;
#ifdef SWIFT_DEBUG_CHECKS
if (cj_cache_idx%VEC_SIZE != 0 || cj_cache_idx < 0) {
error("Unaligned read!!! cj_cache_idx=%d",cj_cache_idx);
if (cj_cache_idx % VEC_SIZE != 0 || cj_cache_idx < 0) {
error("Unaligned read!!! cj_cache_idx=%d", cj_cache_idx);
}
#endif
......@@ -1223,7 +1222,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
#ifdef HAVE_AVX512_F
knl_mask);
#else
0);
0);
#endif
} /* loop over the parts in cj. */
......@@ -1281,7 +1280,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
/* Reset cumulative sums of update vectors. */
vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum,
curlvySum, curlvzSum;
curlvySum, curlvzSum;
/* Get the inverse of hj. */
vector v_hj_inv;
......@@ -1304,17 +1303,18 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
/* Pad the exit iteration align so cache reads are aligned. */
int rem = exit_iteration_align % VEC_SIZE;
if ( exit_iteration_align < VEC_SIZE) {
if (exit_iteration_align < VEC_SIZE) {
exit_iteration_align = 0;
}
else exit_iteration_align -= rem;
} else
exit_iteration_align -= rem;
/* Loop over the parts in ci. */
for (int ci_cache_idx = exit_iteration_align; ci_cache_idx < ci_cache_count; ci_cache_idx += VEC_SIZE) {
for (int ci_cache_idx = exit_iteration_align;
ci_cache_idx < ci_cache_count; ci_cache_idx += VEC_SIZE) {
#ifdef SWIFT_DEBUG_CHECKS
if (ci_cache_idx%VEC_SIZE != 0 || ci_cache_idx < 0) {
error("Unaligned read!!! ci_cache_idx=%d",ci_cache_idx);
if (ci_cache_idx % VEC_SIZE != 0 || ci_cache_idx < 0) {
error("Unaligned read!!! ci_cache_idx=%d", ci_cache_idx);
}
#endif
......@@ -1354,12 +1354,13 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
#ifdef HAVE_AVX512_F
knl_mask);
#else
0);
0);
#endif
} /* loop over the parts in ci. */
/* Perform horizontal adds on vector sums and store result in particle pj. */
/* Perform horizontal adds on vector sums and store result in particle pj.
*/
VEC_HADD(rhoSum, pj->rho);
VEC_HADD(rho_dhSum, pj->density.rho_dh);
VEC_HADD(wcountSum, pj->density.wcount);
......@@ -1370,7 +1371,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
VEC_HADD(curlvzSum, pj->density.rot_v[2]);
} /* loop over the parts in cj. */
TIMER_TOC(timer_dopair_density);
}
......
......@@ -553,8 +553,7 @@ int main(int argc, char *argv[]) {
dump_particle_fields(outputFileName, main_cell, cells);
/* Check serial results against the vectorised results. */
if (check_results(main_cell->parts, vec_parts, main_cell->count,
threshold))
if (check_results(main_cell->parts, vec_parts, main_cell->count, threshold))
message("Differences found...");
/* Output timing */
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment