Commit 9a0ff76d authored by James Willis's avatar James Willis
Browse files

Merge branch 'doself2-vectorisation' into dopair2-vectorisation

parents 33476206 5c9fd282
......@@ -63,6 +63,21 @@ struct cache {
/* Particle z velocity. */
float *restrict vz __attribute__((aligned(CACHE_ALIGN)));
/* Particle density. */
float *restrict rho __attribute__((aligned(CACHE_ALIGN)));
/* Particle smoothing length gradient. */
float *restrict grad_h __attribute__((aligned(CACHE_ALIGN)));
/* Pressure over density squared. */
float *restrict pOrho2 __attribute__((aligned(CACHE_ALIGN)));
/* Balsara switch. */
float *restrict balsara __attribute__((aligned(CACHE_ALIGN)));
/* Particle sound speed. */
float *restrict soundspeed __attribute__((aligned(CACHE_ALIGN)));
/* Maximum distance of particles into neighbouring cell. */
float *restrict max_d __attribute__((aligned(CACHE_ALIGN)));
......@@ -97,6 +112,24 @@ struct c2_cache {
/* z velocity of particle pj. */
float vzq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
/* Density of particle pj. */
float rhoq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
/* Smoothing length gradient of particle pj. */
float grad_hq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
/* Pressure over density squared of particle pj. */
float pOrho2q[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
/* Balsara switch of particle pj. */
float balsaraq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
/* Sound speed of particle pj. */
float soundspeedq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
/* Inverse smoothing length of particle pj. */
float h_invq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
};
/**
......@@ -126,6 +159,11 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c,
free(c->vy);
free(c->vz);
free(c->h);
free(c->rho);
free(c->grad_h);
free(c->pOrho2);
free(c->balsara);
free(c->soundspeed);
free(c->max_d);
}
......@@ -138,6 +176,11 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c,
error += posix_memalign((void **)&c->vz, CACHE_ALIGN, sizeBytes);
error += posix_memalign((void **)&c->h, CACHE_ALIGN, sizeBytes);
error += posix_memalign((void **)&c->max_d, CACHE_ALIGN, sizeBytes);
error += posix_memalign((void **)&c->rho, CACHE_ALIGN, sizeBytes);
error += posix_memalign((void **)&c->grad_h, CACHE_ALIGN, sizeBytes);
error += posix_memalign((void **)&c->pOrho2, CACHE_ALIGN, sizeBytes);
error += posix_memalign((void **)&c->balsara, CACHE_ALIGN, sizeBytes);
error += posix_memalign((void **)&c->soundspeed, CACHE_ALIGN, sizeBytes);
if (error != 0)
error("Couldn't allocate cache, no. of particles: %d", (int)count);
......@@ -170,6 +213,12 @@ __attribute__((always_inline)) INLINE void cache_read_particles(
ci_cache->vx[i] = ci->parts[i].v[0];
ci_cache->vy[i] = ci->parts[i].v[1];
ci_cache->vz[i] = ci->parts[i].v[2];
ci_cache->rho[i] = ci->parts[i].rho;
ci_cache->grad_h[i] = ci->parts[i].force.f;
ci_cache->pOrho2[i] = ci->parts[i].force.P_over_rho2;
ci_cache->balsara[i] = ci->parts[i].force.balsara;
ci_cache->soundspeed[i] = ci->parts[i].force.soundspeed;
}
#endif
......
This diff is collapsed.
......@@ -432,11 +432,12 @@ static const vector cubic_2_dwdx_const_c2 = FILL_VEC(-3.f);
static const vector cond = FILL_VEC(0.5f);
#endif
/*TODO: Comment kernels for each region */
/**
* @brief Computes the kernel function and its derivative for two particles
* using vectors.
*
* Return 0 if $u > \\gamma = H/h$
* using vectors. Does not return zero if $u > \\gamma = H/h$, should only
* be called if particles are known to interact.
*
* @param u The ratio of the distance to the smoothing length $u = x/h$.
* @param w (return) The value of the kernel function $W(x,h)$.
......@@ -473,7 +474,6 @@ __attribute__((always_inline)) INLINE static void kernel_deval_1_vec(
/* Form a mask for each part of the kernel. */
mask_reg1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */
mask_reg2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */
;
/* Work out w for both regions of the kernel and combine the results together
* using masks. */
......@@ -517,9 +517,9 @@ __attribute__((always_inline)) INLINE static void kernel_deval_1_vec(
/**
* @brief Computes the kernel function and its derivative for two particles
* using interleaved vectors.
*
* Return 0 if $u > \\gamma = H/h$
* using interleaved vectors. Does not return zero if $u > \\gamma = H/h$,
* should only
* be called if particles are known to interact.
*
* @param u The ratio of the distance to the smoothing length $u = x/h$.
* @param w (return) The value of the kernel function $W(x,h)$.
......@@ -583,12 +583,10 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
vector mask_reg1, mask_reg2, mask_reg1_v2, mask_reg2_v2;
/* Form a mask for each part of the kernel. */
mask_reg1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */
mask_reg1_v2.v = vec_cmp_lt(x2.v, cond.v); /* 0 < x < 0.5 */
mask_reg2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */
;
mask_reg1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */
mask_reg1_v2.v = vec_cmp_lt(x2.v, cond.v); /* 0 < x < 0.5 */
mask_reg2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */
mask_reg2_v2.v = vec_cmp_gte(x2.v, cond.v); /* 0.5 < x < 1 */
;
/* Work out w for both regions of the kernel and combine the results together
* using masks. */
......@@ -651,9 +649,8 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
/**
* @brief Computes the kernel function for two particles
* using vectors.
*
* Return 0 if $u > \\gamma = H/h$
* using vectors. Does not return zero if $u > \\gamma = H/h$, should only
* be called if particles are known to interact.
*
* @param u The ratio of the distance to the smoothing length $u = x/h$.
* @param w (return) The value of the kernel function $W(x,h)$.
......@@ -676,12 +673,11 @@ __attribute__((always_inline)) INLINE static void kernel_eval_W_vec(vector *u,
w->v = vec_fma(x.v, w->v, wendland_const_c5.v);
#elif defined(CUBIC_SPLINE_KERNEL)
vector w2;
vector mask1, mask2;
vector mask_reg1, mask_reg2;
/* Form a mask for each part of the kernel. */
mask1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */
mask2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */
;
mask_reg1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */
mask_reg2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */
/* Work out w for both regions of the kernel and combine the results together
* using masks. */
......@@ -698,8 +694,8 @@ __attribute__((always_inline)) INLINE static void kernel_eval_W_vec(vector *u,
w2.v = vec_fma(x.v, w2.v, cubic_2_const_c3.v);
/* Mask out unneeded values. */
w->v = vec_and(w->v, mask1.v);
w2.v = vec_and(w2.v, mask2.v);
w->v = vec_and(w->v, mask_reg1.v);
w2.v = vec_and(w2.v, mask_reg2.v);
/* Added both w and w2 together to form complete result. */
w->v = vec_add(w->v, w2.v);
......@@ -712,6 +708,66 @@ __attribute__((always_inline)) INLINE static void kernel_eval_W_vec(vector *u,
vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
}
/**
* @brief Computes the kernel function derivative for two particles
* using vectors. Does not return zero if $u > \\gamma = H/h$, should only
* be called if particles are known to interact.
*
* @param u The ratio of the distance to the smoothing length $u = x/h$.
* @param dw_dx (return) The norm of the gradient of $|\\nabla W(x,h)|$.
*/
__attribute__((always_inline)) INLINE static void kernel_eval_dWdx_vec(
vector *u, vector *dw_dx) {
/* Go to the range [0,1[ from [0,H[ */
vector x;
x.v = vec_mul(u->v, kernel_gamma_inv_vec.v);
#ifdef WENDLAND_C2_KERNEL
/* Init the iteration for Horner's scheme. */
dw_dx->v = vec_fma(wendland_dwdx_const_c0.v, x.v, wendland_dwdx_const_c1.v);
/* Calculate the polynomial interleaving vector operations */
dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c2.v);
dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c3.v);
dw_dx->v = vec_mul(dw_dx->v, x.v);
#elif defined(CUBIC_SPLINE_KERNEL)
vector dw_dx2;
vector mask_reg1, mask_reg2;
/* Form a mask for each part of the kernel. */
mask_reg1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */
mask_reg2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */
/* Work out w for both regions of the kernel and combine the results together
* using masks. */
/* Init the iteration for Horner's scheme. */
dw_dx->v = vec_fma(cubic_1_dwdx_const_c0.v, x.v, cubic_1_dwdx_const_c1.v);
dw_dx2.v = vec_fma(cubic_2_dwdx_const_c0.v, x.v, cubic_2_dwdx_const_c1.v);
/* Calculate the polynomial interleaving vector operations. */
dw_dx->v = vec_mul(dw_dx->v, x.v); /* cubic_1_dwdx_const_c2 is zero. */
dw_dx2.v = vec_fma(dw_dx2.v, x.v, cubic_2_dwdx_const_c2.v);
/* Mask out unneeded values. */
dw_dx->v = vec_and(dw_dx->v, mask_reg1.v);
dw_dx2.v = vec_and(dw_dx2.v, mask_reg2.v);
/* Added both dwdx and dwdx2 together to form complete result. */
dw_dx->v = vec_add(dw_dx->v, dw_dx2.v);
#else
#error
#endif
/* Return everything */
dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
kernel_gamma_inv_dim_plus_one_vec.v));
}
/**
* @brief Computes the kernel function derivative for two particles
* using vectors.
......@@ -721,7 +777,7 @@ __attribute__((always_inline)) INLINE static void kernel_eval_W_vec(vector *u,
* @param u The ratio of the distance to the smoothing length $u = x/h$.
* @param dw_dx (return) The norm of the gradient of $|\\nabla W(x,h)|$.
*/
__attribute__((always_inline)) INLINE static void kernel_eval_dWdx_vec(
__attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_vec(
vector *u, vector *dw_dx) {
/* Go to the range [0,1[ from [0,H[ */
......@@ -741,12 +797,11 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_vec(
#elif defined(CUBIC_SPLINE_KERNEL)
vector dw_dx2;
vector mask1, mask2;
vector mask_reg1, mask_reg2;
/* Form a mask for each part of the kernel. */
mask1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */
mask2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */
;
mask_reg1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */
mask_reg2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */
/* Work out w for both regions of the kernel and combine the results together
* using masks. */
......@@ -760,18 +815,112 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_vec(
dw_dx2.v = vec_fma(dw_dx2.v, x.v, cubic_2_dwdx_const_c2.v);
/* Mask out unneeded values. */
dw_dx->v = vec_and(dw_dx->v, mask1.v);
dw_dx2.v = vec_and(dw_dx2.v, mask2.v);
dw_dx->v = vec_and(dw_dx->v, mask_reg1.v);
dw_dx2.v = vec_and(dw_dx2.v, mask_reg2.v);
/* Added both dwdx and dwdx2 together to form complete result. */
dw_dx->v = vec_add(dw_dx->v, dw_dx2.v);
#else
#error
#endif
/* Mask out result for particles that lie outside of the kernel function. */
vector mask;
mask.v = vec_cmp_lt(x.v, vec_set1(1.f));
dw_dx->v = vec_and(dw_dx->v, mask.v);
/* Return everything */
dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
kernel_gamma_inv_dim_plus_one_vec.v));
}
/**
* @brief Computes the kernel function derivative for two particles
* using interleaved vectors.
*
* Return 0 if $u > \\gamma = H/h$
*
* @param u The ratio of the distance to the smoothing length $u = x/h$.
* @param dw_dx (return) The norm of the gradient of $|\\nabla W(x,h)|$.
*/
__attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_2_vec(
vector *u, vector *dw_dx, vector *u_2, vector *dw_dx_2) {
/* Go to the range [0,1[ from [0,H[ */
vector x, x_2;
x.v = vec_mul(u->v, kernel_gamma_inv_vec.v);
x_2.v = vec_mul(u_2->v, kernel_gamma_inv_vec.v);
#ifdef WENDLAND_C2_KERNEL
/* Init the iteration for Horner's scheme. */
dw_dx->v = vec_fma(wendland_dwdx_const_c0.v, x.v, wendland_dwdx_const_c1.v);
dw_dx_2->v =
vec_fma(wendland_dwdx_const_c0.v, x_2.v, wendland_dwdx_const_c1.v);
/* Calculate the polynomial interleaving vector operations */
dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c2.v);
dw_dx_2->v = vec_fma(dw_dx_2->v, x_2.v, wendland_dwdx_const_c2.v);
dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c3.v);
dw_dx_2->v = vec_fma(dw_dx_2->v, x_2.v, wendland_dwdx_const_c3.v);
dw_dx->v = vec_mul(dw_dx->v, x.v);
dw_dx_2->v = vec_mul(dw_dx_2->v, x_2.v);
#elif defined(CUBIC_SPLINE_KERNEL)
vector dw_dx2, dw_dx2_2;
vector mask_reg1, mask_reg2;
vector mask_reg1_2, mask_reg2_2;
/* Form a mask for each part of the kernel. */
mask_reg1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */
mask_reg1_2.v = vec_cmp_lt(x_2.v, cond.v); /* 0 < x < 0.5 */
mask_reg2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */
mask_reg2_2.v = vec_cmp_gte(x_2.v, cond.v); /* 0.5 < x < 1 */
/* Work out w for both regions of the kernel and combine the results together
* using masks. */
/* Init the iteration for Horner's scheme. */
dw_dx->v = vec_fma(cubic_1_dwdx_const_c0.v, x.v, cubic_1_dwdx_const_c1.v);
dw_dx_2->v = vec_fma(cubic_1_dwdx_const_c0.v, x_2.v, cubic_1_dwdx_const_c1.v);
dw_dx2.v = vec_fma(cubic_2_dwdx_const_c0.v, x.v, cubic_2_dwdx_const_c1.v);
dw_dx2_2.v = vec_fma(cubic_2_dwdx_const_c0.v, x_2.v, cubic_2_dwdx_const_c1.v);
/* Calculate the polynomial interleaving vector operations. */
dw_dx->v = vec_mul(dw_dx->v, x.v); /* cubic_1_dwdx_const_c2 is zero. */
dw_dx_2->v = vec_mul(dw_dx_2->v, x_2.v); /* cubic_1_dwdx_const_c2 is zero. */
dw_dx2.v = vec_fma(dw_dx2.v, x.v, cubic_2_dwdx_const_c2.v);
dw_dx2_2.v = vec_fma(dw_dx2_2.v, x_2.v, cubic_2_dwdx_const_c2.v);
/* Mask out unneeded values. */
dw_dx->v = vec_and(dw_dx->v, mask_reg1.v);
dw_dx_2->v = vec_and(dw_dx_2->v, mask_reg1_2.v);
dw_dx2.v = vec_and(dw_dx2.v, mask_reg2.v);
dw_dx2_2.v = vec_and(dw_dx2_2.v, mask_reg2_2.v);
/* Added both dwdx and dwdx2 together to form complete result. */
dw_dx->v = vec_add(dw_dx->v, dw_dx2.v);
dw_dx_2->v = vec_add(dw_dx_2->v, dw_dx2_2.v);
#else
#error
#endif
/* Mask out result for particles that lie outside of the kernel function. */
vector mask, mask_2;
mask.v = vec_cmp_lt(x.v, vec_set1(1.f));
mask_2.v = vec_cmp_lt(x_2.v, vec_set1(1.f));
dw_dx->v = vec_and(dw_dx->v, mask.v);
dw_dx_2->v = vec_and(dw_dx_2->v, mask_2.v);
/* Return everything */
dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
kernel_gamma_inv_dim_plus_one_vec.v));
dw_dx_2->v = vec_mul(
dw_dx_2->v,
vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_plus_one_vec.v));
}
#endif /* WITH_VECTORIZATION */
......
......@@ -1787,9 +1787,13 @@ void *runner_main(void *data) {
else if (t->subtype == task_subtype_gradient)
runner_doself1_gradient(r, ci);
#endif
else if (t->subtype == task_subtype_force)
else if (t->subtype == task_subtype_force) {
#if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH)
runner_doself2_force_vec(r, ci);
#else
runner_doself2_force(r, ci);
else if (t->subtype == task_subtype_grav)
#endif
} else if (t->subtype == task_subtype_grav)
runner_doself_grav(r, ci, 1);
else if (t->subtype == task_subtype_external_grav)
runner_do_grav_external(r, ci, 1);
......
......@@ -2633,9 +2633,14 @@ void DOSUB_SELF2(struct runner *r, struct cell *ci, int gettimer) {
}
/* Otherwise, compute self-interaction. */
else
else {
#if (DOSELF2 == runner_doself2_force) && defined(WITH_VECTORIZATION) && \
defined(GADGET2_SPH)
runner_doself2_force_vec(r, ci);
#else
DOSELF2(r, ci);
#endif
}
if (gettimer) TIMER_TOC(TIMER_DOSUB_SELF);
}
......
This diff is collapsed.
......@@ -35,7 +35,7 @@
/* Function prototypes. */
void runner_doself1_density_vec(struct runner *r, struct cell *restrict c);
void runner_doself1_density_vec_2(struct runner *r, struct cell *restrict c);
void runner_doself2_force_vec(struct runner *r, struct cell *restrict c);
void runner_dopair1_density_vec(struct runner *r, struct cell *restrict ci,
struct cell *restrict cj);
......
......@@ -60,7 +60,9 @@
#define vec_dbl_set(a, b, c, d, e, f, g, h) \
_mm512_set_pd(h, g, f, e, d, c, b, a)
#define vec_add(a, b) _mm512_add_ps(a, b)
#define vec_mask_add(a, b, mask) _mm512_mask_add_ps(a, mask, b, a)
#define vec_sub(a, b) _mm512_sub_ps(a, b)
#define vec_mask_sub(a, b, mask) _mm512_mask_sub_ps(a, mask, a, b)
#define vec_mul(a, b) _mm512_mul_ps(a, b)
#define vec_fma(a, b, c) _mm512_fmadd_ps(a, b, c)
#define vec_sqrt(a) _mm512_sqrt_ps(a)
......@@ -75,7 +77,15 @@
#define vec_cmp_lt(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ)
#define vec_cmp_lte(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ)
#define vec_cmp_gte(a, b) _mm512_cmp_ps_mask(a, b, _CMP_GE_OQ)
#define vec_cmp_result(a) a
#define vec_form_int_mask(a) a
#define vec_and(a, b) _mm512_and_ps(a, b)
#define vec_mask_and(a, b) a &b
#define vec_and_mask(a, mask) _mm512_maskz_expand_ps(mask, a)
#define vec_init_mask(mask) mask = 0xFFFF
#define vec_zero_mask(mask) mask = 0
#define vec_create_mask(mask, cond) mask = cond
#define vec_pad_mask(mask, pad) mask = mask >> (pad)
#define vec_todbl_lo(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 0))
#define vec_todbl_hi(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 1))
#define vec_dbl_tofloat(a, b) _mm512_insertf128(_mm512_castps128_ps512(a), b, 1)
......@@ -116,10 +126,13 @@
for (int i = 0; i < VEC_SIZE; i++) b += a.f[i]; \
}
#endif
/* Calculates the number of set bits in the mask and adds the result to an int.
*/
#define VEC_FORM_PACKED_MASK(mask, v_mask, pack) \
pack += __builtin_popcount(mask);
/* Do nothing in the case of AVX-512 as there are already
* instructions for left-packing.*/
#define VEC_FORM_PACKED_MASK(mask, packed_mask)
/* Finds the horizontal maximum of vector b and returns a float. */
#define VEC_HMAX(a, b) a = _mm512_reduce_max_ps(b.v)
/* Performs a left-pack on a vector based upon a mask and returns the result. */
#define VEC_LEFT_PACK(a, mask, result) \
......@@ -142,7 +155,9 @@
#define vec_set(a, b, c, d, e, f, g, h) _mm256_set_ps(h, g, f, e, d, c, b, a)
#define vec_dbl_set(a, b, c, d) _mm256_set_pd(d, c, b, a)
#define vec_add(a, b) _mm256_add_ps(a, b)
#define vec_mask_add(a, b, mask) vec_add(a, vec_and(b, mask.v))
#define vec_sub(a, b) _mm256_sub_ps(a, b)
#define vec_mask_sub(a, b, mask) vec_sub(a, vec_and(b, mask.v))
#define vec_mul(a, b) _mm256_mul_ps(a, b)
#define vec_sqrt(a) _mm256_sqrt_ps(a)
#define vec_rcp(a) _mm256_rcp_ps(a)
......@@ -157,7 +172,15 @@
#define vec_cmp_lte(a, b) _mm256_cmp_ps(a, b, _CMP_LE_OQ)
#define vec_cmp_gte(a, b) _mm256_cmp_ps(a, b, _CMP_GE_OQ)
#define vec_cmp_result(a) _mm256_movemask_ps(a)
#define vec_form_int_mask(a) _mm256_movemask_ps(a.v)
#define vec_and(a, b) _mm256_and_ps(a, b)
#define vec_mask_and(a, b) _mm256_and_ps(a.v, b.v)
#define vec_and_mask(a, mask) vec_mask_and(a, mask)
#define vec_init_mask(mask) mask.m = vec_setint1(0xFFFFFFFF)
#define vec_create_mask(mask, cond) mask.v = cond
#define vec_zero_mask(mask) mask.v = vec_setzero()
#define vec_pad_mask(mask, pad) \
for (int i = VEC_SIZE - (pad); i < VEC_SIZE; i++) mask.i[i] = 0
#define vec_todbl_lo(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 0))
#define vec_todbl_hi(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 1))
#define vec_dbl_tofloat(a, b) _mm256_insertf128(_mm256_castps128_ps256(a), b, 1)
......@@ -183,6 +206,13 @@
a.v = _mm256_hadd_ps(a.v, a.v); \
b += a.f[0] + a.f[4];
/* Performs a horizontal maximum on the vector and takes the maximum of the
* result with a float, b. */
#define VEC_HMAX(a, b) \
{ \
for (int k = 0; k < VEC_SIZE; k++) b = max(b, a.f[k]); \
}
/* Returns the lower 128-bits of the 256-bit vector. */
#define VEC_GET_LOW(a) _mm256_castps256_ps128(a)
......@@ -201,19 +231,18 @@
/* Takes an integer mask and forms a left-packed integer vector
* containing indices of the set bits in the integer mask.
* Also returns the total number of bits set in the mask. */
#define VEC_FORM_PACKED_MASK(mask, v_mask, pack) \
#define VEC_FORM_PACKED_MASK(mask, packed_mask) \
{ \
unsigned long expanded_mask = _pdep_u64(mask, 0x0101010101010101); \
expanded_mask *= 0xFF; \
unsigned long wanted_indices = _pext_u64(identity_indices, expanded_mask); \
__m128i bytevec = _mm_cvtsi64_si128(wanted_indices); \
v_mask = _mm256_cvtepu8_epi32(bytevec); \
pack += __builtin_popcount(mask); \
packed_mask.m = _mm256_cvtepu8_epi32(bytevec); \
}
/* Performs a left-pack on a vector based upon a mask and returns the result. */
#define VEC_LEFT_PACK(a, mask, result) \
vec_unaligned_store(_mm256_permutevar8x32_ps(a, mask), result)
vec_unaligned_store(_mm256_permutevar8x32_ps(a, mask.m), result)
#endif /* HAVE_AVX2 */
/* Create an FMA using vec_add and vec_mul if AVX2 is not present. */
......@@ -336,6 +365,13 @@ typedef union {
int i[VEC_SIZE];
} vector;
/* Define the mask type depending on the instruction set used. */
#ifdef HAVE_AVX512_F
typedef __mmask16 mask_t;
#else
typedef vector mask_t;
#endif
/**
* @brief Calculates the inverse ($1/x$) of a vector using intrinsics and a
* Newton iteration to obtain the correct level of accuracy.
......
......@@ -22,7 +22,7 @@ AM_LDFLAGS = ../src/.libs/libswiftsim.a $(HDF5_LDFLAGS) $(HDF5_LIBS) $(FFTW_LIBS
# List of programs and scripts to run in the test suite
TESTS = testGreetings testMaths testReading.sh testSingle testKernel testSymmetry \
testPair.sh testPairPerturbed.sh test27cells.sh test27cellsPerturbed.sh \
testParser.sh testSPHStep test125cells.sh testFFT \
testParser.sh testSPHStep test125cells.sh test125cellsPerturbed.sh testFFT \
testAdiabaticIndex testRiemannExact testRiemannTRRS testRiemannHLLC \
testMatrixInversion testThreadpool testDump testLogger \
testVoronoi1D testVoronoi2D testVoronoi3D
......@@ -92,6 +92,6 @@ testLogger_SOURCES = testLogger.c
# Files necessary for distribution
EXTRA_DIST = testReading.sh makeInput.py testPair.sh testPairPerturbed.sh \
test27cells.sh test27cellsPerturbed.sh testParser.sh \
test125cells.sh testParserInput.yaml difffloat.py \
test125cells.sh test125cells.sh testParserInput.yaml difffloat.py \
tolerance_125.dat tolerance_27_normal.dat tolerance_27_perturbed.dat \
tolerance_pair_normal.dat tolerance_pair_perturbed.dat
......@@ -368,7 +368,7 @@ void test_interactions(struct part test_part, struct part *parts, size_t count,
/* Perform vector interaction. */
#ifdef WITH_VECTORIZATION
vector hi_vec, hi_inv_vec, vix_vec, viy_vec, viz_vec, mask, mask2;
vector hi_vec, hi_inv_vec, vix_vec, viy_vec, viz_vec;
vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum,
curlvySum, curlvzSum;
......@@ -387,14 +387,10 @@ void test_interactions(struct part test_part, struct part *parts, size_t count,
viz_vec.v = vec_load(&vizq[0]);
hi_inv_vec = vec_reciprocal(hi_vec);
mask.m = vec_setint1(0xFFFFFFFF);
mask2.m = vec_setint1(0xFFFFFFFF);
#ifdef HAVE_AVX512_F
KNL_MASK_16 knl_mask, knl_mask2;
knl_mask = 0xFFFF;
knl_mask2 = 0xFFFF;
#endif
mask_t mask, mask2;
vec_init_mask(mask);
vec_init_mask(mask2);
const ticks vec_tic = getticks();
......@@ -404,12 +400,7 @@ void test_interactions(struct part test_part, struct part *parts, size_t count,
(vix_vec), (viy_vec), (viz_vec), &(vjxq[i]), &(vjyq[i]),
&(vjzq[i]), &(mjq[i]), &rhoSum, &rho_dhSum, &wcountSum,
&wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, &curlvzSum,
mask, mask2,
#ifdef HAVE_AVX512_F
knl_mask, knl_mask2);
#else
0, 0);
#endif
mask, mask2, 0);
}
VEC_HADD(rhoSum, piq[0]->rho);
......
......@@ -30,6 +30,19 @@
/* Local headers. */
#include "swift.h"
#if defined(WITH_VECTORIZATION)
#define DOSELF2 runner_doself2_force_vec
//#define DOPAIR2 runner_dopair2_force_vec
#define DOSELF2_NAME "runner_doself2_force_vec"
#define DOPAIR2_NAME "runner_dopair2_force"
#endif
#ifndef DOSELF2
#define DOSELF2 runner_doself2_force
#define DOSELF2_NAME "runner_doself2_density"
#define DOPAIR2_NAME "runner_dopair2_force"
#endif
enum velocity_field {
velocity_zero,
velocity_const,
......@@ -240,7 +253,7 @@ void reset_particles(struct cell *c, struct hydro_space *hs,
* @param press The type of pressure field.
*/
</