Commit 447bc7a1 authored by James Willis's avatar James Willis
Browse files

Added mask_t to left-packing with AVX2 and AVX-512 instruction sets.

parent 58e78fbd
......@@ -152,7 +152,7 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions(
* @param v_viz #vector of z velocity of pi.
*/
__attribute__((always_inline)) INLINE static void storeInteractions(
const short mask, const int pjd, vector *v_r2, vector *v_dx, vector *v_dy,
const int mask, const int pjd, vector *v_r2, vector *v_dx, vector *v_dy,
vector *v_dz, const struct cache *const cell_cache, struct c2_cache *const int_cache,
int *icount, vector *rhoSum, vector *rho_dhSum, vector *wcountSum,
vector *wcount_dhSum, vector *div_vSum, vector *curlvxSum,
......@@ -162,34 +162,20 @@ __attribute__((always_inline)) INLINE static void storeInteractions(
/* Left-pack values needed into the secondary cache using the interaction mask.
*/
#if defined(HAVE_AVX2) || defined(HAVE_AVX512_F)
int pack = 0;
#ifdef HAVE_AVX512_F
pack += __builtin_popcount(mask);
VEC_LEFT_PACK(v_r2->v, mask, &int_cache->r2q[*icount]);
VEC_LEFT_PACK(v_dx->v, mask, &int_cache->dxq[*icount]);
VEC_LEFT_PACK(v_dy->v, mask, &int_cache->dyq[*icount]);
VEC_LEFT_PACK(v_dz->v, mask, &int_cache->dzq[*icount]);
VEC_LEFT_PACK(vec_load(&cell_cache->m[pjd]), mask, &int_cache->mq[*icount]);
VEC_LEFT_PACK(vec_load(&cell_cache->vx[pjd]), mask, &int_cache->vxq[*icount]);
VEC_LEFT_PACK(vec_load(&cell_cache->vy[pjd]), mask, &int_cache->vyq[*icount]);
VEC_LEFT_PACK(vec_load(&cell_cache->vz[pjd]), mask, &int_cache->vzq[*icount]);
#else
vector v_mask;
VEC_FORM_PACKED_MASK(mask, v_mask.m, pack);
VEC_LEFT_PACK(v_r2->v, v_mask.m, &int_cache->r2q[*icount]);
VEC_LEFT_PACK(v_dx->v, v_mask.m, &int_cache->dxq[*icount]);
VEC_LEFT_PACK(v_dy->v, v_mask.m, &int_cache->dyq[*icount]);
VEC_LEFT_PACK(v_dz->v, v_mask.m, &int_cache->dzq[*icount]);
VEC_LEFT_PACK(vec_load(&cell_cache->m[pjd]), v_mask.m, &int_cache->mq[*icount]);
VEC_LEFT_PACK(vec_load(&cell_cache->vx[pjd]), v_mask.m, &int_cache->vxq[*icount]);
VEC_LEFT_PACK(vec_load(&cell_cache->vy[pjd]), v_mask.m, &int_cache->vyq[*icount]);
VEC_LEFT_PACK(vec_load(&cell_cache->vz[pjd]), v_mask.m, &int_cache->vzq[*icount]);
#endif /* HAVE_AVX512_F */
(*icount) += pack;
mask_t packed_mask;
VEC_FORM_PACKED_MASK(mask, packed_mask);
VEC_LEFT_PACK(v_r2->v, packed_mask, &int_cache->r2q[*icount]);
VEC_LEFT_PACK(v_dx->v, packed_mask, &int_cache->dxq[*icount]);
VEC_LEFT_PACK(v_dy->v, packed_mask, &int_cache->dyq[*icount]);
VEC_LEFT_PACK(v_dz->v, packed_mask, &int_cache->dzq[*icount]);
VEC_LEFT_PACK(vec_load(&cell_cache->m[pjd]), packed_mask, &int_cache->mq[*icount]);
VEC_LEFT_PACK(vec_load(&cell_cache->vx[pjd]), packed_mask, &int_cache->vxq[*icount]);
VEC_LEFT_PACK(vec_load(&cell_cache->vy[pjd]), packed_mask, &int_cache->vyq[*icount]);
VEC_LEFT_PACK(vec_load(&cell_cache->vz[pjd]), packed_mask, &int_cache->vzq[*icount]);
/* Increment interaction count by number of bits set in mask. */
(*icount) += __builtin_popcount(mask);
#else
/* Quicker to do it serially in AVX rather than use intrinsics. */
for (int bit_index = 0; bit_index < VEC_SIZE; bit_index++) {
......@@ -375,51 +361,31 @@ __attribute__((always_inline)) INLINE static void storeForceInteractions(
/* Left-pack values needed into the secondary cache using the interaction mask.
*/
#if defined(HAVE_AVX2) || defined(HAVE_AVX512_F)
int pack = 0;
/* Invert hj. */
vector v_hj, v_hj_inv;
v_hj = vec_load(&cell_cache->h[pjd]);
v_hj_inv = vec_reciprocal(v_hj);
#ifdef HAVE_AVX512_F
pack += __builtin_popcount(mask);
VEC_LEFT_PACK(v_r2->v, mask, &int_cache->r2q[*icount]);
VEC_LEFT_PACK(v_dx->v, mask, &int_cache->dxq[*icount]);
VEC_LEFT_PACK(v_dy->v, mask, &int_cache->dyq[*icount]);
VEC_LEFT_PACK(v_dz->v, mask, &int_cache->dzq[*icount]);
VEC_LEFT_PACK(vec_load(&cell_cache->m[pjd]), mask, &int_cache->mq[*icount]);
VEC_LEFT_PACK(vec_load(&cell_cache->vx[pjd]), mask, &int_cache->vxq[*icount]);
VEC_LEFT_PACK(vec_load(&cell_cache->vy[pjd]), mask, &int_cache->vyq[*icount]);
VEC_LEFT_PACK(vec_load(&cell_cache->vz[pjd]), mask, &int_cache->vzq[*icount]);
VEC_LEFT_PACK(vec_load(&cell_cache->rho[pjd]), mask, &int_cache->rhoq[*icount]);
VEC_LEFT_PACK(vec_load(&cell_cache->grad_h[pjd]), mask, &int_cache->grad_hq[*icount]);
VEC_LEFT_PACK(vec_load(&cell_cache->pOrho2[pjd]), mask, &int_cache->pOrho2q[*icount]);
VEC_LEFT_PACK(vec_load(&cell_cache->balsara[pjd]), mask, &int_cache->balsaraq[*icount]);
VEC_LEFT_PACK(vec_load(&cell_cache->soundspeed[pjd]), mask, &int_cache->soundspeedq[*icount]);
VEC_LEFT_PACK(v_hj_inv->v, mask, &int_cache->h_invq[*icount]);
#else
vector v_mask;
VEC_FORM_PACKED_MASK(mask, v_mask.m, pack);
VEC_LEFT_PACK(v_r2->v, v_mask.m, &int_cache->r2q[*icount]);
VEC_LEFT_PACK(v_dx->v, v_mask.m, &int_cache->dxq[*icount]);
VEC_LEFT_PACK(v_dy->v, v_mask.m, &int_cache->dyq[*icount]);
VEC_LEFT_PACK(v_dz->v, v_mask.m, &int_cache->dzq[*icount]);
VEC_LEFT_PACK(vec_load(&cell_cache->m[pjd]), v_mask.m, &int_cache->mq[*icount]);
VEC_LEFT_PACK(vec_load(&cell_cache->vx[pjd]), v_mask.m, &int_cache->vxq[*icount]);
VEC_LEFT_PACK(vec_load(&cell_cache->vy[pjd]), v_mask.m, &int_cache->vyq[*icount]);
VEC_LEFT_PACK(vec_load(&cell_cache->vz[pjd]), v_mask.m, &int_cache->vzq[*icount]);
VEC_LEFT_PACK(v_rhoj->v, v_mask.m, &int_cache->rhoq[*icount]);
VEC_LEFT_PACK(v_grad_hj->v, v_mask.m, &int_cache->grad_hq[*icount]);
VEC_LEFT_PACK(v_pOrhoj2->v, v_mask.m, &int_cache->pOrho2q[*icount]);
VEC_LEFT_PACK(v_balsara_j->v, v_mask.m, &int_cache->balsaraq[*icount]);
VEC_LEFT_PACK(v_cj->v, v_mask.m, &int_cache->soundspeedq[*icount]);
VEC_LEFT_PACK(v_hj_inv->v, v_mask.m, &int_cache->h_invq[*icount]);
#endif /* HAVE_AVX512_F */
(*icount) += pack;
mask_t packed_mask;
VEC_FORM_PACKED_MASK(mask, packed_mask);
VEC_LEFT_PACK(v_r2->v, packed_mask, &int_cache->r2q[*icount]);
VEC_LEFT_PACK(v_dx->v, packed_mask, &int_cache->dxq[*icount]);
VEC_LEFT_PACK(v_dy->v, packed_mask, &int_cache->dyq[*icount]);
VEC_LEFT_PACK(v_dz->v, packed_mask, &int_cache->dzq[*icount]);
VEC_LEFT_PACK(vec_load(&cell_cache->m[pjd]), packed_mask, &int_cache->mq[*icount]);
VEC_LEFT_PACK(vec_load(&cell_cache->vx[pjd]), packed_mask, &int_cache->vxq[*icount]);
VEC_LEFT_PACK(vec_load(&cell_cache->vy[pjd]), packed_mask, &int_cache->vyq[*icount]);
VEC_LEFT_PACK(vec_load(&cell_cache->vz[pjd]), packed_mask, &int_cache->vzq[*icount]);
VEC_LEFT_PACK(vec_load(&cell_cache->rho[pjd]), packed_mask, &int_cache->rhoq[*icount]);
VEC_LEFT_PACK(vec_load(&cell_cache->grad_h[pjd]), packed_mask, &int_cache->grad_hq[*icount]);
VEC_LEFT_PACK(vec_load(&cell_cache->pOrho2[pjd]), packed_mask, &int_cache->pOrho2q[*icount]);
VEC_LEFT_PACK(vec_load(&cell_cache->balsara[pjd]), packed_mask, &int_cache->balsaraq[*icount]);
VEC_LEFT_PACK(vec_load(&cell_cache->soundspeed[pjd]), packed_mask, &int_cache->soundspeedq[*icount]);
VEC_LEFT_PACK(v_hj_inv->v, packed_mask, &int_cache->h_invq[*icount]);
/* Increment interaction count by number of bits set in mask. */
(*icount) += __builtin_popcount(mask);
#else
/* Quicker to do it serially in AVX rather than use intrinsics. */
for (int bit_index = 0; bit_index < VEC_SIZE; bit_index++) {
......
......@@ -126,10 +126,10 @@
for (int i = 0; i < VEC_SIZE; i++) b += a.f[i]; \
}
#endif
/* Calculates the number of set bits in the mask and adds the result to an int.
*/
#define VEC_FORM_PACKED_MASK(mask, v_mask, pack) \
pack += __builtin_popcount(mask);
/* Do nothing in the case of AVX-512 as there are already
* instructions for left-packing.*/
#define VEC_FORM_PACKED_MASK(mask, packed_mask)
/* Finds the horizontal maximum of vector b and returns a float. */
#define VEC_HMAX(a, b) a = _mm512_reduce_max_ps(b.v)
......@@ -230,14 +230,13 @@
/* Takes an integer mask and forms a left-packed integer vector
* containing indices of the set bits in the integer mask.
* Also returns the total number of bits set in the mask. */
#define VEC_FORM_PACKED_MASK(mask, v_mask, pack) \
#define VEC_FORM_PACKED_MASK(mask, packed_mask) \
{ \
unsigned long expanded_mask = _pdep_u64(mask, 0x0101010101010101); \
expanded_mask *= 0xFF; \
unsigned long wanted_indices = _pext_u64(identity_indices, expanded_mask); \
__m128i bytevec = _mm_cvtsi64_si128(wanted_indices); \
v_mask = _mm256_cvtepu8_epi32(bytevec); \
pack += __builtin_popcount(mask); \
packed_mask.m = _mm256_cvtepu8_epi32(bytevec); \
}
/* Performs a left-pack on a vector based upon a mask and returns the result. */
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment