diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c index f7c628d05344b90a8ba9348550423988710e353c..2fa6051892ef0d43c7b11f392d2650475375c71d 100644 --- a/src/runner_doiact_vec.c +++ b/src/runner_doiact_vec.c @@ -152,7 +152,7 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions( * @param v_viz #vector of z velocity of pi. */ __attribute__((always_inline)) INLINE static void storeInteractions( - const short mask, const int pjd, vector *v_r2, vector *v_dx, vector *v_dy, + const int mask, const int pjd, vector *v_r2, vector *v_dx, vector *v_dy, vector *v_dz, const struct cache *const cell_cache, struct c2_cache *const int_cache, int *icount, vector *rhoSum, vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum, vector *div_vSum, vector *curlvxSum, @@ -162,34 +162,20 @@ __attribute__((always_inline)) INLINE static void storeInteractions( /* Left-pack values needed into the secondary cache using the interaction mask. */ #if defined(HAVE_AVX2) || defined(HAVE_AVX512_F) - int pack = 0; - -#ifdef HAVE_AVX512_F - pack += __builtin_popcount(mask); - VEC_LEFT_PACK(v_r2->v, mask, &int_cache->r2q[*icount]); - VEC_LEFT_PACK(v_dx->v, mask, &int_cache->dxq[*icount]); - VEC_LEFT_PACK(v_dy->v, mask, &int_cache->dyq[*icount]); - VEC_LEFT_PACK(v_dz->v, mask, &int_cache->dzq[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->m[pjd]), mask, &int_cache->mq[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->vx[pjd]), mask, &int_cache->vxq[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->vy[pjd]), mask, &int_cache->vyq[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->vz[pjd]), mask, &int_cache->vzq[*icount]); -#else - vector v_mask; - VEC_FORM_PACKED_MASK(mask, v_mask.m, pack); - - VEC_LEFT_PACK(v_r2->v, v_mask.m, &int_cache->r2q[*icount]); - VEC_LEFT_PACK(v_dx->v, v_mask.m, &int_cache->dxq[*icount]); - VEC_LEFT_PACK(v_dy->v, v_mask.m, &int_cache->dyq[*icount]); - VEC_LEFT_PACK(v_dz->v, v_mask.m, &int_cache->dzq[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->m[pjd]), v_mask.m, &int_cache->mq[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->vx[pjd]), v_mask.m, &int_cache->vxq[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->vy[pjd]), v_mask.m, &int_cache->vyq[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->vz[pjd]), v_mask.m, &int_cache->vzq[*icount]); - -#endif /* HAVE_AVX512_F */ - - (*icount) += pack; + mask_t packed_mask; + VEC_FORM_PACKED_MASK(mask, packed_mask); + + VEC_LEFT_PACK(v_r2->v, packed_mask, &int_cache->r2q[*icount]); + VEC_LEFT_PACK(v_dx->v, packed_mask, &int_cache->dxq[*icount]); + VEC_LEFT_PACK(v_dy->v, packed_mask, &int_cache->dyq[*icount]); + VEC_LEFT_PACK(v_dz->v, packed_mask, &int_cache->dzq[*icount]); + VEC_LEFT_PACK(vec_load(&cell_cache->m[pjd]), packed_mask, &int_cache->mq[*icount]); + VEC_LEFT_PACK(vec_load(&cell_cache->vx[pjd]), packed_mask, &int_cache->vxq[*icount]); + VEC_LEFT_PACK(vec_load(&cell_cache->vy[pjd]), packed_mask, &int_cache->vyq[*icount]); + VEC_LEFT_PACK(vec_load(&cell_cache->vz[pjd]), packed_mask, &int_cache->vzq[*icount]); + + /* Increment interaction count by number of bits set in mask. */ + (*icount) += __builtin_popcount(mask); #else /* Quicker to do it serially in AVX rather than use intrinsics. */ for (int bit_index = 0; bit_index < VEC_SIZE; bit_index++) { @@ -375,51 +361,31 @@ __attribute__((always_inline)) INLINE static void storeForceInteractions( /* Left-pack values needed into the secondary cache using the interaction mask. */ #if defined(HAVE_AVX2) || defined(HAVE_AVX512_F) - int pack = 0; - /* Invert hj. */ vector v_hj, v_hj_inv; v_hj = vec_load(&cell_cache->h[pjd]); v_hj_inv = vec_reciprocal(v_hj); -#ifdef HAVE_AVX512_F - pack += __builtin_popcount(mask); - VEC_LEFT_PACK(v_r2->v, mask, &int_cache->r2q[*icount]); - VEC_LEFT_PACK(v_dx->v, mask, &int_cache->dxq[*icount]); - VEC_LEFT_PACK(v_dy->v, mask, &int_cache->dyq[*icount]); - VEC_LEFT_PACK(v_dz->v, mask, &int_cache->dzq[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->m[pjd]), mask, &int_cache->mq[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->vx[pjd]), mask, &int_cache->vxq[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->vy[pjd]), mask, &int_cache->vyq[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->vz[pjd]), mask, &int_cache->vzq[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->rho[pjd]), mask, &int_cache->rhoq[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->grad_h[pjd]), mask, &int_cache->grad_hq[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->pOrho2[pjd]), mask, &int_cache->pOrho2q[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->balsara[pjd]), mask, &int_cache->balsaraq[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->soundspeed[pjd]), mask, &int_cache->soundspeedq[*icount]); - VEC_LEFT_PACK(v_hj_inv->v, mask, &int_cache->h_invq[*icount]); -#else - vector v_mask; - VEC_FORM_PACKED_MASK(mask, v_mask.m, pack); - - VEC_LEFT_PACK(v_r2->v, v_mask.m, &int_cache->r2q[*icount]); - VEC_LEFT_PACK(v_dx->v, v_mask.m, &int_cache->dxq[*icount]); - VEC_LEFT_PACK(v_dy->v, v_mask.m, &int_cache->dyq[*icount]); - VEC_LEFT_PACK(v_dz->v, v_mask.m, &int_cache->dzq[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->m[pjd]), v_mask.m, &int_cache->mq[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->vx[pjd]), v_mask.m, &int_cache->vxq[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->vy[pjd]), v_mask.m, &int_cache->vyq[*icount]); - VEC_LEFT_PACK(vec_load(&cell_cache->vz[pjd]), v_mask.m, &int_cache->vzq[*icount]); - VEC_LEFT_PACK(v_rhoj->v, v_mask.m, &int_cache->rhoq[*icount]); - VEC_LEFT_PACK(v_grad_hj->v, v_mask.m, &int_cache->grad_hq[*icount]); - VEC_LEFT_PACK(v_pOrhoj2->v, v_mask.m, &int_cache->pOrho2q[*icount]); - VEC_LEFT_PACK(v_balsara_j->v, v_mask.m, &int_cache->balsaraq[*icount]); - VEC_LEFT_PACK(v_cj->v, v_mask.m, &int_cache->soundspeedq[*icount]); - VEC_LEFT_PACK(v_hj_inv->v, v_mask.m, &int_cache->h_invq[*icount]); - -#endif /* HAVE_AVX512_F */ - - (*icount) += pack; + mask_t packed_mask; + VEC_FORM_PACKED_MASK(mask, packed_mask); + + VEC_LEFT_PACK(v_r2->v, packed_mask, &int_cache->r2q[*icount]); + VEC_LEFT_PACK(v_dx->v, packed_mask, &int_cache->dxq[*icount]); + VEC_LEFT_PACK(v_dy->v, packed_mask, &int_cache->dyq[*icount]); + VEC_LEFT_PACK(v_dz->v, packed_mask, &int_cache->dzq[*icount]); + VEC_LEFT_PACK(vec_load(&cell_cache->m[pjd]), packed_mask, &int_cache->mq[*icount]); + VEC_LEFT_PACK(vec_load(&cell_cache->vx[pjd]), packed_mask, &int_cache->vxq[*icount]); + VEC_LEFT_PACK(vec_load(&cell_cache->vy[pjd]), packed_mask, &int_cache->vyq[*icount]); + VEC_LEFT_PACK(vec_load(&cell_cache->vz[pjd]), packed_mask, &int_cache->vzq[*icount]); + VEC_LEFT_PACK(vec_load(&cell_cache->rho[pjd]), packed_mask, &int_cache->rhoq[*icount]); + VEC_LEFT_PACK(vec_load(&cell_cache->grad_h[pjd]), packed_mask, &int_cache->grad_hq[*icount]); + VEC_LEFT_PACK(vec_load(&cell_cache->pOrho2[pjd]), packed_mask, &int_cache->pOrho2q[*icount]); + VEC_LEFT_PACK(vec_load(&cell_cache->balsara[pjd]), packed_mask, &int_cache->balsaraq[*icount]); + VEC_LEFT_PACK(vec_load(&cell_cache->soundspeed[pjd]), packed_mask, &int_cache->soundspeedq[*icount]); + VEC_LEFT_PACK(v_hj_inv->v, packed_mask, &int_cache->h_invq[*icount]); + + /* Increment interaction count by number of bits set in mask. */ + (*icount) += __builtin_popcount(mask); #else /* Quicker to do it serially in AVX rather than use intrinsics. */ for (int bit_index = 0; bit_index < VEC_SIZE; bit_index++) { diff --git a/src/vector.h b/src/vector.h index 15dd0d1d83ba24345ab92b2c6c6522327d18731e..d50bbd80e8dd23f2be06afcc4dca1545bed1e582 100644 --- a/src/vector.h +++ b/src/vector.h @@ -126,10 +126,10 @@ for (int i = 0; i < VEC_SIZE; i++) b += a.f[i]; \ } #endif -/* Calculates the number of set bits in the mask and adds the result to an int. - */ -#define VEC_FORM_PACKED_MASK(mask, v_mask, pack) \ - pack += __builtin_popcount(mask); + +/* Do nothing in the case of AVX-512 as there are already + * instructions for left-packing.*/ +#define VEC_FORM_PACKED_MASK(mask, packed_mask) /* Finds the horizontal maximum of vector b and returns a float. */ #define VEC_HMAX(a, b) a = _mm512_reduce_max_ps(b.v) @@ -230,14 +230,13 @@ /* Takes an integer mask and forms a left-packed integer vector * containing indices of the set bits in the integer mask. * Also returns the total number of bits set in the mask. */ -#define VEC_FORM_PACKED_MASK(mask, v_mask, pack) \ +#define VEC_FORM_PACKED_MASK(mask, packed_mask) \ { \ unsigned long expanded_mask = _pdep_u64(mask, 0x0101010101010101); \ expanded_mask *= 0xFF; \ unsigned long wanted_indices = _pext_u64(identity_indices, expanded_mask); \ __m128i bytevec = _mm_cvtsi64_si128(wanted_indices); \ - v_mask = _mm256_cvtepu8_epi32(bytevec); \ - pack += __builtin_popcount(mask); \ + packed_mask.m = _mm256_cvtepu8_epi32(bytevec); \ } /* Performs a left-pack on a vector based upon a mask and returns the result. */