Commit 35069140 authored by James Willis's avatar James Willis
Browse files

Added more vector intrinsics for AVX512 and support for left-packing vectors...

Added more vector intrinsics for AVX512 and support for left-packing vectors in multiple instruction sets.
parent a121ffd4
...@@ -39,6 +39,16 @@ ...@@ -39,6 +39,16 @@
#define VEC_MACRO(elcount, type) \ #define VEC_MACRO(elcount, type) \
__attribute__((vector_size((elcount) * sizeof(type)))) type __attribute__((vector_size((elcount) * sizeof(type)))) type
/* Define vector reciprocals. vec_rcp and vec_rsqrt do not have the level of
* accuracy we need, so an extra two terms are added. */
#define VEC_RECIPROCAL(x, x_inv) \
x_inv = vec_rcp(x); \
x_inv = vec_sub(x_inv, vec_mul(x_inv, (vec_fma(x, x_inv, vec_set1(-1.0f))) ) )
#define VEC_RECIPROCAL_SQRT(x, x_inv) \
x_inv = vec_rsqrt(x); \
x_inv = vec_sub(x_inv, vec_mul(vec_mul(vec_set1(0.5f), x_inv),(vec_fma(x, vec_mul(x_inv, x_inv), vec_set1(-1.0f)))))
/* So what will the vector size be? */ /* So what will the vector size be? */
#ifdef HAVE_AVX512_F #ifdef HAVE_AVX512_F
#define VEC_HAVE_GATHER #define VEC_HAVE_GATHER
...@@ -46,19 +56,33 @@ ...@@ -46,19 +56,33 @@
#define VEC_FLOAT __m512 #define VEC_FLOAT __m512
#define VEC_DBL __m512d #define VEC_DBL __m512d
#define VEC_INT __m512i #define VEC_INT __m512i
#define KNL_MASK_16 __mmask16
#define vec_load(a) _mm512_load_ps(a) #define vec_load(a) _mm512_load_ps(a)
#define vec_store(a,addr) _mm512_store_ps(addr,a)
#define vec_setzero() _mm512_setzero_ps()
#define vec_setintzero() _mm512_setzero_epi32()
#define vec_set1(a) _mm512_set1_ps(a) #define vec_set1(a) _mm512_set1_ps(a)
#define vec_setint1(a) _mm512_set1_epi32(a)
#define vec_set(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ #define vec_set(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
_mm512_set_ps(p, o, n, m, l, k, j, i, h, g, f, e, d, c, b, a) _mm512_set_ps(p, o, n, m, l, k, j, i, h, g, f, e, d, c, b, a)
#define vec_dbl_set(a, b, c, d, e, f, g, h) \ #define vec_dbl_set(a, b, c, d, e, f, g, h) \
_mm512_set_pd(h, g, f, e, d, c, b, a) _mm512_set_pd(h, g, f, e, d, c, b, a)
#define vec_add(a, b) _mm512_add_ps(a, b)
#define vec_sub(a, b) _mm512_sub_ps(a, b)
#define vec_mul(a, b) _mm512_mul_ps(a, b)
#define vec_fma(a, b, c) _mm512_fmadd_ps(a, b, c)
#define vec_sqrt(a) _mm512_sqrt_ps(a) #define vec_sqrt(a) _mm512_sqrt_ps(a)
#define vec_rcp(a) _mm512_rcp_ps(a) #define vec_rcp(a) _mm512_rcp14_ps(a)
#define vec_rsqrt(a) _mm512_rsqrt_ps(a) #define vec_rsqrt(a) _mm512_rsqrt14_ps(a)
#define vec_ftoi(a) _mm512_cvttps_epi32(a) #define vec_ftoi(a) _mm512_cvttps_epi32(a)
#define vec_fmin(a, b) _mm512_min_ps(a, b) #define vec_fmin(a, b) _mm512_min_ps(a, b)
#define vec_fmax(a, b) _mm512_max_ps(a, b) #define vec_fmax(a, b) _mm512_max_ps(a, b)
#define vec_fabs(a) _mm512_andnot_ps(_mm512_set1_ps(-0.f), a) #define vec_fabs(a) _mm512_andnot_ps(_mm512_set1_ps(-0.f), a)
#define vec_floor(a) _mm512_floor_ps(a)
#define vec_cmp_gt(a, b) _mm512_cmp_ps_mask(a, b, _CMP_GT_OQ)
#define vec_cmp_lt(a, b) _mm512_cmp_ps_mask(a, b,_CMP_LT_OQ)
#define vec_cmp_lte(a, b) _mm512_cmp_ps_mask(a, b,_CMP_LE_OQ)
#define vec_and(a,b) _mm512_and_ps(a, b)
#define vec_todbl_lo(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 0)) #define vec_todbl_lo(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 0))
#define vec_todbl_hi(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 1)) #define vec_todbl_hi(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 1))
#define vec_dbl_tofloat(a, b) _mm512_insertf128(_mm512_castps128_ps512(a), b, 1) #define vec_dbl_tofloat(a, b) _mm512_insertf128(_mm512_castps128_ps512(a), b, 1)
...@@ -81,20 +105,30 @@ ...@@ -81,20 +105,30 @@
1) 1)
#define vec_gather(base, offsets) _mm512_i32gather_ps(offsets.m, base, 1) #define vec_gather(base, offsets) _mm512_i32gather_ps(offsets.m, base, 1)
#define FILL_VEC(a) \ #define FILL_VEC(a) \
{ \ { \
.f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a, .f[4] = a, .f[5] = a, \ .f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a, .f[4] = a, .f[5] = a, \
.f[6] = a, .f[7] = a, .f[8] = a, .f[9] = a, .f[10] = a, .f[11] = a, \ .f[6] = a, .f[7] = a, .f[8] = a, .f[9] = a, .f[10] = a, .f[11] = a, \
.f[12] = a, .f[13] = a, .f[14] = a, .f[15] = a \ .f[12] = a, .f[13] = a, .f[14] = a, .f[15] = a \
} }
#define VEC_HADD(a,b) b += _mm512_reduce_add_ps(a.v)
#define VEC_FORM_PACKED_MASK(mask,v_mask,pack) pack += __builtin_popcount(mask);
#define VEC_LEFT_PACK(a,mask,result) _mm512_mask_compressstoreu_ps(result, mask, a)
#elif defined(HAVE_AVX) #elif defined(HAVE_AVX)
#define VEC_SIZE 8 #define VEC_SIZE 8
#define VEC_FLOAT __m256 #define VEC_FLOAT __m256
#define VEC_DBL __m256d #define VEC_DBL __m256d
#define VEC_INT __m256i #define VEC_INT __m256i
#define vec_load(a) _mm256_load_ps(a) #define vec_load(a) _mm256_load_ps(a)
#define vec_store(a,addr) _mm256_store_ps(addr,a)
#define vec_setzero() _mm256_setzero_ps()
#define vec_setintzero() _mm256_setzero_si256()
#define vec_set1(a) _mm256_set1_ps(a) #define vec_set1(a) _mm256_set1_ps(a)
#define vec_setint1(a) _mm256_set1_epi32(a)
#define vec_set(a, b, c, d, e, f, g, h) _mm256_set_ps(h, g, f, e, d, c, b, a) #define vec_set(a, b, c, d, e, f, g, h) _mm256_set_ps(h, g, f, e, d, c, b, a)
#define vec_dbl_set(a, b, c, d) _mm256_set_pd(d, c, b, a) #define vec_dbl_set(a, b, c, d) _mm256_set_pd(d, c, b, a)
#define vec_add(a, b) _mm256_add_ps(a, b)
#define vec_sub(a, b) _mm256_sub_ps(a, b)
#define vec_mul(a, b) _mm256_mul_ps(a, b)
#define vec_sqrt(a) _mm256_sqrt_ps(a) #define vec_sqrt(a) _mm256_sqrt_ps(a)
#define vec_rcp(a) _mm256_rcp_ps(a) #define vec_rcp(a) _mm256_rcp_ps(a)
#define vec_rsqrt(a) _mm256_rsqrt_ps(a) #define vec_rsqrt(a) _mm256_rsqrt_ps(a)
...@@ -102,6 +136,12 @@ ...@@ -102,6 +136,12 @@
#define vec_fmin(a, b) _mm256_min_ps(a, b) #define vec_fmin(a, b) _mm256_min_ps(a, b)
#define vec_fmax(a, b) _mm256_max_ps(a, b) #define vec_fmax(a, b) _mm256_max_ps(a, b)
#define vec_fabs(a) _mm256_andnot_ps(_mm256_set1_ps(-0.f), a) #define vec_fabs(a) _mm256_andnot_ps(_mm256_set1_ps(-0.f), a)
#define vec_floor(a) _mm256_floor_ps(a)
#define vec_cmp_lt(a, b) _mm256_cmp_ps(a, b, _CMP_LT_OQ)
#define vec_cmp_gt(a, b) _mm256_cmp_ps(a, b, _CMP_GT_OQ)
#define vec_cmp_lte(a, b) _mm256_cmp_ps(a, b, _CMP_LE_OQ)
#define vec_cmp_result(a) _mm256_movemask_ps(a)
#define vec_and(a,b) _mm256_and_ps(a, b)
#define vec_todbl_lo(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 0)) #define vec_todbl_lo(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 0))
#define vec_todbl_hi(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 1)) #define vec_todbl_hi(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 1))
#define vec_dbl_tofloat(a, b) _mm256_insertf128(_mm256_castps128_ps256(a), b, 1) #define vec_dbl_tofloat(a, b) _mm256_insertf128(_mm256_castps128_ps256(a), b, 1)
...@@ -118,9 +158,61 @@ ...@@ -118,9 +158,61 @@
.f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a, .f[4] = a, .f[5] = a, \ .f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a, .f[4] = a, .f[5] = a, \
.f[6] = a, .f[7] = a \ .f[6] = a, .f[7] = a \
} }
#define VEC_HADD(a,b) \
a.v = _mm256_hadd_ps(a.v, a.v); \
a.v = _mm256_hadd_ps(a.v, a.v); \
b += a.f[0] + a.f[4];
#define VEC_GET_LOW(a) _mm256_castps256_ps128(a)
#define VEC_GET_HIGH(a) _mm256_extractf128_ps(a,1)
#ifdef HAVE_AVX2 #ifdef HAVE_AVX2
#define vec_fma(a, b, c) _mm256_fmadd_ps(a, b, c)
#define identity_indices 0x0706050403020100
#define VEC_HAVE_GATHER #define VEC_HAVE_GATHER
#define vec_gather(base, offsets) _mm256_i32gather_ps(base, offsets.m, 1) #define vec_gather(base, offsets) _mm256_i32gather_ps(base, offsets.m, 1)
#define VEC_FORM_PACKED_MASK(mask,v_mask,pack) \
{ \
unsigned long expanded_mask = _pdep_u64(mask, 0x0101010101010101); \
expanded_mask *= 0xFF; \
unsigned long wanted_indices = _pext_u64(identity_indices, expanded_mask); \
__m128i bytevec = _mm_cvtsi64_si128(wanted_indices); \
v_mask = _mm256_cvtepu8_epi32(bytevec); \
pack += __builtin_popcount(mask); \
}
#define VEC_LEFT_PACK(a,mask,result) *((__m256 *)(result)) = _mm256_permutevar8x32_ps(a,mask)
#endif
#ifndef vec_fma
#define vec_fma(a, b, c) vec_add(vec_mul(a, b), c)
#endif
#ifndef VEC_FORM_PACKED_MASK
#define VEC_FORM_PACKED_MASK(mask,v_mask,pack) \
{ \
for(int i=0; i<VEC_SIZE; i++) \
if ((mask & (1 << i))) \
v_mask.i[pack++] = i; \
}
#define VEC_FORM_PACKED_MASK_2(mask,v_mask,pack,mask2,v_mask2,pack2) \
{ \
for(int i=0; i<VEC_SIZE; i++) { \
if ((mask & (1 << i))) \
v_mask.i[pack++] = i; \
if ((mask2 & (1 << i))) \
v_mask2.i[pack2++] = i; \
} \
}
#endif
#ifndef VEC_LEFT_PACK
#define VEC_LEFT_PACK(a,mask,result) \
{ \
__m256 t1 = _mm256_castps128_ps256(_mm256_extractf128_ps(a, 1)); \
__m256 t2 = _mm256_insertf128_ps(t1, _mm256_castps256_ps128(a), 1); \
__m256 r0 = _mm256_permutevar_ps(a, mask); \
__m256 r1 = _mm256_permutevar_ps(t2, mask); \
__m128i k1 = _mm_slli_epi32((__m128i)(_mm_xor_si128((__m128i)VEC_GET_HIGH((__m256)mask),(__m128i)_mm_set1_epi32(4))), 29); \
__m128i k0 = _mm_slli_epi32((__m128i)(VEC_GET_LOW((__m256)mask)), 29); \
__m256 kk = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_castsi128_ps(k0)), \
_mm_castsi128_ps(k1), 1); \
*((__m256 *)(result)) = _mm256_blendv_ps(r0, r1, kk); \
}
#endif #endif
#elif defined(HAVE_SSE2) #elif defined(HAVE_SSE2)
#define VEC_SIZE 4 #define VEC_SIZE 4
...@@ -128,6 +220,7 @@ ...@@ -128,6 +220,7 @@
#define VEC_DBL __m128d #define VEC_DBL __m128d
#define VEC_INT __m128i #define VEC_INT __m128i
#define vec_load(a) _mm_load_ps(a) #define vec_load(a) _mm_load_ps(a)
#define vec_store(a,addr) _mm_store_ps(addr,a)
#define vec_set1(a) _mm_set1_ps(a) #define vec_set1(a) _mm_set1_ps(a)
#define vec_set(a, b, c, d) _mm_set_ps(d, c, b, a) #define vec_set(a, b, c, d) _mm_set_ps(d, c, b, a)
#define vec_dbl_set(a, b) _mm_set_pd(b, a) #define vec_dbl_set(a, b) _mm_set_pd(b, a)
...@@ -138,6 +231,10 @@ ...@@ -138,6 +231,10 @@
#define vec_fmin(a, b) _mm_min_ps(a, b) #define vec_fmin(a, b) _mm_min_ps(a, b)
#define vec_fmax(a, b) _mm_max_ps(a, b) #define vec_fmax(a, b) _mm_max_ps(a, b)
#define vec_fabs(a) _mm_andnot_ps(_mm_set1_ps(-0.f), a) #define vec_fabs(a) _mm_andnot_ps(_mm_set1_ps(-0.f), a)
#define vec_floor(a) _mm_floor_ps(a)
#define vec_cmp_lt(a, b) _mm_cmplt_ps(a, b)
#define vec_cmp_lte(a, b) _mm_cmp_ps(a, b, _CMP_LE_OQ)
#define vec_cmp_result(a) _mm_movemask_ps(a)
#define vec_todbl_lo(a) _mm_cvtps_pd(a) #define vec_todbl_lo(a) _mm_cvtps_pd(a)
#define vec_todbl_hi(a) _mm_cvtps_pd(_mm_movehl_ps(a, a)) #define vec_todbl_hi(a) _mm_cvtps_pd(_mm_movehl_ps(a, a))
#define vec_dbl_tofloat(a, b) _mm_movelh_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b)) #define vec_dbl_tofloat(a, b) _mm_movelh_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b))
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment