Commit 240b36fb authored by James Willis's avatar James Willis
Browse files

Improved documentation.

parent d200065c
......@@ -23,6 +23,8 @@
/* Have I already read this file? */
#ifndef VEC_MACRO
/*TODO: Tidy this file up with comments.*/
#include "../config.h"
#ifdef WITH_VECTORIZATION
......@@ -40,6 +42,8 @@
__attribute__((vector_size((elcount) * sizeof(type)))) type
/* So what will the vector size be? */
/* AVX-512 intrinsics*/
#ifdef HAVE_AVX512_F
#define VEC_HAVE_GATHER
#define VEC_SIZE 16
......@@ -94,17 +98,27 @@
_mm512_set1_epi64(ptrs[0])), \
1)
#define vec_gather(base, offsets) _mm512_i32gather_ps(offsets.m, base, 1)
/* Initialises a vector struct with a default value. */
#define FILL_VEC(a) \
{ \
.f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a, .f[4] = a, .f[5] = a, \
.f[6] = a, .f[7] = a, .f[8] = a, .f[9] = a, .f[10] = a, .f[11] = a, \
.f[12] = a, .f[13] = a, .f[14] = a, .f[15] = a \
}
/* Performs a horizontal add on the vector and adds the result to a float. */
#define VEC_HADD(a, b) b += _mm512_reduce_add_ps(a.v)
/* Calculates the number of set bits in the mask and adds the result to an int. */
#define VEC_FORM_PACKED_MASK(mask, v_mask, pack) \
pack += __builtin_popcount(mask);
/* Performs a left-pack on a vector based upon a mask and returns the result. */
#define VEC_LEFT_PACK(a, mask, result) \
_mm512_mask_compressstoreu_ps(result, mask, a)
/* AVX intrinsics */
#elif defined(HAVE_AVX)
#define VEC_SIZE 8
#define VEC_FLOAT __m256
......@@ -146,22 +160,38 @@
#define vec_dbl_ftoi(a) _mm256_cvttpd_epi32(a)
#define vec_dbl_fmin(a, b) _mm256_min_pd(a, b)
#define vec_dbl_fmax(a, b) _mm256_max_pd(a, b)
/* Initialises a vector struct with a default value. */
#define FILL_VEC(a) \
{ \
.f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a, .f[4] = a, .f[5] = a, \
.f[6] = a, .f[7] = a \
}
/* Performs a horizontal add on the vector and adds the result to a float. */
#define VEC_HADD(a, b) \
a.v = _mm256_hadd_ps(a.v, a.v); \
a.v = _mm256_hadd_ps(a.v, a.v); \
b += a.f[0] + a.f[4];
/* Returns the lower 128-bits of the 256-bit vector. */
#define VEC_GET_LOW(a) _mm256_castps256_ps128(a)
/* Returns the higher 128-bits of the 256-bit vector. */
#define VEC_GET_HIGH(a) _mm256_extractf128_ps(a, 1)
/* Check if we have AVX2 intrinsics alongside AVX */
#ifdef HAVE_AVX2
#define vec_fma(a, b, c) _mm256_fmadd_ps(a, b, c)
/* Used in VEC_FORM_PACKED_MASK */
#define identity_indices 0x0706050403020100
#define VEC_HAVE_GATHER
#define vec_gather(base, offsets) _mm256_i32gather_ps(base, offsets.m, 1)
/* Takes an integer mask and forms a left-packed integer vector
* containing indices of the set bits in the integer mask.
* Also returns the total number of bits set in the mask. */
#define VEC_FORM_PACKED_MASK(mask, v_mask, pack) \
{ \
unsigned long expanded_mask = _pdep_u64(mask, 0x0101010101010101); \
......@@ -171,18 +201,32 @@
v_mask = _mm256_cvtepu8_epi32(bytevec); \
pack += __builtin_popcount(mask); \
}
/* Performs a left-pack on a vector based upon a mask and returns the result. */
#define VEC_LEFT_PACK(a, mask, result) \
vec_unaligned_store(_mm256_permutevar8x32_ps(a, mask), result)
#endif
#endif /* HAVE_AVX2 */
/* Create an FMA using vec_add and vec_mul if AVX2 is not present. */
#ifndef vec_fma
#define vec_fma(a, b, c) vec_add(vec_mul(a, b), c)
#endif
/* Form a packed mask without intrinsics if AVX2 is not present. */
#ifndef VEC_FORM_PACKED_MASK
/* Takes an integer mask and forms a left-packed integer vector
* containing indices of the set bits in the integer mask.
* Also returns the total number of bits set in the mask. */
#define VEC_FORM_PACKED_MASK(mask, v_mask, pack) \
{ \
for (int i = 0; i < VEC_SIZE; i++) \
if ((mask & (1 << i))) v_mask.i[pack++] = i; \
}
/* Takes two integer masks and forms two left-packed integer vectors
* containing indices of the set bits in each corresponding integer mask.
* Also returns the total number of bits set in the mask. */
#define VEC_FORM_PACKED_MASK_2(mask, v_mask, pack, mask2, v_mask2, pack2) \
{ \
for (int i = 0; i < VEC_SIZE; i++) { \
......@@ -191,6 +235,10 @@
} \
}
#endif
/* Performs a left-pack on a vector based upon a mask and returns the result. */
/* This uses AVX intrinsics, but this is slower than performing the left-pack
* manually by looping over the vectors. */
#ifndef VEC_LEFT_PACK
#define VEC_LEFT_PACK(a, mask, result) \
{ \
......@@ -208,7 +256,9 @@
_mm_castsi128_ps(k1), 1); \
*((__m256 *)(result)) = _mm256_blendv_ps(r0, r1, kk); \
}
#endif
#endif /* HAVE_AVX2 */
/* SSE intrinsics*/
#elif defined(HAVE_SSE2)
#define VEC_SIZE 4
#define VEC_FLOAT __m128
......@@ -249,17 +299,23 @@
#define vec_dbl_ftoi(a) _mm_cvttpd_epi32(a)
#define vec_dbl_fmin(a, b) _mm_min_pd(a, b)
#define vec_dbl_fmax(a, b) _mm_max_pd(a, b)
/* Initialises a vector struct with a default value. */
#define FILL_VEC(a) \
{ .f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a }
/* Performs a horizontal add on the vector and adds the result to a float. */
#define VEC_HADD(a, b) \
a.v = _mm_hadd_ps(a.v, a.v); \
b += a.f[0] + a.f[1];
/* Create an FMA using vec_add and vec_mul if AVX2 is not present. */
#ifndef vec_fma
#define vec_fma(a, b, c) vec_add(vec_mul(a, b), c)
#endif
#else
#define VEC_SIZE 4
#endif
#endif /* HAVE_SSE2 */
/* Define the composite types for element access. */
typedef union {
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment