diff --git a/src/vector.h b/src/vector.h index 1a1190978e6bb7319d892c10c0bcb3ff62a2f85a..03fa4875574e28e555f02b32ea00c82f1bf64abd 100644 --- a/src/vector.h +++ b/src/vector.h @@ -39,18 +39,6 @@ #define VEC_MACRO(elcount, type) \ __attribute__((vector_size((elcount) * sizeof(type)))) type -/* Define vector reciprocals. vec_rcp and vec_rsqrt do not have the level of - * accuracy we need, so an extra two terms are added. */ -#define VEC_RECIPROCAL(x, x_inv) \ - x_inv = vec_rcp(x); \ - x_inv = vec_sub(x_inv, vec_mul(x_inv, (vec_fma(x, x_inv, vec_set1(-1.0f))))) - -#define VEC_RECIPROCAL_SQRT(x, x_inv) \ - x_inv = vec_rsqrt(x); \ - x_inv = vec_sub( \ - x_inv, vec_mul(vec_mul(vec_set1(0.5f), x_inv), \ - (vec_fma(x, vec_mul(x_inv, x_inv), vec_set1(-1.0f))))) - /* So what will the vector size be? */ #ifdef HAVE_AVX512_F #define VEC_HAVE_GATHER @@ -268,6 +256,38 @@ typedef union { int i[VEC_SIZE]; } vector; +/** + * @brief Calculates the inverse ($1/x$) of a vector using intrinsics and a Newton iteration to obtain the correct level of accuracy. + * + * @param x #vector to be inverted. + * @return x_inv #vector inverted x. + */ +__attribute__((always_inline)) INLINE vector vec_reciprocal(vector x) { + + vector x_inv; + + x_inv.v = vec_rcp(x.v); + x_inv.v = vec_sub(x_inv.v, vec_mul(x_inv.v, (vec_fma(x.v, x_inv.v, vec_set1(-1.0f))))); + + return x_inv; +} + +/** + * @brief Calculates the inverse and square root ($1/\sqrt{x}$) of a vector using intrinsics and a Newton iteration to obtain the correct level of accuracy. + * + * @param x #vector to be inverted. + * @return x_inv #vector inverted x. + */ +__attribute__((always_inline)) INLINE vector vec_reciprocal_sqrt(vector x) { + + vector x_inv; + + x_inv.v = vec_rsqrt(x.v); + x_inv.v = vec_sub(x_inv.v, vec_mul(vec_mul(vec_set1(0.5f), x_inv.v), (vec_fma(x.v, vec_mul(x_inv.v, x_inv.v), vec_set1(-1.0f))))); + + return x_inv; +} + #else /* Needed for cache alignment. */ #define VEC_SIZE 16