From 8cd6adc7905f92a3c74910930cf9badfa941e4d1 Mon Sep 17 00:00:00 2001 From: James Willis <james.s.willis@durham.ac.uk> Date: Tue, 13 Dec 2016 10:56:54 +0000 Subject: [PATCH] Added inline vector functions to calculate the inverse and inverse square root. --- src/vector.h | 44 ++++++++++++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/src/vector.h b/src/vector.h index 1a1190978e..03fa487557 100644 --- a/src/vector.h +++ b/src/vector.h @@ -39,18 +39,6 @@ #define VEC_MACRO(elcount, type) \ __attribute__((vector_size((elcount) * sizeof(type)))) type -/* Define vector reciprocals. vec_rcp and vec_rsqrt do not have the level of - * accuracy we need, so an extra two terms are added. */ -#define VEC_RECIPROCAL(x, x_inv) \ - x_inv = vec_rcp(x); \ - x_inv = vec_sub(x_inv, vec_mul(x_inv, (vec_fma(x, x_inv, vec_set1(-1.0f))))) - -#define VEC_RECIPROCAL_SQRT(x, x_inv) \ - x_inv = vec_rsqrt(x); \ - x_inv = vec_sub( \ - x_inv, vec_mul(vec_mul(vec_set1(0.5f), x_inv), \ - (vec_fma(x, vec_mul(x_inv, x_inv), vec_set1(-1.0f))))) - /* So what will the vector size be? */ #ifdef HAVE_AVX512_F #define VEC_HAVE_GATHER @@ -268,6 +256,38 @@ typedef union { int i[VEC_SIZE]; } vector; +/** + * @brief Calculates the inverse ($1/x$) of a vector using intrinsics and a Newton iteration to obtain the correct level of accuracy. + * + * @param x #vector to be inverted. + * @return x_inv #vector inverted x. + */ +__attribute__((always_inline)) INLINE vector vec_reciprocal(vector x) { + + vector x_inv; + + x_inv.v = vec_rcp(x.v); + x_inv.v = vec_sub(x_inv.v, vec_mul(x_inv.v, (vec_fma(x.v, x_inv.v, vec_set1(-1.0f))))); + + return x_inv; +} + +/** + * @brief Calculates the inverse and square root ($1/\sqrt{x}$) of a vector using intrinsics and a Newton iteration to obtain the correct level of accuracy. + * + * @param x #vector to be inverted. + * @return x_inv #vector inverted x. + */ +__attribute__((always_inline)) INLINE vector vec_reciprocal_sqrt(vector x) { + + vector x_inv; + + x_inv.v = vec_rsqrt(x.v); + x_inv.v = vec_sub(x_inv.v, vec_mul(vec_mul(vec_set1(0.5f), x_inv.v), (vec_fma(x.v, vec_mul(x_inv.v, x_inv.v), vec_set1(-1.0f))))); + + return x_inv; +} + #else /* Needed for cache alignment. */ #define VEC_SIZE 16 -- GitLab