Commit 8cd6adc7 authored by James Willis's avatar James Willis
Browse files

Added inline vector functions to calculate the inverse and inverse square root.

parent a6e392c5
......@@ -39,18 +39,6 @@
#define VEC_MACRO(elcount, type) \
__attribute__((vector_size((elcount) * sizeof(type)))) type
/* Define vector reciprocals. vec_rcp and vec_rsqrt do not have the level of
* accuracy we need, so an extra two terms are added. */
#define VEC_RECIPROCAL(x, x_inv) \
x_inv = vec_rcp(x); \
x_inv = vec_sub(x_inv, vec_mul(x_inv, (vec_fma(x, x_inv, vec_set1(-1.0f)))))
#define VEC_RECIPROCAL_SQRT(x, x_inv) \
x_inv = vec_rsqrt(x); \
x_inv = vec_sub( \
x_inv, vec_mul(vec_mul(vec_set1(0.5f), x_inv), \
(vec_fma(x, vec_mul(x_inv, x_inv), vec_set1(-1.0f)))))
/* So what will the vector size be? */
#ifdef HAVE_AVX512_F
#define VEC_HAVE_GATHER
......@@ -268,6 +256,38 @@ typedef union {
int i[VEC_SIZE];
} vector;
/**
* @brief Calculates the inverse ($1/x$) of a vector using intrinsics and a Newton iteration to obtain the correct level of accuracy.
*
* @param x #vector to be inverted.
* @return x_inv #vector inverted x.
*/
__attribute__((always_inline)) INLINE vector vec_reciprocal(vector x) {
vector x_inv;
x_inv.v = vec_rcp(x.v);
x_inv.v = vec_sub(x_inv.v, vec_mul(x_inv.v, (vec_fma(x.v, x_inv.v, vec_set1(-1.0f)))));
return x_inv;
}
/**
* @brief Calculates the inverse and square root ($1/\sqrt{x}$) of a vector using intrinsics and a Newton iteration to obtain the correct level of accuracy.
*
* @param x #vector to be inverted.
* @return x_inv #vector inverted x.
*/
__attribute__((always_inline)) INLINE vector vec_reciprocal_sqrt(vector x) {
vector x_inv;
x_inv.v = vec_rsqrt(x.v);
x_inv.v = vec_sub(x_inv.v, vec_mul(vec_mul(vec_set1(0.5f), x_inv.v), (vec_fma(x.v, vec_mul(x_inv.v, x_inv.v), vec_set1(-1.0f)))));
return x_inv;
}
#else
/* Needed for cache alignment. */
#define VEC_SIZE 16
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment