From 8cd6adc7905f92a3c74910930cf9badfa941e4d1 Mon Sep 17 00:00:00 2001
From: James Willis <james.s.willis@durham.ac.uk>
Date: Tue, 13 Dec 2016 10:56:54 +0000
Subject: [PATCH] Added inline vector functions to calculate the inverse and
 inverse square root.

---
 src/vector.h | 44 ++++++++++++++++++++++++++++++++------------
 1 file changed, 32 insertions(+), 12 deletions(-)

diff --git a/src/vector.h b/src/vector.h
index 1a1190978e..03fa487557 100644
--- a/src/vector.h
+++ b/src/vector.h
@@ -39,18 +39,6 @@
 #define VEC_MACRO(elcount, type) \
   __attribute__((vector_size((elcount) * sizeof(type)))) type
 
-/* Define vector reciprocals. vec_rcp and vec_rsqrt do not have the level of
- * accuracy we need, so an extra two terms are added. */
-#define VEC_RECIPROCAL(x, x_inv) \
-  x_inv = vec_rcp(x);            \
-  x_inv = vec_sub(x_inv, vec_mul(x_inv, (vec_fma(x, x_inv, vec_set1(-1.0f)))))
-
-#define VEC_RECIPROCAL_SQRT(x, x_inv)                \
-  x_inv = vec_rsqrt(x);                              \
-  x_inv = vec_sub(                                   \
-      x_inv, vec_mul(vec_mul(vec_set1(0.5f), x_inv), \
-                     (vec_fma(x, vec_mul(x_inv, x_inv), vec_set1(-1.0f)))))
-
 /* So what will the vector size be? */
 #ifdef HAVE_AVX512_F
 #define VEC_HAVE_GATHER
@@ -268,6 +256,38 @@ typedef union {
   int i[VEC_SIZE];
 } vector;
 
+/**
+ * @brief Calculates the inverse ($1/x$) of a vector using intrinsics and a Newton iteration to obtain the correct level of accuracy.
+ *
+ * @param x #vector to be inverted.
+ * @return x_inv #vector inverted x.
+ */
+__attribute__((always_inline)) INLINE vector vec_reciprocal(vector x) {
+
+  vector x_inv;
+
+  x_inv.v = vec_rcp(x.v);
+  x_inv.v = vec_sub(x_inv.v, vec_mul(x_inv.v, (vec_fma(x.v, x_inv.v, vec_set1(-1.0f)))));
+
+  return x_inv;
+}
+
+/**
+ * @brief Calculates the inverse and square root ($1/\sqrt{x}$) of a vector using intrinsics and a Newton iteration to obtain the correct level of accuracy.
+ *
+ * @param x #vector to be inverted.
+ * @return x_inv #vector inverted x.
+ */
+__attribute__((always_inline)) INLINE vector vec_reciprocal_sqrt(vector x) {
+
+  vector x_inv;
+
+  x_inv.v = vec_rsqrt(x.v);
+  x_inv.v = vec_sub(x_inv.v, vec_mul(vec_mul(vec_set1(0.5f), x_inv.v), (vec_fma(x.v, vec_mul(x_inv.v, x_inv.v), vec_set1(-1.0f)))));
+  
+  return x_inv;
+}
+
 #else
 /* Needed for cache alignment. */
 #define VEC_SIZE 16
-- 
GitLab