Merge branch 'vectorise_kernel' into 'master'

Vectorise kernel Should fix issue #154. Includes updates to `testKernel.c` that test `kernel_deval_vec` against the serial `kernel_deval`. I have tested it with Cubic Spline, Wendland C2 + C6 kernels. See merge request !150

Merge branch 'vectorise_kernel' into 'master'
efdf7330 · Matthieu Schaller · ef91f6c8 · e78b25f5 · efdf7330 · efdf7330
Commit efdf7330 authored 9 years ago by Matthieu Schaller
--- a/src/kernel_hydro.h
+++ b/src/kernel_hydro.h
@@ -212,6 +212,61 @@ __attribute__((always_inline)) INLINE static void kernel_eval(float u,
  *W = w * (float)kernel_constant * (float)kernel_igamma3;
 }

+#ifdef VECTORIZE
+
+static const vector kernel_igamma_vec = FILL_VEC((float)kernel_igamma);
+
+static const vector kernel_ivals_vec = FILL_VEC((float)kernel_ivals);
+
+static const vector kernel_constant_vec = FILL_VEC((float)kernel_constant);
+
+static const vector kernel_igamma3_vec = FILL_VEC((float)kernel_igamma3);
+
+static const vector kernel_igamma4_vec = FILL_VEC((float)kernel_igamma4);
+
+/**
+ * @brief Computes the kernel function and its derivative (Vectorised version).
+ *
+ * Return 0 if $u > \\gamma = H/h$
+ *
+ * @param u The ratio of the distance to the smoothing length $u = x/h$.
+ * @param w (return) The value of the kernel function $W(x,h)$.
+ * @param dw_dx (return) The norm of the gradient of $|\\nabla W(x,h)|$.
+ */
+__attribute__((always_inline))
+    INLINE static void kernel_deval_vec(vector *u, vector *w, vector *dw_dx) {
+
+  /* Go to the range [0,1[ from [0,H[ */
+  vector x;
+  x.v = u->v * kernel_igamma_vec.v;
+
+  /* Load x and get the interval id. */
+  vector ind;
+  ind.m = vec_ftoi(vec_fmin(x.v * kernel_ivals_vec.v, kernel_ivals_vec.v));
+
+  /* load the coefficients. */
+  vector c[kernel_degree + 1];
+  for (int k = 0; k < VEC_SIZE; k++)
+    for (int j = 0; j < kernel_degree + 1; j++)
+      c[j].f[k] = kernel_coeffs[ind.i[k] * (kernel_degree + 1) + j];
+
+  /* Init the iteration for Horner's scheme. */
+  w->v = (c[0].v * x.v) + c[1].v;
+  dw_dx->v = c[0].v;
+
+  /* And we're off! */
+  for (int k = 2; k <= kernel_degree; k++) {
+    dw_dx->v = (dw_dx->v * x.v) + w->v;
+    w->v = (x.v * w->v) + c[k].v;
+  }
+
+  /* Return everything */
+  w->v = w->v * kernel_constant_vec.v * kernel_igamma3_vec.v;
+  dw_dx->v = dw_dx->v * kernel_constant_vec.v * kernel_igamma4_vec.v;
+}
+
+#endif
+
 /* Some cross-check functions */
 void hydro_kernel_dump(int N);


--- a/src/vector.h
+++ b/src/vector.h
@@ -79,6 +79,12 @@
                            _mm512_set1_epi64(ptrs[0])),                    \
      1)
 #define vec_gather(base, offsets) _mm512_i32gather_ps(offsets.m, base, 1)
+#define FILL_VEC(a)                                                     \
+  {                                                                     \
+    .f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a, .f[4] = a, .f[5] = a,   \
+    .f[6] = a, .f[7] = a, .f[8] = a, .f[9] = a, .f[10] = a, .f[11] = a, \
+    .f[12] = a, .f[13] = a, .f[14] = a, .f[15] = a                      \
+  }
 #elif defined(NO__AVX__)
 #define VECTORIZE
 #define VEC_SIZE 8
@@ -107,6 +113,11 @@
 #define vec_dbl_ftoi(a) _mm256_cvttpd_epi32(a)
 #define vec_dbl_fmin(a, b) _mm256_min_pd(a, b)
 #define vec_dbl_fmax(a, b) _mm256_max_pd(a, b)
+#define FILL_VEC(a)                                                   \
+  {                                                                   \
+    .f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a, .f[4] = a, .f[5] = a, \
+    .f[6] = a, .f[7] = a                                              \
+  }
 #ifdef __AVX2__
 #define VEC_HAVE_GATHER
 #define vec_gather(base, offsets) _mm256_i32gather_ps(base, offsets.m, 1)
@@ -139,6 +150,8 @@
 #define vec_dbl_ftoi(a) _mm_cvttpd_epi32(a)
 #define vec_dbl_fmin(a, b) _mm_min_pd(a, b)
 #define vec_dbl_fmax(a, b) _mm_max_pd(a, b)
+#define FILL_VEC(a) \
+  { .f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a }
 #else
 #define VEC_SIZE 4
 #endif

--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -22,7 +22,7 @@ AM_LDFLAGS = ../src/.libs/libswiftsim.a $(HDF5_LDFLAGS) $(HDF5_LIBS)

 # List of programs and scripts to run in the test suite
 TESTS = testGreetings testReading.sh testSingle testPair.sh testPairPerturbed.sh \
-	test27cells.sh test27cellsPerturbed.sh testParser.sh
+	test27cells.sh test27cellsPerturbed.sh testParser.sh testKernel

 # List of test programs to compile
 check_PROGRAMS = testGreetings testReading testSingle testTimeIntegration \

--- a/tests/testKernel.c
+++ b/tests/testKernel.c
 /*******************************************************************************
 * This file is part of SWIFT.
- * Copyright (C) 2016 Matthieu Schaller (matthieu.schaller@durham.ac.uk).
+ * Copyright (C) 2016 Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ *                    James Willis (james.s.willis@durham.ac.uk)
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published
@@ -17,21 +18,64 @@
 *
 ******************************************************************************/

+#define NO__AVX__
+#include "vector.h"
+
 #include "swift.h"
+#include "kernel_hydro.h"
+
+#define numPoints 64

 int main() {

  const float h = const_eta_kernel;
-  const int numPoints = 30;
+  float W[numPoints] = {0.f};
+  float dW[numPoints] = {0.f};
+
+  printf("\nSerial Output\n");
+  printf("-------------\n");

  for (int i = 0; i < numPoints; ++i) {

-    const float x = i * 3.f / numPoints;
-    float W, dW;
-    kernel_deval(x / h, &W, &dW);
+    const float x = i * 2.5f / numPoints;
+    kernel_deval(x / h, &W[i], &dW[i]);
+
+    printf("%2d: h= %f H= %f x=%f W(x,h)=%f dW(x,h)=%f\n", i, h,
+           h * kernel_gamma, x, W[i], dW[i]);
+  }
+
+  printf("\nVector Output for VEC_SIZE=%d\n", VEC_SIZE);
+  printf("-------------\n");
+  for (int i = 0; i < numPoints; i += VEC_SIZE) {
+
+    vector vx, vx_h;
+    vector W_vec, dW_vec;
+
+    for (int j = 0; j < VEC_SIZE; j++) {
+      vx.f[j] = (i + j) * 2.5f / numPoints;
+    }
+
+    vx_h.v = vx.v / vec_set1(h);
+
+    kernel_deval_vec(&vx_h, &W_vec, &dW_vec);
+
+    for (int j = 0; j < VEC_SIZE; j++) {
+      printf("%2d: h= %f H= %f x=%f W(x,h)=%f dW(x,h)=%f\n", i + j, h,
+             h * kernel_gamma, vx.f[j], W_vec.f[j], dW_vec.f[j]);

-    printf("h= %f H= %f x=%f W(x,h)=%f\n", h, h * kernel_gamma, x, W);
+      if (fabsf(W_vec.f[j] - W[i + j]) > 2e-7) {
+        printf("Invalid value ! scalar= %e, vector= %e\n", W[i + j],
+               W_vec.f[j]);
+        return 1;
+      }
+      if (fabsf(dW_vec.f[j] - dW[i + j]) > 2e-7) {
+        printf("Invalid value ! scalar= %e, vector= %e\n", dW[i + j],
+               dW_vec.f[j]);
+        return 1;
+      }
+    }
  }

+  printf("\nAll values are consistent\n");
  return 0;
 }