Commit efdf7330 authored by Matthieu Schaller's avatar Matthieu Schaller
Browse files

Merge branch 'vectorise_kernel' into 'master'

Vectorise kernel

Should fix issue #154. Includes updates to `testKernel.c` that test `kernel_deval_vec` against the serial `kernel_deval`.
I have tested it with Cubic Spline, Wendland C2 + C6 kernels.

See merge request !150
parents ef91f6c8 e78b25f5
......@@ -212,6 +212,61 @@ __attribute__((always_inline)) INLINE static void kernel_eval(float u,
*W = w * (float)kernel_constant * (float)kernel_igamma3;
}
#ifdef VECTORIZE
static const vector kernel_igamma_vec = FILL_VEC((float)kernel_igamma);
static const vector kernel_ivals_vec = FILL_VEC((float)kernel_ivals);
static const vector kernel_constant_vec = FILL_VEC((float)kernel_constant);
static const vector kernel_igamma3_vec = FILL_VEC((float)kernel_igamma3);
static const vector kernel_igamma4_vec = FILL_VEC((float)kernel_igamma4);
/**
* @brief Computes the kernel function and its derivative (Vectorised version).
*
* Return 0 if $u > \\gamma = H/h$
*
* @param u The ratio of the distance to the smoothing length $u = x/h$.
* @param w (return) The value of the kernel function $W(x,h)$.
* @param dw_dx (return) The norm of the gradient of $|\\nabla W(x,h)|$.
*/
__attribute__((always_inline))
INLINE static void kernel_deval_vec(vector *u, vector *w, vector *dw_dx) {
/* Go to the range [0,1[ from [0,H[ */
vector x;
x.v = u->v * kernel_igamma_vec.v;
/* Load x and get the interval id. */
vector ind;
ind.m = vec_ftoi(vec_fmin(x.v * kernel_ivals_vec.v, kernel_ivals_vec.v));
/* load the coefficients. */
vector c[kernel_degree + 1];
for (int k = 0; k < VEC_SIZE; k++)
for (int j = 0; j < kernel_degree + 1; j++)
c[j].f[k] = kernel_coeffs[ind.i[k] * (kernel_degree + 1) + j];
/* Init the iteration for Horner's scheme. */
w->v = (c[0].v * x.v) + c[1].v;
dw_dx->v = c[0].v;
/* And we're off! */
for (int k = 2; k <= kernel_degree; k++) {
dw_dx->v = (dw_dx->v * x.v) + w->v;
w->v = (x.v * w->v) + c[k].v;
}
/* Return everything */
w->v = w->v * kernel_constant_vec.v * kernel_igamma3_vec.v;
dw_dx->v = dw_dx->v * kernel_constant_vec.v * kernel_igamma4_vec.v;
}
#endif
/* Some cross-check functions */
void hydro_kernel_dump(int N);
......
......@@ -79,6 +79,12 @@
_mm512_set1_epi64(ptrs[0])), \
1)
#define vec_gather(base, offsets) _mm512_i32gather_ps(offsets.m, base, 1)
#define FILL_VEC(a) \
{ \
.f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a, .f[4] = a, .f[5] = a, \
.f[6] = a, .f[7] = a, .f[8] = a, .f[9] = a, .f[10] = a, .f[11] = a, \
.f[12] = a, .f[13] = a, .f[14] = a, .f[15] = a \
}
#elif defined(NO__AVX__)
#define VECTORIZE
#define VEC_SIZE 8
......@@ -107,6 +113,11 @@
#define vec_dbl_ftoi(a) _mm256_cvttpd_epi32(a)
#define vec_dbl_fmin(a, b) _mm256_min_pd(a, b)
#define vec_dbl_fmax(a, b) _mm256_max_pd(a, b)
#define FILL_VEC(a) \
{ \
.f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a, .f[4] = a, .f[5] = a, \
.f[6] = a, .f[7] = a \
}
#ifdef __AVX2__
#define VEC_HAVE_GATHER
#define vec_gather(base, offsets) _mm256_i32gather_ps(base, offsets.m, 1)
......@@ -139,6 +150,8 @@
#define vec_dbl_ftoi(a) _mm_cvttpd_epi32(a)
#define vec_dbl_fmin(a, b) _mm_min_pd(a, b)
#define vec_dbl_fmax(a, b) _mm_max_pd(a, b)
#define FILL_VEC(a) \
{ .f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a }
#else
#define VEC_SIZE 4
#endif
......
......@@ -22,7 +22,7 @@ AM_LDFLAGS = ../src/.libs/libswiftsim.a $(HDF5_LDFLAGS) $(HDF5_LIBS)
# List of programs and scripts to run in the test suite
TESTS = testGreetings testReading.sh testSingle testPair.sh testPairPerturbed.sh \
test27cells.sh test27cellsPerturbed.sh testParser.sh
test27cells.sh test27cellsPerturbed.sh testParser.sh testKernel
# List of test programs to compile
check_PROGRAMS = testGreetings testReading testSingle testTimeIntegration \
......
/*******************************************************************************
* This file is part of SWIFT.
* Copyright (C) 2016 Matthieu Schaller (matthieu.schaller@durham.ac.uk).
* Copyright (C) 2016 Matthieu Schaller (matthieu.schaller@durham.ac.uk)
* James Willis (james.s.willis@durham.ac.uk)
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published
......@@ -17,21 +18,64 @@
*
******************************************************************************/
#define NO__AVX__
#include "vector.h"
#include "swift.h"
#include "kernel_hydro.h"
#define numPoints 64
int main() {
const float h = const_eta_kernel;
const int numPoints = 30;
float W[numPoints] = {0.f};
float dW[numPoints] = {0.f};
printf("\nSerial Output\n");
printf("-------------\n");
for (int i = 0; i < numPoints; ++i) {
const float x = i * 3.f / numPoints;
float W, dW;
kernel_deval(x / h, &W, &dW);
const float x = i * 2.5f / numPoints;
kernel_deval(x / h, &W[i], &dW[i]);
printf("%2d: h= %f H= %f x=%f W(x,h)=%f dW(x,h)=%f\n", i, h,
h * kernel_gamma, x, W[i], dW[i]);
}
printf("\nVector Output for VEC_SIZE=%d\n", VEC_SIZE);
printf("-------------\n");
for (int i = 0; i < numPoints; i += VEC_SIZE) {
vector vx, vx_h;
vector W_vec, dW_vec;
for (int j = 0; j < VEC_SIZE; j++) {
vx.f[j] = (i + j) * 2.5f / numPoints;
}
vx_h.v = vx.v / vec_set1(h);
kernel_deval_vec(&vx_h, &W_vec, &dW_vec);
for (int j = 0; j < VEC_SIZE; j++) {
printf("%2d: h= %f H= %f x=%f W(x,h)=%f dW(x,h)=%f\n", i + j, h,
h * kernel_gamma, vx.f[j], W_vec.f[j], dW_vec.f[j]);
printf("h= %f H= %f x=%f W(x,h)=%f\n", h, h * kernel_gamma, x, W);
if (fabsf(W_vec.f[j] - W[i + j]) > 2e-7) {
printf("Invalid value ! scalar= %e, vector= %e\n", W[i + j],
W_vec.f[j]);
return 1;
}
if (fabsf(dW_vec.f[j] - dW[i + j]) > 2e-7) {
printf("Invalid value ! scalar= %e, vector= %e\n", dW[i + j],
dW_vec.f[j]);
return 1;
}
}
}
printf("\nAll values are consistent\n");
return 0;
}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment