diff --git a/src/kernel_hydro.h b/src/kernel_hydro.h index 66f51391fb9504ba30363b1980aaad1fcc9174b7..3662510fbcd3817d338d76bc89b1334e3c0cd395 100644 --- a/src/kernel_hydro.h +++ b/src/kernel_hydro.h @@ -212,6 +212,61 @@ __attribute__((always_inline)) INLINE static void kernel_eval(float u, *W = w * (float)kernel_constant * (float)kernel_igamma3; } +#ifdef VECTORIZE + +static const vector kernel_igamma_vec = FILL_VEC((float)kernel_igamma); + +static const vector kernel_ivals_vec = FILL_VEC((float)kernel_ivals); + +static const vector kernel_constant_vec = FILL_VEC((float)kernel_constant); + +static const vector kernel_igamma3_vec = FILL_VEC((float)kernel_igamma3); + +static const vector kernel_igamma4_vec = FILL_VEC((float)kernel_igamma4); + +/** + * @brief Computes the kernel function and its derivative (Vectorised version). + * + * Return 0 if $u > \\gamma = H/h$ + * + * @param u The ratio of the distance to the smoothing length $u = x/h$. + * @param w (return) The value of the kernel function $W(x,h)$. + * @param dw_dx (return) The norm of the gradient of $|\\nabla W(x,h)|$. + */ +__attribute__((always_inline)) + INLINE static void kernel_deval_vec(vector *u, vector *w, vector *dw_dx) { + + /* Go to the range [0,1[ from [0,H[ */ + vector x; + x.v = u->v * kernel_igamma_vec.v; + + /* Load x and get the interval id. */ + vector ind; + ind.m = vec_ftoi(vec_fmin(x.v * kernel_ivals_vec.v, kernel_ivals_vec.v)); + + /* load the coefficients. */ + vector c[kernel_degree + 1]; + for (int k = 0; k < VEC_SIZE; k++) + for (int j = 0; j < kernel_degree + 1; j++) + c[j].f[k] = kernel_coeffs[ind.i[k] * (kernel_degree + 1) + j]; + + /* Init the iteration for Horner's scheme. */ + w->v = (c[0].v * x.v) + c[1].v; + dw_dx->v = c[0].v; + + /* And we're off! */ + for (int k = 2; k <= kernel_degree; k++) { + dw_dx->v = (dw_dx->v * x.v) + w->v; + w->v = (x.v * w->v) + c[k].v; + } + + /* Return everything */ + w->v = w->v * kernel_constant_vec.v * kernel_igamma3_vec.v; + dw_dx->v = dw_dx->v * kernel_constant_vec.v * kernel_igamma4_vec.v; +} + +#endif + /* Some cross-check functions */ void hydro_kernel_dump(int N); diff --git a/src/vector.h b/src/vector.h index ef2b7c4b9e42ceb61dc38c3196c1819be652926f..fa311f121f7b702f2288be0d561e520b52330457 100644 --- a/src/vector.h +++ b/src/vector.h @@ -79,6 +79,12 @@ _mm512_set1_epi64(ptrs[0])), \ 1) #define vec_gather(base, offsets) _mm512_i32gather_ps(offsets.m, base, 1) +#define FILL_VEC(a) \ + { \ + .f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a, .f[4] = a, .f[5] = a, \ + .f[6] = a, .f[7] = a, .f[8] = a, .f[9] = a, .f[10] = a, .f[11] = a, \ + .f[12] = a, .f[13] = a, .f[14] = a, .f[15] = a \ + } #elif defined(NO__AVX__) #define VECTORIZE #define VEC_SIZE 8 @@ -107,6 +113,11 @@ #define vec_dbl_ftoi(a) _mm256_cvttpd_epi32(a) #define vec_dbl_fmin(a, b) _mm256_min_pd(a, b) #define vec_dbl_fmax(a, b) _mm256_max_pd(a, b) +#define FILL_VEC(a) \ + { \ + .f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a, .f[4] = a, .f[5] = a, \ + .f[6] = a, .f[7] = a \ + } #ifdef __AVX2__ #define VEC_HAVE_GATHER #define vec_gather(base, offsets) _mm256_i32gather_ps(base, offsets.m, 1) @@ -139,6 +150,8 @@ #define vec_dbl_ftoi(a) _mm_cvttpd_epi32(a) #define vec_dbl_fmin(a, b) _mm_min_pd(a, b) #define vec_dbl_fmax(a, b) _mm_max_pd(a, b) +#define FILL_VEC(a) \ + { .f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a } #else #define VEC_SIZE 4 #endif diff --git a/tests/Makefile.am b/tests/Makefile.am index b53a08615c5a8c7c2c31475bf7207522f8b9a58c..d0c132ad1b6dadd749a389fb71b873120b48139a 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -22,7 +22,7 @@ AM_LDFLAGS = ../src/.libs/libswiftsim.a $(HDF5_LDFLAGS) $(HDF5_LIBS) # List of programs and scripts to run in the test suite TESTS = testGreetings testReading.sh testSingle testPair.sh testPairPerturbed.sh \ - test27cells.sh test27cellsPerturbed.sh testParser.sh + test27cells.sh test27cellsPerturbed.sh testParser.sh testKernel # List of test programs to compile check_PROGRAMS = testGreetings testReading testSingle testTimeIntegration \ diff --git a/tests/testKernel.c b/tests/testKernel.c index 5ad9cc81ea92e6ef9487489c5d560abf414e38df..5a5df57f293348cf865eb4c9fd88f22e906f089b 100644 --- a/tests/testKernel.c +++ b/tests/testKernel.c @@ -1,6 +1,7 @@ /******************************************************************************* * This file is part of SWIFT. - * Copyright (C) 2016 Matthieu Schaller (matthieu.schaller@durham.ac.uk). + * Copyright (C) 2016 Matthieu Schaller (matthieu.schaller@durham.ac.uk) + * James Willis (james.s.willis@durham.ac.uk) * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published @@ -17,21 +18,64 @@ * ******************************************************************************/ +#define NO__AVX__ +#include "vector.h" + #include "swift.h" +#include "kernel_hydro.h" + +#define numPoints 64 int main() { const float h = const_eta_kernel; - const int numPoints = 30; + float W[numPoints] = {0.f}; + float dW[numPoints] = {0.f}; + + printf("\nSerial Output\n"); + printf("-------------\n"); for (int i = 0; i < numPoints; ++i) { - const float x = i * 3.f / numPoints; - float W, dW; - kernel_deval(x / h, &W, &dW); + const float x = i * 2.5f / numPoints; + kernel_deval(x / h, &W[i], &dW[i]); + + printf("%2d: h= %f H= %f x=%f W(x,h)=%f dW(x,h)=%f\n", i, h, + h * kernel_gamma, x, W[i], dW[i]); + } + + printf("\nVector Output for VEC_SIZE=%d\n", VEC_SIZE); + printf("-------------\n"); + for (int i = 0; i < numPoints; i += VEC_SIZE) { + + vector vx, vx_h; + vector W_vec, dW_vec; + + for (int j = 0; j < VEC_SIZE; j++) { + vx.f[j] = (i + j) * 2.5f / numPoints; + } + + vx_h.v = vx.v / vec_set1(h); + + kernel_deval_vec(&vx_h, &W_vec, &dW_vec); + + for (int j = 0; j < VEC_SIZE; j++) { + printf("%2d: h= %f H= %f x=%f W(x,h)=%f dW(x,h)=%f\n", i + j, h, + h * kernel_gamma, vx.f[j], W_vec.f[j], dW_vec.f[j]); - printf("h= %f H= %f x=%f W(x,h)=%f\n", h, h * kernel_gamma, x, W); + if (fabsf(W_vec.f[j] - W[i + j]) > 2e-7) { + printf("Invalid value ! scalar= %e, vector= %e\n", W[i + j], + W_vec.f[j]); + return 1; + } + if (fabsf(dW_vec.f[j] - dW[i + j]) > 2e-7) { + printf("Invalid value ! scalar= %e, vector= %e\n", dW[i + j], + dW_vec.f[j]); + return 1; + } + } } + printf("\nAll values are consistent\n"); return 0; }