Commit 0bb4b74f authored by Matthieu Schaller's avatar Matthieu Schaller
Browse files

Merge branch 'AVX512-Fixes' into 'master'

Avx512 fixes

* Adds a generic mask, `mask_t`, for each instruction set (AVX, AVX2, AVX512)
* Removes testPair and testPairPerturbed which are superseded by test27cells and test27cellsPerturbed
* Adds testActivePair that computes the density between a pair of cells in various configurations of active particles (face, edge, corner)
* Replaces all arithmetic vector operations to support AVX512 as there is no compiler support to overload them
* Computes max_index_i[] for each particle in runner_dopair1_density_vec to follow ParCo paper
* testInteractions now calls correct vectorised density particle interactions and is run as a part of the test suite
* Solves #327


See merge request !396
parents bac1b1dd 0971ff35
......@@ -34,7 +34,7 @@ examples/*/*/*.txt
examples/*/*/used_parameters.yml
examples/*/gravity_checks_*.dat
tests/testPair
tests/testActivePair
tests/brute_force_periodic_BC_standard.dat
tests/swift_periodic_BC_standard.dat
tests/brute_force_periodic_BC_pertrubed.dat
......@@ -54,6 +54,11 @@ tests/brute_force_125_standard.dat
tests/swift_dopair_125_standard.dat
tests/brute_force_125_perturbed.dat
tests/swift_dopair_125_perturbed.dat
tests/brute_force_active.dat
tests/brute_force_periodic_BC_perturbed.dat
tests/swift_dopair_active.dat
tests/test_nonsym_density_serial.dat
tests/test_nonsym_density_vec.dat
tests/testGreetings
tests/testReading
tests/input.hdf5
......@@ -75,8 +80,6 @@ tests/test27cells.sh
tests/test27cellsPerturbed.sh
tests/test125cells.sh
tests/test125cellsPerturbed.sh
tests/testPair.sh
tests/testPairPerturbed.sh
tests/testParser.sh
tests/testReading.sh
tests/testAdiabaticIndex
......
......@@ -861,14 +861,14 @@ AM_CONDITIONAL([HAVE_DOXYGEN], [test "$ac_cv_path_ac_pt_DX_DOXYGEN" != ""])
# Handle .in files.
AC_CONFIG_FILES([Makefile src/Makefile examples/Makefile doc/Makefile doc/Doxyfile tests/Makefile])
AC_CONFIG_FILES([tests/testReading.sh], [chmod +x tests/testReading.sh])
AC_CONFIG_FILES([tests/testPair.sh], [chmod +x tests/testPair.sh])
AC_CONFIG_FILES([tests/testPairPerturbed.sh], [chmod +x tests/testPairPerturbed.sh])
AC_CONFIG_FILES([tests/testActivePair.sh], [chmod +x tests/testActivePair.sh])
AC_CONFIG_FILES([tests/test27cells.sh], [chmod +x tests/test27cells.sh])
AC_CONFIG_FILES([tests/test27cellsPerturbed.sh], [chmod +x tests/test27cellsPerturbed.sh])
AC_CONFIG_FILES([tests/test125cells.sh], [chmod +x tests/test125cells.sh])
AC_CONFIG_FILES([tests/test125cellsPerturbed.sh], [chmod +x tests/test125cellsPerturbed.sh])
AC_CONFIG_FILES([tests/testPeriodicBC.sh], [chmod +x tests/testPeriodicBC.sh])
AC_CONFIG_FILES([tests/testPeriodicBCPerturbed.sh], [chmod +x tests/testPeriodicBCPerturbed.sh])
AC_CONFIG_FILES([tests/testInteractions.sh], [chmod +x tests/testInteractions.sh])
AC_CONFIG_FILES([tests/testParser.sh], [chmod +x tests/testParser.sh])
# Save the compilation options
......
......@@ -63,8 +63,8 @@ struct cache {
/* Particle z velocity. */
float *restrict vz __attribute__((aligned(CACHE_ALIGN)));
/* Maximum distance of particles into neighbouring cell. */
float *restrict max_d __attribute__((aligned(CACHE_ALIGN)));
/* Maximum index into neighbouring cell for particles that are in range. */
int *restrict max_index __attribute__((aligned(CACHE_ALIGN)));
/* Cache size. */
int count;
......@@ -111,9 +111,10 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c,
/* Align cache on correct byte boundary and pad cache size to be a multiple of
* the vector size
* and include 2 vector lengths for remainder operations. */
unsigned int pad = 2 * VEC_SIZE, rem = count % VEC_SIZE;
size_t pad = 2 * VEC_SIZE, rem = count % VEC_SIZE;
if (rem > 0) pad += VEC_SIZE - rem;
unsigned int sizeBytes = (count + pad) * sizeof(float);
size_t sizeBytes = (count + pad) * sizeof(float);
size_t sizeIntBytes = (count + pad) * sizeof(int);
int error = 0;
/* Free memory if cache has already been allocated. */
......@@ -126,7 +127,7 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c,
free(c->vy);
free(c->vz);
free(c->h);
free(c->max_d);
free(c->max_index);
}
error += posix_memalign((void **)&c->x, CACHE_ALIGN, sizeBytes);
......@@ -137,7 +138,7 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c,
error += posix_memalign((void **)&c->vy, CACHE_ALIGN, sizeBytes);
error += posix_memalign((void **)&c->vz, CACHE_ALIGN, sizeBytes);
error += posix_memalign((void **)&c->h, CACHE_ALIGN, sizeBytes);
error += posix_memalign((void **)&c->max_d, CACHE_ALIGN, sizeBytes);
error += posix_memalign((void **)&c->max_index, CACHE_ALIGN, sizeIntBytes);
if (error != 0)
error("Couldn't allocate cache, no. of particles: %d", (int)count);
......@@ -430,7 +431,7 @@ static INLINE void cache_clean(struct cache *c) {
free(c->vy);
free(c->vz);
free(c->h);
free(c->max_d);
free(c->max_index);
}
}
......
......@@ -275,11 +275,11 @@ __attribute__((always_inline)) INLINE static vector pow_dimension_vec(
#if defined(HYDRO_DIMENSION_3D)
return (vector)(x.v * x.v * x.v);
return (vector)(vec_mul(vec_mul(x.v, x.v), x.v));
#elif defined(HYDRO_DIMENSION_2D)
return (vector)(x.v * x.v);
return (vector)(vec_mul(x.v, x.v));
#elif defined(HYDRO_DIMENSION_1D)
......@@ -304,16 +304,16 @@ __attribute__((always_inline)) INLINE static vector pow_dimension_plus_one_vec(
#if defined(HYDRO_DIMENSION_3D)
const vector x2 = (vector)(x.v * x.v);
return (vector)(x2.v * x2.v);
const vector x2 = (vector)(vec_mul(x.v, x.v));
return (vector)(vec_mul(x2.v, x2.v));
#elif defined(HYDRO_DIMENSION_2D)
return (vector)(x.v * x.v * x.v);
return (vector)(vec_mul(x.v, vec_mul(x.v, x.v)));
#elif defined(HYDRO_DIMENSION_1D)
return (vector)(x.v * x.v);
return (vector)(vec_mul(x.v, x.v));
#else
......
......@@ -388,7 +388,7 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz,
vector *wcountSum, vector *wcount_dhSum,
vector *div_vSum, vector *curlvxSum,
vector *curlvySum, vector *curlvzSum,
vector mask, int knlMask) {
mask_t mask) {
vector r, ri, ui, wi, wi_dx;
vector mj;
......@@ -432,49 +432,23 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz,
curlvry.v = vec_mul(curlvry.v, ri.v);
curlvrz.v = vec_mul(curlvrz.v, ri.v);
/* Mask updates to intermediate vector sums for particle pi. */
#ifdef HAVE_AVX512_F
rhoSum->v =
_mm512_mask_add_ps(rhoSum->v, knlMask, vec_mul(mj.v, wi.v), rhoSum->v);
rho_dhSum->v =
_mm512_mask_sub_ps(rho_dhSum->v, knlMask, rho_dhSum->v,
vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
vec_mul(ui.v, wi_dx.v))));
wcountSum->v = _mm512_mask_add_ps(wcountSum->v, knlMask, wi.v, wcountSum->v);
wcount_dhSum->v = _mm512_mask_sub_ps(
rho_dhSum->v, knlMask, rho_dhSum->v,
vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v)));
div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask, div_vSum->v,
vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)));
curlvxSum->v = _mm512_mask_add_ps(curlvxSum->v, knlMask,
vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)),
curlvxSum->v);
curlvySum->v = _mm512_mask_add_ps(curlvySum->v, knlMask,
vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)),
curlvySum->v);
curlvzSum->v = _mm512_mask_add_ps(curlvzSum->v, knlMask,
vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)),
curlvzSum->v);
#else
rhoSum->v += vec_and(vec_mul(mj.v, wi.v), mask.v);
rho_dhSum->v -= vec_and(vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
vec_mul(ui.v, wi_dx.v))),
mask.v);
wcountSum->v += vec_and(wi.v, mask.v);
wcount_dhSum->v -= vec_and(
vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v)), mask.v);
div_vSum->v -= vec_and(vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask.v);
curlvxSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask.v);
curlvySum->v += vec_and(vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask.v);
curlvzSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask.v);
#endif
vector wcount_dh_update;
wcount_dh_update.v =
vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v));
/* Mask updates to intermediate vector sums for particle pi. */
rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj.v, wi.v), mask);
rho_dhSum->v = vec_mask_sub(rho_dhSum->v, vec_mul(mj.v, wcount_dh_update.v), mask);
wcountSum->v = vec_mask_add(wcountSum->v, wi.v, mask);
wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, wcount_dh_update.v, mask);
div_vSum->v =
vec_mask_sub(div_vSum->v, vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask);
curlvxSum->v = vec_mask_add(curlvxSum->v,
vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask);
curlvySum->v = vec_mask_add(curlvySum->v,
vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask);
curlvzSum->v = vec_mask_add(curlvzSum->v,
vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask);
}
/**
......@@ -482,12 +456,14 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz,
* (non-symmetric vectorized version).
*/
__attribute__((always_inline)) INLINE static void
runner_iact_nonsym_2_vec_density(
float *R2, float *Dx, float *Dy, float *Dz, vector hi_inv, vector vix,
vector viy, vector viz, float *Vjx, float *Vjy, float *Vjz, float *Mj,
vector *rhoSum, vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum,
vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum,
vector mask, vector mask2, int knlMask, int knlMask2) {
runner_iact_nonsym_2_vec_density(float *R2, float *Dx, float *Dy, float *Dz,
vector hi_inv, vector vix, vector viy,
vector viz, float *Vjx, float *Vjy, float *Vjz,
float *Mj, vector *rhoSum, vector *rho_dhSum,
vector *wcountSum, vector *wcount_dhSum,
vector *div_vSum, vector *curlvxSum,
vector *curlvySum, vector *curlvzSum,
mask_t mask, mask_t mask2, short mask_cond) {
vector r, ri, r2, ui, wi, wi_dx;
vector mj;
......@@ -567,88 +543,66 @@ runner_iact_nonsym_2_vec_density(
curlvrz.v = vec_mul(curlvrz.v, ri.v);
curlvrz2.v = vec_mul(curlvrz2.v, ri2.v);
/* Mask updates to intermediate vector sums for particle pi. */
#ifdef HAVE_AVX512_F
rhoSum->v =
_mm512_mask_add_ps(rhoSum->v, knlMask, vec_mul(mj.v, wi.v), rhoSum->v);
rhoSum->v =
_mm512_mask_add_ps(rhoSum->v, knlMask2, vec_mul(mj2.v, wi2.v), rhoSum->v);
rho_dhSum->v =
_mm512_mask_sub_ps(rho_dhSum->v, knlMask, rho_dhSum->v,
vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
vec_mul(ui.v, wi_dx.v))));
rho_dhSum->v =
_mm512_mask_sub_ps(rho_dhSum->v, knlMask2, rho_dhSum->v,
vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi2.v,
vec_mul(ui2.v, wi_dx2.v))));
wcountSum->v = _mm512_mask_add_ps(wcountSum->v, knlMask, wi.v, wcountSum->v);
wcountSum->v =
_mm512_mask_add_ps(wcountSum->v, knlMask2, wi2.v, wcountSum->v);
wcount_dhSum->v = _mm512_mask_sub_ps(
rho_dhSum->v, knlMask, rho_dhSum->v,
vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v)));
wcount_dhSum->v = _mm512_mask_sub_ps(
rho_dhSum->v, knlMask2, rho_dhSum->v,
vec_fma(vec_set1(hydro_dimension), wi2.v, vec_mul(ui2.v, wi_dx2.v)));
div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask, div_vSum->v,
vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)));
div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask2, div_vSum->v,
vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)));
curlvxSum->v = _mm512_mask_add_ps(curlvxSum->v, knlMask,
vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)),
curlvxSum->v);
curlvxSum->v = _mm512_mask_add_ps(
curlvxSum->v, knlMask2, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)),
curlvxSum->v);
curlvySum->v = _mm512_mask_add_ps(curlvySum->v, knlMask,
vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)),
curlvySum->v);
curlvySum->v = _mm512_mask_add_ps(
curlvySum->v, knlMask2, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)),
curlvySum->v);
curlvzSum->v = _mm512_mask_add_ps(curlvzSum->v, knlMask,
vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)),
curlvzSum->v);
curlvzSum->v = _mm512_mask_add_ps(
curlvzSum->v, knlMask2, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)),
curlvzSum->v);
#else
rhoSum->v += vec_and(vec_mul(mj.v, wi.v), mask.v);
rhoSum->v += vec_and(vec_mul(mj2.v, wi2.v), mask2.v);
rho_dhSum->v -= vec_and(vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
vec_mul(ui.v, wi_dx.v))),
mask.v);
rho_dhSum->v -=
vec_and(vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v,
vec_mul(ui2.v, wi_dx2.v))),
mask2.v);
wcountSum->v += vec_and(wi.v, mask.v);
wcountSum->v += vec_and(wi2.v, mask2.v);
wcount_dhSum->v -= vec_and(
vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v)), mask.v);
wcount_dhSum->v -= vec_and(
vec_fma(vec_set1(hydro_dimension), wi2.v, vec_mul(ui2.v, wi_dx2.v)),
mask2.v);
div_vSum->v -= vec_and(vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask.v);
div_vSum->v -= vec_and(vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)), mask2.v);
curlvxSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask.v);
curlvxSum->v +=
vec_and(vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)), mask2.v);
curlvySum->v += vec_and(vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask.v);
curlvySum->v +=
vec_and(vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)), mask2.v);
curlvzSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask.v);
curlvzSum->v +=
vec_and(vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)), mask2.v);
#endif
vector wcount_dh_update, wcount_dh_update2;
wcount_dh_update.v =
vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v));
wcount_dh_update2.v =
vec_fma(vec_set1(hydro_dimension), wi2.v, vec_mul(ui2.v, wi_dx2.v));
/* Mask updates to intermediate vector sums for particle pi. */
/* Mask only when needed. */
if (mask_cond) {
rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj.v, wi.v), mask);
rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj2.v, wi2.v), mask2);
rho_dhSum->v =
vec_mask_sub(rho_dhSum->v, vec_mul(mj.v, wcount_dh_update.v), mask);
rho_dhSum->v =
vec_mask_sub(rho_dhSum->v, vec_mul(mj2.v, wcount_dh_update2.v), mask2);
wcountSum->v = vec_mask_add(wcountSum->v, wi.v, mask);
wcountSum->v = vec_mask_add(wcountSum->v, wi2.v, mask2);
wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, wcount_dh_update.v, mask);
wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, wcount_dh_update2.v, mask2);
div_vSum->v = vec_mask_sub(div_vSum->v,
vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask);
div_vSum->v = vec_mask_sub(
div_vSum->v, vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)), mask2);
curlvxSum->v = vec_mask_add(
curlvxSum->v, vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask);
curlvxSum->v = vec_mask_add(
curlvxSum->v, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)), mask2);
curlvySum->v = vec_mask_add(
curlvySum->v, vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask);
curlvySum->v = vec_mask_add(
curlvySum->v, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)), mask2);
curlvzSum->v = vec_mask_add(
curlvzSum->v, vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask);
curlvzSum->v = vec_mask_add(
curlvzSum->v, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)), mask2);
} else {
rhoSum->v = vec_add(rhoSum->v, vec_mul(mj.v, wi.v));
rhoSum->v = vec_add(rhoSum->v, vec_mul(mj2.v, wi2.v));
rho_dhSum->v = vec_sub(rho_dhSum->v, vec_mul(mj.v, wcount_dh_update.v));
rho_dhSum->v = vec_sub(rho_dhSum->v, vec_mul(mj2.v, wcount_dh_update2.v));
wcountSum->v = vec_add(wcountSum->v, wi.v);
wcountSum->v = vec_add(wcountSum->v, wi2.v);
wcount_dhSum->v = vec_sub(wcount_dhSum->v, wcount_dh_update.v);
wcount_dhSum->v = vec_sub(wcount_dhSum->v, wcount_dh_update2.v);
div_vSum->v = vec_sub(div_vSum->v, vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)));
div_vSum->v =
vec_sub(div_vSum->v, vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)));
curlvxSum->v =
vec_add(curlvxSum->v, vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)));
curlvxSum->v =
vec_add(curlvxSum->v, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)));
curlvySum->v =
vec_add(curlvySum->v, vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)));
curlvySum->v =
vec_add(curlvySum->v, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)));
curlvzSum->v =
vec_add(curlvzSum->v, vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)));
curlvzSum->v =
vec_add(curlvzSum->v, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)));
}
}
#endif
......@@ -755,7 +709,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_force(
float *R2, float *Dx, float *Hi, float *Hj, struct part **pi,
struct part **pj) {
#ifdef WITH_VECTORIZATION
#ifdef WITH_OLD_VECTORIZATION
vector r, r2, ri;
vector xi, xj;
......@@ -1037,7 +991,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force(
float *R2, float *Dx, float *Hi, float *Hj, struct part **pi,
struct part **pj) {
#ifdef WITH_VECTORIZATION
#ifdef WITH_OLD_VECTORIZATION
vector r, r2, ri;
vector xi, xj;
......
......@@ -341,20 +341,7 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx(
/* ------------------------------------------------------------------------- */
#ifdef WITH_VECTORIZATION
static const vector kernel_gamma_inv_vec = FILL_VEC((float)kernel_gamma_inv);
static const vector kernel_ivals_vec = FILL_VEC((float)kernel_ivals);
static const vector kernel_constant_vec = FILL_VEC((float)kernel_constant);
static const vector kernel_gamma_inv_dim_vec =
FILL_VEC((float)kernel_gamma_inv_dim);
static const vector kernel_gamma_inv_dim_plus_one_vec =
FILL_VEC((float)kernel_gamma_inv_dim_plus_one);
#ifdef WITH_OLD_VECTORIZATION
/**
* @brief Computes the kernel function and its derivative (Vectorised version).
*
......@@ -369,11 +356,12 @@ __attribute__((always_inline)) INLINE static void kernel_deval_vec(
/* Go to the range [0,1[ from [0,H[ */
vector x;
x.v = u->v * kernel_gamma_inv_vec.v;
x.v = vec_mul(u->v, kernel_gamma_inv_vec.v);
/* Load x and get the interval id. */
vector ind;
ind.m = vec_ftoi(vec_fmin(x.v * kernel_ivals_vec.v, kernel_ivals_vec.v));
ind.m =
vec_ftoi(vec_fmin(vec_mul(x.v, kernel_ivals_vec.v), kernel_ivals_vec.v));
/* load the coefficients. */
vector c[kernel_degree + 1];
......@@ -382,20 +370,36 @@ __attribute__((always_inline)) INLINE static void kernel_deval_vec(
c[j].f[k] = kernel_coeffs[ind.i[k] * (kernel_degree + 1) + j];
/* Init the iteration for Horner's scheme. */
w->v = (c[0].v * x.v) + c[1].v;
w->v = vec_fma(c[0].v, x.v, c[1].v);
dw_dx->v = c[0].v;
/* And we're off! */
for (int k = 2; k <= kernel_degree; k++) {
dw_dx->v = (dw_dx->v * x.v) + w->v;
w->v = (x.v * w->v) + c[k].v;
dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
w->v = vec_fma(x.v, w->v, c[k].v);
}
/* Return everything */
w->v = w->v * kernel_constant_vec.v * kernel_gamma_inv_dim_vec.v;
dw_dx->v =
dw_dx->v * kernel_constant_vec.v * kernel_gamma_inv_dim_plus_one_vec.v;
w->v =
vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
kernel_gamma_inv_dim_plus_one_vec.v));
}
#endif
#ifdef WITH_VECTORIZATION
static const vector kernel_gamma_inv_vec = FILL_VEC((float)kernel_gamma_inv);
static const vector kernel_ivals_vec = FILL_VEC((float)kernel_ivals);
static const vector kernel_constant_vec = FILL_VEC((float)kernel_constant);
static const vector kernel_gamma_inv_dim_vec =
FILL_VEC((float)kernel_gamma_inv_dim);
static const vector kernel_gamma_inv_dim_plus_one_vec =
FILL_VEC((float)kernel_gamma_inv_dim_plus_one);
/* Define constant vectors for the Wendland C2 and Cubic Spline kernel
* coefficients. */
......@@ -468,15 +472,14 @@ __attribute__((always_inline)) INLINE static void kernel_deval_1_vec(
w->v = vec_fma(x.v, w->v, wendland_const_c5.v);
#elif defined(CUBIC_SPLINE_KERNEL)
vector w2, dw_dx2;
vector mask_reg1, mask_reg2;
mask_t mask_reg;
/* Form a mask for each part of the kernel. */
mask_reg1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */
mask_reg2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */
;
/* Form a mask for one part of the kernel. */
/* Only need the mask for one region as the vec_blend defaults to the vector when the mask is 0.*/
vec_create_mask(mask_reg, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */
/* Work out w for both regions of the kernel and combine the results together
* using masks. */
* using a mask. */
/* Init the iteration for Horner's scheme. */
w->v = vec_fma(cubic_1_const_c0.v, x.v, cubic_1_const_c1.v);
......@@ -495,20 +498,16 @@ __attribute__((always_inline)) INLINE static void kernel_deval_1_vec(
w->v = vec_fma(x.v, w->v, cubic_1_const_c3.v);
w2.v = vec_fma(x.v, w2.v, cubic_2_const_c3.v);
/* Mask out unneeded values. */
w->v = vec_and(w->v, mask_reg1.v);
w2.v = vec_and(w2.v, mask_reg2.v);
dw_dx->v = vec_and(dw_dx->v, mask_reg1.v);
dw_dx2.v = vec_and(dw_dx2.v, mask_reg2.v);
/* Added both w and w2 together to form complete result. */
w->v = vec_add(w->v, w2.v);
dw_dx->v = vec_add(dw_dx->v, dw_dx2.v);
/* Blend both kernel regions into one vector (mask out unneeded values). */
/* Only need the mask for one region as the vec_blend defaults to the vector when the mask is 0.*/
w->v = vec_blend(mask_reg, w->v, w2.v);
dw_dx->v = vec_blend(mask_reg, dw_dx->v, dw_dx2.v);
#else
#error
#error "Vectorisation not supported for this kernel!!!"
#endif
/* Return everything */
/* Return everyting */
w->v =
vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
......@@ -580,15 +579,12 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
#elif defined(CUBIC_SPLINE_KERNEL)
vector w_2, dw_dx_2;
vector w2_2, dw_dx2_2;
vector mask_reg1, mask_reg2, mask_reg1_v2, mask_reg2_v2;
mask_t mask_reg, mask_reg_v2;
/* Form a mask for each part of the kernel. */
mask_reg1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */
mask_reg1_v2.v = vec_cmp_lt(x2.v, cond.v); /* 0 < x < 0.5 */
mask_reg2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */
;
mask_reg2_v2.v = vec_cmp_gte(x2.v, cond.v); /* 0.5 < x < 1 */
;
/* Form a mask for one part of the kernel for each vector. */
/* Only need the mask for one region as the vec_blend defaults to the vector when the mask is 0.*/
vec_create_mask(mask_reg, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */
vec_create_mask(mask_reg_v2, vec_cmp_gte(x2.v, cond.v)); /* 0.5 < x < 1 */
/* Work out w for both regions of the kernel and combine the results together
* using masks. */
......@@ -622,29 +618,22 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
w_2.v = vec_fma(x.v, w_2.v, cubic_2_const_c3.v);
w2_2.v = vec_fma(x2.v, w2_2.v, cubic_2_const_c3.v);
/* Mask out unneeded values. */
w->v = vec_and(w->v, mask_reg1.v);
w2->v = vec_and(w2->v, mask_reg1_v2.v);
w_2.v = vec_and(w_2.v, mask_reg2.v);
w2_2.v = vec_and(w2_2.v, mask_reg2_v2.v);
dw_dx->v = vec_and(dw_dx->v, mask_reg1.v);
dw_dx2->v = vec_and(dw_dx2->v, mask_reg1_v2.v);
dw_dx_2.v = vec_and(dw_dx_2.v, mask_reg2.v);
dw_dx2_2.v = vec_and(dw_dx2_2.v, mask_reg2_v2.v);
/* Added both w and w2 together to form complete result. */
w->v = vec_add(w->v, w_2.v);
w2->v = vec_add(w2->v, w2_2.v);
dw_dx->v = vec_add(dw_dx->v, dw_dx_2.v);
dw_dx2->v = vec_add(dw_dx2->v, dw_dx2_2.v);
/* Blend both kernel regions into one vector (mask out unneeded values). */
/* Only need the mask for one region as the vec_blend defaults to the vector when the mask is 0.*/
w->v = vec_blend(mask_reg, w->v, w_2.v);
w2->v = vec_blend(mask_reg_v2, w2->v, w2_2.v);
dw_dx->v = vec_blend(mask_reg, dw_dx->v, dw_dx_2.v);
dw_dx2->v = vec_blend(mask_reg_v2, dw_dx2->v, dw_dx2_2.v);
/* Return everything */
w->v = w->v * kernel_constant_vec.v * kernel_gamma_inv_dim_vec.v;
w2->v = w2->v * kernel_constant_vec.v * kernel_gamma_inv_dim_vec.v;
dw_dx->v =
dw_dx->v * kernel_constant_vec.v * kernel_gamma_inv_dim_plus_one_vec.v;
dw_dx2->v =
dw_dx2->v * kernel_constant_vec.v * kernel_gamma_inv_dim_plus_one_vec.v;
w->v =
vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
w2->v = vec_mul(w2->v,
vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
kernel_gamma_inv_dim_plus_one_vec.v));
dw_dx2->v = vec_mul(dw_dx2->v, vec_mul(kernel_constant_vec.v,