Commit 6079a299 authored by Matthieu Schaller's avatar Matthieu Schaller
Browse files

Merge branch 'master' of gitlab.cosma.dur.ac.uk:swift/swiftsim

parents 2d34a8e0 ff64197b
......@@ -26,9 +26,11 @@
#include "cell.h"
#include "error.h"
#include "part.h"
#include "sort_part.h"
#include "vector.h"
#define NUM_VEC_PROC 2
#define CACHE_ALIGN 64
#define C2_CACHE_SIZE (NUM_VEC_PROC * VEC_SIZE * 6) + (NUM_VEC_PROC * VEC_SIZE)
#define C2_CACHE_ALIGN sizeof(float) * VEC_SIZE
......@@ -37,28 +39,31 @@
struct cache {
/* Particle x position. */
float *restrict x __attribute__((aligned(sizeof(float) * VEC_SIZE)));
float *restrict x __attribute__((aligned(CACHE_ALIGN)));
/* Particle y position. */
float *restrict y __attribute__((aligned(sizeof(float) * VEC_SIZE)));
float *restrict y __attribute__((aligned(CACHE_ALIGN)));
/* Particle z position. */
float *restrict z __attribute__((aligned(sizeof(float) * VEC_SIZE)));
float *restrict z __attribute__((aligned(CACHE_ALIGN)));
/* Particle smoothing length. */
float *restrict h __attribute__((aligned(sizeof(float) * VEC_SIZE)));
float *restrict h __attribute__((aligned(CACHE_ALIGN)));
/* Particle mass. */
float *restrict m __attribute__((aligned(sizeof(float) * VEC_SIZE)));
float *restrict m __attribute__((aligned(CACHE_ALIGN)));
/* Particle x velocity. */
float *restrict vx __attribute__((aligned(sizeof(float) * VEC_SIZE)));
float *restrict vx __attribute__((aligned(CACHE_ALIGN)));
/* Particle y velocity. */
float *restrict vy __attribute__((aligned(sizeof(float) * VEC_SIZE)));
float *restrict vy __attribute__((aligned(CACHE_ALIGN)));
/* Particle z velocity. */
float *restrict vz __attribute__((aligned(sizeof(float) * VEC_SIZE)));
float *restrict vz __attribute__((aligned(CACHE_ALIGN)));
/* Maximum distance of particles into neighbouring cell. */
float *restrict max_d __attribute__((aligned(CACHE_ALIGN)));
/* Cache size. */
int count;
......@@ -102,10 +107,12 @@ struct c2_cache {
__attribute__((always_inline)) INLINE void cache_init(struct cache *c,
size_t count) {
/* Align cache on correct byte boundary and pad cache size to include 2 vector
* lengths for remainder operations. */
unsigned long alignment = sizeof(float) * VEC_SIZE;
unsigned int sizeBytes = (count + (2 * VEC_SIZE)) * sizeof(float);
/* Align cache on correct byte boundary and pad cache size to be a multiple of
* the vector size
* and include 2 vector lengths for remainder operations. */
unsigned int pad = 2 * VEC_SIZE, rem = count % VEC_SIZE;
if (rem > 0) pad += VEC_SIZE - rem;
unsigned int sizeBytes = (count + pad) * sizeof(float);
int error = 0;
/* Free memory if cache has already been allocated. */
......@@ -118,16 +125,18 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c,
free(c->vy);
free(c->vz);
free(c->h);
free(c->max_d);
}
error += posix_memalign((void **)&c->x, alignment, sizeBytes);
error += posix_memalign((void **)&c->y, alignment, sizeBytes);
error += posix_memalign((void **)&c->z, alignment, sizeBytes);
error += posix_memalign((void **)&c->m, alignment, sizeBytes);
error += posix_memalign((void **)&c->vx, alignment, sizeBytes);
error += posix_memalign((void **)&c->vy, alignment, sizeBytes);
error += posix_memalign((void **)&c->vz, alignment, sizeBytes);
error += posix_memalign((void **)&c->h, alignment, sizeBytes);
error += posix_memalign((void **)&c->x, CACHE_ALIGN, sizeBytes);
error += posix_memalign((void **)&c->y, CACHE_ALIGN, sizeBytes);
error += posix_memalign((void **)&c->z, CACHE_ALIGN, sizeBytes);
error += posix_memalign((void **)&c->m, CACHE_ALIGN, sizeBytes);
error += posix_memalign((void **)&c->vx, CACHE_ALIGN, sizeBytes);
error += posix_memalign((void **)&c->vy, CACHE_ALIGN, sizeBytes);
error += posix_memalign((void **)&c->vz, CACHE_ALIGN, sizeBytes);
error += posix_memalign((void **)&c->h, CACHE_ALIGN, sizeBytes);
error += posix_memalign((void **)&c->max_d, CACHE_ALIGN, sizeBytes);
if (error != 0)
error("Couldn't allocate cache, no. of particles: %d", (int)count);
......@@ -145,8 +154,11 @@ __attribute__((always_inline)) INLINE void cache_read_particles(
#if defined(GADGET2_SPH)
/* Shift the particles positions to a local frame so single precision can be
* used instead of double precision. */
/* Shift the particles positions to a local frame so single precision can be
* used instead of double precision. */
#if defined(WITH_VECTORIZATION) && defined(__ICC)
#pragma vector aligned
#endif
for (int i = 0; i < ci->count; i++) {
ci_cache->x[i] = ci->parts[i].x[0] - ci->loc[0];
ci_cache->y[i] = ci->parts[i].x[1] - ci->loc[1];
......@@ -163,7 +175,247 @@ __attribute__((always_inline)) INLINE void cache_read_particles(
}
/**
* @brief Clean the memory allocated by a #cache object.
* @brief Populate cache by reading in the particles from two cells in unsorted
* order.
*
* @param ci The i #cell.
* @param cj The j #cell.
* @param ci_cache The cache for cell ci.
* @param cj_cache The cache for cell cj.
* @param shift The amount to shift the particle positions to account for BCs
*/
__attribute__((always_inline)) INLINE void cache_read_two_cells(
const struct cell *const ci, const struct cell *const cj,
struct cache *const ci_cache, struct cache *const cj_cache,
const double *const shift) {
/* Shift the particles positions to a local frame (ci frame) so single
* precision can be
* used instead of double precision. Also shift the cell ci, particles
* positions due to BCs but leave cell cj. */
for (int i = 0; i < ci->count; i++) {
ci_cache->x[i] = ci->parts[i].x[0] - ci->loc[0] - shift[0];
ci_cache->y[i] = ci->parts[i].x[1] - ci->loc[1] - shift[1];
ci_cache->z[i] = ci->parts[i].x[2] - ci->loc[2] - shift[2];
ci_cache->h[i] = ci->parts[i].h;
ci_cache->m[i] = ci->parts[i].mass;
ci_cache->vx[i] = ci->parts[i].v[0];
ci_cache->vy[i] = ci->parts[i].v[1];
ci_cache->vz[i] = ci->parts[i].v[2];
}
for (int i = 0; i < cj->count; i++) {
cj_cache->x[i] = cj->parts[i].x[0] - ci->loc[0];
cj_cache->y[i] = cj->parts[i].x[1] - ci->loc[1];
cj_cache->z[i] = cj->parts[i].x[2] - ci->loc[2];
cj_cache->h[i] = cj->parts[i].h;
cj_cache->m[i] = cj->parts[i].mass;
cj_cache->vx[i] = cj->parts[i].v[0];
cj_cache->vy[i] = cj->parts[i].v[1];
cj_cache->vz[i] = cj->parts[i].v[2];
}
}
__attribute__((always_inline)) INLINE void cache_read_cell_sorted(
const struct cell *const ci, struct cache *const ci_cache,
const struct entry *restrict sort_i, double *const loc,
double *const shift) {
int idx;
/* Shift the particles positions to a local frame (ci frame) so single precision
* can be
* used instead of double precision. Also shift the cell ci, particles positions
* due to BCs but leave cell cj. */
#if defined(WITH_VECTORIZATION) && defined(__ICC)
#pragma simd
#endif
for (int i = 0; i < ci->count; i++) {
idx = sort_i[i].i;
ci_cache->x[i] = ci->parts[idx].x[0] - loc[0] - shift[0];
ci_cache->y[i] = ci->parts[idx].x[1] - loc[1] - shift[1];
ci_cache->z[i] = ci->parts[idx].x[2] - loc[2] - shift[2];
ci_cache->h[i] = ci->parts[idx].h;
ci_cache->m[i] = ci->parts[idx].mass;
ci_cache->vx[i] = ci->parts[idx].v[0];
ci_cache->vy[i] = ci->parts[idx].v[1];
ci_cache->vz[i] = ci->parts[idx].v[2];
}
}
/**
* @brief Populate cache by reading in the particles from two cells in sorted
* order.
*
* @param ci The i #cell.
* @param cj The j #cell.
* @param ci_cache The #cache for cell ci.
* @param cj_cache The #cache for cell cj.
* @param sort_i The array of sorted particle indices for cell ci.
* @param sort_j The array of sorted particle indices for cell ci.
* @param shift The amount to shift the particle positions to account for BCs
*/
__attribute__((always_inline)) INLINE void cache_read_two_cells_sorted(
const struct cell *const ci, const struct cell *const cj,
struct cache *const ci_cache, struct cache *const cj_cache,
const struct entry *restrict sort_i, const struct entry *restrict sort_j,
const double *const shift) {
int idx;
/* Shift the particles positions to a local frame (ci frame) so single precision
* can be
* used instead of double precision. Also shift the cell ci, particles positions
* due to BCs but leave cell cj. */
#if defined(WITH_VECTORIZATION) && defined(__ICC)
#pragma simd
#endif
for (int i = 0; i < ci->count; i++) {
idx = sort_i[i].i;
ci_cache->x[i] = ci->parts[idx].x[0] - ci->loc[0] - shift[0];
ci_cache->y[i] = ci->parts[idx].x[1] - ci->loc[1] - shift[1];
ci_cache->z[i] = ci->parts[idx].x[2] - ci->loc[2] - shift[2];
ci_cache->h[i] = ci->parts[idx].h;
ci_cache->m[i] = ci->parts[idx].mass;
ci_cache->vx[i] = ci->parts[idx].v[0];
ci_cache->vy[i] = ci->parts[idx].v[1];
ci_cache->vz[i] = ci->parts[idx].v[2];
}
#if defined(WITH_VECTORIZATION) && defined(__ICC)
#pragma simd
#endif
for (int i = 0; i < cj->count; i++) {
idx = sort_j[i].i;
cj_cache->x[i] = cj->parts[idx].x[0] - ci->loc[0];
cj_cache->y[i] = cj->parts[idx].x[1] - ci->loc[1];
cj_cache->z[i] = cj->parts[idx].x[2] - ci->loc[2];
cj_cache->h[i] = cj->parts[idx].h;
cj_cache->m[i] = cj->parts[idx].mass;
cj_cache->vx[i] = cj->parts[idx].v[0];
cj_cache->vy[i] = cj->parts[idx].v[1];
cj_cache->vz[i] = cj->parts[idx].v[2];
}
}
/**
* @brief Populate caches by only reading particles that are within range of
* each other within the adjoining cell.Also read the particles into the cache
* in sorted order.
*
* @param ci The i #cell.
* @param cj The j #cell.
* @param ci_cache The #cache for cell ci.
* @param cj_cache The #cache for cell cj.
* @param sort_i The array of sorted particle indices for cell ci.
* @param sort_j The array of sorted particle indices for cell ci.
* @param shift The amount to shift the particle positions to account for BCs
* @param first_pi The first particle in cell ci that is in range.
* @param last_pj The last particle in cell cj that is in range.
* @param num_vec_proc Number of vectors that will be used to process the
* interaction.
*/
__attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
const struct cell *const ci, const struct cell *const cj,
struct cache *const ci_cache, struct cache *const cj_cache,
const struct entry *restrict sort_i, const struct entry *restrict sort_j,
const double *const shift, int *first_pi, int *last_pj,
const int num_vec_proc) {
int idx, ci_cache_idx;
/* Pad number of particles read to the vector size. */
int rem = (ci->count - *first_pi) % (num_vec_proc * VEC_SIZE);
if (rem != 0) {
int pad = (num_vec_proc * VEC_SIZE) - rem;
if (*first_pi - pad >= 0) *first_pi -= pad;
}
rem = *last_pj % (num_vec_proc * VEC_SIZE);
if (rem != 0) {
int pad = (num_vec_proc * VEC_SIZE) - rem;
if (*last_pj + pad < cj->count) *last_pj += pad;
}
int first_pi_align = *first_pi;
int last_pj_align = *last_pj;
/* Shift the particles positions to a local frame (ci frame) so single precision
* can be
* used instead of double precision. Also shift the cell ci, particles positions
* due to BCs but leave cell cj. */
#if defined(WITH_VECTORIZATION) && defined(__ICC)
#pragma vector aligned
#endif
for (int i = first_pi_align; i < ci->count; i++) {
/* Make sure ci_cache is filled from the first element. */
ci_cache_idx = i - first_pi_align;
idx = sort_i[i].i;
ci_cache->x[ci_cache_idx] = ci->parts[idx].x[0] - ci->loc[0] - shift[0];
ci_cache->y[ci_cache_idx] = ci->parts[idx].x[1] - ci->loc[1] - shift[1];
ci_cache->z[ci_cache_idx] = ci->parts[idx].x[2] - ci->loc[2] - shift[2];
ci_cache->h[ci_cache_idx] = ci->parts[idx].h;
ci_cache->m[ci_cache_idx] = ci->parts[idx].mass;
ci_cache->vx[ci_cache_idx] = ci->parts[idx].v[0];
ci_cache->vy[ci_cache_idx] = ci->parts[idx].v[1];
ci_cache->vz[ci_cache_idx] = ci->parts[idx].v[2];
}
/* Pad cache with fake particles that exist outside the cell so will not
* interact.*/
float fake_pix = 2.0f * ci->parts[sort_i[ci->count - 1].i].x[0];
for (int i = ci->count - first_pi_align;
i < ci->count - first_pi_align + VEC_SIZE; i++) {
ci_cache->x[i] = fake_pix;
ci_cache->y[i] = 1.f;
ci_cache->z[i] = 1.f;
ci_cache->h[i] = 1.f;
ci_cache->m[i] = 1.f;
ci_cache->vx[i] = 1.f;
ci_cache->vy[i] = 1.f;
ci_cache->vz[i] = 1.f;
}
#if defined(WITH_VECTORIZATION) && defined(__ICC)
#pragma vector aligned
#endif
for (int i = 0; i <= last_pj_align; i++) {
idx = sort_j[i].i;
cj_cache->x[i] = cj->parts[idx].x[0] - ci->loc[0];
cj_cache->y[i] = cj->parts[idx].x[1] - ci->loc[1];
cj_cache->z[i] = cj->parts[idx].x[2] - ci->loc[2];
cj_cache->h[i] = cj->parts[idx].h;
cj_cache->m[i] = cj->parts[idx].mass;
cj_cache->vx[i] = cj->parts[idx].v[0];
cj_cache->vy[i] = cj->parts[idx].v[1];
cj_cache->vz[i] = cj->parts[idx].v[2];
}
/* Pad cache with fake particles that exist outside the cell so will not
* interact.*/
float fake_pjx = 2.0f * cj->parts[sort_j[cj->count - 1].i].x[0];
for (int i = last_pj_align + 1; i < last_pj_align + 1 + VEC_SIZE; i++) {
cj_cache->x[i] = fake_pjx;
cj_cache->y[i] = 1.f;
cj_cache->z[i] = 1.f;
cj_cache->h[i] = 1.f;
cj_cache->m[i] = 1.f;
cj_cache->vx[i] = 1.f;
cj_cache->vy[i] = 1.f;
cj_cache->vz[i] = 1.f;
}
}
/* @brief Clean the memory allocated by a #cache object.
*
* @param c The #cache to clean.
*/
......@@ -177,6 +429,7 @@ static INLINE void cache_clean(struct cache *c) {
free(c->vy);
free(c->vz);
free(c->h);
free(c->max_d);
}
}
......
......@@ -4264,9 +4264,11 @@ void engine_init(struct engine *e, struct space *s,
e->runners[k].qid = k * nr_queues / e->nr_threads;
}
/* Allocate particle cache. */
e->runners[k].par_cache.count = 0;
cache_init(&e->runners[k].par_cache, CACHE_SIZE);
/* Allocate particle caches. */
e->runners[k].ci_cache.count = 0;
e->runners[k].cj_cache.count = 0;
cache_init(&e->runners[k].ci_cache, CACHE_SIZE);
cache_init(&e->runners[k].cj_cache, CACHE_SIZE);
if (verbose) {
if (with_aff)
......@@ -4353,7 +4355,8 @@ void engine_compute_next_snapshot_time(struct engine *e) {
*/
void engine_clean(struct engine *e) {
for (int i = 0; i < e->nr_threads; ++i) cache_clean(&e->runners[i].par_cache);
for (int i = 0; i < e->nr_threads; ++i) cache_clean(&e->runners[i].ci_cache);
for (int i = 0; i < e->nr_threads; ++i) cache_clean(&e->runners[i].cj_cache);
free(e->runners);
free(e->snapshotUnits);
free(e->links);
......
......@@ -32,6 +32,7 @@
* Gadget-2 tree-code neighbours search.
*/
#include "cache.h"
#include "minmax.h"
/**
......@@ -150,7 +151,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_density(
for (k = 0; k < 3; k++)
dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]);
#else
error("Unknown vector size.")
error("Unknown vector size.");
#endif
/* Get the radius and inverse radius. */
......@@ -317,7 +318,7 @@ runner_iact_nonsym_vec_density(float *R2, float *Dx, float *Hi, float *Hj,
for (k = 0; k < 3; k++)
dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]);
#else
error("Unknown vector size.")
error("Unknown vector size.");
#endif
/* Get the radius and inverse radius. */
......@@ -374,6 +375,106 @@ runner_iact_nonsym_vec_density(float *R2, float *Dx, float *Hi, float *Hj,
}
#ifdef WITH_VECTORIZATION
/**
* @brief Density interaction computed using 1 vector
* (non-symmetric vectorized version).
*/
__attribute__((always_inline)) INLINE static void
runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz,
vector hi_inv, vector vix, vector viy,
vector viz, float *Vjx, float *Vjy, float *Vjz,
float *Mj, vector *rhoSum, vector *rho_dhSum,
vector *wcountSum, vector *wcount_dhSum,
vector *div_vSum, vector *curlvxSum,
vector *curlvySum, vector *curlvzSum,
vector mask, int knlMask) {
vector r, ri, xi, wi, wi_dx;
vector mj;
vector dvx, dvy, dvz;
vector vjx, vjy, vjz;
vector dvdr;
vector curlvrx, curlvry, curlvrz;
/* Fill the vectors. */
mj.v = vec_load(Mj);
vjx.v = vec_load(Vjx);
vjy.v = vec_load(Vjy);
vjz.v = vec_load(Vjz);
/* Get the radius and inverse radius. */
ri = vec_reciprocal_sqrt(*r2);
r.v = vec_mul(r2->v, ri.v);
xi.v = vec_mul(r.v, hi_inv.v);
/* Calculate the kernel for two particles. */
kernel_deval_1_vec(&xi, &wi, &wi_dx);
/* Compute dv. */
dvx.v = vec_sub(vix.v, vjx.v);
dvy.v = vec_sub(viy.v, vjy.v);
dvz.v = vec_sub(viz.v, vjz.v);
/* Compute dv dot r */
dvdr.v = vec_fma(dvx.v, dx->v, vec_fma(dvy.v, dy->v, vec_mul(dvz.v, dz->v)));
dvdr.v = vec_mul(dvdr.v, ri.v);
/* Compute dv cross r */
curlvrx.v =
vec_fma(dvy.v, dz->v, vec_mul(vec_set1(-1.0f), vec_mul(dvz.v, dy->v)));
curlvry.v =
vec_fma(dvz.v, dx->v, vec_mul(vec_set1(-1.0f), vec_mul(dvx.v, dz->v)));
curlvrz.v =
vec_fma(dvx.v, dy->v, vec_mul(vec_set1(-1.0f), vec_mul(dvy.v, dx->v)));
curlvrx.v = vec_mul(curlvrx.v, ri.v);
curlvry.v = vec_mul(curlvry.v, ri.v);
curlvrz.v = vec_mul(curlvrz.v, ri.v);
/* Mask updates to intermediate vector sums for particle pi. */
#ifdef HAVE_AVX512_F
rhoSum->v =
_mm512_mask_add_ps(rhoSum->v, knlMask, vec_mul(mj.v, wi.v), rhoSum->v);
rho_dhSum->v =
_mm512_mask_sub_ps(rho_dhSum->v, knlMask, rho_dhSum->v,
vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
vec_mul(xi.v, wi_dx.v))));
wcountSum->v = _mm512_mask_add_ps(wcountSum->v, knlMask, wi.v, wcountSum->v);
wcount_dhSum->v = _mm512_mask_sub_ps(wcount_dhSum->v, knlMask,
wcount_dhSum->v, vec_mul(xi.v, wi_dx.v));
div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask, div_vSum->v,
vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)));
curlvxSum->v = _mm512_mask_add_ps(curlvxSum->v, knlMask,
vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)),
curlvxSum->v);
curlvySum->v = _mm512_mask_add_ps(curlvySum->v, knlMask,
vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)),
curlvySum->v);
curlvzSum->v = _mm512_mask_add_ps(curlvzSum->v, knlMask,
vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)),
curlvzSum->v);
#else
rhoSum->v += vec_and(vec_mul(mj.v, wi.v), mask.v);
rho_dhSum->v -= vec_and(vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
vec_mul(xi.v, wi_dx.v))),
mask.v);
wcountSum->v += vec_and(wi.v, mask.v);
wcount_dhSum->v -= vec_and(vec_mul(xi.v, wi_dx.v), mask.v);
div_vSum->v -= vec_and(vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask.v);
curlvxSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask.v);
curlvySum->v += vec_and(vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask.v);
curlvzSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask.v);
#endif
}
/**
* @brief Density interaction computed using 2 interleaved vectors
* (non-symmetric vectorized version).
......@@ -744,7 +845,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_force(
vec_set(pj[0]->force.balsara, pj[1]->force.balsara,
pj[2]->force.balsara, pj[3]->force.balsara);
#else
error("Unknown vector size.")
error("Unknown vector size.");
#endif
/* Get the radius and inverse radius. */
......@@ -1023,7 +1124,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force(
vec_set(pj[0]->force.balsara, pj[1]->force.balsara,
pj[2]->force.balsara, pj[3]->force.balsara);
#else
error("Unknown vector size.")
error("Unknown vector size.");
#endif
/* Get the radius and inverse radius. */
......
......@@ -304,6 +304,41 @@ __attribute__((always_inline)) INLINE static void kernel_eval(
*W = w * kernel_constant * kernel_gamma_inv_dim;
}
/**
* @brief Computes the kernel function derivative.
*
* The kernel function needs to be mutliplied by \f$h^{-d}\f$ and the gradient
* by \f$h^{-(d+1)}\f$, where \f$d\f$ is the dimensionality of the problem.
*
* Returns 0 if \f$u > \gamma = H/h\f$.
*
* @param u The ratio of the distance to the smoothing length \f$u = x/h\f$.
* @param dW_dx (return) The norm of the gradient of \f$|\nabla W(x,h)|\f$.
*/
__attribute__((always_inline)) INLINE static void kernel_eval_dWdx(
float u, float *restrict dW_dx) {
/* Go to the range [0,1[ from [0,H[ */
const float x = u * kernel_gamma_inv;
/* Pick the correct branch of the kernel */
const int temp = (int)(x * kernel_ivals_f);
const int ind = temp > kernel_ivals ? kernel_ivals : temp;
const float *const coeffs = &kernel_coeffs[ind * (kernel_degree + 1)];
/* First two terms of the polynomial ... */
float dw_dx = ((float)kernel_degree * coeffs[0] * x) +
(float)(kernel_degree - 1) * coeffs[1];
/* ... and the rest of them */
for (int k = 2; k < kernel_degree; k++) {
dw_dx = dw_dx * x + (float)(kernel_degree - k) * coeffs[k];
}
/* Return everything */
*dW_dx = dw_dx * kernel_constant * kernel_gamma_inv_dim_plus_one;
}
/* ------------------------------------------------------------------------- */
#ifdef WITH_VECTORIZATION
......@@ -362,7 +397,8 @@ __attribute__((always_inline)) INLINE static void kernel_deval_vec(
dw_dx->v * kernel_constant_vec.v * kernel_gamma_inv_dim_plus_one_vec.v;
}
/* Define constant vectors for the Wendland C2 kernel coefficients. */
/* Define constant vectors for the Wendland C2 and Cubic Spline kernel
* coefficients. */
#ifdef WENDLAND_C2_KERNEL
static const vector wendland_const_c0 = FILL_VEC(4.f);
static const vector wendland_const_c1 = FILL_VEC(-15.f);
......@@ -370,8 +406,115 @@ static const vector wendland_const_c2 = FILL_VEC(20.f);
static const vector wendland_const_c3 = FILL_VEC(-10.f);
static const vector wendland_const_c4 = FILL_VEC(0.f);
static const vector wendland_const_c5 = FILL_VEC(1.f);